SCRAPING EDUCATION AND EXPERIENCE HISTORY FROM LINKEDIN

GOAL:

This project aims to scape education and experience history informations with profile urls from LinkedIn by using BeautifulSoup and linkedin_scraper and selenium.

linkedin_data

In [ ]:

from linkedin_scraper import Person, actions
from selenium import webdriver
from bs4 import BeautifulSoup
from random import randint
import numpy as np
import time
import pandas as pd

In [ ]:

df=pd.read_excel('PATH OF FILE')
urls=df['Personal LinkedIn Profile'].to_list

In [ ]:

urls

In [ ]:

import pandas as pd

def convert_dataframe(x):
    company_names = set(item['company_name'] for item in x if 'company_name' in item)
    len_experiences = len(company_names)
    
    school_names = set(item['school_name'] for item in x if 'school_name' in item)
    len_education = len(school_name)  

# Create an empty DataFrame with the desired columns
    columns = []
    for i in range(1, len_experiences+1):  # Assuming you want to handle up to 5 experiences, modify as needed
        columns.extend([f"profile_title_{i}", f"company_name_{i}", f"timeframe_{i}"])
    for i in range(1, len_education+1):  # Assuming you want to handle up to 5 experiences, modify as needed
        columns.extend([f"school_name{i}", f"degree_name{i}"])
        

# Populate the DataFrame with data
    df = pd.concat(map(pd.Series, x)).to_frame().T
    df=df.set_axis(columns, axis='columns')
    return df

In [ ]:

experiences = []
education = []

time.sleep(randint(5,15))

driver = webdriver.Chrome()
time.sleep(randint(5,15))

email = "YOUR E-MAIL"
password = "YOUR PASSWORD"
actions.login(driver, email, password)

In [ ]:

def search_experiences_education(url):
    

    time.sleep(randint(5,15))
    profile_url = url
    driver.get(profile_url)
    time.sleep(randint(5,15))
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    

    
    name_div = soup.find('div', {'class': 'display-flex mt2'})
    
    ## experience ###

    experience_div = soup.find('div', {"id": "experience"})
    exp_list = experience_div.findNext('div').findNext('div', {"class": "pvs-list__outer-container"}).findChild('ul').findAll('li')
    for each_exp in exp_list:

        col = each_exp.findNext("div", {"class": "display-flex flex-column full-width"})
        profile_title = col.findNext('div').findNext('span').findNext('span').text
        company_name = col.findNext('span', {"class": "t-14 t-normal"}).findNext('span').text
        timeframe = col.findAll('span', {"class": "t-14 t-normal t-black--light"})[0].findNext('span').text

        experiences.append({
            "profile_title": profile_title.replace('\n', '').strip(),
            "company_name": company_name.replace('\n', '').strip(),
            "timeframe": timeframe.replace('\n', '').strip(),
            #"degree_level": degree_level.replace('\n', '').strip(),
        })
    ####
        
    ## education ##
    edc_div = soup.find('div', {'id': 'education'})
    edc_list = about_div.findNext('div').findNext('div', {"class": "pvs-list__outer-container"}).findChild('ul').findAll('li')
    for each_edc in edc_list:

        col = each_edc.findNext("div", {"class": "display-flex flex-wrap align-items-center full-height"})
        school_name = col.findNext('div').findNext('span').findNext('span').text
        degree_name = col.findNext('span', {"class": "t-14 t-normal"}).findNext('span').text

        education.append({
            "school_name": school_name.replace('\n', '').strip(),
            "degree_name": degree_name.replace('\n', '').strip(),
        })
        
    ####
        
    merged_list=experiences+education
    
        
    #####remove the duplications

    # Use a set to keep track of unique items based on specific fields
    unique_items = set()

    # List to store unique dictionaries
    unique_data = []

    for item in merged_list:
    # Extract relevant fields for comparison
        profile_title = item.get('profile_title', '')
        company_name = item.get('company_name', '')
        timeframe = item.get('timeframe', '')
        school_name = item.get('school_name', '')
        degree_name = item.get('degree_name', '')

    # Create a key tuple
        key_fields = (profile_title, company_name, timeframe, school_name, degree_name)

    # Check if the combination is unique
        if key_fields not in unique_items:
            unique_items.add(key_fields)
            unique_data.append(item)
    #####
    
    final_df=convert_dataframe(unique_data)
    
    
    return final_df

In [ ]:

driver.close()
driver.quit()

In [ ]:

import numpy as np
columns = ['profile_title_1', 'company_name_1', 'timeframe_1']
merged_df=pd.DataFrame(columns=columns)
for url in urls:
    x=[]
    x=search_experiences_education(url)