SCRAPING EDUCATION AND EXPERIENCE HISTORY FROM LINKEDIN
GOAL:
This project aims to scape education and experience history informations with profile urls from LinkedIn by using BeautifulSoup and linkedin_scraper and selenium.
In [ ]:
from linkedin_scraper import Person, actions
from selenium import webdriver
from bs4 import BeautifulSoup
from random import randint
import numpy as np
import time
import pandas as pd
In [ ]:
df=pd.read_excel('PATH OF FILE')
urls=df['Personal LinkedIn Profile'].to_list
In [ ]:
urls
In [ ]:
import pandas as pd
def convert_dataframe(x):
company_names = set(item['company_name'] for item in x if 'company_name' in item)
len_experiences = len(company_names)
school_names = set(item['school_name'] for item in x if 'school_name' in item)
len_education = len(school_name)
# Create an empty DataFrame with the desired columns
columns = []
for i in range(1, len_experiences+1): # Assuming you want to handle up to 5 experiences, modify as needed
columns.extend([f"profile_title_{i}", f"company_name_{i}", f"timeframe_{i}"])
for i in range(1, len_education+1): # Assuming you want to handle up to 5 experiences, modify as needed
columns.extend([f"school_name{i}", f"degree_name{i}"])
# Populate the DataFrame with data
df = pd.concat(map(pd.Series, x)).to_frame().T
df=df.set_axis(columns, axis='columns')
return df
In [ ]:
experiences = []
education = []
time.sleep(randint(5,15))
driver = webdriver.Chrome()
time.sleep(randint(5,15))
email = "YOUR E-MAIL"
password = "YOUR PASSWORD"
actions.login(driver, email, password)
In [ ]:
def search_experiences_education(url):
time.sleep(randint(5,15))
profile_url = url
driver.get(profile_url)
time.sleep(randint(5,15))
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
name_div = soup.find('div', {'class': 'display-flex mt2'})
## experience ###
experience_div = soup.find('div', {"id": "experience"})
exp_list = experience_div.findNext('div').findNext('div', {"class": "pvs-list__outer-container"}).findChild('ul').findAll('li')
for each_exp in exp_list:
col = each_exp.findNext("div", {"class": "display-flex flex-column full-width"})
profile_title = col.findNext('div').findNext('span').findNext('span').text
company_name = col.findNext('span', {"class": "t-14 t-normal"}).findNext('span').text
timeframe = col.findAll('span', {"class": "t-14 t-normal t-black--light"})[0].findNext('span').text
experiences.append({
"profile_title": profile_title.replace('\n', '').strip(),
"company_name": company_name.replace('\n', '').strip(),
"timeframe": timeframe.replace('\n', '').strip(),
#"degree_level": degree_level.replace('\n', '').strip(),
})
####
## education ##
edc_div = soup.find('div', {'id': 'education'})
edc_list = about_div.findNext('div').findNext('div', {"class": "pvs-list__outer-container"}).findChild('ul').findAll('li')
for each_edc in edc_list:
col = each_edc.findNext("div", {"class": "display-flex flex-wrap align-items-center full-height"})
school_name = col.findNext('div').findNext('span').findNext('span').text
degree_name = col.findNext('span', {"class": "t-14 t-normal"}).findNext('span').text
education.append({
"school_name": school_name.replace('\n', '').strip(),
"degree_name": degree_name.replace('\n', '').strip(),
})
####
merged_list=experiences+education
#####remove the duplications
# Use a set to keep track of unique items based on specific fields
unique_items = set()
# List to store unique dictionaries
unique_data = []
for item in merged_list:
# Extract relevant fields for comparison
profile_title = item.get('profile_title', '')
company_name = item.get('company_name', '')
timeframe = item.get('timeframe', '')
school_name = item.get('school_name', '')
degree_name = item.get('degree_name', '')
# Create a key tuple
key_fields = (profile_title, company_name, timeframe, school_name, degree_name)
# Check if the combination is unique
if key_fields not in unique_items:
unique_items.add(key_fields)
unique_data.append(item)
#####
final_df=convert_dataframe(unique_data)
return final_df
In [ ]:
driver.close()
driver.quit()
In [ ]:
import numpy as np
columns = ['profile_title_1', 'company_name_1', 'timeframe_1']
merged_df=pd.DataFrame(columns=columns)
for url in urls:
x=[]
x=search_experiences_education(url)
