Skip to content

Added jobs and people in company_scraper #98

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 87 additions & 13 deletions scrape_linkedin/Company.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,16 @@
class Company(ResultsObject):
"""Linkedin User Profile Object"""

attributes = ['overview', 'jobs', 'life', 'insights']
attributes = ['overview', 'jobs', 'life', 'insights', 'people']
# KD adds insights attribute

def __init__(self, overview, jobs, life, insights):
def __init__(self, overview, jobs, life, insights, people):
# KD fixed attributes making jobs and life undefined as they are defined in CompanyScraper, and this allows insights to work
self.overview_soup = BeautifulSoup(overview, 'html.parser')
self.jobs_soup = BeautifulSoup(jobs, 'html.parser')
self.life_soup = BeautifulSoup(life, 'html.parser')
self.insights_soup = BeautifulSoup(insights, 'html.parser')
self.people_soup = BeautifulSoup(people, 'html.parser')
# KD adds insights soup

@property
Expand All @@ -29,25 +30,25 @@ def overview(self):

# Banner containing company Name + Location
banner = one_or_default(
self.overview_soup, '.org-top-card')

self.overview_soup, 'section.org-top-card')
# Main container with company overview info
container = one_or_default(
self.overview_soup, '.org-grid__core-rail--wide')

self.overview_soup, 'section.artdeco-card.p4.mb3')
overview = {}
overview['description'] = container.select_one(
'section > p').get_text().strip()

metadata_keys = container.select('.org-page-details__definition-term')
print(metadata_keys)
# print(metadata_keys)
metadata_keys = [
x for x in metadata_keys if "Company size" not in x.get_text()]
print(metadata_keys)
# print(metadata_keys)
metadata_values = container.select(
'.org-page-details__definition-text')
overview.update(
get_info(banner, {'name': '.org-top-card-summary__title'})) # A fix to the name selector
get_info(banner, {'name': '.t-24.t-black.t-bold'})) # A fix to the name selector
overview.update(
get_info(container, {'company_size': '.org-about-company-module__company-size-definition-text'})) # Manually added Company size

Expand All @@ -56,10 +57,10 @@ def overview(self):
dict_val = val.get_text().strip()
if "company_size" not in dict_key:
overview[dict_key] = dict_val
print(overview)
# print(overview)

all_employees_links = all_or_default(
banner, '.mt2 > a > span') # A fix to locate "See all ### employees on LinkedIn"
banner, '.mt1 > div > a:nth-of-type(2) > span') # A fix to locate "See all ### employees on LinkedIn"

if all_employees_links:
all_employees_text = all_employees_links[-1].text
Expand All @@ -80,7 +81,51 @@ def overview(self):

@property
def jobs(self):
return None
jobs = {}
containers = self.jobs_soup.select(
'div.jobs-unified-top-card__content--two-pane')
for container in containers:
role = container.select_one(
'h2.t-24.t-bold')
company = container.select_one(
'a.ember-view.t-black.t-normal')
l_r_container = container.select_one(
'span.jobs-unified-top-card__subtitle-primary-grouping')
location = l_r_container.select_one(
'span:nth-of-type(2)')
remote = l_r_container.select_one(
'span:nth-of-type(3)')
posted = container.select_one(
'span.jobs-unified-top-card__posted-date')
applicants = container.select_one(
'span.jobs-unified-top-card__applicant-count')
job_details = container.select(
'div.jobs-unified-top-card__job-insight > span')

recruiting, job_type, employees = None, None, None
if len(job_details) > 1:
job_type = job_details[0].get_text().strip()
employees = job_details[1].get_text().strip()
if len(job_details) > 3:
recruiting = job_details[3].get_text().strip()
if company:
company = company.get_text().strip()
if location:
location = location.get_text().strip()
if remote:
remote = remote.get_text().strip()
if posted:
posted = posted.get_text().strip()
if applicants:
applicants = applicants.get_text().strip()
if role:
role = role.get_text().strip()
elif not role:
continue
jobs[role] = {"company": company, "location": location,
"remote": remote, "posted": posted, "applicants": applicants,
"job_type": job_type, "employees": employees, "recruiting": recruiting}
return jobs

@property
def life(self):
Expand All @@ -103,3 +148,32 @@ def insights(self):

}))
return insights

@property
def people(self):
people = {
'Stats': {},
'People_you_may_know': {}
}
stats_containers = self.people_soup.select('div.artdeco-carousel__item-container')
for container in stats_containers:
heading = container.select_one(
'div > div > div > h4').get_text().strip()
people['Stats'].update({heading: []})
elements = container.select('.org-people-bar-graph-element__percentage-bar-info')
for element in elements:
text = element.get_text().strip()
people['Stats'][heading].append(text)

people_containers = self.people_soup.select('div.org-people-profile-card__profile-info')
for container in people_containers:
name = container.select_one(
'div.org-people-profile-card__profile-title')
if name:
name = name.get_text().strip()
info = container.select_one(
'div.lt-line-clamp.lt-line-clamp--multi-line').get_text().strip()
# image_url = container.select_one('div > div > a > img')['src']
# image_url = None
people['People_you_may_know'][name] = info
return people
125 changes: 103 additions & 22 deletions scrape_linkedin/CompanyScraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,26 +9,32 @@
from .Scraper import Scraper
from .utils import AnyEC

import time

logger = logging.getLogger(__name__)


class CompanyScraper(Scraper):
def scrape(self, company, overview=True, jobs=False, life=False, insights=False):
def scrape(self, company, overview=True, jobs=True, life=False, insights=False, people=True):
# Get Overview
self.load_initial(company)
self.company = company

jobs_html = life_html = insights_html = overview_html = ''
jobs_html = life_html = insights_html = overview_html = people_html = ''

if overview:
overview_html = self.get_overview()
if life:
life_html = self.get_life()
if jobs:
jobs_html = self.get_jobs()
if insights:
insights_html = self.get_insights()
if people:
people_html = self.get_people()
if jobs:
jobs_html = self.get_jobs()

#print("JOBS", jobs_html, "\n\n\n\n\nLIFE", life_html)
return Company(overview_html, jobs_html, life_html, insights_html)
return Company(overview_html, jobs_html, life_html, insights_html, people_html)

def load_initial(self, company):
url = 'https://www.linkedin.com/company/{}'.format(company)
Expand All @@ -52,24 +58,29 @@ def load_initial(self, company):
except:
raise ValueError(
'Company Unavailable: Company link does not match any companies on LinkedIn')

def click_on_tab(self, tab_name):
main_url = "https://www.linkedin.com/company/{}/".format(self.company)
try:
self.driver.get(main_url + tab_name)
except:
print("Tab cannot be found.")
return


def get_overview(self):
try:
tab_link = self.driver.find_element_by_css_selector(
'a[data-control-name="page_member_main_nav_about_tab"]')
tab_link.click()
self.click_on_tab('about')
self.wait_for_el(
'a[data-control-name="page_member_main_nav_about_tab"].active')
'section.artdeco-card.p4.mb3')
return self.driver.find_element_by_css_selector(
'.organization-outlet').get_attribute('outerHTML')
'div.scaffold-layout__row.scaffold-layout__content').get_attribute('outerHTML')
except:
return ''

def get_life(self):
try:
tab_link = self.driver.find_element_by_css_selector(
'a[data-control-name="page_member_main_nav_life_tab"]')
tab_link.click()
self.click_on_tab('life')
self.wait_for_el(
'a[data-control-name="page_member_main_nav_life_tab"].active')
return self.driver.find_element_by_css_selector('.org-life').get_attribute('outerHTML')
Expand All @@ -78,22 +89,92 @@ def get_life(self):

def get_jobs(self):
try:
tab_link = self.driver.find_element_by_css_selector(
'a[data-control-name="page_member_main_nav_jobs_tab"]')
tab_link.click()
self.click_on_tab('jobs')
self.wait_for_el(
'a[data-control-name="page_member_main_nav_jobs_tab"].active')
return self.driver.find_element_by_css_selector('.org-jobs-container').get_attribute('outerHTML')
'a.link-without-hover-visited.mt5.ember-view')
self.driver.execute_script(
"document.getElementsByClassName('link-without-hover-visited mt5 ember-view')[0].click()")
time.sleep(5)
job_html = ''

def click_on_job():
html = ''
containers = self.driver.find_elements_by_css_selector(
'li.jobs-search-results__list-item.occludable-update.p0.relative')
for container in containers:
self.driver.execute_script(
"arguments[0].scrollIntoView(true);", container)
container.find_element_by_css_selector(
'div.job-card-container.relative').click()
time.sleep(1)
self.wait_for_el(
'div.jobs-unified-top-card__job-insight')
html += self.driver.find_element_by_css_selector(
'div.jobs-unified-top-card__content--two-pane').get_attribute('outerHTML')
return html

buttons = self.driver.find_elements_by_css_selector(
'li.artdeco-pagination__indicator.artdeco-pagination__indicator--number')
if buttons:
last_button = int(buttons[-1].get_attribute('data-test-pagination-page-btn'))
for page in range(1, last_button + 1):
job_html += click_on_job()
buttons = self.driver.find_elements_by_css_selector(
'li.artdeco-pagination__indicator.artdeco-pagination__indicator--number > button')
for next_page in buttons:
if int(next_page.get_attribute('aria-label').split()[1]) == page + 1:
next_page.click()
time.sleep(2)
break
else:
job_html += click_on_job()

# with open('output.html', 'w', encoding = "utf-8") as output:
# output.write(str(job_html))
return job_html
except:
return ''

def get_insights(self):
try:
tab_link = self.driver.find_element_by_css_selector(
'a[data-control-name="page_member_main_nav_insights_tab"]')
tab_link.click()
self.click_on_tab('home')
self.wait_for_el(
'a[data-control-name="page_member_main_nav_insights_tab"].active')
return self.driver.find_element_by_css_selector('.org-premium-insights-module').get_attribute('outerHTML')
return self.driver.find_element_by_css_selector(
'.org-premium-insights-module').get_attribute('outerHTML')
except:
return ''

def get_people(self):
try:
self.click_on_tab('people')
self.wait_for_el(
'div.artdeco-card.pv5.pl5.pr1.mt4')
stats = self.driver.find_elements_by_css_selector(
'.artdeco-carousel__item-container')
stats_container = ''
for index in range(0, len(stats), 2):
stats_container += stats[index].get_attribute('outerHTML')
if index == len(stats)-1:
break
stats_container += stats[index+1].get_attribute('outerHTML')
self.driver.find_element_by_css_selector(
'button.artdeco-pagination__button.artdeco-pagination__button--next').click()
time.sleep(1)

for _ in range(100):
height = self.driver.execute_script('return document.body.scrollHeight')
scroll = 'window.scrollTo(0, ' + str(height) + ');'
self.driver.execute_script(scroll)
time.sleep(1)
scroll_2 = 'window.scrollTo(0, ' + str(height-100) + ');'
self.driver.execute_script(scroll_2)
time.sleep(1)
people_container = self.driver.find_element_by_css_selector(
'.artdeco-card.pv5.pl5.pr1.mt4').get_attribute('outerHTML')

# with open('new_out.html','w',encoding = "utf-8") as out:
# out.write(str(stats_container + people_container))
return stats_container + people_container
except:
return ''
5 changes: 3 additions & 2 deletions scrape_linkedin/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,9 @@ def scrape(url, user, company, attribute, input_file, headless, output_file, dri
output = profile.to_dict()

if output_file:
with open(output_file, 'w') as outfile:
json.dump(output, outfile)
with open(output_file, 'w', encoding='utf-8') as outfile:
# outfile.write(str(output))
json.dump(output, outfile, ensure_ascii=False)
else:
pprint(output)

Expand Down