From addc1b8715fa67be81c2e24db61b44851494d72a Mon Sep 17 00:00:00 2001 From: Anilabha Datta Date: Tue, 24 Aug 2021 15:33:20 +0530 Subject: [PATCH 1/2] added jobs and people in company_scraper --- scrape_linkedin/Company.py | 100 +++++++++++++++++++--- scrape_linkedin/CompanyScraper.py | 132 ++++++++++++++++++++++++------ scrape_linkedin/cli.py | 3 +- 3 files changed, 198 insertions(+), 37 deletions(-) diff --git a/scrape_linkedin/Company.py b/scrape_linkedin/Company.py index 26f1b6f..e716e34 100644 --- a/scrape_linkedin/Company.py +++ b/scrape_linkedin/Company.py @@ -12,15 +12,16 @@ class Company(ResultsObject): """Linkedin User Profile Object""" - attributes = ['overview', 'jobs', 'life', 'insights'] + attributes = ['overview', 'jobs', 'life', 'insights', 'people'] # KD adds insights attribute - def __init__(self, overview, jobs, life, insights): + def __init__(self, overview, jobs, life, insights, people): # KD fixed attributes making jobs and life undefined as they are defined in CompanyScraper, and this allows insights to work self.overview_soup = BeautifulSoup(overview, 'html.parser') self.jobs_soup = BeautifulSoup(jobs, 'html.parser') self.life_soup = BeautifulSoup(life, 'html.parser') self.insights_soup = BeautifulSoup(insights, 'html.parser') + self.people_soup = BeautifulSoup(people, 'html.parser') # KD adds insights soup @property @@ -29,25 +30,25 @@ def overview(self): # Banner containing company Name + Location banner = one_or_default( - self.overview_soup, '.org-top-card') - + self.overview_soup, 'section.org-top-card') + # Main container with company overview info container = one_or_default( - self.overview_soup, '.org-grid__core-rail--wide') - + self.overview_soup, 'section.artdeco-card.p4.mb3') + overview = {} overview['description'] = container.select_one( 'section > p').get_text().strip() - + metadata_keys = container.select('.org-page-details__definition-term') - print(metadata_keys) + # print(metadata_keys) metadata_keys = [ x for x in metadata_keys if "Company size" not in x.get_text()] - print(metadata_keys) + # print(metadata_keys) metadata_values = container.select( '.org-page-details__definition-text') overview.update( - get_info(banner, {'name': '.org-top-card-summary__title'})) # A fix to the name selector + get_info(banner, {'name': '.t-24.t-black.t-bold'})) # A fix to the name selector overview.update( get_info(container, {'company_size': '.org-about-company-module__company-size-definition-text'})) # Manually added Company size @@ -56,10 +57,10 @@ def overview(self): dict_val = val.get_text().strip() if "company_size" not in dict_key: overview[dict_key] = dict_val - print(overview) + # print(overview) all_employees_links = all_or_default( - banner, '.mt2 > a > span') # A fix to locate "See all ### employees on LinkedIn" + banner, '.mt1 > div > a:nth-of-type(2) > span') # A fix to locate "See all ### employees on LinkedIn" if all_employees_links: all_employees_text = all_employees_links[-1].text @@ -80,7 +81,51 @@ def overview(self): @property def jobs(self): - return None + jobs = {} + containers = self.jobs_soup.select( + 'div.jobs-unified-top-card__content--two-pane') + for container in containers: + role = container.select_one( + 'h2.t-24.t-bold') + company = container.select_one( + 'a.ember-view.t-black.t-normal') + l_r_container = container.select_one( + 'span.jobs-unified-top-card__subtitle-primary-grouping') + location = l_r_container.select_one( + 'span:nth-of-type(2)') + remote = l_r_container.select_one( + 'span:nth-of-type(3)') + posted = container.select_one( + 'span.jobs-unified-top-card__posted-date') + applicants = container.select_one( + 'span.jobs-unified-top-card__applicant-count') + job_details = container.select( + 'div.jobs-unified-top-card__job-insight > span') + + recruiting, job_type, employees = None, None, None + if len(job_details) > 1: + job_type = job_details[0].get_text().strip() + employees = job_details[1].get_text().strip() + if len(job_details) > 3: + recruiting = job_details[3].get_text().strip() + if company: + company = company.get_text().strip() + if location: + location = location.get_text().strip() + if remote: + remote = remote.get_text().strip() + if posted: + posted = posted.get_text().strip() + if applicants: + applicants = applicants.get_text().strip() + if role: + role = role.get_text().strip() + elif not role: + continue + jobs[role] = {"company": company, "location": location, + "remote": remote, "posted": posted, "applicants": applicants, + "job_type": job_type, "employees": employees, "recruiting": recruiting} + return jobs @property def life(self): @@ -103,3 +148,32 @@ def insights(self): })) return insights + + @property + def people(self): + people = { + 'Stats': {}, + 'People_you_may_know': {} + } + stats_containers = self.people_soup.select('div.artdeco-carousel__item-container') + for container in stats_containers: + heading = container.select_one( + 'div > div > div > h4').get_text().strip() + people['Stats'].update({heading: []}) + elements = container.select('.org-people-bar-graph-element__percentage-bar-info') + for element in elements: + text = element.get_text().strip() + people['Stats'][heading].append(text) + + people_containers = self.people_soup.select('div.org-people-profile-card__profile-info') + for container in people_containers: + name = container.select_one( + 'div.org-people-profile-card__profile-title') + if name: + name = name.get_text().strip() + info = container.select_one( + 'div.lt-line-clamp.lt-line-clamp--multi-line').get_text().strip() + # image_url = container.select_one('div > div > a > img')['src'] + # image_url = None + people['People_you_may_know'][name] = info + return people diff --git a/scrape_linkedin/CompanyScraper.py b/scrape_linkedin/CompanyScraper.py index 9013c38..433d42c 100644 --- a/scrape_linkedin/CompanyScraper.py +++ b/scrape_linkedin/CompanyScraper.py @@ -9,26 +9,31 @@ from .Scraper import Scraper from .utils import AnyEC +import time + logger = logging.getLogger(__name__) class CompanyScraper(Scraper): - def scrape(self, company, overview=True, jobs=False, life=False, insights=False): + def scrape(self, company, overview=True, jobs=True, life=False, insights=False, people=True): # Get Overview self.load_initial(company) - jobs_html = life_html = insights_html = overview_html = '' + jobs_html = life_html = insights_html = overview_html = people_html = '' if overview: overview_html = self.get_overview() if life: life_html = self.get_life() - if jobs: - jobs_html = self.get_jobs() if insights: insights_html = self.get_insights() + if people: + people_html = self.get_people() + if jobs: + jobs_html = self.get_jobs() + #print("JOBS", jobs_html, "\n\n\n\n\nLIFE", life_html) - return Company(overview_html, jobs_html, life_html, insights_html) + return Company(overview_html, jobs_html, life_html, insights_html, people_html) def load_initial(self, company): url = 'https://www.linkedin.com/company/{}'.format(company) @@ -52,24 +57,31 @@ def load_initial(self, company): except: raise ValueError( 'Company Unavailable: Company link does not match any companies on LinkedIn') - + + def click_on_tab(self, tab_name): + try: + tabs = self.driver.find_elements_by_css_selector( + 'li.org-page-navigation__item.m0 > a') + for tab in tabs: + if tab.text == tab_name: + tab.click() + return + except: + return + def get_overview(self): try: - tab_link = self.driver.find_element_by_css_selector( - 'a[data-control-name="page_member_main_nav_about_tab"]') - tab_link.click() + self.click_on_tab('About') self.wait_for_el( - 'a[data-control-name="page_member_main_nav_about_tab"].active') + 'section.artdeco-card.p4.mb3') return self.driver.find_element_by_css_selector( - '.organization-outlet').get_attribute('outerHTML') + 'div.scaffold-layout__row.scaffold-layout__content').get_attribute('outerHTML') except: return '' def get_life(self): try: - tab_link = self.driver.find_element_by_css_selector( - 'a[data-control-name="page_member_main_nav_life_tab"]') - tab_link.click() + self.click_on_tab('Life') self.wait_for_el( 'a[data-control-name="page_member_main_nav_life_tab"].active') return self.driver.find_element_by_css_selector('.org-life').get_attribute('outerHTML') @@ -78,22 +90,96 @@ def get_life(self): def get_jobs(self): try: - tab_link = self.driver.find_element_by_css_selector( - 'a[data-control-name="page_member_main_nav_jobs_tab"]') - tab_link.click() + self.click_on_tab('Jobs') self.wait_for_el( - 'a[data-control-name="page_member_main_nav_jobs_tab"].active') - return self.driver.find_element_by_css_selector('.org-jobs-container').get_attribute('outerHTML') + 'a.link-without-hover-visited.mt5.ember-view') + self.driver.execute_script( + "document.getElementsByClassName('link-without-hover-visited mt5 ember-view')[0].click()") + time.sleep(5) + job_html = '' + + def click_on_job(): + html = '' + containers = self.driver.find_elements_by_css_selector( + 'li.jobs-search-results__list-item.occludable-update.p0.relative') + for container in containers: + self.driver.execute_script( + "arguments[0].scrollIntoView(true);", container) + container.find_element_by_css_selector( + 'div.job-card-container.relative').click() + time.sleep(1) + self.wait_for_el( + 'div.jobs-unified-top-card__job-insight') + html += self.driver.find_element_by_css_selector( + 'div.jobs-unified-top-card__content--two-pane').get_attribute('outerHTML') + return html + + buttons = self.driver.find_elements_by_css_selector( + 'li.artdeco-pagination__indicator.artdeco-pagination__indicator--number') + if buttons: + last_button = int(buttons[-1].get_attribute('data-test-pagination-page-btn')) + for page in range(1, last_button + 1): + job_html += click_on_job() + buttons = self.driver.find_elements_by_css_selector( + 'li.artdeco-pagination__indicator.artdeco-pagination__indicator--number > button') + for next_page in buttons: + if int(next_page.get_attribute('aria-label').split()[1]) == page + 1: + next_page.click() + time.sleep(2) + break + else: + job_html += click_on_job() + + with open('output.html', 'w', encoding = "utf-8") as output: + output.write(str(job_html)) + return job_html except: return '' def get_insights(self): try: - tab_link = self.driver.find_element_by_css_selector( - 'a[data-control-name="page_member_main_nav_insights_tab"]') - tab_link.click() + self.click_on_tab('Home') self.wait_for_el( 'a[data-control-name="page_member_main_nav_insights_tab"].active') - return self.driver.find_element_by_css_selector('.org-premium-insights-module').get_attribute('outerHTML') + return self.driver.find_element_by_css_selector( + '.org-premium-insights-module').get_attribute('outerHTML') except: return '' + + def get_people(self): + try: + self.click_on_tab('People') + self.wait_for_el( + 'div.artdeco-card.pv5.pl5.pr1.mt4') + + + + + stats = self.driver.find_elements_by_css_selector( + '.artdeco-carousel__item-container') + stats_container = '' + for index in range(0, len(stats), 2): + stats_container += stats[index].get_attribute('outerHTML') + if index == len(stats)-1: + break + stats_container += stats[index+1].get_attribute('outerHTML') + self.driver.find_element_by_css_selector( + 'button.artdeco-pagination__button.artdeco-pagination__button--next').click() + time.sleep(1) + + for _ in range(100): + height = self.driver.execute_script('return document.body.scrollHeight') + scroll = 'window.scrollTo(0, ' + str(height) + ');' + self.driver.execute_script(scroll) + time.sleep(1) + scroll_2 = 'window.scrollTo(0, ' + str(height-100) + ');' + self.driver.execute_script(scroll_2) + time.sleep(1) + people_container = self.driver.find_element_by_css_selector( + '.artdeco-card.pv5.pl5.pr1.mt4').get_attribute('outerHTML') + + with open('new_out.html','w',encoding = "utf-8") as out: + out.write(str(stats_container + people_container)) + return stats_container + people_container + except: + return '' \ No newline at end of file diff --git a/scrape_linkedin/cli.py b/scrape_linkedin/cli.py index ff37253..d03b15e 100644 --- a/scrape_linkedin/cli.py +++ b/scrape_linkedin/cli.py @@ -89,7 +89,8 @@ def scrape(url, user, company, attribute, input_file, headless, output_file, dri output = profile.to_dict() if output_file: - with open(output_file, 'w') as outfile: + with open(output_file, 'w', encoding='utf-8') as outfile: + # outfile.write(str(output)) json.dump(output, outfile) else: pprint(output) From e94d7334b234d3b545fd4df58d5d14e5e4ec3617 Mon Sep 17 00:00:00 2001 From: Anilabha Datta Date: Tue, 31 Aug 2021 17:43:18 +0530 Subject: [PATCH 2/2] supports all languages and fixed json output of cyrillic characters --- scrape_linkedin/CompanyScraper.py | 33 +++++++++++++------------------ scrape_linkedin/cli.py | 2 +- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/scrape_linkedin/CompanyScraper.py b/scrape_linkedin/CompanyScraper.py index 433d42c..a7a03b8 100644 --- a/scrape_linkedin/CompanyScraper.py +++ b/scrape_linkedin/CompanyScraper.py @@ -18,6 +18,7 @@ class CompanyScraper(Scraper): def scrape(self, company, overview=True, jobs=True, life=False, insights=False, people=True): # Get Overview self.load_initial(company) + self.company = company jobs_html = life_html = insights_html = overview_html = people_html = '' @@ -59,19 +60,17 @@ def load_initial(self, company): 'Company Unavailable: Company link does not match any companies on LinkedIn') def click_on_tab(self, tab_name): + main_url = "https://www.linkedin.com/company/{}/".format(self.company) try: - tabs = self.driver.find_elements_by_css_selector( - 'li.org-page-navigation__item.m0 > a') - for tab in tabs: - if tab.text == tab_name: - tab.click() - return + self.driver.get(main_url + tab_name) except: + print("Tab cannot be found.") return + def get_overview(self): try: - self.click_on_tab('About') + self.click_on_tab('about') self.wait_for_el( 'section.artdeco-card.p4.mb3') return self.driver.find_element_by_css_selector( @@ -81,7 +80,7 @@ def get_overview(self): def get_life(self): try: - self.click_on_tab('Life') + self.click_on_tab('life') self.wait_for_el( 'a[data-control-name="page_member_main_nav_life_tab"].active') return self.driver.find_element_by_css_selector('.org-life').get_attribute('outerHTML') @@ -90,7 +89,7 @@ def get_life(self): def get_jobs(self): try: - self.click_on_tab('Jobs') + self.click_on_tab('jobs') self.wait_for_el( 'a.link-without-hover-visited.mt5.ember-view') self.driver.execute_script( @@ -130,15 +129,15 @@ def click_on_job(): else: job_html += click_on_job() - with open('output.html', 'w', encoding = "utf-8") as output: - output.write(str(job_html)) + # with open('output.html', 'w', encoding = "utf-8") as output: + # output.write(str(job_html)) return job_html except: return '' def get_insights(self): try: - self.click_on_tab('Home') + self.click_on_tab('home') self.wait_for_el( 'a[data-control-name="page_member_main_nav_insights_tab"].active') return self.driver.find_element_by_css_selector( @@ -148,13 +147,9 @@ def get_insights(self): def get_people(self): try: - self.click_on_tab('People') + self.click_on_tab('people') self.wait_for_el( 'div.artdeco-card.pv5.pl5.pr1.mt4') - - - - stats = self.driver.find_elements_by_css_selector( '.artdeco-carousel__item-container') stats_container = '' @@ -178,8 +173,8 @@ def get_people(self): people_container = self.driver.find_element_by_css_selector( '.artdeco-card.pv5.pl5.pr1.mt4').get_attribute('outerHTML') - with open('new_out.html','w',encoding = "utf-8") as out: - out.write(str(stats_container + people_container)) + # with open('new_out.html','w',encoding = "utf-8") as out: + # out.write(str(stats_container + people_container)) return stats_container + people_container except: return '' \ No newline at end of file diff --git a/scrape_linkedin/cli.py b/scrape_linkedin/cli.py index d03b15e..2318167 100644 --- a/scrape_linkedin/cli.py +++ b/scrape_linkedin/cli.py @@ -91,7 +91,7 @@ def scrape(url, user, company, attribute, input_file, headless, output_file, dri if output_file: with open(output_file, 'w', encoding='utf-8') as outfile: # outfile.write(str(output)) - json.dump(output, outfile) + json.dump(output, outfile, ensure_ascii=False) else: pprint(output)