From addc1b8715fa67be81c2e24db61b44851494d72a Mon Sep 17 00:00:00 2001
From: Anilabha Datta <anilabhadatta@gmail.com>
Date: Tue, 24 Aug 2021 15:33:20 +0530
Subject: [PATCH 1/2] added jobs and people in company_scraper

---
 scrape_linkedin/Company.py        | 100 +++++++++++++++++++---
 scrape_linkedin/CompanyScraper.py | 132 ++++++++++++++++++++++++------
 scrape_linkedin/cli.py            |   3 +-
 3 files changed, 198 insertions(+), 37 deletions(-)

diff --git a/scrape_linkedin/Company.py b/scrape_linkedin/Company.py
index 26f1b6f..e716e34 100644
--- a/scrape_linkedin/Company.py
+++ b/scrape_linkedin/Company.py
@@ -12,15 +12,16 @@
 class Company(ResultsObject):
     """Linkedin User Profile Object"""
 
-    attributes = ['overview', 'jobs', 'life', 'insights']
+    attributes = ['overview', 'jobs', 'life', 'insights', 'people']
     # KD adds insights attribute
 
-    def __init__(self, overview, jobs, life, insights):
+    def __init__(self, overview, jobs, life, insights, people):
         # KD fixed attributes making jobs and life undefined as they are defined in CompanyScraper, and this allows insights to work
         self.overview_soup = BeautifulSoup(overview, 'html.parser')
         self.jobs_soup = BeautifulSoup(jobs, 'html.parser')
         self.life_soup = BeautifulSoup(life, 'html.parser')
         self.insights_soup = BeautifulSoup(insights, 'html.parser')
+        self.people_soup = BeautifulSoup(people, 'html.parser')
         # KD adds insights soup
 
     @property
@@ -29,25 +30,25 @@ def overview(self):
 
         # Banner containing company Name + Location
         banner = one_or_default(
-            self.overview_soup, '.org-top-card')
-
+            self.overview_soup, 'section.org-top-card')
+        
         # Main container with company overview info
         container = one_or_default(
-            self.overview_soup, '.org-grid__core-rail--wide')
-
+            self.overview_soup, 'section.artdeco-card.p4.mb3')
+        
         overview = {}
         overview['description'] = container.select_one(
             'section > p').get_text().strip()
-
+        
         metadata_keys = container.select('.org-page-details__definition-term')
-        print(metadata_keys)
+        # print(metadata_keys)
         metadata_keys = [
             x for x in metadata_keys if "Company size" not in x.get_text()]
-        print(metadata_keys)
+        # print(metadata_keys)
         metadata_values = container.select(
             '.org-page-details__definition-text')
         overview.update(
-            get_info(banner, {'name': '.org-top-card-summary__title'}))  # A fix to the name selector
+            get_info(banner, {'name': '.t-24.t-black.t-bold'}))  # A fix to the name selector
         overview.update(
             get_info(container, {'company_size': '.org-about-company-module__company-size-definition-text'}))  # Manually added Company size
 
@@ -56,10 +57,10 @@ def overview(self):
             dict_val = val.get_text().strip()
             if "company_size" not in dict_key:
                 overview[dict_key] = dict_val
-        print(overview)
+        # print(overview)
 
         all_employees_links = all_or_default(
-            banner, '.mt2 > a > span')  # A fix to locate "See all ### employees on LinkedIn"
+            banner, '.mt1 > div > a:nth-of-type(2) > span')  # A fix to locate "See all ### employees on LinkedIn"
 
         if all_employees_links:
             all_employees_text = all_employees_links[-1].text
@@ -80,7 +81,51 @@ def overview(self):
 
     @property
     def jobs(self):
-        return None
+        jobs = {}
+        containers = self.jobs_soup.select(
+            'div.jobs-unified-top-card__content--two-pane')
+        for container in containers:
+            role = container.select_one(
+                'h2.t-24.t-bold')   
+            company = container.select_one(
+                'a.ember-view.t-black.t-normal')
+            l_r_container = container.select_one(
+                'span.jobs-unified-top-card__subtitle-primary-grouping')
+            location = l_r_container.select_one(
+                'span:nth-of-type(2)')
+            remote = l_r_container.select_one(
+                'span:nth-of-type(3)')
+            posted = container.select_one(
+                'span.jobs-unified-top-card__posted-date')
+            applicants = container.select_one(
+                'span.jobs-unified-top-card__applicant-count')
+            job_details = container.select(
+                'div.jobs-unified-top-card__job-insight > span')
+            
+            recruiting, job_type, employees = None, None, None
+            if len(job_details) > 1:
+                job_type = job_details[0].get_text().strip()
+                employees = job_details[1].get_text().strip()
+            if len(job_details) > 3:
+                recruiting = job_details[3].get_text().strip()
+            if company:
+                company = company.get_text().strip()
+            if location:
+                location = location.get_text().strip()
+            if remote:
+                remote = remote.get_text().strip()
+            if posted:
+                posted = posted.get_text().strip()
+            if applicants:
+                applicants = applicants.get_text().strip()
+            if role:
+                role = role.get_text().strip()
+            elif not role:
+                continue
+            jobs[role] = {"company": company, "location": location,
+                "remote": remote, "posted": posted, "applicants": applicants,
+                "job_type": job_type, "employees": employees, "recruiting": recruiting}
+        return jobs
 
     @property
     def life(self):
@@ -103,3 +148,32 @@ def insights(self):
 
         }))
         return insights
+    
+    @property
+    def people(self):
+        people = {
+            'Stats': {}, 
+            'People_you_may_know': {}
+        }
+        stats_containers = self.people_soup.select('div.artdeco-carousel__item-container')
+        for container in stats_containers:
+            heading = container.select_one(
+                'div > div > div > h4').get_text().strip()
+            people['Stats'].update({heading: []})
+            elements = container.select('.org-people-bar-graph-element__percentage-bar-info')
+            for element in elements:
+                text = element.get_text().strip()
+                people['Stats'][heading].append(text)
+        
+        people_containers = self.people_soup.select('div.org-people-profile-card__profile-info')
+        for container in people_containers:
+            name = container.select_one(
+                'div.org-people-profile-card__profile-title')
+            if name:
+                name = name.get_text().strip()
+                info = container.select_one(
+                    'div.lt-line-clamp.lt-line-clamp--multi-line').get_text().strip()
+                # image_url = container.select_one('div > div > a > img')['src']
+                # image_url = None
+                people['People_you_may_know'][name] = info
+        return people
diff --git a/scrape_linkedin/CompanyScraper.py b/scrape_linkedin/CompanyScraper.py
index 9013c38..433d42c 100644
--- a/scrape_linkedin/CompanyScraper.py
+++ b/scrape_linkedin/CompanyScraper.py
@@ -9,26 +9,31 @@
 from .Scraper import Scraper
 from .utils import AnyEC
 
+import time
+
 logger = logging.getLogger(__name__)
 
 
 class CompanyScraper(Scraper):
-    def scrape(self, company, overview=True, jobs=False, life=False, insights=False):
+    def scrape(self, company, overview=True, jobs=True, life=False, insights=False, people=True):
         # Get Overview
         self.load_initial(company)
 
-        jobs_html = life_html = insights_html = overview_html = ''
+        jobs_html = life_html = insights_html = overview_html = people_html = ''
 
         if overview:
             overview_html = self.get_overview()
         if life:
             life_html = self.get_life()
-        if jobs:
-            jobs_html = self.get_jobs()
         if insights:
             insights_html = self.get_insights()
+        if people:
+            people_html = self.get_people()
+        if jobs:
+            jobs_html = self.get_jobs()
+
         #print("JOBS", jobs_html, "\n\n\n\n\nLIFE", life_html)
-        return Company(overview_html, jobs_html, life_html, insights_html)
+        return Company(overview_html, jobs_html, life_html, insights_html, people_html)
 
     def load_initial(self, company):
         url = 'https://www.linkedin.com/company/{}'.format(company)
@@ -52,24 +57,31 @@ def load_initial(self, company):
         except:
             raise ValueError(
                 'Company Unavailable: Company link does not match any companies on LinkedIn')
-
+    
+    def click_on_tab(self, tab_name):
+        try:
+            tabs = self.driver.find_elements_by_css_selector(
+                'li.org-page-navigation__item.m0 > a')
+            for tab in tabs:
+                if tab.text == tab_name:
+                    tab.click()
+                    return
+        except:
+            return
+    
     def get_overview(self):
         try:
-            tab_link = self.driver.find_element_by_css_selector(
-                'a[data-control-name="page_member_main_nav_about_tab"]')
-            tab_link.click()
+            self.click_on_tab('About')
             self.wait_for_el(
-                'a[data-control-name="page_member_main_nav_about_tab"].active')
+                'section.artdeco-card.p4.mb3')
             return self.driver.find_element_by_css_selector(
-                '.organization-outlet').get_attribute('outerHTML')
+                'div.scaffold-layout__row.scaffold-layout__content').get_attribute('outerHTML')
         except:
             return ''
 
     def get_life(self):
         try:
-            tab_link = self.driver.find_element_by_css_selector(
-                'a[data-control-name="page_member_main_nav_life_tab"]')
-            tab_link.click()
+            self.click_on_tab('Life')
             self.wait_for_el(
                 'a[data-control-name="page_member_main_nav_life_tab"].active')
             return self.driver.find_element_by_css_selector('.org-life').get_attribute('outerHTML')
@@ -78,22 +90,96 @@ def get_life(self):
 
     def get_jobs(self):
         try:
-            tab_link = self.driver.find_element_by_css_selector(
-                'a[data-control-name="page_member_main_nav_jobs_tab"]')
-            tab_link.click()
+            self.click_on_tab('Jobs')
             self.wait_for_el(
-                'a[data-control-name="page_member_main_nav_jobs_tab"].active')
-            return self.driver.find_element_by_css_selector('.org-jobs-container').get_attribute('outerHTML')
+                'a.link-without-hover-visited.mt5.ember-view')
+            self.driver.execute_script(
+                "document.getElementsByClassName('link-without-hover-visited mt5 ember-view')[0].click()")
+            time.sleep(5)
+            job_html = ''
+            
+            def click_on_job():
+                html = ''
+                containers = self.driver.find_elements_by_css_selector(
+                    'li.jobs-search-results__list-item.occludable-update.p0.relative')
+                for container in containers:
+                    self.driver.execute_script(
+                        "arguments[0].scrollIntoView(true);", container)
+                    container.find_element_by_css_selector(
+                        'div.job-card-container.relative').click()
+                    time.sleep(1)
+                    self.wait_for_el(
+                        'div.jobs-unified-top-card__job-insight')
+                    html += self.driver.find_element_by_css_selector(
+                        'div.jobs-unified-top-card__content--two-pane').get_attribute('outerHTML')
+                return html
+
+            buttons = self.driver.find_elements_by_css_selector(
+                'li.artdeco-pagination__indicator.artdeco-pagination__indicator--number')
+            if buttons:
+                last_button = int(buttons[-1].get_attribute('data-test-pagination-page-btn'))
+                for page in range(1, last_button + 1):
+                    job_html += click_on_job()
+                    buttons = self.driver.find_elements_by_css_selector(
+                        'li.artdeco-pagination__indicator.artdeco-pagination__indicator--number > button')
+                    for next_page in buttons:
+                        if int(next_page.get_attribute('aria-label').split()[1]) == page + 1:
+                            next_page.click()
+                            time.sleep(2)
+                            break
+            else:
+                job_html += click_on_job()
+
+            with open('output.html', 'w', encoding = "utf-8") as output:
+                output.write(str(job_html))
+            return job_html
         except:
             return ''
 
     def get_insights(self):
         try:
-            tab_link = self.driver.find_element_by_css_selector(
-                'a[data-control-name="page_member_main_nav_insights_tab"]')
-            tab_link.click()
+            self.click_on_tab('Home')
             self.wait_for_el(
                 'a[data-control-name="page_member_main_nav_insights_tab"].active')
-            return self.driver.find_element_by_css_selector('.org-premium-insights-module').get_attribute('outerHTML')
+            return self.driver.find_element_by_css_selector(
+                '.org-premium-insights-module').get_attribute('outerHTML')
         except:
             return ''
+
+    def get_people(self):
+        try:
+            self.click_on_tab('People')
+            self.wait_for_el(
+                'div.artdeco-card.pv5.pl5.pr1.mt4')
+            
+            
+                
+            
+            stats = self.driver.find_elements_by_css_selector(
+                '.artdeco-carousel__item-container')
+            stats_container = ''
+            for index in range(0, len(stats), 2):
+                stats_container += stats[index].get_attribute('outerHTML')
+                if index == len(stats)-1:
+                    break
+                stats_container += stats[index+1].get_attribute('outerHTML')
+                self.driver.find_element_by_css_selector(
+                    'button.artdeco-pagination__button.artdeco-pagination__button--next').click()
+                time.sleep(1)
+            
+            for _ in range(100):
+                height = self.driver.execute_script('return document.body.scrollHeight')
+                scroll = 'window.scrollTo(0, ' + str(height) + ');'
+                self.driver.execute_script(scroll)
+                time.sleep(1)
+                scroll_2 = 'window.scrollTo(0, ' + str(height-100) + ');'
+                self.driver.execute_script(scroll_2)
+                time.sleep(1)
+            people_container = self.driver.find_element_by_css_selector(
+                '.artdeco-card.pv5.pl5.pr1.mt4').get_attribute('outerHTML')
+            
+            with open('new_out.html','w',encoding = "utf-8") as out:
+                out.write(str(stats_container + people_container))
+            return stats_container + people_container
+        except:
+            return ''
\ No newline at end of file
diff --git a/scrape_linkedin/cli.py b/scrape_linkedin/cli.py
index ff37253..d03b15e 100644
--- a/scrape_linkedin/cli.py
+++ b/scrape_linkedin/cli.py
@@ -89,7 +89,8 @@ def scrape(url, user, company, attribute, input_file, headless, output_file, dri
         output = profile.to_dict()
 
     if output_file:
-        with open(output_file, 'w') as outfile:
+        with open(output_file, 'w', encoding='utf-8') as outfile:
+            # outfile.write(str(output))
             json.dump(output, outfile)
     else:
         pprint(output)

From e94d7334b234d3b545fd4df58d5d14e5e4ec3617 Mon Sep 17 00:00:00 2001
From: Anilabha Datta <anilabhadatta@gmail.com>
Date: Tue, 31 Aug 2021 17:43:18 +0530
Subject: [PATCH 2/2] supports all languages and fixed json output of cyrillic
 characters

---
 scrape_linkedin/CompanyScraper.py | 33 +++++++++++++------------------
 scrape_linkedin/cli.py            |  2 +-
 2 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/scrape_linkedin/CompanyScraper.py b/scrape_linkedin/CompanyScraper.py
index 433d42c..a7a03b8 100644
--- a/scrape_linkedin/CompanyScraper.py
+++ b/scrape_linkedin/CompanyScraper.py
@@ -18,6 +18,7 @@ class CompanyScraper(Scraper):
     def scrape(self, company, overview=True, jobs=True, life=False, insights=False, people=True):
         # Get Overview
         self.load_initial(company)
+        self.company = company
 
         jobs_html = life_html = insights_html = overview_html = people_html = ''
 
@@ -59,19 +60,17 @@ def load_initial(self, company):
                 'Company Unavailable: Company link does not match any companies on LinkedIn')
     
     def click_on_tab(self, tab_name):
+        main_url = "https://www.linkedin.com/company/{}/".format(self.company)
         try:
-            tabs = self.driver.find_elements_by_css_selector(
-                'li.org-page-navigation__item.m0 > a')
-            for tab in tabs:
-                if tab.text == tab_name:
-                    tab.click()
-                    return
+            self.driver.get(main_url + tab_name)
         except:
+            print("Tab cannot be found.")
             return
+
     
     def get_overview(self):
         try:
-            self.click_on_tab('About')
+            self.click_on_tab('about')
             self.wait_for_el(
                 'section.artdeco-card.p4.mb3')
             return self.driver.find_element_by_css_selector(
@@ -81,7 +80,7 @@ def get_overview(self):
 
     def get_life(self):
         try:
-            self.click_on_tab('Life')
+            self.click_on_tab('life')
             self.wait_for_el(
                 'a[data-control-name="page_member_main_nav_life_tab"].active')
             return self.driver.find_element_by_css_selector('.org-life').get_attribute('outerHTML')
@@ -90,7 +89,7 @@ def get_life(self):
 
     def get_jobs(self):
         try:
-            self.click_on_tab('Jobs')
+            self.click_on_tab('jobs')
             self.wait_for_el(
                 'a.link-without-hover-visited.mt5.ember-view')
             self.driver.execute_script(
@@ -130,15 +129,15 @@ def click_on_job():
             else:
                 job_html += click_on_job()
 
-            with open('output.html', 'w', encoding = "utf-8") as output:
-                output.write(str(job_html))
+            # with open('output.html', 'w', encoding = "utf-8") as output:
+            #     output.write(str(job_html))
             return job_html
         except:
             return ''
 
     def get_insights(self):
         try:
-            self.click_on_tab('Home')
+            self.click_on_tab('home')
             self.wait_for_el(
                 'a[data-control-name="page_member_main_nav_insights_tab"].active')
             return self.driver.find_element_by_css_selector(
@@ -148,13 +147,9 @@ def get_insights(self):
 
     def get_people(self):
         try:
-            self.click_on_tab('People')
+            self.click_on_tab('people')
             self.wait_for_el(
                 'div.artdeco-card.pv5.pl5.pr1.mt4')
-            
-            
-                
-            
             stats = self.driver.find_elements_by_css_selector(
                 '.artdeco-carousel__item-container')
             stats_container = ''
@@ -178,8 +173,8 @@ def get_people(self):
             people_container = self.driver.find_element_by_css_selector(
                 '.artdeco-card.pv5.pl5.pr1.mt4').get_attribute('outerHTML')
             
-            with open('new_out.html','w',encoding = "utf-8") as out:
-                out.write(str(stats_container + people_container))
+            # with open('new_out.html','w',encoding = "utf-8") as out:
+            #     out.write(str(stats_container + people_container))
             return stats_container + people_container
         except:
             return ''
\ No newline at end of file
diff --git a/scrape_linkedin/cli.py b/scrape_linkedin/cli.py
index d03b15e..2318167 100644
--- a/scrape_linkedin/cli.py
+++ b/scrape_linkedin/cli.py
@@ -91,7 +91,7 @@ def scrape(url, user, company, attribute, input_file, headless, output_file, dri
     if output_file:
         with open(output_file, 'w', encoding='utf-8') as outfile:
             # outfile.write(str(output))
-            json.dump(output, outfile)
+            json.dump(output, outfile, ensure_ascii=False)
     else:
         pprint(output)