scraping

furas · furas · commit 0259b6de3a8d · 2021-07-30T13:42:00.000+02:00
diff --git a/__scraping__/artofmanliness.com - scrapy/main.py b/__scraping__/artofmanliness.com - scrapy/main.py
@@ -0,0 +1,54 @@
+
+# author: Bartlomiej "furas" Burek (https://blog.furas.pl)
+# date: 2021.07.15
+#
+# title: Trouble outputting data with Scrapy
+# url: https://stackoverflow.com/questions/68386890/trouble-outputting-data-with-scrapy/68387811#68387811
+
+
+import scrapy
+from scrapy.http import Request
+
+class ArticlesSpider(scrapy.Spider):
+    name = 'articles'
+    allowed_domains = ['artofmanliness.com']
+    max_pages = 2
+
+    def start_requests(self):
+        for i in range(self.max_pages):
+            yield scrapy.Request('http://artofmanliness.com/articles/page/%d/' % i, callback=self.parse)
+
+    def parse(self, response):
+        # AOM has a list of all articles in pages of about 189
+        for article in response.xpath('//article[contains(@class, "aom-article-simple")]'):
+            url = article.xpath('.//a/@href').extract()
+            print('article url:', url)
+
+            if url:
+                yield Request(url=url[0], callback=self.parse_article)
+
+    def parse_article(self, response):
+        #title = response.xpath('//h1[@class="post-title entry-title"]/text()').extract()
+        title = response.xpath('//h1[@itemprop="headline"]/text()').extract()
+        
+        category = response.xpath('//p[@class="in-category"]//a/text()').extract()
+
+        #date = response.xpath('//p[@class="single-date"]//span[2]/text()').extract()
+        date = response.xpath('//span[@itemprop="datePublished"]/text()').extract()
+
+        yield {
+            'Title': title,
+            'Category': category,
+            'Date': date,
+            'URL': response.url           
+        }
+        
+from scrapy.crawler import CrawlerProcess
+
+c = CrawlerProcess({
+    'USER_AGENT': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36",
+    # save in file CSV, JSON or XML
+    'FEEDS': {'output.csv': {'format': 'csv'}},  # new in 2.1
+})
+c.crawl(ArticlesSpider)
+c.start() 
diff --git a/__scraping__/bscscan.com - urllib, BS/main.py b/__scraping__/bscscan.com - urllib, BS/main.py
@@ -0,0 +1,43 @@
+
+# author: Bartlomiej "furas" Burek (https://blog.furas.pl)
+# date: 2021.07.23
+#
+# title: How to properly extract data that returns None in Python Beautifulsoup
+# url: https://stackoverflow.com/questions/68493228/how-to-properly-extract-data-that-returns-none-in-python-beautifulsoup/68493491?noredirect=1#comment121049699_68493491
+
+from bs4 import BeautifulSoup
+from urllib import request
+from urllib.request import Request, urlopen
+
+
+url = 'https://bscscan.com/tx/0x1b6f00c8cd99e0daac5718c743ef9a51af40f95feae23bf29960ae1f66a1cff7'
+#url = 'https://bscscan.com/tx/0xc54d83b870a1b4159f12bff092c8a24dfa045e133b07d3a3a41898293ac86c71'
+headers = {'User-Agent': 'Mozilla/5.0'}
+
+req = Request(url, headers=headers)
+html = urlopen(req).read()
+soup = BeautifulSoup(html, 'html.parser')
+
+val = soup.find('span', class_='u-label u-label--value u-label--secondary text-dark rounded mr-1').text
+transfee = soup.find('span', id='ContentPlaceHolder1_spanTxFee').text
+fromaddr = soup.find('span', id='spanFromAdd').text
+token = soup.find('span', class_='hash-tag text-truncate hash-tag-custom-from tooltip-address').text
+
+print("From:            ", fromaddr)
+print("Value:           ", val)
+print("Transaction Fee: ", transfee)
+print("Tokens:")
+
+main_data = soup.select("ul#wrapperContent div.media-body")
+
+for item in main_data:
+    all_span = item.find_all("span", class_='mr-1')
+    #for number, span in enumerate(all_span):
+    #    print(number, span.get_text(strip=True))
+    last_span = all_span[-1]
+    
+    all_a = item.find_all("a")
+    last_a = all_a[-1]
+    
+    print("{:>35} | {:18} | https://bscscan.com{}".format(last_span.get_text(strip=True), last_a.get_text(strip=True), last_a['href']))
+
diff --git a/__scraping__/crunchbase.com - selenium/main.py b/__scraping__/crunchbase.com - selenium/main.py
@@ -0,0 +1,51 @@
+
+# author: Bartlomiej "furas" Burek (https://blog.furas.pl)
+# date: 2021.07.23
+#
+# title: How to iterate through web table with Selenium?
+# url: https://stackoverflow.com/questions/68493382/how-to-iterate-through-web-table-with-selenium/68494220#68494220
+
+# DOESN"T WORK WITH FIREFOX BECAUSE SERVER SHOWS CAPTCHA
+
+
+from selenium import webdriver
+#from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+import pandas as pd
+import time
+
+#paths
+#PATH = "C:/Program Files (x86)\chromedriver.exe"
+#driver = webdriver.Chrome(PATH)
+driver = webdriver.Chrome()
+#driver = webdriver.Firefox()
+
+url = "https://www.crunchbase.com/search/organizations/field/organization.companies/categories/electric-vehicle"
+driver.get(url)
+driver.maximize_window()
+time.sleep(5)
+
+print('title:', driver.title)
+
+WebDriverWait(driver, 20).until(
+        EC.visibility_of_element_located(
+          (By.XPATH, ('//grid-body//identifier-formatter/a/div/div')
+        )))
+
+all_rows = driver.find_elements_by_css_selector("grid-row")
+
+all_companies = []
+                          
+for row in all_rows:
+    company = {
+        'name':     row.find_element_by_xpath('.//*[@class="identifier-label"]').text.strip(),
+        'industry': row.find_element_by_xpath('.//*[@data-columnid="categories"]//span').text.strip(),
+        'hq':       row.find_element_by_xpath('.//*[@data-columnid="location_identifiers"]//span').text.strip(),
+    }
+    all_companies.append(company)
+    
+#create dataframe    
+df = pd.DataFrame(all_companies)
+print(df)
diff --git a/__scraping__/magicbricks.com - requests, BS/main.py b/__scraping__/magicbricks.com - requests, BS/main.py
@@ -0,0 +1,19 @@
+
+# author: Bartlomiej "furas" Burek (https://blog.furas.pl)
+# date: 2021.07.15
+#
+# title: How to scrap string that does not have unique ID for data extraction? [closed]
+# url: https://stackoverflow.com/questions/68394176/how-to-scrap-string-that-does-not-have-unique-id-for-data-extraction
+
+from bs4 import BeautifulSoup as BS
+import requests
+
+url = 'https://www.magicbricks.com/property-for-sale-in-namakkal-pppfs'
+r= requests.get(url)
+
+soup = BS(r.text, 'html.parser')
+
+all_items = soup.find_all('span', class_='m-srp-card__title')
+for item in all_items:
+    print('for Sale in', item.text.split('for Sale in')[1].strip() )
+
diff --git a/__scraping__/pixabay.com -  requests, BS/README.md b/__scraping__/pixabay.com -  requests, BS/README.md
@@ -4,3 +4,13 @@
 # author: Bartłomiej "furas" Burek (https://blog.furas.pl)
 
 # https://stackoverflow.com/questions/63767927/cant-scrape-some-static-image-links-from-a-webpage-using-requests
+
+---
+
+Update: 2021.07.15
+
+Web page changed some classes and code needed updates
+
+- `.results--efirA` instead of `.search_results`
+
+- `data-lazy-src` instead of `data-lazy`
diff --git a/__scraping__/pixabay.com -  requests, BS/main.py b/__scraping__/pixabay.com -  requests, BS/main.py
@@ -2,6 +2,8 @@
 # date: 2020.09.07
 # author: Bartłomiej "furas" Burek (https://blog.furas.pl)
 # https://stackoverflow.com/questions/63767927/cant-scrape-some-static-image-links-from-a-webpage-using-requests
+#
+# update: 2021.07.15
 
 import requests
 from bs4 import BeautifulSoup
@@ -42,14 +44,15 @@
 
 soup = BeautifulSoup(r.text, "lxml")
 
-for item in soup.select(".search_results a > img[src]"):
+#for item in soup.select("[data-hid='photo_list_results'] a > img[src]"):
+for item in soup.select(".results--efirA a > img[src]"):
     src = item.get("src")
     if src is not None and 'blank.gif' not in src:
         print('src:', src)
         results.append(src)
     else:
-        src = item.get("data-lazy")
-        print('data-lazy:', src)
+        src = item.get("data-lazy-src")
+        print('data-lazy-src:', src)
         results.append(src)
 
 print('len:', len(results))