scraping

furas · furas · commit 7a9edfe54ecc · 2021-05-30T16:47:02.000+02:00
diff --git a/__scraping__/fda.gov - pandas/main.py b/__scraping__/fda.gov - pandas/main.py
@@ -0,0 +1,26 @@
+
+# author: Bartlomiej "furas" Burek (https://blog.furas.pl)
+# date: 2021.05.28
+#
+# title: HTTP Error 400 Bad request calling api with python
+# url: https://stackoverflow.com/questions/67730460/http-error-400-bad-request-calling-api-with-python/67740975#67740975
+
+# Based on [documentation](https://open.fda.gov/apis/query-parameters/) 
+# you should use `skip` instead of `limit` - and use always `limit=100` 
+# like `limit=100&skip=0`, `limit=100&skip=100`, `limit=100&skip=200`, `limit=100&skip=300`, etc.
+
+import pandas as pd
+
+limit = 100
+url = 'https://api.fda.gov/drug/ndc.json?search=dea_schedule:"{}"&limit={}&skip={}'
+all_data_df = []
+
+for skip in range(0, 2321, 100):
+    query = url.format('CII', limitskip)
+    print('query:', query)
+    data = pd.read_json(query, orient='values', typ='series', convert_dates=False)
+    data = data['results']
+    all_data_df.append(data)
+    
+print(all_data_df)
+
diff --git a/__scraping__/flashscore.com (2) - selenium, BS/main.py b/__scraping__/flashscore.com (2) - selenium, BS/main.py
@@ -0,0 +1,35 @@
+
+# author: Bartlomiej "furas" Burek (https://blog.furas.pl)
+# date: 2021.05.29
+#
+# title: Scraping a section of webpage based on text
+# url: https://stackoverflow.com/questions/67754320/scraping-a-section-of-webpage-based-on-text/67756231#67756231
+
+import selenium.webdriver
+from bs4 import BeautifulSoup as BS
+import time
+
+url = 'https://www.flashscore.com/football/chile/primera-division/'
+
+driver = selenium.webdriver.Firefox()
+driver.get(url)
+
+time.sleep(5)
+
+soup = BS(driver.page_source, 'html.parser')
+
+print('--- version 1 ---')
+
+section = soup.find('div', id='live-table').find('section')
+
+for item in section.find_all('div', title='Click for match detail!'):
+    print(item.get('id'))
+
+print('--- version 2 ---')
+
+section = soup.find('section', class_='event--live')
+
+for item in section.find_all('div', title='Click for match detail!'):
+    print(item.get('id'))
+
+
diff --git a/__scraping__/fussballdaten.de - selenium, SVG/main.py b/__scraping__/fussballdaten.de - selenium, SVG/main.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+
+# author: Bartlomiej "furas" Burek (https://blog.furas.pl)
+# date: 2021.04.26
+# https://stackoverflow.com/questions/67254893/scraping-text-of-class-with-selenium-and-with-whitespaces-between-different-text/
+
+
+from selenium import webdriver
+
+#driver = webdriver.Firefox()
+#driver = webdriver.Chrome()
+driver = webdriver.Edge()
+
+driver.get('https://www.fussballdaten.de/vereine/fc-bayern-muenchen/2019/')
+
+# close popup window with message
+driver.find_element_by_xpath('//button[@aria-label="Einwilligen"]').click()
+
+print('--- FIND ---')
+
+dots_graph = driver.find_element_by_class_name("tore-dots")
+all_items = dots_graph.find_elements_by_tag_name("text")
+
+dot_vals = [item.text for item in all_items]
+print(dot_vals)
+
+print('--- XPATH 1 ---')
+
+# doesn't work with `g` and `text` - maybe because it is inside `<SVG>` 
+all_items = driver.find_elements_by_xpath('//g[@class="tore-dots"]//text')  
+
+dot_vals = [item.text for item in all_items]
+print(dot_vals)
+
+print('--- XPATH (*, name) ---')
+
+all_items = driver.find_elements_by_xpath('//*[@class="tore-dots"]//*[local-name()="text"]')
+
+dot_vals = [item.text for item in all_items]
+print(dot_vals)
+
+print('--- XPATH (*, local-name) ---')
+
+all_items = driver.find_elements_by_xpath('//*[@class="tore-dots"]//*[name()="text"]')
+
+dot_vals = [item.text for item in all_items]
+print(dot_vals)
+
+print('--- CSS ---')
+
+all_items = driver.find_elements_by_css_selector('.tore-dots text')
+
+dot_vals = [item.text for item in all_items]
+print(dot_vals)
+
diff --git a/__scraping__/investagrams.com - requests, JSON/main.py b/__scraping__/investagrams.com - requests, JSON/main.py
@@ -0,0 +1,32 @@
+
+# author: Bartlomiej "furas" Burek (https://blog.furas.pl)
+# date: 2021.05.29
+#
+# title: Web scraping with bs4 does not return number value
+# url: https://stackoverflow.com/questions/67751314/web-scraping-with-bs4-does-not-return-number-value/67751732#67751732
+
+import requests
+
+headers = { 
+    'User-Agent': 'Mozilla/5.0',
+    'Referer': 'https://www.investagrams.com/'
+}
+
+params = {
+    'stockCode': 'ac',
+    'defaultExchangeType': '1',
+    'cv': '1622292000-0-',
+}
+
+url = 'https://webapi.investagrams.com/InvestaApi/Stock/ViewStock'
+r = requests.get(url, params=params, headers=headers)
+
+#print(r.status_code)
+#print(r.json())
+
+data = r.json()
+print('Open:', data['LatestStockHistory']['Open'])
+
+for key, value in data['LatestStockHistory'].items():
+    print(key, '=', value)
+