Skip to content

Commit 7a9edfe

Browse files
committed
scraping
1 parent 7d59790 commit 7a9edfe

File tree

4 files changed

+148
-0
lines changed
  • __scraping__
    • fda.gov - pandas
    • flashscore.com (2) - selenium, BS
    • fussballdaten.de - selenium, SVG
    • investagrams.com - requests, JSON

4 files changed

+148
-0
lines changed

__scraping__/fda.gov - pandas/main.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
2+
# author: Bartlomiej "furas" Burek (https://blog.furas.pl)
3+
# date: 2021.05.28
4+
#
5+
# title: HTTP Error 400 Bad request calling api with python
6+
# url: https://stackoverflow.com/questions/67730460/http-error-400-bad-request-calling-api-with-python/67740975#67740975
7+
8+
# Based on [documentation](https://open.fda.gov/apis/query-parameters/)
9+
# you should use `skip` instead of `limit` - and use always `limit=100`
10+
# like `limit=100&skip=0`, `limit=100&skip=100`, `limit=100&skip=200`, `limit=100&skip=300`, etc.
11+
12+
import pandas as pd
13+
14+
limit = 100
15+
url = 'https://api.fda.gov/drug/ndc.json?search=dea_schedule:"{}"&limit={}&skip={}'
16+
all_data_df = []
17+
18+
for skip in range(0, 2321, 100):
19+
query = url.format('CII', limitskip)
20+
print('query:', query)
21+
data = pd.read_json(query, orient='values', typ='series', convert_dates=False)
22+
data = data['results']
23+
all_data_df.append(data)
24+
25+
print(all_data_df)
26+
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
2+
# author: Bartlomiej "furas" Burek (https://blog.furas.pl)
3+
# date: 2021.05.29
4+
#
5+
# title: Scraping a section of webpage based on text
6+
# url: https://stackoverflow.com/questions/67754320/scraping-a-section-of-webpage-based-on-text/67756231#67756231
7+
8+
import selenium.webdriver
9+
from bs4 import BeautifulSoup as BS
10+
import time
11+
12+
url = 'https://www.flashscore.com/football/chile/primera-division/'
13+
14+
driver = selenium.webdriver.Firefox()
15+
driver.get(url)
16+
17+
time.sleep(5)
18+
19+
soup = BS(driver.page_source, 'html.parser')
20+
21+
print('--- version 1 ---')
22+
23+
section = soup.find('div', id='live-table').find('section')
24+
25+
for item in section.find_all('div', title='Click for match detail!'):
26+
print(item.get('id'))
27+
28+
print('--- version 2 ---')
29+
30+
section = soup.find('section', class_='event--live')
31+
32+
for item in section.find_all('div', title='Click for match detail!'):
33+
print(item.get('id'))
34+
35+
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
#!/usr/bin/env python3
2+
3+
# author: Bartlomiej "furas" Burek (https://blog.furas.pl)
4+
# date: 2021.04.26
5+
# https://stackoverflow.com/questions/67254893/scraping-text-of-class-with-selenium-and-with-whitespaces-between-different-text/
6+
7+
8+
from selenium import webdriver
9+
10+
#driver = webdriver.Firefox()
11+
#driver = webdriver.Chrome()
12+
driver = webdriver.Edge()
13+
14+
driver.get('https://www.fussballdaten.de/vereine/fc-bayern-muenchen/2019/')
15+
16+
# close popup window with message
17+
driver.find_element_by_xpath('//button[@aria-label="Einwilligen"]').click()
18+
19+
print('--- FIND ---')
20+
21+
dots_graph = driver.find_element_by_class_name("tore-dots")
22+
all_items = dots_graph.find_elements_by_tag_name("text")
23+
24+
dot_vals = [item.text for item in all_items]
25+
print(dot_vals)
26+
27+
print('--- XPATH 1 ---')
28+
29+
# doesn't work with `g` and `text` - maybe because it is inside `<SVG>`
30+
all_items = driver.find_elements_by_xpath('//g[@class="tore-dots"]//text')
31+
32+
dot_vals = [item.text for item in all_items]
33+
print(dot_vals)
34+
35+
print('--- XPATH (*, name) ---')
36+
37+
all_items = driver.find_elements_by_xpath('//*[@class="tore-dots"]//*[local-name()="text"]')
38+
39+
dot_vals = [item.text for item in all_items]
40+
print(dot_vals)
41+
42+
print('--- XPATH (*, local-name) ---')
43+
44+
all_items = driver.find_elements_by_xpath('//*[@class="tore-dots"]//*[name()="text"]')
45+
46+
dot_vals = [item.text for item in all_items]
47+
print(dot_vals)
48+
49+
print('--- CSS ---')
50+
51+
all_items = driver.find_elements_by_css_selector('.tore-dots text')
52+
53+
dot_vals = [item.text for item in all_items]
54+
print(dot_vals)
55+
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
2+
# author: Bartlomiej "furas" Burek (https://blog.furas.pl)
3+
# date: 2021.05.29
4+
#
5+
# title: Web scraping with bs4 does not return number value
6+
# url: https://stackoverflow.com/questions/67751314/web-scraping-with-bs4-does-not-return-number-value/67751732#67751732
7+
8+
import requests
9+
10+
headers = {
11+
'User-Agent': 'Mozilla/5.0',
12+
'Referer': 'https://www.investagrams.com/'
13+
}
14+
15+
params = {
16+
'stockCode': 'ac',
17+
'defaultExchangeType': '1',
18+
'cv': '1622292000-0-',
19+
}
20+
21+
url = 'https://webapi.investagrams.com/InvestaApi/Stock/ViewStock'
22+
r = requests.get(url, params=params, headers=headers)
23+
24+
#print(r.status_code)
25+
#print(r.json())
26+
27+
data = r.json()
28+
print('Open:', data['LatestStockHistory']['Open'])
29+
30+
for key, value in data['LatestStockHistory'].items():
31+
print(key, '=', value)
32+

0 commit comments

Comments
 (0)