Skip to content

Commit 60b7216

Browse files
committed
scraping
1 parent 0c27ff6 commit 60b7216

File tree

26 files changed

+8894
-1
lines changed

26 files changed

+8894
-1
lines changed

__scraping__/blockchain.info/main.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/usr/bin/env python3
2+
3+
# date: 2020.05.18
4+
# https://stackoverflow.com/questions/61858764/is-there-an-easy-way-to-access-all-transactions-recorded-in-a-bitcoin-block-with/
5+
#
6+
# https://www.blockchain.com/api/blockchain_api
7+
8+
import requests
9+
10+
r = requests.get('https://blockchain.info/block-height/100?format=json')
11+
data = r.json()
12+
13+
#print(r.text)
14+
#print(data)
15+
print(data['blocks'][0]['hash'])
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/usr/bin/env python3
2+
3+
# date: 2020.05.18
4+
# https://stackoverflow.com/questions/61876744/scraper-returns-null-result/
5+
6+
import requests
7+
8+
url = 'https://store-site-backend-static.ak.epicgames.com/freeGamesPromotions?locale=en-US&country=PL&allowCountries=PL'
9+
10+
r = requests.get(url)
11+
12+
data = r.json()
13+
14+
#print(r.text)
15+
16+
for item in data['data']['Catalog']['searchStore']['elements']:
17+
print(item['title'])
18+
offers = item['promotions']['promotionalOffers']
19+
for offer in offers:
20+
print(offer['promotionalOffers'][0]['startDate'])
21+
print(offer['promotionalOffers'][0]['endDate'])
22+
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
#!/usr/bin/env python3
2+
3+
# date: 2020.05.28
4+
#
5+
6+
from selenium import webdriver
7+
from selenium.webdriver.support.ui import Select
8+
import pandas as pd
9+
import time
10+
11+
# --- functions ---
12+
13+
def get_data(start_date, end_date, product):
14+
15+
# select `Variation Report`
16+
driver.find_element_by_id('ctl00_MainContent_Rbl_Rpt_type_1').click()
17+
18+
# select `Daily Variant`
19+
element_variation = driver.find_element_by_id ('ctl00_MainContent_Ddl_Rpt_Option1')
20+
drop_variation = Select(element_variation)
21+
drop_variation.select_by_visible_text('Daily Variation')
22+
23+
# select `product` before `date` because `end_date` opens calendar which blocks `product` list
24+
element_commodity = driver.find_element_by_id ('ctl00_MainContent_Lst_Commodity')
25+
drop_commodity = Select(element_commodity)
26+
drop_commodity.select_by_visible_text(product)
27+
28+
# select `start_date` and `end_date`
29+
driver.find_element_by_id('ctl00_MainContent_Txt_FrmDate').send_keys(start_date)
30+
driver.find_element_by_id('ctl00_MainContent_Txt_ToDate').send_keys(end_date)
31+
32+
# click button `Get Data`
33+
driver.find_element_by_id('ctl00_MainContent_btn_getdata1').click()
34+
35+
time.sleep(3) # sometimes it need to wait for loading page
36+
37+
#second table is the one that we want
38+
table = pd.read_html(driver.page_source)[2]
39+
40+
print(len(table))
41+
print(table)
42+
43+
# go back
44+
driver.find_element_by_id('btn_back').click()
45+
46+
time.sleep(3) # sometimes it need to wait for loading page
47+
48+
return table
49+
50+
# --- main ---
51+
52+
driver = webdriver.Firefox()
53+
54+
driver.get('https://fcainfoweb.nic.in/Reports/Report_Menu_Web.aspx')
55+
56+
start_date = '01/05/2020'
57+
end_date = '27/05/2020'
58+
59+
for number, product in enumerate( ('Rice', 'Wheat', 'Tomato', 'Sugar') ):
60+
table = get_data(start_date, end_date, product)
61+
# for first product create file, for other products append to existing file
62+
if number == 0:
63+
mode = 'w'
64+
else:
65+
mode = 'a'
66+
# standard engine `xlsxwriter` can't append so I had to use `openpyxl`
67+
with pd.ExcelWriter('output.xlsx', engine='openpyxl', mode=mode) as writer:
68+
table.to_excel(writer, sheet_name=product, index=False)
69+
45.8 KB
Binary file not shown.
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
2+
# date: 2020.05.26
3+
# https://stackoverflow.com/questions/61994836/bs4-web-scraping-searching-div-class/
4+
5+
import requests
6+
from bs4 import BeautifulSoup
7+
8+
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
9+
r = requests.get('https://www.google.com/search?q=titanic+movie', headers=headers)
10+
11+
soup = BeautifulSoup(r.content, 'html.parser')
12+
item = soup.find('div', class_="srBp4 Vrkhme")
13+
print(item.get_text(strip=True, separator=' '))
14+
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
2+
# date: 2020.05.26
3+
# https://stackoverflow.com/questions/61994836/bs4-web-scraping-searching-div-class/
4+
5+
import selenium.webdriver
6+
7+
url = 'https://www.google.com/search?q=titanic+movie'
8+
9+
driver = selenium.webdriver.Firefox()
10+
driver.get(url)
11+
12+
item = driver.find_element_by_class_name('srBp4.Vrkhme')
13+
print(item.text.strip())
14+
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#!/usr/bin/env python3
2+
3+
# date: 2020.05.25
4+
# https://stackoverflow.com/questions/62003463/web-scraping-hedge-fund-data-with-beautifulsoup
5+
6+
import selenium.webdriver
7+
import time
8+
9+
url = 'https://hedgefollow.com/funds/Duquesne+Family+Office'
10+
11+
driver = selenium.webdriver.Firefox()
12+
driver.get(url)
13+
14+
time.sleep(3)
15+
16+
table = driver.find_element_by_id('dgtopHolders')
17+
18+
print('--- headers ---')
19+
20+
row = table.find_elements_by_tag_name('tr')[0]
21+
for cell in row.find_elements_by_tag_name('th'):
22+
print(cell.text)
23+
24+
print('--- data ---')
25+
26+
for row in table.find_elements_by_tag_name('tr')[1:]:
27+
for cell in row.find_elements_by_tag_name('td'):
28+
print(cell.text)
29+
print('---')
30+
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#!/usr/bin/env python3
2+
3+
# date: 2020.05.24
4+
# https://stackoverflow.com/questions/61981006/extracting-p-from-div-class-python-to-get-addresses/
5+
6+
import requests
7+
import BeautifulSoup
8+
import csv
9+
import urllib2
10+
import time
11+
12+
initial_url = "https://www.lifetime.life"
13+
14+
response = requests.get("https://www.lifetime.life/view-all-locations.html")
15+
soup = BeautifulSoup.BeautifulSoup(response.text)
16+
17+
with open('gyms2.csv', 'w') as gf:
18+
gymwriter = csv.writer(gf)
19+
for a in soup.findAll('a'):
20+
if '/life-time-locations/' in a['href']:
21+
gymurl = urllib2.urlparse.urljoin(initial_url, a.get('href'))
22+
print(gymurl)
23+
24+
response = requests.get(gymurl)
25+
sub_soup = BeautifulSoup.BeautifulSoup(response.text)
26+
27+
try:
28+
address_line = sub_soup.find('p', {'class': 'small m-b-sm p-t-1'}).find('span', {'class': 'btn-icon-text'})
29+
30+
gymrow = [gymurl, address_line.text]
31+
print(gymrow)
32+
gymwriter.writerow(gymrow)
33+
time.sleep(3)
34+
except Exception as ex:
35+
print(ex)
36+
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#!/usr/bin/env python3
2+
3+
# date: 2020.05.24
4+
# https://stackoverflow.com/questions/61981006/extracting-p-from-div-class-python-to-get-addresses/
5+
6+
import requests
7+
from bs4 import BeautifulSoup
8+
import urllib.parse
9+
import csv
10+
import time
11+
12+
initial_url = "https://www.lifetime.life"
13+
14+
response = requests.get("https://www.lifetime.life/view-all-locations.html")
15+
soup = BeautifulSoup(response.text)
16+
17+
with open('gyms2.csv', 'w') as gf:
18+
gymwriter = csv.writer(gf)
19+
for a in soup.findAll('a'):
20+
if '/life-time-locations/' in a['href']:
21+
gymurl = urllib.parse.urljoin(initial_url, a.get('href'))
22+
print(gymurl)
23+
24+
response = requests.get(gymurl)
25+
sub_soup = BeautifulSoup(response.text)
26+
27+
try:
28+
address_line = sub_soup.select('p.small.m-b-sm.p-t-1 span.btn-icon-text')
29+
gymrow = [gymurl, address_line[0].text.strip()]
30+
print(gymrow)
31+
gymwriter.writerow(gymrow)
32+
time.sleep(3)
33+
except Exception as ex:
34+
print(ex)
35+
36+
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#!/usr/bin/env python3
2+
3+
# date: 2020.05.25
4+
# https://stackoverflow.com/questions/62000520/extracting-html-data-using-python/
5+
6+
import requests
7+
from bs4 import BeautifulSoup
8+
import csv
9+
10+
url = 'https://www.marketscreener.com/MICROSOFT-CORPORATION-4835/company/'
11+
12+
r = requests.get(url) #, headers={'user-agent': 'Mozilla/5.0'})
13+
soup = BeautifulSoup(r.content, 'html.parser')
14+
15+
all_tables = []
16+
17+
for table in soup.select("table table.nfvtTab"):
18+
table_rows = []
19+
for tr in table.select('tr'):
20+
row = []
21+
for td in tr.select('td'):
22+
#print(td)
23+
item = td.get_text(strip=True, separator=' ')
24+
#print(item)
25+
row.append(item)
26+
table_rows.append(row)
27+
all_tables.append(table_rows)
28+
29+
# add headers for nested columns
30+
31+
#Sales per Business
32+
all_tables[0][0].insert(2, '2018')
33+
all_tables[0][0].insert(4, '2019')
34+
all_tables[0][1].insert(0, '')
35+
all_tables[0][1].insert(5, '')
36+
37+
# create one row with headers
38+
headers = [f'{a} {b}'.strip() for a,b in zip(all_tables[0][0], all_tables[0][1])]
39+
print('new:', headers)
40+
all_tables[0][0] = headers # set new headers in first row
41+
del all_tables[0][1] # remove second row
42+
43+
#Sales per region
44+
all_tables[1][0].insert(2, '2018')
45+
all_tables[1][0].insert(4, '2019')
46+
all_tables[1][1].insert(0, '')
47+
all_tables[1][1].insert(5, '')
48+
49+
# create one row with headers
50+
headers = [f'{a} {b}'.strip() for a,b in zip(all_tables[1][0], all_tables[1][1])]
51+
print('new:', headers)
52+
all_tables[1][0] = headers # set new headers in first row
53+
del all_tables[1][1] # remove second row
54+
55+
#Equities
56+
all_tables[3][0].insert(4, 'Free-Float %')
57+
all_tables[3][0].insert(6, 'Company-owned shares %')
58+
59+
for number, table in enumerate(all_tables, 1):
60+
print('---', number, '---')
61+
for row in table:
62+
print(row)
63+
64+
for number, table in enumerate(all_tables, 1):
65+
with open(f'table{number}.csv', 'w') as f:
66+
csv_writer = csv.writer(f)
67+
csv_writer.writerows(table)
68+
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
,2018 USD (in Million),2018 %,2019 USD (in Million),2019 %,Delta
2+
More Personal Computing,"42,276",38.4%,"45,698",36.4%,+8.09%
3+
Productivity and Business Processes,"35,865",32.6%,"41,160",32.8%,+14.76%
4+
Intelligent Cloud,"32,219",29.2%,"38,985",31.1%,+21%
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
,2018 USD (in Million),2018 %,2019 USD (in Million),2019 %,Delta
2+
United States,"55,926",50.8%,"64,199",51.2%,+14.79%
3+
Other Countries,"54,434",49.4%,"61,644",49.1%,+13.25%
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
Name,Age,Since,Title
2+
Satya Nadella,52,2014,Chief Executive Officer & Non-Independent Director
3+
Bradford Smith,60,2015,President & Chief Legal Officer
4+
John Thompson,69,2014,Independent Chairman
5+
Kirk Koenigsbauer,51,2020,COO & VP-Experiences & Devices Group
6+
Amy E. Hood,47,2013,Chief Financial Officer & Executive Vice President
7+
James Kevin Scott,54,-,Chief Technology Officer & Executive VP
8+
John W. Stanton,64,2014,Independent Director
9+
Teri L. List-Stoll,57,2014,Independent Director
10+
Charles Scharf,53,2014,Independent Director
11+
Sandra E. Peterson,60,2015,Independent Director
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
,Vote,Quantity,Free-Float,Free-Float %,Company-owned shares,Company-owned shares %,Total Float
2+
Stock A,1,"7,583,440,247","7,475,252,172",98.6%,0,0.0%,98.6%
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
Name,Equities,%
2+
"The Vanguard Group, Inc.","603,109,511",7.95%
3+
Capital Research & Management Co.,"556,573,400",7.34%
4+
"SSgA Funds Management, Inc.","314,771,248",4.15%
5+
Fidelity Management & Research Co.,"221,883,722",2.93%
6+
BlackRock Fund Advisors,"183,455,207",2.42%
7+
"T. Rowe Price Associates, Inc. (Investment Management)","172,056,401",2.27%
8+
Capital Research & Management Co. (World Investors),"139,116,236",1.83%
9+
Putnam LLC,"121,797,960",1.61%
10+
Geode Capital Management LLC,"115,684,966",1.53%
11+
Capital Research & Management Co. (International Investors),"103,523,946",1.37%
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import pandas as pd
2+
3+
df = pd.read_csv(f'table1.csv', index_col=0) #, header=[0,1])
4+
print(df)
5+
6+
df = pd.read_csv(f'table2.csv', index_col=0) #, header=[0,1])
7+
print(df)
8+
9+
df = pd.read_csv(f'table3.csv') #, index_col=0)
10+
print(df)
11+
12+
df = pd.read_csv(f'table4.csv', index_col=0)
13+
print(df)
14+
15+
df = pd.read_csv(f'table5.csv') #, index_col=0)
16+
print(df)
17+

__scraping__/myntra.com - scrapy/2019.08.14/output.csv

Lines changed: 8212 additions & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)