Skip to content

Commit 962e6db

Browse files
committed
__scraping__
1 parent 5fad70b commit 962e6db

File tree

2 files changed

+79
-0
lines changed

2 files changed

+79
-0
lines changed

__scraping__/bloomberg.com/main.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# if it get data then it works event with less headers
2+
# but when it get title `Bloomberg - Are you a robot?`
3+
# then it can get recaptcha which you may see when you open page in browser.
4+
# Sometimes it needs all headers again and it has to wait few seconds before request
5+
# but sometimes it need to resolve recaptcha.
6+
7+
import requests
8+
from bs4 import BeautifulSoup
9+
10+
headers = {
11+
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0',
12+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
13+
'Accept-Language': 'en-US;q=0.7,en;q=0.3',
14+
'Accept-Encoding': 'gzip, deflate, br',
15+
'Connection': 'keep-alive',
16+
'Pragma': 'no-cache',
17+
'Cache-Control': 'no-cache',
18+
}
19+
20+
s = requests.Session()
21+
s.headers.update(headers)
22+
23+
#print(s.headers)
24+
25+
# --- get main page to get Cookies ---
26+
27+
#url = 'https://www.bloomberg.com'
28+
#print('url:', url)
29+
#source = s.get(url, headers=headers)
30+
#soup = BeautifulSoup(source.content, 'lxml')
31+
#print('title:', soup.find('title').text)
32+
33+
#print(source.content)
34+
35+
# --- get page with data ---
36+
37+
url = 'https://www.bloomberg.com/profile/company/AAPL:US'
38+
print('url:', url)
39+
source = s.get(url, headers=headers)
40+
soup = BeautifulSoup(source.content, 'lxml')
41+
print('title:', soup.find('title').text)
42+
43+
#print(source.content)
44+
45+
soup = BeautifulSoup(source.content, 'lxml')
46+
company_name = soup.findAll('h1', class_= 'companyName__9bd88132')
47+
company_description = soup.findAll('div', class_ = 'description__ce057c5c')
48+
49+
print('company_name:', company_name[0].text if company_name else company_name)
50+
print('company_description:', company_description[0].text if company_description else company_description)

__scraping__/flashscore.com/main.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
2+
# date: 2020.06.10
3+
# https://stackoverflow.com/questions/62293949/web-scraping-with-bs4-pyhton3-cant-find-elements/62294633#62294633
4+
5+
import requests
6+
import bs4 as bs
7+
8+
#url = 'https://www.flashscore.com/field-hockey/netherlands/hoofdklasse/standings/'
9+
10+
url = 'https://d.flashscore.com/x/feed/ss_1_INmPqO86_GOMWObX1_table_overall'
11+
12+
headers = {
13+
# 'User-Agent': 'Mozilla/5.0'
14+
# 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0',
15+
# 'X-Referer': 'https://www.flashscore.com/field-hockey/netherlands/hoofdklasse/standings/',
16+
'X-Fsign': 'SW9D1eZo',
17+
# 'X-Requested-With': 'XMLHttpRequest',
18+
# 'Referer': 'https://d.flashscore.com/x/feed/proxy-local',
19+
}
20+
21+
r = requests.get(url, headers=headers)
22+
#print(r.text)
23+
24+
soup = bs.BeautifulSoup(r.text, 'lxml')
25+
26+
for item in soup.find_all('span', class_='team_name_span'):
27+
print(item.text)
28+
29+

0 commit comments

Comments
 (0)