Skip to content

Commit d00d5af

Browse files
committed
new examples
1 parent d53ec56 commit d00d5af

File tree

47 files changed

+8890
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+8890
-0
lines changed
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/usr/bin/env python3
2+
3+
# date: 2019.11.24
4+
# https://stackoverflow.com/questions/59019810/python-web-scraping-ahref-link-and-articles-not-showing-up-in-source-code
5+
6+
import selenium.webdriver
7+
8+
url = 'https://corporate.dow.com/en-us/news.html'
9+
driver = selenium.webdriver.Firefox()
10+
driver.get(url)
11+
12+
all_items = driver.find_elements_by_xpath('//ul[@class="results__list"]/li')
13+
for item in all_items:
14+
print(item.find_element_by_xpath('.//h3').text)
15+
print(item.find_element_by_xpath('.//a').get_attribute('href'))
16+
print('---')
17+
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/usr/bin/env python3
2+
3+
# date: 2019.11.23
4+
# https://stackoverflow.com/questions/59008770/want-to-read-a-tag-data-using-selenium
5+
6+
from selenium import webdriver
7+
8+
driver = webdriver.Firefox()
9+
driver.get('https://dps.psx.com.pk/')
10+
11+
last_table = driver.find_elements_by_xpath("//table")[-1]
12+
13+
for row in last_table.find_elements_by_xpath(".//tr")[1:]:
14+
print(row.find_element_by_xpath(".//td/a[@class='tbl__symbol']").text)
15+
print([td.text for td in row.find_elements_by_xpath(".//td[@class='right']")])
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#!/usr/bin/env python3
2+
3+
# date: 2019.11.23
4+
# https://stackoverflow.com/questions/59008426/python-web-scrapping-if-using-all-scalar-values-you-must-pass-an-index
5+
6+
import pandas as pd
7+
import requests
8+
import urllib.request
9+
import time
10+
from bs4 import BeautifulSoup
11+
12+
url = 'https://www.medindia.net/doctors/drug_information/abacavir.htm'
13+
response = requests.get(url)
14+
soup = BeautifulSoup(response.text, "html.parser")
15+
drug = soup.find(class_='mi-container__fluid')
16+
#print(drug)
17+
18+
# whole page contain drug content
19+
items = drug.find_all(class_='report-content drug-widget')
20+
#print(items)
21+
22+
# extract drug information from drug content into individual variable
23+
trade_name = items[0].find(class_='drug-content').get_text()
24+
function = items[1].find(class_='drug-content').get_text()
25+
contraindications = items[2].find(class_='drug-content').get_text()
26+
dosage = items[3].find(class_='drug-content').get_text()
27+
how_to_use = items[4].find(class_='drug-content').get_text()
28+
warnings = items[5].find(class_='drug-content').get_text()
29+
storage = items[7].find(class_='drug-content').get_text()
30+
31+
32+
drug_stuff = pd.DataFrame({
33+
'trade_name': [trade_name],
34+
'function': [function],
35+
'contraindications': [contraindications],
36+
'dosage': [dosage],
37+
'how_to_use': [how_to_use],
38+
'warnings': [warnings],
39+
'storage': [storage],
40+
})
41+
42+
print(drug_stuff)

__scraping__/money.cnn.com/main.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#!/usr/bin/env python3
2+
3+
# date: 2019.11.23
4+
# https://stackoverflow.com/questions/59004270/i-want-to-display-first-word-from-1st-list-and-display-10-words-from-2nd-list-an
5+
6+
from bs4 import BeautifulSoup
7+
import urllib.request
8+
9+
url = 'https://money.cnn.com/data/hotstocks/'
10+
html = urllib.request.urlopen(url).read()
11+
soup = BeautifulSoup(html,'lxml')
12+
13+
allbody = soup.find('div', class_='cnnBody_Left wsodContent')
14+
names = allbody.find_all('h3') #I am finding the header tags text
15+
names = [x.text for x in names]
16+
#print(names)
17+
18+
contents = allbody.find_all('table', class_='wsod_dataTable wsod_dataTableBigAlt')
19+
20+
tables = [] # keep three tables
21+
22+
for item in contents:
23+
data = [] # list for single table
24+
25+
for tr in item.find_all('tr')[1:]: # find rows in table - skip row with headers `[1:]`
26+
a = tr.find('a') # get only from first column
27+
a = a.text.strip()
28+
29+
span = tr.find('span') # get only from first column
30+
span = span.text.strip()
31+
32+
data.append( (a, span) )
33+
34+
tables.append(data)
35+
36+
37+
for name, table in zip(names, tables):
38+
print('-', name)
39+
for a, span in table:
40+
print(a, span)

__scraping__/rtrs.tv/main.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
from requests import session
2+
3+
from bs4 import BeautifulSoup
4+
5+
url = r'https://www.rtrs.tv/vijesti/index.php'
6+
7+
headers = {
8+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
9+
}
10+
11+
with session() as c:
12+
13+
r = c.get(url, headers=headers)
14+
15+
print(r)
16+
17+
soup = BeautifulSoup(r.text, 'html.parser')
18+
19+
all_h2 = soup.find_all('h2')
20+
for item in all_h2:
21+
print(item.text)
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#!/usr/bin/env python3
2+
3+
# date: 2019.11.23
4+
# https://stackoverflow.com/questions/59003700/pythonon-ajax-php-prase-result-is-different-from-on-screen-result
5+
6+
import requests
7+
8+
url = 'http://std.stheadline.com/daily/ajax/ajaxFormerly.php'
9+
10+
params = {
11+
'startDate': '2019-11-20',
12+
'endDate': '2019-11-22',
13+
'type[]': '15',
14+
'keyword': '',
15+
}
16+
17+
r = requests.post(url, data=params)
18+
19+
data = r.json()
20+
21+
print(data['totalCount']) # 47
22+
23+
import pandas as pd
24+
import io
25+
26+
f = io.StringIO(r.text)
27+
df = pd.read_json(f)
28+
29+
print(df)
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#!/usr/bin/env python3
2+
3+
# date: 2019.11.23
4+
# https://stackoverflow.com/questions/59003872/running-for-loop-and-skipping-stocks-with-keyerror-date
5+
6+
from datetime import datetime, timedelta
7+
from urllib.request import urlopen
8+
from bs4 import BeautifulSoup
9+
import pandas as pd
10+
from pandas_datareader import data as web
11+
12+
html = urlopen('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
13+
soup = BeautifulSoup(html,'lxml')
14+
sp500_raw = soup.find('table', {'class': 'wikitable sortable'})
15+
16+
spsymbol = []
17+
18+
for row in sp500_raw.findAll('tr')[1:]:
19+
spsymbols = row.findAll('td')[0].text.strip()
20+
spsymbol.append(spsymbols)
21+
22+
start = datetime(2008, 1, 1).date()
23+
end = datetime.today().date()
24+
25+
for ticker in spsymbol:
26+
print(ticker)
27+
try:
28+
df = web.get_data_yahoo(ticker, start, end)
29+
df = df.reset_index()
30+
#print(df.head())
31+
df.to_csv(ticker + '.csv', header=True, index=True, columns=['Date', 'High', 'Low', 'Open', 'Close', 'Volume', 'Adj Close'], sep=' ')
32+
except Exception as ex:
33+
print('Ex:', ex)
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from bs4 import BeautifulSoup as BS
2+
3+
text = '''
4+
<tr data-title='<img src="url1.jpg" alt="1">' >
5+
<tr data-title='<img src="url2.jpg" alt="2">' >
6+
'''
7+
8+
soup = BS(text, 'html.parser')
9+
10+
all_items = soup.find_all('tr', {"data-title": True})
11+
12+
for item in all_items:
13+
print('item:', item['data-title'])
14+
#print('item:', item.attrs.get('data-title'))
15+
#print('item:', item.attrs['data-title'])
16+
#print('item:', item.get('data-title'))
17+
18+
link = item.get('data-title')
19+
s = BS(link, 'html.parser')
20+
print('src:', s.find('img')['src'])
21+
22+
23+

0 commit comments

Comments
 (0)