Skip to content

Commit 585ac63

Browse files
committed
__scraping__
1 parent 279700f commit 585ac63

File tree

5 files changed

+279
-0
lines changed

5 files changed

+279
-0
lines changed
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#!/usr/bin/env python3
2+
3+
# date: 2019.12.18
4+
# https://stackoverflow.com/questions/59386434/selenium-webdriver-i-want-to-click-on-the-next-page-till-last-page/59387563#59387563
5+
6+
from selenium import webdriver
7+
#from bs4 import BeautifulSoup as bs
8+
import time
9+
10+
url = 'https://curecity.in/vendor-list.php?category=Doctor&filters_location=Jaipur&filters%5Bsubareas_global%5D=&filters_speciality='
11+
12+
#driver = webdriver.Chrome('C:\chromedriver.exe')
13+
driver = webdriver.Firefox()
14+
driver.maximize_window()
15+
16+
driver.get(url)
17+
next_page_number = 1
18+
19+
while True:
20+
21+
print('page:', next_page_number)
22+
time.sleep(10) # page loads very slow so I need longer sleep
23+
24+
#soup = bs(driver.page_source, 'html.parser')
25+
#for link in soup.find_all('div',class_='col-md-9 feature-info'):
26+
# link1 = link.find('a')
27+
# print(link1['href'])
28+
29+
for link in driver.find_elements_by_xpath('//div[@class="col-md-2 feature-icon"]/a'):
30+
print(link.get_attribute('href'))
31+
32+
try:
33+
# button '>' jums 3 pages so I click button with number of next page.
34+
next_page_number += 1
35+
driver.find_element_by_xpath('//a[@data-page="{}"]'.format(next_page_number)).click()
36+
except:
37+
print('No more pages')
38+
break # exit loop
39+
40+
#driver.close()
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/usr/bin/env python3
2+
3+
# date: 2019.12.20
4+
# https://stackoverflow.com/questions/59419682/how-do-i-extract-this-entire-table-and-store-it-in-csv-file/
5+
6+
import requests
7+
8+
r = requests.get('https://games.crossfit.com/competitions/api/v1/competitions/open/2020/leaderboards?view=0&division=1&scaled=0&sort=0')
9+
10+
data = r.json()
11+
12+
for row in data['leaderboardRows']:
13+
print(row['entrant']['competitorName'], row['overallScore'], [(x['rank'],x['scoreDisplay']) for x in row['scores']])
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
#!/usr/bin/env python3
2+
3+
# date: 2019.12.12
4+
# https://stackoverflow.com/questions/59259699/scrapy-formrequest-parameter-not-working-but-showing-all-result-instead/
5+
# page: https://researchgrant.gov.sg/eservices/advanced-search/
6+
7+
import scrapy
8+
import urllib.parse
9+
10+
class MySpider(scrapy.Spider):
11+
12+
name = 'myspider'
13+
#allowed_domains = []
14+
15+
params = {
16+
'name': 'advancesearchawardedprojectsp'
17+
}
18+
19+
args = {
20+
'keyword': '',
21+
'source': 'sharepoint',
22+
'type': 'project',
23+
'status': 'open',
24+
'page': 1,
25+
'_pp_projectstatus': '',
26+
27+
#'_pp_hiname': 'tan',
28+
#'_pp_piname': '',
29+
'_pp_hiname': 'ab',
30+
'_pp_piname': '', #'pua',
31+
32+
'_pp_source': '',
33+
'_pp_details': '',
34+
}
35+
36+
def start_requests(self):
37+
38+
# create request for first page
39+
args = urllib.parse.urlencode(self.args)
40+
41+
url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args
42+
43+
yield scrapy.FormRequest(url, callback=self.parse_item, method='POST', formdata=self.params, headers={'X-Requested-With': 'XMLHttpRequest'})
44+
45+
46+
def parse_item(self,response):
47+
#print('parse_item] url:', response.url)
48+
#print('parse_item] text:', response.text)
49+
50+
#for quote in response.xpath('//div[contains(@style,"overflow-x:auto")]'):
51+
# for row in quote.xpath('./table[contains(@class,"table-striped")]/tbody/tr'):
52+
# link = row.xpath('td[1]/a/@href').extract_first()
53+
# yield scrapy.Request(link, callback=self.parse_product)
54+
55+
for row in response.xpath('//table[@name="MVCGridTable_advancesearchawardedprojectsp"]/tbody/tr'):
56+
cols = row.xpath('.//td')
57+
link = cols[0].xpath('.//a/@href').get().strip()
58+
title = cols[0].xpath('.//a/text()').get().strip()
59+
status = cols[1].xpath('.//text()').get().strip()
60+
pi = cols[2].xpath('.//text()').get().strip()
61+
hi = cols[3].xpath('.//text()').get().strip()
62+
date = cols[4].xpath('.//text()').get().strip()
63+
64+
item = {
65+
#'id': project_id,
66+
'status': status,
67+
'title': title,
68+
'link': link,
69+
'pi': pi,
70+
'hi': hi,
71+
'date': date,
72+
}
73+
74+
# few links are redirected to main page so they are filtered and it needs `dont_filter=True`
75+
yield scrapy.Request(link, meta={'item': item}, callback=self.parse_product, dont_filter=True)
76+
77+
# create request for next page
78+
onclick = response.xpath('//a[@aria-label="Next page"]/@onclick').get()
79+
80+
if onclick:
81+
# next page
82+
self.args['page'] += 1
83+
args = urllib.parse.urlencode(self.args)
84+
url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args
85+
yield scrapy.FormRequest(url, callback=self.parse_item, method='POST', formdata=self.params, headers={'X-Requested-With': 'XMLHttpRequest'})
86+
87+
def parse_product(self, response):
88+
#print('parse_product] url:', response.url)
89+
item = response.meta['item']
90+
91+
# .extract_first() or .get() instead of .extract()
92+
project_id = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblProjIdExt"]/text()').get()
93+
#title = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblProjectTitle"]/text()').get()
94+
#pi = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblLeadPIName"]/text()').get()
95+
#hi = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblHostInstName"]/text()').get()
96+
#date = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_dtPickerStartDate"]/text()').get()
97+
# etc.
98+
item['id'] = project_id
99+
100+
yield item
101+
102+
# --- run without project and save in `output.csv` ---
103+
104+
from scrapy.crawler import CrawlerProcess
105+
106+
c = CrawlerProcess({
107+
'USER_AGENT': 'Mozilla/5.0',
108+
# save in file CSV, JSON or XML
109+
'FEED_FORMAT': 'csv', # csv, json, xml
110+
'FEED_URI': 'output.csv', #
111+
})
112+
c.crawl(MySpider)
113+
c.start()

__scraping__/shopee.com.my/main.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#!/usr/bin/env python3
2+
3+
# date: 2019.12.14
4+
#
5+
6+
import selenium.webdriver
7+
from selenium.webdriver.common.action_chains import ActionChains
8+
9+
import time
10+
url = 'https://shopee.com.my/search?keyword=mattress'
11+
12+
driver = selenium.webdriver.Firefox()
13+
driver.get(url)
14+
time.sleep(1)
15+
16+
# select language
17+
driver.find_element_by_xpath('//div[@class="language-selection__list"]/button').click()
18+
time.sleep(3)
19+
20+
# scroll few times to load all items
21+
for x in range(10):
22+
driver.execute_script("window.scrollBy(0,300)")
23+
time.sleep(0.1)
24+
25+
# get all links
26+
all_items = driver.find_elements_by_xpath('//a[@data-sqe="link"]')
27+
print('len:', len(all_items))
28+
29+
all_urls = []
30+
31+
for item in all_items:
32+
url = item.get_attribute('href')
33+
all_urls.append(url)
34+
print(url)
35+
36+
# use links
37+
38+
#for item in all_urls:
39+
# driver.get(url)
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#!/usr/bin/env python3
2+
3+
# date: 2019.12.17
4+
#
5+
6+
import time
7+
from bs4 import BeautifulSoup
8+
from urllib.parse import urljoin
9+
from selenium import webdriver
10+
from selenium.webdriver.chrome.options import Options
11+
12+
def get_links(driver, url):
13+
driver.get(url)
14+
time.sleep(5)
15+
16+
soup = BeautifulSoup(driver.page_source,"lxml")
17+
18+
links = []
19+
20+
for new_url in soup.find_all('a', href=True):
21+
new_url = new_url.get('href')
22+
new_url = urljoin(url, new_url)
23+
links.append(new_url)
24+
25+
return links
26+
27+
# ---
28+
29+
options = Options()
30+
options.add_argument('--incognito')
31+
options.add_argument('--headless')
32+
options.add_argument("--no-sandbox")
33+
options.add_argument('--disable-dev-shm-usage')
34+
options.add_argument("--profile-directory=Default")
35+
#driver = webdriver.Chrome("./chromedriver",options=options)
36+
driver = webdriver.Firefox()
37+
38+
# ---
39+
40+
domain = 'https://spaceflightnow.com/' # to filter external links
41+
start_url = 'https://spaceflightnow.com/'
42+
max_level = 2
43+
44+
links_visited = set([start_url]) # to test visited links
45+
links_with_levels = [(start_url, 0)] # to control levels
46+
47+
# ---
48+
49+
for link, level in links_with_levels:
50+
if level >= max_level:
51+
print('skip:', level, link)
52+
continue
53+
54+
print('visit:', level, link)
55+
56+
links = get_links(driver, link)
57+
58+
print('found:', len(links))
59+
links = list(set(links) - links_visited)
60+
print('after filtering:', len(links))
61+
62+
level += 1
63+
64+
for new_link in links:
65+
if new_link.startswith(domain): # filter external links
66+
links_visited.add(new_link)
67+
links_with_levels.append( (new_link, level) )
68+
69+
# ---
70+
71+
for link, level in links_with_levels:
72+
print('skip:', level, link)
73+
74+

0 commit comments

Comments
 (0)