Skip to content

Commit cd8a0c0

Browse files
Add files via upload
1 parent 19e76d8 commit cd8a0c0

7 files changed

+455
-0
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
import scrapy
2+
import pandas as pd
3+
4+
tbl = response.xpath('(//div[contains(@class,"tabbertab tabbertabdefault")])[1]').get()
5+
df = pd.read_html(tbl)
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import os
2+
import fnmatch
3+
import json
4+
import shutil
5+
import time
6+
total_cnt = 0
7+
8+
PATH = ''
9+
for path, dirs, files in os.walk(PATH):
10+
for f in fnmatch.filter(files, 'meta'):
11+
fullname = os.path.abspath(os.path.join(path, f))
12+
json_info = eval(open(fullname, "r").read())
13+
status = json_info['status']
14+
response_url = json_info['response_url']
15+
total_cnt += 1
16+
17+
if status in [307, 403, 407, 504, 520, 522, 524, 525]:
18+
shutil.rmtree(path)
19+
print(status, path)

scrapy/how to run scrapy.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
'''
2+
scrapy crawl itunes -s LOG_ENABLED=False
3+
'''
4+
5+
if __name__ == '__main__':
6+
from scrapy.utils.project import get_project_settings
7+
from scrapy.crawler import CrawlerProcess, CrawlerRunner
8+
9+
settings = get_project_settings()
10+
process = CrawlerProcess(settings)
11+
process.crawl(ItunesSpider)
12+
process.start()

scrapy/scrapy_as_csv.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
import random
2+
import os
3+
import csv
4+
import scrapy
5+
from scrapy.http import FormRequest
6+
from scrapy import signals
7+
8+
def make_headers():
9+
headers = {
10+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
11+
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{:02d}.0.{:04d}.{} Safari/537.36'.format(
12+
random.randint(63, 84), random.randint(0, 9999), random.randint(98, 132)),
13+
}
14+
return headers
15+
16+
timeout = 100
17+
conn_limit = 200
18+
19+
class MainScraper(scrapy.Spider):
20+
name = "arrt_scrapy"
21+
custom_settings = {
22+
'ROBOTSTXT_OBEY': False,
23+
'CONCURRENT_REQUESTS': conn_limit,
24+
# 'AUTOTHROTTLE_ENABLED': True,
25+
'AUTOTHROTTLE_TARGET_CONCURRENCY': conn_limit,
26+
# 'AUTOTHROTTLE_START_DELAY ': 1,
27+
# 'AUTOTHROTTLE_MAX_DELAY ': 360,
28+
'AUTOTHROTTLE_DEBUG': True,
29+
# 'DOWNLOAD_DELAY': 1,
30+
# 'dont_filter': True,
31+
'RETRY_ENABLED': False,
32+
# 'COOKIES_ENABLED ': False,
33+
'CONCURRENT_REQUESTS_PER_DOMAIN': conn_limit,
34+
'CONCURRENT_REQUESTS_PER_IP': conn_limit,
35+
'HTTPCACHE_ENABLED': True,
36+
'HTTPCACHE_IGNORE_HTTP_CODES': [301, 302, 403, 404, 429, 500, 502, 503],
37+
'HTTPCACHE_STORAGE': 'scrapy.extensions.httpcache.FilesystemCacheStorage',
38+
'HTTPCACHE_POLICY': 'scrapy.extensions.httpcache.DummyPolicy',
39+
# 'LOG_ENABLED': False,
40+
'DOWNLOAD_TIMEOUT': timeout,
41+
'URLLENGTH_LIMIT': 99999,
42+
}
43+
44+
@classmethod
45+
def from_crawler(cls, crawler, *args, **kwargs):
46+
spider = super(MainScraper, cls).from_crawler(crawler, *args, **kwargs)
47+
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
48+
return spider
49+
50+
def __init__(self):
51+
self.result_dir = os.path.join(os.getcwd(), "Result")
52+
if not os.path.exists(self.result_dir):
53+
os.makedirs(self.result_dir)
54+
55+
self.result_fname = os.path.join(self.result_dir, "Result.csv")
56+
self.create_result_file()
57+
58+
heading = [
59+
"", "", "", "", "", "", "", ""
60+
]
61+
if os.path.getsize(self.result_fname) == 0:
62+
self.insert_row(result_row=heading)
63+
64+
self.total_cnt = 0
65+
self.total_result = []
66+
self.total_links = []
67+
68+
def start_requests(self):
69+
url = ""
70+
param1 = ""
71+
request = FormRequest(
72+
url=url,
73+
method='GET',
74+
headers=make_headers(),
75+
callback=self.get_links,
76+
errback=self.fail_links,
77+
dont_filter=True,
78+
meta={
79+
'url': url,
80+
'param1': param1,
81+
# 'proxy': pxy
82+
# 'handle_httpstatus_all': True,
83+
# 'dont_redirect': True,
84+
}
85+
)
86+
yield request
87+
88+
def get_links(self, response):
89+
url = response.meta['url']
90+
param1 = response.meta['param1']
91+
XPATH = ''
92+
93+
rows = response.xpath(XPATH)
94+
95+
def fail_links(self, failure):
96+
request = FormRequest(
97+
url=failure.request.meta['url'],
98+
method='GET',
99+
headers=make_headers(),
100+
callback=self.get_links,
101+
errback=self.fail_links,
102+
dont_filter=True,
103+
meta={
104+
'url': failure.request.meta['url'],
105+
'param1': failure.request.meta['param1'],
106+
# 'proxy': pxy
107+
# 'handle_httpstatus_all': True,
108+
# 'dont_redirect': True,
109+
}
110+
)
111+
yield request
112+
113+
def create_result_file(self):
114+
self.result_fp = open(self.result_fname, 'w', encoding='utf-8', newline='')
115+
self.result_writer = csv.writer(self.result_fp)
116+
117+
def insert_row(self, result_row):
118+
result_row = [str(elm) for elm in result_row]
119+
self.result_writer.writerow(result_row)
120+
self.result_fp.flush()
121+
122+
def spider_closed(self, spider):
123+
pass
124+
125+
if __name__ == '__main__':
126+
from scrapy.utils.project import get_project_settings
127+
from scrapy.crawler import CrawlerProcess
128+
129+
settings = get_project_settings()
130+
process = CrawlerProcess(settings)
131+
process.crawl(MainScraper)
132+
process.start()

scrapy/scrapy_as_excel.py

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
import random
2+
import os
3+
import openpyxl
4+
import scrapy
5+
from scrapy.http import FormRequest
6+
from scrapy import signals
7+
8+
def make_headers():
9+
headers = {
10+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
11+
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{:02d}.0.{:04d}.{} Safari/537.36'.format(
12+
random.randint(63, 84), random.randint(0, 9999), random.randint(98, 132)),
13+
}
14+
return headers
15+
16+
timeout = 100
17+
conn_limit = 200
18+
19+
class MainScraper(scrapy.Spider):
20+
name = "arrt_scrapy"
21+
custom_settings = {
22+
'ROBOTSTXT_OBEY': False,
23+
'CONCURRENT_REQUESTS': conn_limit,
24+
# 'AUTOTHROTTLE_ENABLED': True,
25+
'AUTOTHROTTLE_TARGET_CONCURRENCY': conn_limit,
26+
# 'AUTOTHROTTLE_START_DELAY ': 1,
27+
# 'AUTOTHROTTLE_MAX_DELAY ': 360,
28+
'AUTOTHROTTLE_DEBUG': True,
29+
# 'DOWNLOAD_DELAY': 1,
30+
# 'dont_filter': True,
31+
'RETRY_ENABLED': False,
32+
# 'COOKIES_ENABLED ': False,
33+
'CONCURRENT_REQUESTS_PER_DOMAIN': conn_limit,
34+
'CONCURRENT_REQUESTS_PER_IP': conn_limit,
35+
'HTTPCACHE_ENABLED': True,
36+
'HTTPCACHE_IGNORE_HTTP_CODES': [301, 302, 403, 404, 429, 500, 502, 503],
37+
'HTTPCACHE_STORAGE': 'scrapy.extensions.httpcache.FilesystemCacheStorage',
38+
'HTTPCACHE_POLICY': 'scrapy.extensions.httpcache.DummyPolicy',
39+
# 'LOG_ENABLED': False,
40+
'DOWNLOAD_TIMEOUT': timeout,
41+
'URLLENGTH_LIMIT': 99999,
42+
}
43+
44+
@classmethod
45+
def from_crawler(cls, crawler, *args, **kwargs):
46+
spider = super(MainScraper, cls).from_crawler(crawler, *args, **kwargs)
47+
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
48+
return spider
49+
50+
def __init__(self):
51+
self.result_dir = os.path.join(os.getcwd(), "Result")
52+
if not os.path.exists(self.result_dir):
53+
os.makedirs(self.result_dir)
54+
55+
self.result_fname = os.path.join(self.result_dir, "Result.csv")
56+
self.create_result_file()
57+
58+
heading = [
59+
"", "", "", "", "", "", "", ""
60+
]
61+
if os.path.getsize(self.result_fname) == 0:
62+
self.insert_row(result_row=heading)
63+
64+
self.total_cnt = 0
65+
self.total_result = []
66+
self.total_links = []
67+
68+
def start_requests(self):
69+
url = ""
70+
param1 = ""
71+
request = FormRequest(
72+
url=url,
73+
method='GET',
74+
headers=make_headers(),
75+
callback=self.get_links,
76+
errback=self.fail_links,
77+
dont_filter=True,
78+
meta={
79+
'url': url,
80+
'param1': param1,
81+
# 'proxy': pxy
82+
# 'handle_httpstatus_all': True,
83+
# 'dont_redirect': True,
84+
}
85+
)
86+
yield request
87+
88+
def get_links(self, response):
89+
url = response.meta['url']
90+
param1 = response.meta['param1']
91+
XPATH = ''
92+
93+
rows = response.xpath(XPATH)
94+
95+
def fail_links(self, failure):
96+
request = FormRequest(
97+
url=failure.request.meta['url'],
98+
method='GET',
99+
headers=make_headers(),
100+
callback=self.get_links,
101+
errback=self.fail_links,
102+
dont_filter=True,
103+
meta={
104+
'url': failure.request.meta['url'],
105+
'param1': failure.request.meta['param1'],
106+
# 'proxy': pxy
107+
# 'handle_httpstatus_all': True,
108+
# 'dont_redirect': True,
109+
}
110+
)
111+
yield request
112+
113+
# 2. Write xlsx file
114+
def create_result_file(self):
115+
if os.path.exists(self.result_fname):
116+
self.xfile = openpyxl.load_workbook(self.result_fname)
117+
else:
118+
self.xfile = openpyxl.Workbook()
119+
self.sheet = self.xfile.active
120+
self.row_index = 0
121+
122+
def insert_row(self, result_row):
123+
self.row_index += 1
124+
for i, elm in enumerate(result_row):
125+
self.sheet.cell(row=self.row_index, column=i + 1).value = elm
126+
127+
def spider_closed(self, spider):
128+
self.xfile.save(self.result_fname)
129+
130+
if __name__ == '__main__':
131+
from scrapy.utils.project import get_project_settings
132+
from scrapy.crawler import CrawlerProcess
133+
134+
settings = get_project_settings()
135+
process = CrawlerProcess(settings)
136+
process.crawl(MainScraper)
137+
process.start()

0 commit comments

Comments
 (0)