Skip to content

Commit 8058f5f

Browse files
committed
feat: scrapy basic add
1 parent f1f7143 commit 8058f5f

File tree

14 files changed

+264
-0
lines changed

14 files changed

+264
-0
lines changed

python_scrapy/README.md

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
* scrapy 是一个纯 Python 实现爬取网站数据,。提取结构性的数据而编写的应用程序,用途还是十分广泛的
2+
* 框架呐就是实现的是将我们的简单的一个玩意实现抽象成一个规范化的系统
3+
* 这个就是一个 Twisted 网络异步框架,加快了我们的爬取一个数据的速度
4+
5+
![img](./img.png)
6+
![img](./img_1.png)
7+
8+
9+
scripy 框架的实现原理
10+
> spider 将我们发送原始的请求的 url 地址由引擎传给调度器
11+
> 放到队列中进行去重
12+
13+
![img](./img_2.png)
14+
15+
![img_4.png](img_4.png)
16+
17+
`pip install scrapy`
18+
`pip install pypiwin32`
19+
20+
> scrapy 的创建项目的过程
21+
> * scrapy startproject 爬虫的名称
22+
> * 或者使用: scrapy genspider 爬虫名称 爬虫域
23+
> * 爬虫名是我们的用来实是熊安开启的爬虫项目名
24+
> * 爬虫域,就是告诉我们的这个爬虫的时候实现爬取的域的内容是什么
25+
>
26+
> > 实现创建了后,我们的爬虫就具有了几个比较重要的文件
27+
>
28+
> > 1.items.py | 2.middlewares.py | 3.pipelines.py | 4.setting.py

python_scrapy/img.png

41.1 KB
Loading

python_scrapy/img_1.png

90.6 KB
Loading

python_scrapy/img_2.png

237 KB
Loading

python_scrapy/img_3.png

133 KB
Loading

python_scrapy/img_4.png

133 KB
Loading

python_scrapy/my_scrapy/my_scrapy/__init__.py

Whitespace-only changes.
+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Define here the models for your scraped items
2+
#
3+
# See documentation in:
4+
# https://docs.scrapy.org/en/latest/topics/items.html
5+
6+
import scrapy
7+
8+
9+
class MyScrapyItem(scrapy.Item):
10+
# define the fields for your item here like:
11+
# name = scrapy.Field()
12+
pass
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
# Define here the models for your spider middleware
2+
#
3+
# See documentation in:
4+
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
5+
6+
from scrapy import signals
7+
8+
# useful for handling different item types with a single interface
9+
from itemadapter import is_item, ItemAdapter
10+
11+
12+
class MyScrapySpiderMiddleware:
13+
# Not all methods need to be defined. If a method is not defined,
14+
# scrapy acts as if the spider middleware does not modify the
15+
# passed objects.
16+
17+
@classmethod
18+
def from_crawler(cls, crawler):
19+
# This method is used by Scrapy to create your spiders.
20+
s = cls()
21+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
22+
return s
23+
24+
def process_spider_input(self, response, spider):
25+
# Called for each response that goes through the spider
26+
# middleware and into the spider.
27+
28+
# Should return None or raise an exception.
29+
return None
30+
31+
def process_spider_output(self, response, result, spider):
32+
# Called with the results returned from the Spider, after
33+
# it has processed the response.
34+
35+
# Must return an iterable of Request, or item objects.
36+
for i in result:
37+
yield i
38+
39+
def process_spider_exception(self, response, exception, spider):
40+
# Called when a spider or process_spider_input() method
41+
# (from other spider middleware) raises an exception.
42+
43+
# Should return either None or an iterable of Request or item objects.
44+
pass
45+
46+
def process_start_requests(self, start_requests, spider):
47+
# Called with the start requests of the spider, and works
48+
# similarly to the process_spider_output() method, except
49+
# that it doesn’t have a response associated.
50+
51+
# Must return only requests (not items).
52+
for r in start_requests:
53+
yield r
54+
55+
def spider_opened(self, spider):
56+
spider.logger.info("Spider opened: %s" % spider.name)
57+
58+
59+
class MyScrapyDownloaderMiddleware:
60+
# Not all methods need to be defined. If a method is not defined,
61+
# scrapy acts as if the downloader middleware does not modify the
62+
# passed objects.
63+
64+
@classmethod
65+
def from_crawler(cls, crawler):
66+
# This method is used by Scrapy to create your spiders.
67+
s = cls()
68+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69+
return s
70+
71+
def process_request(self, request, spider):
72+
# Called for each request that goes through the downloader
73+
# middleware.
74+
75+
# Must either:
76+
# - return None: continue processing this request
77+
# - or return a Response object
78+
# - or return a Request object
79+
# - or raise IgnoreRequest: process_exception() methods of
80+
# installed downloader middleware will be called
81+
return None
82+
83+
def process_response(self, request, response, spider):
84+
# Called with the response returned from the downloader.
85+
86+
# Must either;
87+
# - return a Response object
88+
# - return a Request object
89+
# - or raise IgnoreRequest
90+
return response
91+
92+
def process_exception(self, request, exception, spider):
93+
# Called when a download handler or a process_request()
94+
# (from other downloader middleware) raises an exception.
95+
96+
# Must either:
97+
# - return None: continue processing this exception
98+
# - return a Response object: stops process_exception() chain
99+
# - return a Request object: stops process_exception() chain
100+
pass
101+
102+
def spider_opened(self, spider):
103+
spider.logger.info("Spider opened: %s" % spider.name)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Define your item pipelines here
2+
#
3+
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
4+
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
5+
6+
7+
# useful for handling different item types with a single interface
8+
from itemadapter import ItemAdapter
9+
10+
11+
class MyScrapyPipeline:
12+
def process_item(self, item, spider):
13+
return item
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
# Scrapy settings for my_scrapy project
2+
#
3+
# For simplicity, this file contains only settings considered important or
4+
# commonly used. You can find more settings consulting the documentation:
5+
#
6+
# https://docs.scrapy.org/en/latest/topics/settings.html
7+
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
8+
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
9+
10+
BOT_NAME = "my_scrapy"
11+
12+
SPIDER_MODULES = ["my_scrapy.spiders"]
13+
NEWSPIDER_MODULE = "my_scrapy.spiders"
14+
15+
16+
# Crawl responsibly by identifying yourself (and your website) on the user-agent
17+
#USER_AGENT = "my_scrapy (+http://www.yourdomain.com)"
18+
19+
# Obey robots.txt rules
20+
ROBOTSTXT_OBEY = True
21+
22+
# Configure maximum concurrent requests performed by Scrapy (default: 16)
23+
#CONCURRENT_REQUESTS = 32
24+
25+
# Configure a delay for requests for the same website (default: 0)
26+
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
27+
# See also autothrottle settings and docs
28+
#DOWNLOAD_DELAY = 3
29+
# The download delay setting will honor only one of:
30+
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
31+
#CONCURRENT_REQUESTS_PER_IP = 16
32+
33+
# Disable cookies (enabled by default)
34+
#COOKIES_ENABLED = False
35+
36+
# Disable Telnet Console (enabled by default)
37+
#TELNETCONSOLE_ENABLED = False
38+
39+
# Override the default request headers:
40+
#DEFAULT_REQUEST_HEADERS = {
41+
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
42+
# "Accept-Language": "en",
43+
#}
44+
45+
# Enable or disable spider middlewares
46+
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
47+
#SPIDER_MIDDLEWARES = {
48+
# "my_scrapy.middlewares.MyScrapySpiderMiddleware": 543,
49+
#}
50+
51+
# Enable or disable downloader middlewares
52+
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
53+
#DOWNLOADER_MIDDLEWARES = {
54+
# "my_scrapy.middlewares.MyScrapyDownloaderMiddleware": 543,
55+
#}
56+
57+
# Enable or disable extensions
58+
# See https://docs.scrapy.org/en/latest/topics/extensions.html
59+
#EXTENSIONS = {
60+
# "scrapy.extensions.telnet.TelnetConsole": None,
61+
#}
62+
63+
# Configure item pipelines
64+
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
65+
#ITEM_PIPELINES = {
66+
# "my_scrapy.pipelines.MyScrapyPipeline": 300,
67+
#}
68+
69+
# Enable and configure the AutoThrottle extension (disabled by default)
70+
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
71+
#AUTOTHROTTLE_ENABLED = True
72+
# The initial download delay
73+
#AUTOTHROTTLE_START_DELAY = 5
74+
# The maximum download delay to be set in case of high latencies
75+
#AUTOTHROTTLE_MAX_DELAY = 60
76+
# The average number of requests Scrapy should be sending in parallel to
77+
# each remote server
78+
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
79+
# Enable showing throttling stats for every response received:
80+
#AUTOTHROTTLE_DEBUG = False
81+
82+
# Enable and configure HTTP caching (disabled by default)
83+
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
84+
#HTTPCACHE_ENABLED = True
85+
#HTTPCACHE_EXPIRATION_SECS = 0
86+
#HTTPCACHE_DIR = "httpcache"
87+
#HTTPCACHE_IGNORE_HTTP_CODES = []
88+
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
89+
90+
# Set settings whose default value is deprecated to a future-proof value
91+
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
92+
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
93+
FEED_EXPORT_ENCODING = "utf-8"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# This package will contain the spiders of your Scrapy project
2+
#
3+
# Please refer to the documentation for information on how to create and manage
4+
# your spiders.

python_scrapy/my_scrapy/scrapy.cfg

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Automatically created by: scrapy startproject
2+
#
3+
# For more information about the [deploy] section see:
4+
# https://scrapyd.readthedocs.io/en/latest/deploy.html
5+
6+
[settings]
7+
default = my_scrapy.settings
8+
9+
[deploy]
10+
#url = http://localhost:6800/
11+
project = my_scrapy

requirement.txt

5.97 KB
Binary file not shown.

0 commit comments

Comments
 (0)