Skip to content

Commit 0bde3e6

Browse files
authored
Ability to abort requests via setting (#63)
1 parent d08a944 commit 0bde3e6

File tree

7 files changed

+72
-1
lines changed

7 files changed

+72
-1
lines changed

README.md

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
9797
the default value will be used (30000 ms at the time of writing this).
9898
See the docs for [BrowserContext.set_default_navigation_timeout](https://playwright.dev/python/docs/api/class-browsercontext#browser_contextset_default_navigation_timeouttimeout).
9999

100-
* `PLAYWRIGHT_PROCESS_REQUEST_HEADERS` (type `str`, default `scrapy_playwright.headers.use_scrapy_headers`)
100+
* `PLAYWRIGHT_PROCESS_REQUEST_HEADERS` (type `Union[Callable, str]`, default `scrapy_playwright.headers.use_scrapy_headers`)
101101

102102
The path to a coroutine function (`async def`) that processes headers for a given request
103103
and returns a dictionary with the headers to be used (note that, depending on the browser,
@@ -124,6 +124,21 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
124124
Maximum amount of allowed concurrent Playwright pages for each context.
125125
See the [notes about leaving unclosed pages](#receiving-the-page-object-in-the-callback).
126126

127+
* `PLAYWRIGHT_ABORT_REQUEST` (type `Optional[Union[Callable, str]]`, default `None`)
128+
129+
A predicate function (or the path to a function) that receives a
130+
`playwright.async_api.Request` object and must return `True` if the
131+
request should be aborted, `False` otherwise.
132+
133+
For instance, the following prevents the download of images:
134+
```python
135+
PLAYWRIGHT_ABORT_REQUEST = lambda req: req.resource_type == "image"
136+
```
137+
138+
Note that all requests will appear in the DEBUG level logs, however there will
139+
be no corresponding response log lines for aborted requests. Aborted requests
140+
are counted in the `playwright/request_count/aborted` job stats item.
141+
127142

128143
## Basic usage
129144

scrapy_playwright/handler.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,10 @@ def __init__(self, crawler: Crawler) -> None:
109109
self.contexts: Dict[str, BrowserContext] = {}
110110
self.context_semaphores: Dict[str, asyncio.Semaphore] = {}
111111

112+
self.abort_request: Optional[Callable[[PlaywrightRequest], bool]] = None
113+
if crawler.settings.get("PLAYWRIGHT_ABORT_REQUEST"):
114+
self.abort_request = load_object(crawler.settings["PLAYWRIGHT_ABORT_REQUEST"])
115+
112116
@classmethod
113117
def from_crawler(cls: Type[PlaywrightHandler], crawler: Crawler) -> PlaywrightHandler:
114118
return cls(crawler)
@@ -332,6 +336,11 @@ def _make_request_handler(
332336
) -> Callable:
333337
async def _request_handler(route: Route, playwright_request: PlaywrightRequest) -> None:
334338
"""Override request headers, method and body."""
339+
if self.abort_request and self.abort_request(playwright_request):
340+
await route.abort()
341+
self.stats.inc_value("playwright/request_count/aborted")
342+
return None
343+
335344
processed_headers = await self.process_request_headers(
336345
self.browser_type, playwright_request, scrapy_headers
337346
)

tests/site/gallery.html

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
<!doctype html>
2+
<html>
3+
<head>
4+
<title>Gallery</title>
5+
<meta charset="utf-8" />
6+
<meta http-equiv="Content-type" content="text/html; charset=utf-8" />
7+
</head>
8+
9+
<body>
10+
<div>
11+
<h1>Gallery</h1>
12+
<ul>
13+
<li>
14+
<img src="static/img/ales-krivec-ZMZHcvIVgbg-unsplash.jpg" /></p>
15+
</li>
16+
<li>
17+
<img src="static/img/elyssa-fahndrich-MF16lGb95WY-unsplash.jpg" />
18+
</li>
19+
<li>
20+
<img src="static/img/nathan-dumlao-RCfalHrnFAs-unsplash.jpg" />
21+
</li>
22+
</ul>
23+
</div>
24+
</body>
25+
</html>
127 KB
Loading
145 KB
Loading
185 KB
Loading

tests/test_playwright_requests.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,28 @@ async def test_response_attributes(self):
393393

394394
assert response.ip_address == ip_address(server.address)
395395

396+
@pytest.mark.asyncio
397+
async def test_abort_requests(self):
398+
settings_dict = {
399+
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
400+
"PLAYWRIGHT_ABORT_REQUEST": lambda req: req.resource_type == "image",
401+
}
402+
async with make_handler(settings_dict) as handler:
403+
with StaticMockServer() as server:
404+
req = Request(
405+
url=server.urljoin("/gallery.html"),
406+
meta={"playwright": True},
407+
)
408+
await handler._download_request(req, Spider("foo"))
409+
410+
req_prefix = "playwright/request_count"
411+
resp_prefix = "playwright/response_count"
412+
assert handler.stats.get_value(f"{req_prefix}/resource_type/document") == 1
413+
assert handler.stats.get_value(f"{req_prefix}/resource_type/image") == 3
414+
assert handler.stats.get_value(f"{resp_prefix}/resource_type/document") == 1
415+
assert handler.stats.get_value(f"{resp_prefix}/resource_type/image") is None
416+
assert handler.stats.get_value(f"{req_prefix}/aborted") == 3
417+
396418

397419
class TestCaseChromium(MixinTestCase):
398420
browser_type = "chromium"

0 commit comments

Comments
 (0)