Skip to content

Commit a06b3b1

Browse files
authored
Rename PageCoroutine -> PageMethod (#70)
1 parent 9ee07aa commit a06b3b1

File tree

14 files changed

+314
-198
lines changed

14 files changed

+314
-198
lines changed

README.md

Lines changed: 72 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -326,54 +326,75 @@ async def parse_in_new_context(self, response):
326326
```
327327

328328

329-
## Page coroutines
329+
## Executing actions on pages
330330

331331
A sorted iterable (`list`, `tuple` or `dict`, for instance) could be passed
332-
in the `playwright_page_coroutines`
332+
in the `playwright_page_methods`
333333
[Request.meta](https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta)
334334
key to request coroutines to be awaited on the `Page` before returning the final
335335
`Response` to the callback.
336336

337337
This is useful when you need to perform certain actions on a page, like scrolling
338-
down or clicking links, and you want everything to count as a single Scrapy
339-
Response, containing the final result.
338+
down or clicking links, and you want to handle only the final result in your callback.
340339

341-
### `PageCoroutine` class
340+
### `PageMethod` class
342341

343-
* `scrapy_playwright.page.PageCoroutine(method: str, *args, **kwargs)`:
342+
#### `scrapy_playwright.page.PageMethod(method: str, *args, **kwargs)`:
344343

345-
Represents a coroutine to be awaited on a `playwright.page.Page` object,
346-
such as "click", "screenshot", "evaluate", etc.
347-
`method` should be the name of the coroutine, `*args` and `**kwargs`
348-
are passed to the function call. The return value of the coroutine call
349-
will be stored in the `PageCoroutine.result` attribute.
344+
Represents a method to be called (and awaited if necessary) on a
345+
`playwright.page.Page` object, such as "click", "screenshot", "evaluate", etc.
346+
`method` is the name of the method, `*args` and `**kwargs`
347+
are passed when calling such method. The return value
348+
will be stored in the `PageMethod.result` attribute.
350349

351-
For instance,
352-
```python
353-
PageCoroutine("screenshot", path="quotes.png", fullPage=True)
354-
```
350+
For instance,
351+
```python
352+
def start_requests(self):
353+
yield Request(
354+
url="https://example.org",
355+
meta={
356+
"playwright": True,
357+
"playwright_page_methods": [
358+
PageMethod("screenshot", path="example.png", fullPage=True),
359+
],
360+
},
361+
)
355362

356-
produces the same effect as:
357-
```python
358-
# 'page' is a playwright.async_api.Page object
359-
await page.screenshot(path="quotes.png", fullPage=True)
360-
```
363+
def parse(self, response):
364+
screenshot = response.meta["playwright_page_methods"][0]
365+
# screenshot.result contains the image's bytes
366+
```
361367

368+
produces the same effect as:
369+
```python
370+
def start_requests(self):
371+
yield Request(
372+
url="https://example.org",
373+
meta={"playwright": True, "playwright_include_page": True},
374+
)
362375

363-
### Supported coroutines
376+
async def parse(self, response):
377+
page = response.meta["playwright_page"]
378+
await page.screenshot(path="example.png", full_page=True)
379+
await page.close()
380+
```
381+
382+
383+
### Supported methods
364384

365385
Please refer to the [upstream docs for the `Page` class](https://playwright.dev/python/docs/api/class-page)
366-
to see available coroutines
386+
to see available methods.
367387

368388
### Impact on Response objects
369389

370390
Certain `Response` attributes (e.g. `url`, `ip_address`) reflect the state after the last
371-
action performed on a page. If you issue a `PageCoroutine` with an action that results in
391+
action performed on a page. If you issue a `PageMethod` with an action that results in
372392
a navigation (e.g. a `click` on a link), the `Response.url` attribute will point to the
373393
new URL, which might be different from the request's URL.
374394

375395

376396
## Page events
397+
377398
A dictionary of Page event handlers can be specified in the `playwright_page_event_handlers`
378399
[Request.meta](https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta) key.
379400
Keys are the name of the event to be handled (`dialog`, `download`, etc).
@@ -430,15 +451,15 @@ class ClickAndSavePdfSpider(scrapy.Spider):
430451
url="https://example.org",
431452
meta=dict(
432453
playwright=True,
433-
playwright_page_coroutines={
434-
"click": PageCoroutine("click", selector="a"),
435-
"pdf": PageCoroutine("pdf", path="/tmp/file.pdf"),
454+
playwright_page_methods={
455+
"click": PageMethod("click", selector="a"),
456+
"pdf": PageMethod("pdf", path="/tmp/file.pdf"),
436457
},
437458
),
438459
)
439460

440461
def parse(self, response):
441-
pdf_bytes = response.meta["playwright_page_coroutines"]["pdf"].result
462+
pdf_bytes = response.meta["playwright_page_methods"]["pdf"].result
442463
with open("iana.pdf", "wb") as fp:
443464
fp.write(pdf_bytes)
444465
yield {"url": response.url} # response.url is "https://www.iana.org/domains/reserved"
@@ -456,10 +477,10 @@ class ScrollSpider(scrapy.Spider):
456477
meta=dict(
457478
playwright=True,
458479
playwright_include_page=True,
459-
playwright_page_coroutines=[
460-
PageCoroutine("wait_for_selector", "div.quote"),
461-
PageCoroutine("evaluate", "window.scrollBy(0, document.body.scrollHeight)"),
462-
PageCoroutine("wait_for_selector", "div.quote:nth-child(11)"), # 10 per page
480+
playwright_page_methods=[
481+
PageMethod("wait_for_selector", "div.quote"),
482+
PageMethod("evaluate", "window.scrollBy(0, document.body.scrollHeight)"),
483+
PageMethod("wait_for_selector", "div.quote:nth-child(11)"), # 10 per page
463484
],
464485
),
465486
)
@@ -487,7 +508,14 @@ For more examples, please see the scripts in the [examples](examples) directory.
487508
Refer to the [Proxy support](#proxy-support) section for more information.
488509

489510

490-
##  Deprecations
511+
##  Deprecation policy
512+
513+
Deprecated features will be supported for at least six months
514+
following the release that deprecated them. After that, they
515+
may be removed at any time. See the [changelog](changelog.md)
516+
for more information about deprecations and removals.
517+
518+
### Currently deprecated features
491519

492520
* `PLAYWRIGHT_CONTEXT_ARGS` setting (type `dict`, default `{}`)
493521

@@ -497,3 +525,15 @@ For more examples, please see the scripts in the [examples](examples) directory.
497525
Deprecated since
498526
[`v0.0.4`](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.4),
499527
use the `PLAYWRIGHT_CONTEXTS` setting instead
528+
529+
* `scrapy_playwright.page.PageCoroutine` class
530+
531+
Deprecated since
532+
[`v0.0.14`](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.14),
533+
use `scrapy_playwright.page.PageMethod` instead
534+
535+
* `playwright_page_coroutines` Request meta key
536+
537+
Deprecated since
538+
[`v0.0.14`](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.14),
539+
use `playwright_page_methods` instead

changelog.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
# scrapy-playwright changelog
22

3+
### [v0.0.14](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.14) (2022-03-26)
4+
5+
* Renamed `PageCoroutine` to `PageMethod` (`PageCoroutine` is now deprecated)
6+
7+
38
### [v0.0.13](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.13) (2022-03-24)
49

510
* PageCoroutine checks

examples/cookies.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from scrapy import Spider, Request
44
from scrapy.crawler import CrawlerProcess
5-
from scrapy_playwright.page import PageCoroutine
5+
from scrapy_playwright.page import PageMethod
66

77

88
class CookieSpider(Spider):
@@ -18,8 +18,8 @@ def start_requests(self):
1818
cookies={"foo": "bar"},
1919
meta={
2020
"playwright": True,
21-
"playwright_page_coroutines": [
22-
PageCoroutine(
21+
"playwright_page_methods": [
22+
PageMethod(
2323
"screenshot", path=Path(__file__).parent / "cookies.png", full_page=True
2424
),
2525
],

examples/events.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from playwright.async_api import Dialog, Response as PlaywrightResponse
22
from scrapy import Spider, Request
33
from scrapy.crawler import CrawlerProcess
4-
from scrapy_playwright.page import PageCoroutine
4+
from scrapy_playwright.page import PageMethod
55

66

77
class EventsSpider(Spider):
@@ -16,8 +16,8 @@ def start_requests(self):
1616
url="https://example.org",
1717
meta={
1818
"playwright": True,
19-
"playwright_page_coroutines": [
20-
PageCoroutine("evaluate", "alert('foobar');"),
19+
"playwright_page_methods": [
20+
PageMethod("evaluate", "alert('foobar');"),
2121
],
2222
"playwright_page_event_handlers": {
2323
"dialog": self.handle_dialog,

examples/exception.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
from scrapy import Spider, Request
55
from scrapy.crawler import CrawlerProcess
6-
from scrapy_playwright.page import PageCoroutine
6+
from scrapy_playwright.page import PageMethod
77

88

99
class HandleTimeoutMiddleware:
@@ -13,8 +13,8 @@ def process_exception(self, request, exception, spider):
1313
url="https://httpbin.org/get",
1414
meta={
1515
"playwright": True,
16-
"playwright_page_coroutines": [
17-
PageCoroutine(
16+
"playwright_page_methods": [
17+
PageMethod(
1818
"screenshot", path=Path(__file__).parent / "recovered.png", full_page=True
1919
),
2020
],

examples/post.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from scrapy import Spider, FormRequest
44
from scrapy.crawler import CrawlerProcess
5-
from scrapy_playwright.page import PageCoroutine
5+
from scrapy_playwright.page import PageMethod
66

77

88
class PostSpider(Spider):
@@ -18,8 +18,8 @@ def start_requests(self):
1818
formdata={"foo": "bar"},
1919
meta={
2020
"playwright": True,
21-
"playwright_page_coroutines": [
22-
PageCoroutine(
21+
"playwright_page_methods": [
22+
PageMethod(
2323
"screenshot", path=Path(__file__).parent / "post.png", full_page=True
2424
),
2525
],

examples/scroll.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from scrapy import Spider, Request
44
from scrapy.crawler import CrawlerProcess
5-
from scrapy_playwright.page import PageCoroutine
5+
from scrapy_playwright.page import PageMethod
66

77

88
class ScrollSpider(Spider):
@@ -18,11 +18,11 @@ def start_requests(self):
1818
cookies={"foo": "bar", "asdf": "qwerty"},
1919
meta={
2020
"playwright": True,
21-
"playwright_page_coroutines": [
22-
PageCoroutine("wait_for_selector", "div.quote"),
23-
PageCoroutine("evaluate", "window.scrollBy(0, document.body.scrollHeight)"),
24-
PageCoroutine("wait_for_selector", "div.quote:nth-child(11)"), # 10 per page
25-
PageCoroutine(
21+
"playwright_page_methods": [
22+
PageMethod("wait_for_selector", "div.quote"),
23+
PageMethod("evaluate", "window.scrollBy(0, document.body.scrollHeight)"),
24+
PageMethod("wait_for_selector", "div.quote:nth-child(11)"), # 10 per page
25+
PageMethod(
2626
"screenshot", path=Path(__file__).parent / "scroll.png", full_page=True
2727
),
2828
],
@@ -41,6 +41,7 @@ def parse(self, response):
4141
# "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
4242
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
4343
},
44+
"LOG_LEVEL": "INFO",
4445
}
4546
)
4647
process.crawl(ScrollSpider)

examples/storage.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from scrapy import Spider, Request
22
from scrapy.crawler import CrawlerProcess
3-
from scrapy_playwright.page import PageCoroutine
3+
from scrapy_playwright.page import PageMethod
44

55

66
class StorageSpider(Spider):
@@ -16,8 +16,8 @@ def start_requests(self):
1616
meta={
1717
"playwright": True,
1818
"playwright_include_page": True,
19-
"playwright_page_coroutines": [
20-
PageCoroutine("evaluate_handle", "window.localStorage.setItem('foo', 'bar');"),
19+
"playwright_page_methods": [
20+
PageMethod("evaluate_handle", "window.localStorage.setItem('foo', 'bar');"),
2121
],
2222
},
2323
)

scrapy_playwright/handler.py

Lines changed: 31 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from scrapy import Spider, signals
2020
from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
2121
from scrapy.crawler import Crawler
22+
from scrapy.exceptions import ScrapyDeprecationWarning
2223
from scrapy.http import Request, Response
2324
from scrapy.http.headers import Headers
2425
from scrapy.responsetypes import responsetypes
@@ -30,7 +31,7 @@
3031
from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
3132

3233
from scrapy_playwright.headers import use_scrapy_headers
33-
from scrapy_playwright.page import PageCoroutine
34+
from scrapy_playwright.page import PageMethod
3435

3536

3637
__all__ = ["ScrapyPlaywrightDownloadHandler"]
@@ -96,7 +97,7 @@ def __init__(self, crawler: Crawler) -> None:
9697
"The PLAYWRIGHT_CONTEXT_ARGS setting is deprecated, please use"
9798
" PLAYWRIGHT_CONTEXTS instead. Keyword arguments defined in"
9899
" PLAYWRIGHT_CONTEXT_ARGS will be used when creating the 'default' context",
99-
category=DeprecationWarning,
100+
category=ScrapyDeprecationWarning,
100101
stacklevel=2,
101102
)
102103
self.context_kwargs: defaultdict = defaultdict(dict)
@@ -250,23 +251,7 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res
250251
start_time = time()
251252
response = await page.goto(request.url)
252253

253-
page_coroutines = request.meta.get("playwright_page_coroutines") or ()
254-
if isinstance(page_coroutines, dict):
255-
page_coroutines = page_coroutines.values()
256-
for pc in page_coroutines:
257-
if isinstance(pc, PageCoroutine):
258-
try:
259-
method = getattr(page, pc.method)
260-
except AttributeError:
261-
logger.warning(f"Ignoring {repr(pc)}: could not find coroutine")
262-
else:
263-
result = method(*pc.args, **pc.kwargs)
264-
pc.result = await result if isawaitable(result) else result
265-
await page.wait_for_load_state(timeout=self.default_navigation_timeout)
266-
else:
267-
logger.warning(
268-
f"Ignoring {repr(pc)}: expected PageCoroutine, got {repr(type(pc))}"
269-
)
254+
await self._apply_page_methods(page, request)
270255

271256
body_str = await page.content()
272257
request.meta["download_latency"] = time() - start_time
@@ -300,6 +285,33 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res
300285
ip_address=server_ip_address,
301286
)
302287

288+
async def _apply_page_methods(self, page: Page, request: Request) -> None:
289+
page_methods = request.meta.get("playwright_page_methods") or ()
290+
291+
if not page_methods and "playwright_page_coroutines" in request.meta:
292+
page_methods = request.meta["playwright_page_coroutines"]
293+
warnings.warn(
294+
"The 'playwright_page_coroutines' request meta key is deprecated,"
295+
" please use 'playwright_page_methods' instead.",
296+
category=ScrapyDeprecationWarning,
297+
stacklevel=1,
298+
)
299+
300+
if isinstance(page_methods, dict):
301+
page_methods = page_methods.values()
302+
for pm in page_methods:
303+
if isinstance(pm, PageMethod):
304+
try:
305+
method = getattr(page, pm.method)
306+
except AttributeError:
307+
logger.warning(f"Ignoring {repr(pm)}: could not find method")
308+
else:
309+
result = method(*pm.args, **pm.kwargs)
310+
pm.result = await result if isawaitable(result) else result
311+
await page.wait_for_load_state(timeout=self.default_navigation_timeout)
312+
else:
313+
logger.warning(f"Ignoring {repr(pm)}: expected PageMethod, got {repr(type(pm))}")
314+
303315
def _increment_request_stats(self, request: PlaywrightRequest) -> None:
304316
stats_prefix = "playwright/request_count"
305317
self.stats.inc_value(stats_prefix)

0 commit comments

Comments
 (0)