diff --git a/CHANGELOG.md b/CHANGELOG.md index be73496a59..c12038b4c6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,8 @@ ### Features -- Integrate proxies into PlaywrightCrawler. +- Support for proxy configuration in `PlaywrightCrawler`. +- Blocking detection in `PlaywrightCrawler`. - Expose `crawler.log` to public. ### Bug fixes diff --git a/src/crawlee/basic_crawler/types.py b/src/crawlee/basic_crawler/types.py index 0423eb96eb..92bd285d03 100644 --- a/src/crawlee/basic_crawler/types.py +++ b/src/crawlee/basic_crawler/types.py @@ -91,7 +91,14 @@ def __call__( # noqa: D102 class EnqueueLinksFunction(Protocol): - """Type of a function for enqueueing links based on a selector.""" + """Type of a function for enqueueing links based on a selector. + + Args: + selector: CSS selector used to find the elements containing the links. + label: Label for the newly created `Request` objects, used for request routing. + user_data: User data to be provided to the newly created `Request` objects. + **kwargs: Additional arguments for the `add_requests` method. + """ def __call__( # noqa: D102 self, diff --git a/src/crawlee/playwright_crawler/playwright_crawler.py b/src/crawlee/playwright_crawler/playwright_crawler.py index 466e77525a..d69a6bff51 100644 --- a/src/crawlee/playwright_crawler/playwright_crawler.py +++ b/src/crawlee/playwright_crawler/playwright_crawler.py @@ -5,7 +5,9 @@ from typing_extensions import Unpack +from crawlee._utils.blocked import RETRY_CSS_SELECTORS from crawlee.basic_crawler import BasicCrawler, BasicCrawlerOptions, ContextPipeline +from crawlee.basic_crawler.errors import SessionError from crawlee.browsers import BrowserPool from crawlee.enqueue_strategy import EnqueueStrategy from crawlee.models import BaseRequestData @@ -18,7 +20,24 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext]): - """A crawler that fetches the request URL using `Playwright`.""" + """A crawler that leverages the [Playwright](https://playwright.dev/python/) browser automation library. + + `PlaywrightCrawler` is a subclass of `BasicCrawler`, inheriting all its features, such as autoscaling of requests, + request routing, and utilization of `RequestProvider`. Additionally, it offers Playwright-specific methods and + properties, like the `page` property for user data extraction, and the `enqueue_links` method for crawling + other pages. + + This crawler is ideal for crawling websites that require JavaScript execution, as it uses headless browsers + to download web pages and extract data. For websites that do not require JavaScript, consider using + `BeautifulSoupCrawler`, which uses raw HTTP requests, and it is much faster. + + `PlaywrightCrawler` opens a new browser page (i.e., tab) for each `Request` object and invokes the user-provided + request handler function via the `Router`. Users can interact with the page and extract the data using + the Playwright API. + + Note that the pool of browser instances used by `PlaywrightCrawler`, and the pages they open, is internally + managed by the `BrowserPool`. + """ def __init__( self, @@ -50,19 +69,42 @@ def __init__( self._browser_pool = browser_pool - kwargs['_context_pipeline'] = ContextPipeline().compose(self._page_goto) + # Compose the context pipeline with the Playwright-specific context enhancer. + kwargs['_context_pipeline'] = ( + ContextPipeline().compose(self._make_http_request).compose(self._handle_blocked_request) + ) kwargs['_additional_context_managers'] = [self._browser_pool] - kwargs.setdefault('_logger', logging.getLogger(__name__)) super().__init__(**kwargs) - async def _page_goto(self, context: BasicCrawlingContext) -> AsyncGenerator[PlaywrightCrawlingContext, None]: + async def _make_http_request( + self, + context: BasicCrawlingContext, + ) -> AsyncGenerator[PlaywrightCrawlingContext, None]: + """Enhance the crawling context with making an HTTP request using Playwright. + + Args: + context: The basic crawling context to be enhanced. + + Raises: + ValueError: If the browser pool is not initialized. + SessionError: If the URL cannot be loaded by the browser. + + Yields: + An enhanced crawling context with Playwright-specific features. + """ if self._browser_pool is None: raise ValueError('Browser pool is not initialized.') + # Create a new browser page, navigate to the URL and get response. crawlee_page = await self._browser_pool.new_page(proxy_info=context.proxy_info) - await crawlee_page.page.goto(context.request.url) + response = await crawlee_page.page.goto(context.request.url) + + if response is None: + raise SessionError(f'Failed to load the URL: {context.request.url}') + + # Set the loaded URL to the actual URL after redirection. context.request.loaded_url = crawlee_page.page.url async def enqueue_links( @@ -72,6 +114,7 @@ async def enqueue_links( user_data: dict | None = None, **kwargs: Unpack[AddRequestsKwargs], ) -> None: + """The `PlaywrightCrawler` implementation of the `EnqueueLinksFunction` function.""" kwargs.setdefault('strategy', EnqueueStrategy.SAME_HOSTNAME) requests = list[BaseRequestData]() @@ -102,7 +145,43 @@ async def enqueue_links( proxy_info=context.proxy_info, log=context.log, page=crawlee_page.page, + response=response, enqueue_links=enqueue_links, ) await crawlee_page.page.close() + + async def _handle_blocked_request( + self, + crawling_context: PlaywrightCrawlingContext, + ) -> AsyncGenerator[PlaywrightCrawlingContext, None]: + """Enhance the crawling context with handling of blocked requests. + + Args: + crawling_context: The crawling context to be checked for blocking. + + Raises: + SessionError: If the session is blocked based on the HTTP status code or the response content. + + Yields: + The original crawling context if the session is not blocked. + """ + if self._retry_on_blocked: + status_code = crawling_context.response.status + + # Check if the session is blocked based on the HTTP status code. + if crawling_context.session and crawling_context.session.is_blocked_status_code(status_code=status_code): + raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}.') + + matched_selectors = [ + selector for selector in RETRY_CSS_SELECTORS if (await crawling_context.page.query_selector(selector)) + ] + + # Check if the session is blocked based on the response content + if matched_selectors: + raise SessionError( + 'Assuming the session is blocked - ' + f"HTTP response matched the following selectors: {'; '.join(matched_selectors)}" + ) + + yield crawling_context diff --git a/src/crawlee/playwright_crawler/types.py b/src/crawlee/playwright_crawler/types.py index 59398790f1..1527e87b15 100644 --- a/src/crawlee/playwright_crawler/types.py +++ b/src/crawlee/playwright_crawler/types.py @@ -6,12 +6,19 @@ from crawlee.basic_crawler.types import BasicCrawlingContext, EnqueueLinksFunction if TYPE_CHECKING: - from playwright.async_api import Page + from playwright.async_api import Page, Response @dataclass(frozen=True) class PlaywrightCrawlingContext(BasicCrawlingContext): - """Crawling context used by PlaywrightSoupCrawler.""" + """Crawling context used by PlaywrightSoupCrawler. + + Args: + page: The Playwright `Page` object. + response: The Playwright `Response` object. + enqueue_links: The `PlaywrightCrawler` implementation of the `EnqueueLinksFunction` function. + """ page: Page + response: Response enqueue_links: EnqueueLinksFunction