Skip to content

Commit

Permalink
feat: blocking detection for playwright crawler (#328)
Browse files Browse the repository at this point in the history
### Description

- Add blocking detection for `PlaywrightCrawler`.
- Improve documentation of the `PlaywrightCrawler` and related
components.

### Issues

- Closes: #239

### Testing

- Only manual, until we have
#197 ready.

### Checklist

- [x] Changes are described in the `CHANGELOG.md`
- [x] CI passed
  • Loading branch information
vdusek authored Jul 19, 2024
1 parent c630818 commit 49ff6e2
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 9 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

### Features

- Integrate proxies into PlaywrightCrawler.
- Support for proxy configuration in `PlaywrightCrawler`.
- Blocking detection in `PlaywrightCrawler`.
- Expose `crawler.log` to public.

### Bug fixes
Expand Down
9 changes: 8 additions & 1 deletion src/crawlee/basic_crawler/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,14 @@ def __call__( # noqa: D102


class EnqueueLinksFunction(Protocol):
"""Type of a function for enqueueing links based on a selector."""
"""Type of a function for enqueueing links based on a selector.
Args:
selector: CSS selector used to find the elements containing the links.
label: Label for the newly created `Request` objects, used for request routing.
user_data: User data to be provided to the newly created `Request` objects.
**kwargs: Additional arguments for the `add_requests` method.
"""

def __call__( # noqa: D102
self,
Expand Down
89 changes: 84 additions & 5 deletions src/crawlee/playwright_crawler/playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@

from typing_extensions import Unpack

from crawlee._utils.blocked import RETRY_CSS_SELECTORS
from crawlee.basic_crawler import BasicCrawler, BasicCrawlerOptions, ContextPipeline
from crawlee.basic_crawler.errors import SessionError
from crawlee.browsers import BrowserPool
from crawlee.enqueue_strategy import EnqueueStrategy
from crawlee.models import BaseRequestData
Expand All @@ -18,7 +20,24 @@


class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext]):
"""A crawler that fetches the request URL using `Playwright`."""
"""A crawler that leverages the [Playwright](https://playwright.dev/python/) browser automation library.
`PlaywrightCrawler` is a subclass of `BasicCrawler`, inheriting all its features, such as autoscaling of requests,
request routing, and utilization of `RequestProvider`. Additionally, it offers Playwright-specific methods and
properties, like the `page` property for user data extraction, and the `enqueue_links` method for crawling
other pages.
This crawler is ideal for crawling websites that require JavaScript execution, as it uses headless browsers
to download web pages and extract data. For websites that do not require JavaScript, consider using
`BeautifulSoupCrawler`, which uses raw HTTP requests, and it is much faster.
`PlaywrightCrawler` opens a new browser page (i.e., tab) for each `Request` object and invokes the user-provided
request handler function via the `Router`. Users can interact with the page and extract the data using
the Playwright API.
Note that the pool of browser instances used by `PlaywrightCrawler`, and the pages they open, is internally
managed by the `BrowserPool`.
"""

def __init__(
self,
Expand Down Expand Up @@ -50,19 +69,42 @@ def __init__(

self._browser_pool = browser_pool

kwargs['_context_pipeline'] = ContextPipeline().compose(self._page_goto)
# Compose the context pipeline with the Playwright-specific context enhancer.
kwargs['_context_pipeline'] = (
ContextPipeline().compose(self._make_http_request).compose(self._handle_blocked_request)
)
kwargs['_additional_context_managers'] = [self._browser_pool]

kwargs.setdefault('_logger', logging.getLogger(__name__))

super().__init__(**kwargs)

async def _page_goto(self, context: BasicCrawlingContext) -> AsyncGenerator[PlaywrightCrawlingContext, None]:
async def _make_http_request(
self,
context: BasicCrawlingContext,
) -> AsyncGenerator[PlaywrightCrawlingContext, None]:
"""Enhance the crawling context with making an HTTP request using Playwright.
Args:
context: The basic crawling context to be enhanced.
Raises:
ValueError: If the browser pool is not initialized.
SessionError: If the URL cannot be loaded by the browser.
Yields:
An enhanced crawling context with Playwright-specific features.
"""
if self._browser_pool is None:
raise ValueError('Browser pool is not initialized.')

# Create a new browser page, navigate to the URL and get response.
crawlee_page = await self._browser_pool.new_page(proxy_info=context.proxy_info)
await crawlee_page.page.goto(context.request.url)
response = await crawlee_page.page.goto(context.request.url)

if response is None:
raise SessionError(f'Failed to load the URL: {context.request.url}')

# Set the loaded URL to the actual URL after redirection.
context.request.loaded_url = crawlee_page.page.url

async def enqueue_links(
Expand All @@ -72,6 +114,7 @@ async def enqueue_links(
user_data: dict | None = None,
**kwargs: Unpack[AddRequestsKwargs],
) -> None:
"""The `PlaywrightCrawler` implementation of the `EnqueueLinksFunction` function."""
kwargs.setdefault('strategy', EnqueueStrategy.SAME_HOSTNAME)

requests = list[BaseRequestData]()
Expand Down Expand Up @@ -102,7 +145,43 @@ async def enqueue_links(
proxy_info=context.proxy_info,
log=context.log,
page=crawlee_page.page,
response=response,
enqueue_links=enqueue_links,
)

await crawlee_page.page.close()

async def _handle_blocked_request(
self,
crawling_context: PlaywrightCrawlingContext,
) -> AsyncGenerator[PlaywrightCrawlingContext, None]:
"""Enhance the crawling context with handling of blocked requests.
Args:
crawling_context: The crawling context to be checked for blocking.
Raises:
SessionError: If the session is blocked based on the HTTP status code or the response content.
Yields:
The original crawling context if the session is not blocked.
"""
if self._retry_on_blocked:
status_code = crawling_context.response.status

# Check if the session is blocked based on the HTTP status code.
if crawling_context.session and crawling_context.session.is_blocked_status_code(status_code=status_code):
raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}.')

matched_selectors = [
selector for selector in RETRY_CSS_SELECTORS if (await crawling_context.page.query_selector(selector))
]

# Check if the session is blocked based on the response content
if matched_selectors:
raise SessionError(
'Assuming the session is blocked - '
f"HTTP response matched the following selectors: {'; '.join(matched_selectors)}"
)

yield crawling_context
11 changes: 9 additions & 2 deletions src/crawlee/playwright_crawler/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,19 @@
from crawlee.basic_crawler.types import BasicCrawlingContext, EnqueueLinksFunction

if TYPE_CHECKING:
from playwright.async_api import Page
from playwright.async_api import Page, Response


@dataclass(frozen=True)
class PlaywrightCrawlingContext(BasicCrawlingContext):
"""Crawling context used by PlaywrightSoupCrawler."""
"""Crawling context used by PlaywrightSoupCrawler.
Args:
page: The Playwright `Page` object.
response: The Playwright `Response` object.
enqueue_links: The `PlaywrightCrawler` implementation of the `EnqueueLinksFunction` function.
"""

page: Page
response: Response
enqueue_links: EnqueueLinksFunction

0 comments on commit 49ff6e2

Please sign in to comment.