-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Remove custom functions and use generic functions from Crawlee instead. Use pydantic for Actor input data. Remove input: soupFromEncoding, soupExcludeEncodings Update documentation.
- Loading branch information
Showing
9 changed files
with
1,783 additions
and
855 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,11 @@ | ||
# Change Log | ||
|
||
## 0.2 (2024-11-12) | ||
|
||
- Updated to use Crawlee. | ||
- Removed inputs: soupFromEncoding, soupExcludeEncodings | ||
|
||
|
||
## 0.1 (2023-07-19) | ||
|
||
- Initial release of BeautifulSoup Scraper. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
from __future__ import annotations | ||
|
||
import re | ||
from datetime import timedelta | ||
from re import Pattern | ||
from typing import Callable, Sequence, cast | ||
|
||
from crawlee import Glob | ||
from crawlee.beautifulsoup_crawler import BeautifulSoupParser | ||
from pydantic import BaseModel, ConfigDict, Field | ||
|
||
from apify import Actor, ProxyConfiguration | ||
from src.utils import USER_DEFINED_FUNCTION_NAME | ||
|
||
|
||
class ActorInputData(BaseModel): | ||
"""Processed and cleaned inputs for the actor.""" | ||
|
||
model_config = ConfigDict(arbitrary_types_allowed=True) | ||
|
||
start_urls: Sequence[str] | ||
link_selector: str = '' | ||
link_patterns: list[Pattern | Glob] = [] | ||
max_depth: int = Field(0, ge=0) | ||
request_timeout: timedelta = Field(timedelta(seconds=10), gt=timedelta(seconds=0)) | ||
proxy_configuration: ProxyConfiguration | ||
soup_features: BeautifulSoupParser | ||
user_function: Callable | ||
|
||
@classmethod | ||
async def from_input(cls) -> ActorInputData: | ||
"""Instantiate the class from Actor input.""" | ||
actor_input = await Actor.get_input() or {} | ||
|
||
if not (start_urls := actor_input.get('startUrls', [])): | ||
Actor.log.error('No start URLs specified in actor input, exiting...') | ||
await Actor.exit(exit_code=1) | ||
|
||
if not (page_function := actor_input.get('pageFunction', '')): | ||
Actor.log.error('No page function specified in actor input, exiting...') | ||
await Actor.exit(exit_code=1) | ||
|
||
if ( | ||
proxy_configuration := await Actor.create_proxy_configuration( | ||
actor_proxy_input=actor_input.get('proxyConfiguration') | ||
) | ||
) is not None: | ||
aid = cls( | ||
start_urls=[start_url['url'] for start_url in start_urls], | ||
link_selector=actor_input.get('linkSelector', ''), | ||
link_patterns=[ | ||
re.compile(pattern) for pattern in actor_input.get('linkPatterns', ['.*']) | ||
], # default matches everything | ||
max_depth=actor_input.get('maxCrawlingDepth', 1), | ||
request_timeout=timedelta(seconds=actor_input.get('requestTimeout', 10)), | ||
proxy_configuration=proxy_configuration, | ||
soup_features=actor_input.get('soupFeatures', 'html.parser'), | ||
user_function=await extract_user_function(page_function), | ||
) | ||
else: | ||
Actor.log.error('Creation of proxy configuration failed, exiting...') | ||
await Actor.exit(exit_code=1) | ||
|
||
Actor.log.debug(f'actor_input = {aid}') | ||
|
||
return aid | ||
|
||
|
||
async def extract_user_function(page_function: str) -> Callable: | ||
"""Extract the user-defined function using exec and returns it as a Callable. | ||
This function uses `exec` internally to execute the `user_function` code in a separate scope. The `user_function` | ||
should be a valid Python code snippet defining a function named `USER_DEFINED_FUNCTION_NAME`. | ||
Args: | ||
page_function: The string representation of the user-defined function. | ||
Returns: | ||
The extracted user-defined function. | ||
Raises: | ||
KeyError: If the function name `USER_DEFINED_FUNCTION_NAME` cannot be found. | ||
""" | ||
scope: dict = {} | ||
exec(page_function, scope) | ||
|
||
try: | ||
user_defined_function = scope[USER_DEFINED_FUNCTION_NAME] | ||
except KeyError: | ||
Actor.log.error(f'Function name "{USER_DEFINED_FUNCTION_NAME}" could not be found, exiting...') | ||
await Actor.exit(exit_code=1) | ||
|
||
return cast(Callable, user_defined_function) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,61 +1,30 @@ | ||
from bs4 import BeautifulSoup | ||
from httpx import AsyncClient | ||
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext | ||
|
||
from apify import Actor | ||
|
||
from .dataclasses import ActorInputData, Context | ||
from .utils import execute_user_function, extract_user_function, get_proxies_from_conf, update_request_queue | ||
from .input_handling import ActorInputData | ||
from .utils import execute_user_function | ||
|
||
|
||
async def main() -> None: | ||
"""Actor main function.""" | ||
async with Actor: | ||
aid = await ActorInputData.from_input() | ||
|
||
# Enqueue the starting URLs in the default request queue | ||
request_queue = await Actor.open_request_queue() | ||
for start_url in aid.start_urls: | ||
url = start_url.get('url') | ||
Actor.log.info(f'Enqueuing {url} ...') | ||
await request_queue.add_request(request={'url': url, 'userData': {'depth': 0}}) | ||
|
||
user_defined_function = await extract_user_function(aid.page_function) | ||
proxies = await get_proxies_from_conf(aid.proxy_configuration) | ||
|
||
# Process the requests in the queue one by one | ||
while request := await request_queue.fetch_next_request(): | ||
url = request['url'] | ||
Actor.log.info(f'Scraping {url} ...') | ||
|
||
try: | ||
# The usage of the same HTTPX client for the whole request queue was discussed here | ||
# https://github.com/apify/actor-beautifulsoup-scraper/pull/1#pullrequestreview-1518377074 | ||
async with AsyncClient(proxies=proxies) as client: | ||
response = await client.get(url, timeout=aid.request_timeout) | ||
|
||
soup = BeautifulSoup( | ||
response.content, | ||
features=aid.soup_features, | ||
from_encoding=aid.soup_from_encoding, | ||
exclude_encodings=aid.soup_exclude_encodings, | ||
) | ||
|
||
if aid.link_selector: | ||
await update_request_queue( | ||
soup, | ||
request_queue, | ||
request, | ||
aid.max_depth, | ||
aid.link_selector, | ||
aid.link_patterns, | ||
) | ||
|
||
context = Context(soup, request, request_queue, response) | ||
await execute_user_function(context, user_defined_function) | ||
|
||
except BaseException: | ||
Actor.log.exception(f'Cannot extract data from {url} .') | ||
|
||
finally: | ||
# Mark the request as handled so it's not processed again | ||
await request_queue.mark_request_as_handled(request) | ||
crawler = BeautifulSoupCrawler( | ||
parser=aid.soup_features, | ||
max_crawl_depth=aid.max_depth, | ||
proxy_configuration=aid.proxy_configuration, | ||
request_handler_timeout=aid.request_timeout, | ||
) | ||
|
||
@crawler.router.default_handler | ||
async def request_handler(context: BeautifulSoupCrawlingContext) -> None: | ||
# Process the request. | ||
Actor.log.info(f'Scraping {context.request.url} ...') | ||
await execute_user_function(context, aid.user_function) | ||
|
||
if aid.link_selector: | ||
await context.enqueue_links(selector=aid.link_selector, include=aid.link_patterns) | ||
|
||
await crawler.run(aid.start_urls) |
Oops, something went wrong.