Refactor to use crawlee (#23)

Remove custom functions and use generic functions from Crawlee instead. Use pydantic for Actor input data. Remove input: soupFromEncoding, soupExcludeEncodings Update documentation.
apify · Nov 12, 2024 · 2fc779e · 2fc779e
1 parent 5d2efb5
commit 2fc779e
Show file tree

Hide file tree

Showing 9 changed files with 1,783 additions and 855 deletions.
diff --git a/.actor/input_schema.json b/.actor/input_schema.json
@@ -47,7 +47,7 @@
       "type": "string",
       "description": "A Python function, that is executed for every page. Use it to scrape data from the page, perform actions or add new URLs to the request queue. The page function has its own naming scope and you can import any installed modules. Typically you would want to obtain the data from the <code>context.soup</code> object and return them. Identifier <code>page_function</code> can't be changed. For more information about the <code>context</code> object you get into the <code>page_function</code> check the <a href='https://github.com/apify/actor-beautifulsoup-scraper#context' target='_blank' rel='noopener'>github.com/apify/actor-beautifulsoup-scraper#context</a>. Asynchronous functions are supported.",
       "editor": "python",
-      "prefill": "from typing import Any\n\n# See the context section in readme to find out what fields you can access \n# https://apify.com/apify/beautifulsoup-scraper#context    \ndef page_function(context: Context) -> Any:\n    url = context.request['url']\n    title = context.soup.title.string if context.soup.title else None\n    return {'url': url, 'title': title}\n"
+      "prefill": "from typing import Any\nfrom crawlee.beautifulsoup_crawler import BeautifulSoupCrawlingContext\n\n# See the context section in readme to find out what fields you can access \n# https://apify.com/apify/beautifulsoup-scraper#context    \ndef page_function(context: BeautifulSoupCrawlingContext) -> Any:\n    url = context.request.url\n    title = context.soup.title.string if context.soup.title else None\n    return {'url': url, 'title': title}\n"
     },
     "soupFeatures": {
       "sectionCaption": "Advanced BeautifulSoup configuration",
@@ -57,18 +57,6 @@
       "editor": "textfield",
       "prefill": "html.parser"
     },
-    "soupFromEncoding": {
-      "title": "BeautifulSoup from_encoding",
-      "type": "string",
-      "description": "The value of BeautifulSoup <code>from_encoding</code> argument. From BeautifulSoup docs: A string indicating the encoding of the document to be parsed. Pass this in if Beautiful Soup is guessing wrongly about the document's encoding.",
-      "editor": "textfield"
-    },
-    "soupExcludeEncodings": {
-      "title": "BeautifulSoup exclude_encodings",
-      "type": "array",
-      "description": "The value of BeautifulSoup <code>exclude_encodings</code> argument. From BeautifulSoup docs: A list of strings indicating encodings known to be wrong. Pass this in if you don't know the document's encoding but you know Beautiful Soup's guess is wrong.",
-      "editor": "stringList"
-    },
     "proxyConfiguration": {
       "sectionCaption": "Proxy and HTTP configuration",
       "title": "Proxy configuration",

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Change Log
 
+## 0.2 (2024-11-12)
+
+- Updated to use Crawlee.
+- Removed inputs: soupFromEncoding, soupExcludeEncodings
+
+
 ## 0.1 (2023-07-19)
 
 - Initial release of BeautifulSoup Scraper.
diff --git a/README.md b/README.md
@@ -46,20 +46,18 @@ Example:
 
 ```python
 from typing import Any
+from crawlee.beautifulsoup_crawler import BeautifulSoupCrawlingContext
 
-def page_function(context: Context) -> Any:
+def page_function(context: BeautifulSoupCrawlingContext) -> Any:
     url = context.request["url"]
     title = context.soup.title.string if context.soup.title else None
     return {"url": url, "title": title}
 ```
 
 ### Context
 
-The code runs in Python 3.12 and the `page_function` accepts a single argument `context` of type [Context](https://github.com/apify/-beautifulsoup-scraper/blob/master/src/dataclasses.py). It is a dataclass with the following fields:
-- `soup` of type `BeautifulSoup` with the parsed HTTP payload,
-- `request` of type `dict` with the HTTP request data,
-- `request_queue` of type `apify.storages.RequestQueue` ([RequestQueue](https://docs.apify.com/sdk/python/reference/class/RequestQueue)) for the interaction with the HTTP request queue,
-- `response` of type `httpx.Response` with the HTTP response data.
+The code runs in Python 3.12 and the `page_function` accepts a single argument `context` of type [BeautifulSoupCrawlingContext](https://crawlee.dev/python/api/class/BeautifulSoupCrawlingContext). See documentation link for further details
+
 
 ## Proxy configuration
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,12 +9,13 @@ package-mode = false
 
 [tool.poetry.dependencies]
 python = "^3.12"
-apify = "^1.7.0"
+apify = "^2.0.0"
 beautifulsoup4 = "^4.12.3"
 html5lib = "^1.1"
 httpx = "^0.27.0"
 lxml = "^5.2.1"
 types-beautifulsoup4 = "^4.12.0.20240229"
+crawlee = {version = "^0.4.1", extras = ["beautifulsoup4"]}
 
 [tool.poetry.group.dev.dependencies]
 ipython = "^8.23.0"
@@ -83,6 +84,8 @@ indent-style = "space"
     "T20",     # flake8-print
     "TRY301",  # Abstract `raise` to an inner function
 ]
+[tool.ruff.lint.flake8-type-checking]
+runtime-evaluated-base-classes = ["pydantic.BaseModel"]
 
 [tool.ruff.lint.flake8-quotes]
 docstring-quotes = "double"

diff --git a/src/dataclasses.py b/src/dataclasses.py
diff --git a/src/input_handling.py b/src/input_handling.py
@@ -0,0 +1,93 @@
+from __future__ import annotations
+
+import re
+from datetime import timedelta
+from re import Pattern
+from typing import Callable, Sequence, cast
+
+from crawlee import Glob
+from crawlee.beautifulsoup_crawler import BeautifulSoupParser
+from pydantic import BaseModel, ConfigDict, Field
+
+from apify import Actor, ProxyConfiguration
+from src.utils import USER_DEFINED_FUNCTION_NAME
+
+
+class ActorInputData(BaseModel):
+    """Processed and cleaned inputs for the actor."""
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    start_urls: Sequence[str]
+    link_selector: str = ''
+    link_patterns: list[Pattern | Glob] = []
+    max_depth: int = Field(0, ge=0)
+    request_timeout: timedelta = Field(timedelta(seconds=10), gt=timedelta(seconds=0))
+    proxy_configuration: ProxyConfiguration
+    soup_features: BeautifulSoupParser
+    user_function: Callable
+
+    @classmethod
+    async def from_input(cls) -> ActorInputData:
+        """Instantiate the class from Actor input."""
+        actor_input = await Actor.get_input() or {}
+
+        if not (start_urls := actor_input.get('startUrls', [])):
+            Actor.log.error('No start URLs specified in actor input, exiting...')
+            await Actor.exit(exit_code=1)
+
+        if not (page_function := actor_input.get('pageFunction', '')):
+            Actor.log.error('No page function specified in actor input, exiting...')
+            await Actor.exit(exit_code=1)
+
+        if (
+            proxy_configuration := await Actor.create_proxy_configuration(
+                actor_proxy_input=actor_input.get('proxyConfiguration')
+            )
+        ) is not None:
+            aid = cls(
+                start_urls=[start_url['url'] for start_url in start_urls],
+                link_selector=actor_input.get('linkSelector', ''),
+                link_patterns=[
+                    re.compile(pattern) for pattern in actor_input.get('linkPatterns', ['.*'])
+                ],  # default matches everything
+                max_depth=actor_input.get('maxCrawlingDepth', 1),
+                request_timeout=timedelta(seconds=actor_input.get('requestTimeout', 10)),
+                proxy_configuration=proxy_configuration,
+                soup_features=actor_input.get('soupFeatures', 'html.parser'),
+                user_function=await extract_user_function(page_function),
+            )
+        else:
+            Actor.log.error('Creation of proxy configuration failed, exiting...')
+            await Actor.exit(exit_code=1)
+
+        Actor.log.debug(f'actor_input = {aid}')
+
+        return aid
+
+
+async def extract_user_function(page_function: str) -> Callable:
+    """Extract the user-defined function using exec and returns it as a Callable.
+
+    This function uses `exec` internally to execute the `user_function` code in a separate scope. The `user_function`
+    should be a valid Python code snippet defining a function named `USER_DEFINED_FUNCTION_NAME`.
+
+    Args:
+        page_function: The string representation of the user-defined function.
+
+    Returns:
+        The extracted user-defined function.
+
+    Raises:
+        KeyError: If the function name `USER_DEFINED_FUNCTION_NAME` cannot be found.
+    """
+    scope: dict = {}
+    exec(page_function, scope)
+
+    try:
+        user_defined_function = scope[USER_DEFINED_FUNCTION_NAME]
+    except KeyError:
+        Actor.log.error(f'Function name "{USER_DEFINED_FUNCTION_NAME}" could not be found, exiting...')
+        await Actor.exit(exit_code=1)
+
+    return cast(Callable, user_defined_function)
diff --git a/src/main.py b/src/main.py
@@ -1,61 +1,30 @@
-from bs4 import BeautifulSoup
-from httpx import AsyncClient
+from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 
 from apify import Actor
 
-from .dataclasses import ActorInputData, Context
-from .utils import execute_user_function, extract_user_function, get_proxies_from_conf, update_request_queue
+from .input_handling import ActorInputData
+from .utils import execute_user_function
 
 
 async def main() -> None:
     """Actor main function."""
     async with Actor:
         aid = await ActorInputData.from_input()
 
-        # Enqueue the starting URLs in the default request queue
-        request_queue = await Actor.open_request_queue()
-        for start_url in aid.start_urls:
-            url = start_url.get('url')
-            Actor.log.info(f'Enqueuing {url} ...')
-            await request_queue.add_request(request={'url': url, 'userData': {'depth': 0}})
-
-        user_defined_function = await extract_user_function(aid.page_function)
-        proxies = await get_proxies_from_conf(aid.proxy_configuration)
-
-        # Process the requests in the queue one by one
-        while request := await request_queue.fetch_next_request():
-            url = request['url']
-            Actor.log.info(f'Scraping {url} ...')
-
-            try:
-                # The usage of the same HTTPX client for the whole request queue was discussed here
-                # https://github.com/apify/actor-beautifulsoup-scraper/pull/1#pullrequestreview-1518377074
-                async with AsyncClient(proxies=proxies) as client:
-                    response = await client.get(url, timeout=aid.request_timeout)
-
-                soup = BeautifulSoup(
-                    response.content,
-                    features=aid.soup_features,
-                    from_encoding=aid.soup_from_encoding,
-                    exclude_encodings=aid.soup_exclude_encodings,
-                )
-
-                if aid.link_selector:
-                    await update_request_queue(
-                        soup,
-                        request_queue,
-                        request,
-                        aid.max_depth,
-                        aid.link_selector,
-                        aid.link_patterns,
-                    )
-
-                context = Context(soup, request, request_queue, response)
-                await execute_user_function(context, user_defined_function)
-
-            except BaseException:
-                Actor.log.exception(f'Cannot extract data from {url} .')
-
-            finally:
-                # Mark the request as handled so it's not processed again
-                await request_queue.mark_request_as_handled(request)
+        crawler = BeautifulSoupCrawler(
+            parser=aid.soup_features,
+            max_crawl_depth=aid.max_depth,
+            proxy_configuration=aid.proxy_configuration,
+            request_handler_timeout=aid.request_timeout,
+        )
+
+        @crawler.router.default_handler
+        async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
+            # Process the request.
+            Actor.log.info(f'Scraping {context.request.url} ...')
+            await execute_user_function(context, aid.user_function)
+
+            if aid.link_selector:
+                await context.enqueue_links(selector=aid.link_selector, include=aid.link_patterns)
+
+        await crawler.run(aid.start_urls)