diff --git a/docs/examples/yoyodyne.py b/docs/examples/yoyodyne.py new file mode 100644 index 0000000..c3674f9 --- /dev/null +++ b/docs/examples/yoyodyne.py @@ -0,0 +1,13 @@ +import json +from scrapeghost.scrapers import PaginatedSchemaScraper + + +schema = {"first_name": "str", "last_name": "str", "position": "str", "url": "url"} +url = "https://scrapple.fly.dev/staff" + +scraper = PaginatedSchemaScraper(schema) +resp = scraper.scrape(url) + +# the resulting response is a ScrapeResponse object just like any other +# all the results are gathered in resp.data +json.dump(resp.data, open("yoyodyne.json", "w"), indent=2) diff --git a/docs/usage.md b/docs/usage.md index 47d99da..8a345d4 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -138,4 +138,45 @@ If you want to validate that the returned data isn't just JSON, but data in the --8<-- "docs/examples/pydantic_example.log" ``` -This works by converting the `pydantic` model to a schema and registering a `PydanticPostprocessor` to validate the results automatically. \ No newline at end of file +This works by converting the `pydantic` model to a schema and registering a `PydanticPostprocessor` to validate the results automatically. + +## Pagination + +One technique to handle pagination is provided by the `PaginatedSchemaScraper` class. + +This class takes a schema that describes a single result, and wraps it in a schema that describes a list of results as well as an additional page. + +For example: + +```python +{"first_name": "str", "last_name": "str"} +``` + +Automatically becomes: + +```python +{"next_page": "url", "results": [{"first_name": "str", "last_name": "str"}]} +``` + +The `PaginatedSchemaScraper` class then takes care of following the `next_page` link until there are no more pages. + +!!! note + + Right now, given the library's stance on customizing requests being "just use your own HTTP library", the `PaginatedSchemaScraper` class does not provide a means to customize the HTTP request used to retrieve the next page. + + If you need a more complicated approach it is recommended you implement your own pagination logic for now, + may be a good starting point. + + If you have strong opinions here, please open an issue to discuss. + +It then takes the combined "results" and returns them to the user. + +Here's a functional example that scrapes several pages of employees: + +```python +--8<-- "docs/examples/yoyodyne.py" +``` + +!!! warning + + One caveat of the current approach: The `url` attribute on a `ScraperResult` from a `PaginatedSchemaScraper` is a semicolon-delimited list of all the URLs that were scraped to produce that result. \ No newline at end of file diff --git a/examples/yoyodyne.py b/examples/yoyodyne.py deleted file mode 100644 index 354d508..0000000 --- a/examples/yoyodyne.py +++ /dev/null @@ -1,18 +0,0 @@ -""" -This is an example of an auto-paginating scraper using PaginatedSchemaScraper. - -The technique used is to modify the schema to have a -"next_page" field, and then scrape in the usual manner. - -If "next_page" is populated, the scraper will continue. -""" -import json -from scrapeghost import PaginatedSchemaScraper - - -schema = {"first_name": "str", "last_name": "str", "position": "str", "url": "url"} -url = "https://scrapple.fly.dev/staff" - -scraper = PaginatedSchemaScraper(schema, model="gpt-3.5-turbo") -data = scraper.scrape_all(url) -json.dump(data, open("yoyodyne.json", "w"), indent=2) diff --git a/src/scrapeghost/__init__.py b/src/scrapeghost/__init__.py index 238fd31..1f9a1b7 100644 --- a/src/scrapeghost/__init__.py +++ b/src/scrapeghost/__init__.py @@ -1,6 +1,7 @@ # ruff: noqa from .scrapers import ( SchemaScraper, + PaginatedSchemaScraper, ) from .utils import cost_estimate from .preprocessors import CSS, XPath