big doc overhaul, tutorial update for 0.4

jamesturk · Mar 24, 2023 · 0a26905 · 0a26905
1 parent 9cf4b83
commit 0a26905
Show file tree

Hide file tree

Showing 27 changed files with 208 additions and 87 deletions.
diff --git a/Justfile b/Justfile
@@ -9,10 +9,10 @@ docs:
 	poetry run mkdocs serve
 
 examples:
+	poetry run sh docs/examples/cli.sh > docs/examples/cli.log
 	poetry run python docs/examples/tutorial/v2.py > docs/examples/tutorial/v2.log
 	poetry run python docs/examples/tutorial/v3.py > docs/examples/tutorial/v3.log
 	poetry run python docs/examples/tutorial/v5.py > docs/examples/tutorial/v5.log
-	poetry run sh docs/examples/tutorial/cli.sh > docs/examples/tutorial/cli.log
 
 release type:  # patch, minor, major
 	poetry version {{type}}

diff --git a/docs/examples/tutorial/cli.log → docs/examples/cli.log b/docs/examples/tutorial/cli.log → docs/examples/cli.log
@@ -4,16 +4,12 @@
     "photo_url": "https://www.ncleg.gov/Members/MemberImage/S/436/Low",
     "offices": [
         {
-            "type": "mailing",
+            "type": "Mailing",
             "address": "16 West Jones Street, Rm. 1104, Raleigh, NC 27601"
         },
         {
-            "type": "phone",
-            "number": "(919) 715-3036"
-        },
-        {
-            "type": "email",
-            "address": "[email protected]"
+            "type": "Office Phone",
+            "phone": "(919) 715-3036"
         }
     ]
 }
diff --git a/docs/examples/tutorial/cli.sh → docs/examples/cli.sh b/docs/examples/tutorial/cli.sh → docs/examples/cli.sh
diff --git a/docs/examples/pydantic_example b/docs/examples/pydantic_example
@@ -0,0 +1 @@
+docs/examples/pydantic_example.py
diff --git a/docs/examples/pydantic_example.log b/docs/examples/pydantic_example.log
@@ -1 +1,6 @@
-CrewMember(gender='Male', race='Dokarian', alignment='Evil\nProtagonist')
+2023-03-24 17:19:35 [debug    ] got HTML                       length=218107 url=https://spaceghost.fandom.com/wiki/Zorak
+2023-03-24 17:19:35 [debug    ] preprocessor                   from_nodes=1 name=CleanHTML nodes=1
+2023-03-24 17:19:35 [debug    ] preprocessor                   from_nodes=1 name=CSS(.infobox) nodes=1
+2023-03-24 17:19:35 [info     ] API request                    html_tokens=828 model=gpt-3.5-turbo
+2023-03-24 17:19:38 [info     ] API response                   completion_tokens=32 cost=0.0018960000000000001 duration=2.3367760181427 finish_reason=stop prompt_tokens=916
+CrewMember(gender='Male', race='Dokarian', alignment='Evil\\nProtagonist')
diff --git a/docs/examples/tutorial/episode_scraper_1.log b/docs/examples/tutorial/episode_scraper_1.log
@@ -0,0 +1,2 @@
+2023-03-24 17:19:55 [debug    ] got HTML                       length=165069 url=https://comedybangbang.fandom.com/wiki/Operation_Golden_Orb
+2023-03-24 17:19:55 [debug    ] preprocessor                   from_nodes=1 name=CleanHTML nodes=1
diff --git a/docs/examples/tutorial/v1.py → docs/examples/tutorial/episode_scraper_1.py b/docs/examples/tutorial/v1.py → docs/examples/tutorial/episode_scraper_1.py
@@ -1,5 +1,5 @@
 from scrapeghost import SchemaScraper
-from pprint import pprint  # pretty print results
+from pprint import pprint
 
 url = "https://comedybangbang.fandom.com/wiki/Operation_Golden_Orb"
 schema = {
@@ -10,4 +10,6 @@
 
 episode_scraper = SchemaScraper(schema)
 
-pprint(episode_scraper(url))
+response = episode_scraper(url)
+pprint(response.data)
+print(f"Total Cost: ${response.total_cost:.3f}")
diff --git a/docs/examples/tutorial/episode_scraper_2.log b/docs/examples/tutorial/episode_scraper_2.log
@@ -0,0 +1,9 @@
+2023-03-24 17:19:56 [debug    ] got HTML                       length=165069 url=https://comedybangbang.fandom.com/wiki/Operation_Golden_Orb
+2023-03-24 17:19:56 [debug    ] preprocessor                   from_nodes=1 name=CleanHTML nodes=1
+2023-03-24 17:19:56 [debug    ] preprocessor                   from_nodes=1 name=CSS(div.page-content) nodes=1
+2023-03-24 17:19:56 [info     ] API request                    html_tokens=1332 model=gpt-3.5-turbo
+2023-03-24 17:19:58 [info     ] API response                   completion_tokens=33 cost=0.00291 duration=2.4050040245056152 finish_reason=stop prompt_tokens=1422
+{'episode_number': 800,
+ 'release_date': 'March 12, 2023',
+ 'title': 'Operation Golden Orb'}
+Total Cost:  0.00291
diff --git a/docs/examples/tutorial/v2.py → docs/examples/tutorial/episode_scraper_2.py b/docs/examples/tutorial/v2.py → docs/examples/tutorial/episode_scraper_2.py
@@ -14,4 +14,6 @@
     extra_preprocessors=[CSS("div.page-content")],
 )
 
-pprint(episode_scraper(url))
+response = episode_scraper(url)
+pprint(response.data)
+print(f"Total Cost: ${response.total_cost:.3f}")
diff --git a/docs/examples/tutorial/episode_scraper_3.log b/docs/examples/tutorial/episode_scraper_3.log
@@ -0,0 +1,12 @@
+2023-03-24 17:19:59 [debug    ] got HTML                       length=165069 url=https://comedybangbang.fandom.com/wiki/Operation_Golden_Orb
+2023-03-24 17:19:59 [debug    ] preprocessor                   from_nodes=1 name=CleanHTML nodes=1
+2023-03-24 17:19:59 [debug    ] preprocessor                   from_nodes=1 name=CSS(div.page-content) nodes=1
+2023-03-24 17:19:59 [info     ] API request                    html_tokens=1332 model=gpt-3.5-turbo
+2023-03-24 17:20:04 [info     ] API response                   completion_tokens=83 cost=0.003036 duration=4.5867390632629395 finish_reason=stop prompt_tokens=1435
+{'episode_number': 800,
+ 'guests': [{'name': 'Jason Mantzoukas'},
+            {'name': 'Andy Daly'},
+            {'name': 'Paul F. Tompkins'}],
+ 'release_date': '2023-03-12',
+ 'title': 'Operation Golden Orb'}
+Total Cost:  0.003036
diff --git a/docs/examples/tutorial/v3.py → docs/examples/tutorial/episode_scraper_3.py b/docs/examples/tutorial/v3.py → docs/examples/tutorial/episode_scraper_3.py
@@ -11,7 +11,10 @@
 
 episode_scraper = SchemaScraper(
     schema,
+    # can pass preprocessor to constructor or at scrape time
     extra_preprocessors=[CSS("div.page-content")],
 )
 
-pprint(episode_scraper(url))
+response = episode_scraper(url)
+pprint(response.data)
+print(f"Total Cost: ${response.total_cost:.3f}")
diff --git a/docs/examples/tutorial/episode_scraper_4.log b/docs/examples/tutorial/episode_scraper_4.log
@@ -0,0 +1,12 @@
+2023-03-24 17:34:33 [debug    ] got HTML                       length=226164 url=https://www.earwolf.com/episode/operation-golden-orb/
+2023-03-24 17:34:33 [debug    ] preprocessor                   from_nodes=1 name=CleanHTML nodes=1
+2023-03-24 17:34:33 [debug    ] preprocessor                   from_nodes=1 name=CSS(.hero-episode) nodes=1
+2023-03-24 17:34:33 [info     ] API request                    html_tokens=2988 model=gpt-3.5-turbo
+2023-03-24 17:34:38 [info     ] API response                   completion_tokens=88 cost=0.006358000000000001 duration=5.304486036300659 finish_reason=stop prompt_tokens=3091
+{'episode_number': 800,
+ 'guests': [{'name': 'Jason Mantzoukas'},
+            {'name': 'Andy Daly'},
+            {'name': 'Paul F. Tompkins'}],
+ 'release_date': '2023-03-12',
+ 'title': 'EP. 800 — Operation Golden Orb'}
+Total Cost: $0.006
diff --git a/docs/examples/tutorial/episode_scraper_4.py b/docs/examples/tutorial/episode_scraper_4.py
@@ -0,0 +1,19 @@
+from scrapeghost import SchemaScraper, CSS
+from pprint import pprint
+
+url = "https://www.earwolf.com/episode/operation-golden-orb/"
+schema = {
+    "title": "str",
+    "episode_number": "int",
+    "release_date": "YYYY-MM-DD",
+    "guests": [{"name": "str"}],
+}
+
+episode_scraper = SchemaScraper(
+    schema,
+    extra_preprocessors=[CSS(".hero-episode")],
+)
+
+response = episode_scraper(url)
+pprint(response.data)
+print(f"Total Cost: ${response.total_cost:.3f}")
diff --git a/docs/examples/tutorial/episode_scraper_5.log b/docs/examples/tutorial/episode_scraper_5.log
@@ -0,0 +1,12 @@
+2023-03-24 17:43:10 [debug    ] got HTML                       length=226164 url=https://www.earwolf.com/episode/operation-golden-orb/
+2023-03-24 17:43:10 [debug    ] preprocessor                   from_nodes=1 name=CleanHTML nodes=1
+2023-03-24 17:43:10 [debug    ] preprocessor                   from_nodes=1 name=CSS(.hero-episode) nodes=1
+2023-03-24 17:43:11 [info     ] API request                    html_tokens=2988 model=gpt-3.5-turbo
+2023-03-24 17:43:15 [info     ] API response                   completion_tokens=83 cost=0.006378 duration=4.382771968841553 finish_reason=stop prompt_tokens=3106
+{'episode_number': 800,
+ 'guests': [{'name': 'Jason Mantzoukas'},
+            {'name': 'Andy Daly'},
+            {'name': 'Paul F. Tompkins'}],
+ 'release_date': '2023-03-12',
+ 'title': 'Operation Golden Orb'}
+Total Cost: $0.006
diff --git a/docs/examples/tutorial/episode_scraper_5.py b/docs/examples/tutorial/episode_scraper_5.py
@@ -0,0 +1,22 @@
+from scrapeghost import SchemaScraper, CSS
+from pprint import pprint
+
+url = "https://www.earwolf.com/episode/operation-golden-orb/"
+schema = {
+    "title": "str",
+    "episode_number": "int",
+    "release_date": "YYYY-MM-DD",
+    "guests": [{"name": "str"}],
+}
+
+episode_scraper = SchemaScraper(
+    schema,
+    extra_preprocessors=[CSS(".hero-episode")],
+    extra_instructions=[
+        "Do not include the episode number in the title.",
+    ],
+)
+
+response = episode_scraper(url)
+pprint(response.data)
+print(f"Total Cost: ${response.total_cost:.3f}")
diff --git a/docs/examples/tutorial/v4.py → docs/examples/tutorial/list_scraper_v1.py b/docs/examples/tutorial/v4.py → docs/examples/tutorial/list_scraper_v1.py
diff --git a/docs/examples/tutorial/list_scraper_v2.log b/docs/examples/tutorial/list_scraper_v2.log
@@ -0,0 +1,12 @@
+2023-03-24 17:55:33 [debug    ] got HTML                       length=1424892 url=https://comedybangbang.fandom.com/wiki/Category:Episodes
+2023-03-24 17:55:33 [debug    ] preprocessor                   from_nodes=1 name=CleanHTML nodes=1
+2023-03-24 17:55:33 [debug    ] preprocessor                   from_nodes=1 name=CSS(.mw-parser-output a[class!='image link-internal']) nodes=857
+2023-03-24 17:55:33 [debug    ] chunked tags                   num=20 sizes=[1971, 1994, 1986, 1976, 1978, 1990, 1993, 1974, 1995, 1983, 1975, 1979, 1967, 1953, 1971, 1973, 1987, 1960, 1966, 682]
+2023-03-24 17:55:33 [info     ] API request                    html_tokens=1971 model=gpt-3.5-turbo
+2023-03-24 17:57:12 [info     ] API response                   completion_tokens=2053 cost=0.008194 duration=98.94199872016907 finish_reason=length prompt_tokens=2044
+2023-03-24 17:57:12 [warning  ] API request failed             attempts=1 model=gpt-3.5-turbo
+OpenAI did not stop: length (prompt_tokens=2044, completion_tokens=2053)
+2023-03-24 17:57:17 [info     ] API request                    html_tokens=1971 model=gpt-3.5-turbo
+2023-03-24 17:58:59 [info     ] API response                   completion_tokens=2053 cost=0.008194 duration=101.6875491142273 finish_reason=length prompt_tokens=2044
+2023-03-24 17:58:59 [warning  ] API request failed             attempts=2 model=gpt-3.5-turbo
+OpenAI did not stop: length (prompt_tokens=2044, completion_tokens=2053)
diff --git a/docs/examples/tutorial/v5.py → docs/examples/tutorial/list_scraper_v2.py b/docs/examples/tutorial/v5.py → docs/examples/tutorial/list_scraper_v2.py
@@ -2,14 +2,15 @@
 
 episode_list_scraper = SchemaScraper(
     "url",
-    auto_split_length=2048,
+    auto_split_length=2000,
     extra_preprocessors=[CSS(".mw-parser-output a[class!='image link-internal']")],
 )
-episode_urls = episode_list_scraper(
+response = episode_list_scraper(
     "https://comedybangbang.fandom.com/wiki/Category:Episodes"
 )
 
+episode_urls = response.data
 print(episode_urls[:3])
 print(episode_urls[-3:])
 print("total:", len(episode_urls))
-print("cost:", episode_list_scraper.total_cost)
+print(f"Total Cost: ${response.total_cost:.3f}")
diff --git a/docs/examples/tutorial/v6.py → docs/examples/tutorial/list_scraper_v3.py b/docs/examples/tutorial/v6.py → docs/examples/tutorial/list_scraper_v3.py
@@ -3,7 +3,7 @@
 
 episode_list_scraper = SchemaScraper(
     '{"url": "url"}',
-    auto_split_length=2048,
+    auto_split_length=2000,
     # restrict this to GPT-3.5-Turbo to keep the cost down
     models=["gpt-3.5-turbo"],
     extra_preprocessors=CSS(".mw-parser-output a[class!='image link-internal']"),
@@ -20,23 +20,23 @@
     extra_preprocessors=CSS("div.page-content"),
 )
 
-episode_urls = episode_list_scraper(
+resp = episode_list_scraper(
     "https://comedybangbang.fandom.com/wiki/Category:Episodes",
 )
-print(
-    f"Scraped {len(episode_urls)} episode URLs, cost {episode_list_scraper.total_cost}"
-)
+episode_urls = resp.data
+print(f"Scraped {len(episode_urls)} episode URLs, cost {resp.total_cost}")
 
 episode_data = []
 for episode_url in episode_urls:
     print(episode_url)
     episode_data.append(
         episode_scraper(
             episode_url["url"],
-        )
+        ).data
     )
 
-print(f"Scraped {len(episode_data)} episodes, cost {episode_scraper.total_cost}")
+# scrapers have a stats() method that returns a dict of statistics across all calls
+print(f"Scraped {len(episode_data)} episodes, ${episode_scraper.stats()['total_cost']}")
 
 with open("episode_data.json", "w") as f:
     json.dump(episode_data, f, indent=2)
diff --git a/docs/examples/tutorial/v2.log b/docs/examples/tutorial/v2.log
diff --git a/docs/examples/tutorial/v3.log b/docs/examples/tutorial/v3.log
diff --git a/docs/examples/tutorial/v5.log b/docs/examples/tutorial/v5.log
diff --git a/docs/faq.md b/docs/faq.md
@@ -4,17 +4,15 @@
 
 ## Is this practical? Or just a toy?
 
-When I started the project, I think I assumed it was a toy. But I've been surprised by the results.
+When I started the project I mostly assumed it was a toy. But I've been surprised by the results.
 
-After my initial GPT-4 experiments, [Simon Willison asked](https://mastodon.social/@[email protected]/110042216119791967) how well it'd work on GPT-3.5-turbo. I hadn't realized the enormous price difference, and without switching to 3.5-turbo, I'd probably have decided it was too expensive to be practical.
+After my initial GPT-4 experiments, [Simon Willison asked](https://mastodon.social/@[email protected]/110042216119791967) how well it'd work on GPT-3.5-turbo. I hadn't realized the significant price difference, and without switching to 3.5-turbo, I'd probably have decided it was too expensive to be practical.
 
 Once I realized 3.5-turbo was an option, I was able to spend a lot more time tinkering with the prompt and token reduction.  It also got me thinking more about what kind of tooling you'd want around something like this if you were going to actually use it.
 
-It's certainly started to feel more practical now, I'm not sure I'd call it a toy anymore.
-
 ## Why would I use this instead of a traditional scraper?
 
-It is definitely great for quick prototypes. With the CLI tool, you can try a scrape in a *single command*.
+It is definitely great for quick prototypes. With the CLI tool, you can try a scrape in a *single command* without writing a line of code.
 This means you don't need to sink a bunch of time into deciding if it's worth it or not.
 
 Or, imagine a scraper that needs to run infrequently on a page that is likely to break in subtle ways between scrapes.
@@ -27,7 +25,7 @@ It is also quite good at dealing with unstructured text. A list of items in a se
 * It is terrible at pages that are large lists (like a directory), they need to be broken into multiple chunks and the API calls can be quite expensive in terms of time and money.
 * It is quite opaque.  When it fails, it can be hard to tell why.
 * If the page is dynamic, this approach won't work at all.  It requires all of the content to be available in the HTML.
-* It is *slow*.  A single request can take up to a minute.
+* It is *slow*.  A single request can take over a minute if OpenAI is slow to respond.
 * Right now, it only works with OpenAI, that means you'll be dependent on their pricing and availability. It also means
 you need to be comfortable sending your data to a third party.
 
@@ -42,7 +40,7 @@ Try the following:
 
 1. Provide a CSS or XPath selector to limit the scope of the page.
 
-2. Pre-process the HTML. Trim tags or entire sections you don't need.
+2. Pre-process the HTML. Trim tags or entire sections you don't need.  (You can use the preprocessing pipeline to help with this.)
 
 3. Finally, you can use the `auto_split_length` parameter to split the page into smaller chunks.  This only works for list-type pages, and requires a good choice of selector to split the page up.
 
@@ -62,11 +60,11 @@ It is possible, but in practice hasn't been observed as a major problem yet.
 
 Because the [*temperature*](https://platform.openai.com/docs/api-reference/completions) is zero, the output is fully deterministic and seems less likely to hallucinate data.
 
-It is definitely possible however, and future versions of this tool will allow for automated error checking (and possibly correction).
+The `HallucinationChecker` class can be used to detect data that appears in the response that doesn't appear on the page. This approach could be improved, but I haven't seen hallucination as a major problem yet.  (If you have examples, please open an issue!)
 
 ## How much did you spend developing this?
 
-So far, about $25 on API calls, switching to GPT-3.5 as the default made a big difference.
+So far, about $40 on API calls, switching to GPT-3.5 as the default made a big difference.
 
 My most expensive call was a paginated GPT-4 call that cost $2.20.  I decided to add the cost-limiting features after that.
 

diff --git a/docs/openai.md b/docs/openai.md
@@ -1,6 +1,6 @@
-# About the OpenAI API
+# OpenAI / GPT
 
-This section assumes you are mostly unfamiliar with the OpenAI API and aims to provide a high-level overview of how they work in relation to this library. 
+This section assumes you are mostly unfamiliar with the OpenAI API and aims to provide a high-level overview of how it works in relation to this library. 
 
 
 ## API Keys
@@ -19,15 +19,15 @@ The cost estimates provided by this library are based on the [OpenAI pricing pag
 
 ## Tokens
 
-OpenAI encodes text using a tokenizer, which converts words to integers.
+OpenAI encodes text using a [tokenizer](https://github.com/openai/tiktoken), which converts words to integers.
 
 You'll see that billing is based on the number of tokens used.  A token is approximately 3 characters, so 3000 characters of HTML will roughly correspond to 1000 tokens.
 
 Additionally, the GPT-3-Turbo model is limited to 4096 tokens.  GPT-4 is limited to 8192 tokens.  (A 32k model has been announced, but is not yet widely available.)
 
 Various features in the library will help you avoid running into token limits, but it is still very common to exceed them in practice.
 
-If your pages exceed them, you'll need to focus on improving your [selectors](/api.md#selectors) so that only the required data is sent to the underlying models.
+If your pages exceed these limits, you'll need to focus on improving your [selectors](/api.md#selectors) so that only the required data is sent to the underlying models.
 
 ## Prompts
 

diff --git a/docs/snippets/_cost.md b/docs/snippets/_cost.md
@@ -6,4 +6,6 @@
 | GPT-4 (8k) | 0.03 | 0.06 |
 | GPT-4 (32k) | 0.06 | 0.12 |
 
-(See [OpenAI pricing page](https://platform.openai.com/pricing) for latest info.)
+Example: A 3,000 token page that returns 1,000 tokens of JSON will cost $0.008 with GPT-3-Turbo, but $0.15 with GPT-4.
+
+(See [OpenAI pricing page](https://platform.openai.com/pricing) for latest info.)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		2023-03-24 17:19:55 [debug ] got HTML length=165069 url=https://comedybangbang.fandom.com/wiki/Operation_Golden_Orb
		2023-03-24 17:19:55 [debug ] preprocessor from_nodes=1 name=CleanHTML nodes=1