From 5c81ea1803e08185cae2673ea1d7864e79271bbf Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 9 Dec 2024 15:34:50 -0300 Subject: [PATCH 1/5] fixed optional+default bug on llm schema --- .../scrapeURL/transformers/llmExtract.ts | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 1c6adcd12..71a464060 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -99,6 +99,10 @@ export async function generateOpenAICompletions(logger: Logger, options: Extract } let schema = options.schema; + if (schema) { + schema = removeDefaultProperty(schema); + } + if (schema && schema.type === "array") { schema = { type: "object", @@ -112,7 +116,9 @@ export async function generateOpenAICompletions(logger: Logger, options: Extract schema = { type: "object", properties: Object.fromEntries( - Object.entries(schema).map(([key, value]) => [key, { type: value }]) + Object.entries(schema).map(([key, value]) => { + return [key, removeDefaultProperty(value)]; + }) ), required: Object.keys(schema), additionalProperties: false @@ -192,3 +198,19 @@ export async function performLLMExtract(meta: Meta, document: Document): Promise return document; } + +function removeDefaultProperty(schema: any): any { + if (typeof schema !== 'object' || schema === null) return schema; + + const { default: _, ...rest } = schema; + + for (const key in rest) { + if (Array.isArray(rest[key])) { + rest[key] = rest[key].map((item: any) => removeDefaultProperty(item)); + } else if (typeof rest[key] === 'object' && rest[key] !== null) { + rest[key] = removeDefaultProperty(rest[key]); + } + } + + return rest; +} \ No newline at end of file From eab30c474b19b8ffb89f4d64e83919fec99b0f5c Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 16 Dec 2024 09:30:40 -0300 Subject: [PATCH 2/5] added unit tests --- .../scrapeURL/transformers/llmExtract.test.ts | 33 +++++++++++++++++++ .../scrapeURL/transformers/llmExtract.ts | 2 +- 2 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts new file mode 100644 index 000000000..f23f506f0 --- /dev/null +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts @@ -0,0 +1,33 @@ +import { removeDefaultProperty } from "./llmExtract"; + +describe("removeDefaultProperty", () => { + it("should remove the default property from a simple object", () => { + const input = { default: "test", test: "test" }; + const expectedOutput = { test: "test" }; + expect(removeDefaultProperty(input)).toEqual(expectedOutput); + }); + + it("should remove the default property from a nested object", () => { + const input = { default: "test", nested: { default: "nestedTest", test: "nestedTest" } }; + const expectedOutput = { nested: { test: "nestedTest" } }; + expect(removeDefaultProperty(input)).toEqual(expectedOutput); + }); + + it("should remove the default property from an array of objects", () => { + const input = { array: [{ default: "test1", test: "test1" }, { default: "test2", test: "test2" }] }; + const expectedOutput = { array: [{ test: "test1" }, { test: "test2" }] }; + expect(removeDefaultProperty(input)).toEqual(expectedOutput); + }); + + it("should handle objects without a default property", () => { + const input = { test: "test" }; + const expectedOutput = { test: "test" }; + expect(removeDefaultProperty(input)).toEqual(expectedOutput); + }); + + it("should handle null and non-object inputs", () => { + expect(removeDefaultProperty(null)).toBeNull(); + expect(removeDefaultProperty("string")).toBe("string"); + expect(removeDefaultProperty(123)).toBe(123); + }); +}); \ No newline at end of file diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 71a464060..c35e20c13 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -199,7 +199,7 @@ export async function performLLMExtract(meta: Meta, document: Document): Promise return document; } -function removeDefaultProperty(schema: any): any { +export function removeDefaultProperty(schema: any): any { if (typeof schema !== 'object' || schema === null) return schema; const { default: _, ...rest } = schema; From b6802bc443a5d68679af8dfe58737e0d99be26c4 Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 16 Dec 2024 11:41:59 -0300 Subject: [PATCH 3/5] merged with main --- .../scrapeURL/transformers/llmExtract.ts | 148 ++++-------------- 1 file changed, 28 insertions(+), 120 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 7b518300f..c189c8f71 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -121,6 +121,10 @@ export async function generateOpenAICompletions( } let schema = options.schema; + if (schema) { + schema = removeDefaultProperty(schema); +} + if (schema && schema.type === "array") { schema = { type: "object", @@ -134,10 +138,12 @@ export async function generateOpenAICompletions( schema = { type: "object", properties: Object.fromEntries( - Object.entries(schema).map(([key, value]) => [key, { type: value }]), + Object.entries(schema).map(([key, value]) => { + return [key, removeDefaultProperty(value)]; + }) ), required: Object.keys(schema), - additionalProperties: false, + additionalProperties: false }; } @@ -183,124 +189,6 @@ export async function generateOpenAICompletions( if (extract === null && jsonCompletion.choices[0].message.content !== null) { try { - // Encode the message into tokens - const tokens = encoder.encode(markdown); - - // Return the number of tokens - numTokens = tokens.length; - } catch (error) { - logger.warn("Calculating num tokens of string failed", { error, markdown }); - - markdown = markdown.slice(0, maxTokens * modifier); - - let w = "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + maxTokens + ") we support."; - warning = previousWarning === undefined ? w : w + " " + previousWarning; - } finally { - // Free the encoder resources after use - encoder.free(); - } - - if (numTokens > maxTokens) { - // trim the document to the maximum number of tokens, tokens != characters - markdown = markdown.slice(0, maxTokens * modifier); - - const w = "The extraction content would have used more tokens (" + numTokens + ") than the maximum we allow (" + maxTokens + "). -- the input has been automatically trimmed."; - warning = previousWarning === undefined ? w : w + " " + previousWarning; - } - - let schema = options.schema; - if (schema) { - schema = removeDefaultProperty(schema); - } - - if (schema && schema.type === "array") { - schema = { - type: "object", - properties: { - items: options.schema, - }, - required: ["items"], - additionalProperties: false, - }; - } else if (schema && typeof schema === 'object' && !schema.type) { - schema = { - type: "object", - properties: Object.fromEntries( - Object.entries(schema).map(([key, value]) => { - return [key, removeDefaultProperty(value)]; - }) - ), - required: Object.keys(schema), - additionalProperties: false - }; - } - - schema = normalizeSchema(schema); - - const jsonCompletion = await openai.beta.chat.completions.parse({ - model, - temperature: 0, - messages: [ - { - role: "system", - content: options.systemPrompt, - }, - { - role: "user", - content: [{ type: "text", text: markdown }], - }, - { - role: "user", - content: options.prompt !== undefined - ? `Transform the above content into structured JSON output based on the following user request: ${options.prompt}` - : "Transform the above content into structured JSON output.", - }, - ], - response_format: options.schema ? { - type: "json_schema", - json_schema: { - name: "websiteContent", - schema: schema, - strict: true, - } - } : { type: "json_object" }, - }); - - if (jsonCompletion.choices[0].message.refusal !== null) { - throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal); - } - - extract = jsonCompletion.choices[0].message.parsed; - - if (extract === null && jsonCompletion.choices[0].message.content !== null) { - try { - if (!isExtractEndpoint) { - extract = JSON.parse(jsonCompletion.choices[0].message.content); - } else { - const extractData = JSON.parse(jsonCompletion.choices[0].message.content); - extract = options.schema ? extractData.data.extract : extractData; - } - } catch (e) { - logger.error("Failed to parse returned JSON, no schema specified.", { error: e }); - throw new LLMRefusalError("Failed to parse returned JSON. Please specify a schema in the extract object."); - } - } - - // If the users actually wants the items object, they can specify it as 'required' in the schema - // otherwise, we just return the items array - if (options.schema && options.schema.type === "array" && !schema?.required?.includes("items")) { - extract = extract?.items; - } - return { extract, warning, numTokens }; -} - -export async function performLLMExtract(meta: Meta, document: Document): Promise { - if (meta.options.formats.includes("extract")) { - const { extract, warning } = await generateOpenAICompletions( - meta.logger.child({ method: "performLLMExtract/generateOpenAICompletions" }), - meta.options.extract!, - document.markdown, - document.warning, if (!isExtractEndpoint) { extract = JSON.parse(jsonCompletion.choices[0].message.content); } else { @@ -331,6 +219,26 @@ export async function performLLMExtract(meta: Meta, document: Document): Promise return { extract, warning, numTokens }; } +export async function performLLMExtract( + meta: Meta, + document: Document, +): Promise { + if (meta.options.formats.includes("extract")) { + const { extract, warning } = await generateOpenAICompletions( + meta.logger.child({ + method: "performLLMExtract/generateOpenAICompletions", + }), + meta.options.extract!, + document.markdown, + document.warning, + ); + document.extract = extract; + document.warning = warning; + } + + return document; +} + export function removeDefaultProperty(schema: any): any { if (typeof schema !== 'object' || schema === null) return schema; From d8150c61714cfd320190c08263ae4722e38b95ad Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 16 Dec 2024 11:46:56 -0300 Subject: [PATCH 4/5] added type to reqs example --- apps/api/requests.http | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/api/requests.http b/apps/api/requests.http index 0e3b92066..5d99bce93 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -19,7 +19,7 @@ Authorization: Bearer {{$dotenv TEST_API_KEY}} content-type: application/json { - "url": "firecrawl.dev" + "url": "v" } ### Check Crawl Status @@ -70,8 +70,8 @@ content-type: application/json "urls": ["firecrawl.dev"], "prompt": "What is the title, description and main product of the page?", "schema": { - "title": "string", - "description": "string", - "mainProduct": "string" + "title": { "type": "string" }, + "description": { "type": "string" }, + "mainProduct": { "type": "string" } } } \ No newline at end of file From 2c233bd3213cab52bdb493313d86a45b86dd90b4 Mon Sep 17 00:00:00 2001 From: Rafael Miller <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 16 Dec 2024 11:48:48 -0300 Subject: [PATCH 5/5] Update requests.http --- apps/api/requests.http | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/requests.http b/apps/api/requests.http index 5d99bce93..8aa3788db 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -19,7 +19,7 @@ Authorization: Bearer {{$dotenv TEST_API_KEY}} content-type: application/json { - "url": "v" + "url": "firecrawl.dev" } ### Check Crawl Status @@ -74,4 +74,4 @@ content-type: application/json "description": { "type": "string" }, "mainProduct": { "type": "string" } } -} \ No newline at end of file +}