From 5c81ea1803e08185cae2673ea1d7864e79271bbf Mon Sep 17 00:00:00 2001
From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com>
Date: Mon, 9 Dec 2024 15:34:50 -0300
Subject: [PATCH 1/5] fixed optional+default bug on llm schema

---
 .../scrapeURL/transformers/llmExtract.ts      | 24 ++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
index 1c6adcd12..71a464060 100644
--- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
@@ -99,6 +99,10 @@ export async function generateOpenAICompletions(logger: Logger, options: Extract
     }
 
     let schema = options.schema;
+    if (schema) {
+        schema = removeDefaultProperty(schema);
+    }
+
     if (schema && schema.type === "array") {
         schema = {
             type: "object",
@@ -112,7 +116,9 @@ export async function generateOpenAICompletions(logger: Logger, options: Extract
       schema = {
           type: "object",
           properties: Object.fromEntries(
-              Object.entries(schema).map(([key, value]) => [key, { type: value }])
+              Object.entries(schema).map(([key, value]) => {
+                  return [key, removeDefaultProperty(value)];
+              })
           ),
           required: Object.keys(schema),
           additionalProperties: false
@@ -192,3 +198,19 @@ export async function performLLMExtract(meta: Meta, document: Document): Promise
 
     return document;
 }
+
+function removeDefaultProperty(schema: any): any {
+  if (typeof schema !== 'object' || schema === null) return schema;
+
+  const { default: _, ...rest } = schema;
+
+  for (const key in rest) {
+      if (Array.isArray(rest[key])) {
+          rest[key] = rest[key].map((item: any) => removeDefaultProperty(item));
+      } else if (typeof rest[key] === 'object' && rest[key] !== null) {
+          rest[key] = removeDefaultProperty(rest[key]);
+      }
+  }
+
+  return rest;
+}
\ No newline at end of file

From eab30c474b19b8ffb89f4d64e83919fec99b0f5c Mon Sep 17 00:00:00 2001
From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com>
Date: Mon, 16 Dec 2024 09:30:40 -0300
Subject: [PATCH 2/5] added unit tests

---
 .../scrapeURL/transformers/llmExtract.test.ts | 33 +++++++++++++++++++
 .../scrapeURL/transformers/llmExtract.ts      |  2 +-
 2 files changed, 34 insertions(+), 1 deletion(-)
 create mode 100644 apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts

diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts
new file mode 100644
index 000000000..f23f506f0
--- /dev/null
+++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts
@@ -0,0 +1,33 @@
+import { removeDefaultProperty } from "./llmExtract";
+
+describe("removeDefaultProperty", () => {
+    it("should remove the default property from a simple object", () => {
+        const input = { default: "test", test: "test" };
+        const expectedOutput = { test: "test" };
+        expect(removeDefaultProperty(input)).toEqual(expectedOutput);
+    });
+
+    it("should remove the default property from a nested object", () => {
+        const input = { default: "test", nested: { default: "nestedTest", test: "nestedTest" } };
+        const expectedOutput = { nested: { test: "nestedTest" } };
+        expect(removeDefaultProperty(input)).toEqual(expectedOutput);
+    });
+
+    it("should remove the default property from an array of objects", () => {
+        const input = { array: [{ default: "test1", test: "test1" }, { default: "test2", test: "test2" }] };
+        const expectedOutput = { array: [{ test: "test1" }, { test: "test2" }] };
+        expect(removeDefaultProperty(input)).toEqual(expectedOutput);
+    });
+
+    it("should handle objects without a default property", () => {
+        const input = { test: "test" };
+        const expectedOutput = { test: "test" };
+        expect(removeDefaultProperty(input)).toEqual(expectedOutput);
+    });
+
+    it("should handle null and non-object inputs", () => {
+        expect(removeDefaultProperty(null)).toBeNull();
+        expect(removeDefaultProperty("string")).toBe("string");
+        expect(removeDefaultProperty(123)).toBe(123);
+    });
+});
\ No newline at end of file
diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
index 71a464060..c35e20c13 100644
--- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
@@ -199,7 +199,7 @@ export async function performLLMExtract(meta: Meta, document: Document): Promise
     return document;
 }
 
-function removeDefaultProperty(schema: any): any {
+export function removeDefaultProperty(schema: any): any {
   if (typeof schema !== 'object' || schema === null) return schema;
 
   const { default: _, ...rest } = schema;

From b6802bc443a5d68679af8dfe58737e0d99be26c4 Mon Sep 17 00:00:00 2001
From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com>
Date: Mon, 16 Dec 2024 11:41:59 -0300
Subject: [PATCH 3/5] merged with main

---
 .../scrapeURL/transformers/llmExtract.ts      | 148 ++++--------------
 1 file changed, 28 insertions(+), 120 deletions(-)

diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
index 7b518300f..c189c8f71 100644
--- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
@@ -121,6 +121,10 @@ export async function generateOpenAICompletions(
   }
 
   let schema = options.schema;
+  if (schema) {
+    schema = removeDefaultProperty(schema);
+}
+
   if (schema && schema.type === "array") {
     schema = {
       type: "object",
@@ -134,10 +138,12 @@ export async function generateOpenAICompletions(
     schema = {
       type: "object",
       properties: Object.fromEntries(
-        Object.entries(schema).map(([key, value]) => [key, { type: value }]),
+        Object.entries(schema).map(([key, value]) => {
+          return [key, removeDefaultProperty(value)];
+        })
       ),
       required: Object.keys(schema),
-      additionalProperties: false,
+      additionalProperties: false
     };
   }
 
@@ -183,124 +189,6 @@ export async function generateOpenAICompletions(
 
   if (extract === null && jsonCompletion.choices[0].message.content !== null) {
     try {
-        // Encode the message into tokens
-        const tokens = encoder.encode(markdown);
-    
-        // Return the number of tokens
-        numTokens = tokens.length;
-    } catch (error) {
-        logger.warn("Calculating num tokens of string failed", { error, markdown });
-
-        markdown = markdown.slice(0, maxTokens * modifier);
-
-        let w = "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + maxTokens + ") we support.";
-        warning = previousWarning === undefined ? w : w + " " + previousWarning;
-    } finally {
-        // Free the encoder resources after use
-        encoder.free();
-    }
-
-    if (numTokens > maxTokens) {
-        // trim the document to the maximum number of tokens, tokens != characters
-        markdown = markdown.slice(0, maxTokens * modifier);
-
-        const w = "The extraction content would have used more tokens (" + numTokens + ") than the maximum we allow (" + maxTokens + "). -- the input has been automatically trimmed.";
-        warning = previousWarning === undefined ? w : w + " " + previousWarning;
-    }
-
-    let schema = options.schema;
-    if (schema) {
-        schema = removeDefaultProperty(schema);
-    }
-
-    if (schema && schema.type === "array") {
-        schema = {
-            type: "object",
-            properties: {
-                items: options.schema,
-            },
-            required: ["items"],
-            additionalProperties: false,
-        };
-    } else if (schema && typeof schema === 'object' && !schema.type) {
-      schema = {
-          type: "object",
-          properties: Object.fromEntries(
-              Object.entries(schema).map(([key, value]) => {
-                  return [key, removeDefaultProperty(value)];
-              })
-          ),
-          required: Object.keys(schema),
-          additionalProperties: false
-      };
-    }
-
-    schema = normalizeSchema(schema);
-
-    const jsonCompletion = await openai.beta.chat.completions.parse({
-        model,
-        temperature: 0,
-        messages: [
-            {
-                role: "system",
-                content: options.systemPrompt,
-            },
-            {
-                role: "user",
-                content: [{ type: "text", text: markdown }],
-            },
-            {
-                role: "user",
-                content: options.prompt !== undefined
-                    ? `Transform the above content into structured JSON output based on the following user request: ${options.prompt}`
-                    : "Transform the above content into structured JSON output.",
-            },
-        ],
-        response_format: options.schema ? {
-            type: "json_schema",
-            json_schema: {
-                name: "websiteContent",
-                schema: schema,
-                strict: true,
-            }
-        } : { type: "json_object" },
-    });
-
-    if (jsonCompletion.choices[0].message.refusal !== null) {
-        throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
-    }
-
-    extract = jsonCompletion.choices[0].message.parsed;
-
-    if (extract === null && jsonCompletion.choices[0].message.content !== null) {
-        try {
-            if (!isExtractEndpoint) {
-                extract = JSON.parse(jsonCompletion.choices[0].message.content);
-            } else {
-                const extractData = JSON.parse(jsonCompletion.choices[0].message.content);
-                extract = options.schema ? extractData.data.extract : extractData;
-            }
-        } catch (e) {
-            logger.error("Failed to parse returned JSON, no schema specified.", { error: e });
-            throw new LLMRefusalError("Failed to parse returned JSON. Please specify a schema in the extract object.");
-        }
-    }
-
-    // If the users actually wants the items object, they can specify it as 'required' in the schema
-    // otherwise, we just return the items array
-    if (options.schema && options.schema.type === "array" && !schema?.required?.includes("items")) {
-        extract = extract?.items;
-    }
-    return { extract, warning, numTokens };
-}
-
-export async function performLLMExtract(meta: Meta, document: Document): Promise<Document> {
-    if (meta.options.formats.includes("extract")) {
-        const { extract, warning } = await generateOpenAICompletions(
-          meta.logger.child({ method: "performLLMExtract/generateOpenAICompletions" }),
-          meta.options.extract!,
-          document.markdown,
-          document.warning,
       if (!isExtractEndpoint) {
         extract = JSON.parse(jsonCompletion.choices[0].message.content);
       } else {
@@ -331,6 +219,26 @@ export async function performLLMExtract(meta: Meta, document: Document): Promise
   return { extract, warning, numTokens };
 }
 
+export async function performLLMExtract(
+  meta: Meta,
+  document: Document,
+): Promise<Document> {
+  if (meta.options.formats.includes("extract")) {
+    const { extract, warning } = await generateOpenAICompletions(
+      meta.logger.child({
+        method: "performLLMExtract/generateOpenAICompletions",
+      }),
+      meta.options.extract!,
+      document.markdown,
+      document.warning,
+    );
+    document.extract = extract;
+    document.warning = warning;
+  }
+
+  return document;
+}
+
 export function removeDefaultProperty(schema: any): any {
   if (typeof schema !== 'object' || schema === null) return schema;
 

From d8150c61714cfd320190c08263ae4722e38b95ad Mon Sep 17 00:00:00 2001
From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com>
Date: Mon, 16 Dec 2024 11:46:56 -0300
Subject: [PATCH 4/5] added type to reqs example

---
 apps/api/requests.http | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/apps/api/requests.http b/apps/api/requests.http
index 0e3b92066..5d99bce93 100644
--- a/apps/api/requests.http
+++ b/apps/api/requests.http
@@ -19,7 +19,7 @@ Authorization: Bearer {{$dotenv TEST_API_KEY}}
 content-type: application/json
 
 {
-  "url": "firecrawl.dev"
+  "url": "v"
 }
 
 ### Check Crawl Status
@@ -70,8 +70,8 @@ content-type: application/json
   "urls": ["firecrawl.dev"],
   "prompt": "What is the title, description and main product of the page?",
   "schema": {
-    "title": "string",
-    "description": "string",
-    "mainProduct": "string"
+    "title": { "type": "string" },
+    "description": { "type": "string" },
+    "mainProduct": { "type": "string" }
   }
 }
\ No newline at end of file

From 2c233bd3213cab52bdb493313d86a45b86dd90b4 Mon Sep 17 00:00:00 2001
From: Rafael Miller <150964962+rafaelsideguide@users.noreply.github.com>
Date: Mon, 16 Dec 2024 11:48:48 -0300
Subject: [PATCH 5/5] Update requests.http

---
 apps/api/requests.http | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/api/requests.http b/apps/api/requests.http
index 5d99bce93..8aa3788db 100644
--- a/apps/api/requests.http
+++ b/apps/api/requests.http
@@ -19,7 +19,7 @@ Authorization: Bearer {{$dotenv TEST_API_KEY}}
 content-type: application/json
 
 {
-  "url": "v"
+  "url": "firecrawl.dev"
 }
 
 ### Check Crawl Status
@@ -74,4 +74,4 @@ content-type: application/json
     "description": { "type": "string" },
     "mainProduct": { "type": "string" }
   }
-}
\ No newline at end of file
+}