From 7419030469752dbea5e5ff936370accc087b603f Mon Sep 17 00:00:00 2001 From: guillermoscript Date: Wed, 22 Nov 2023 23:15:07 -0400 Subject: [PATCH 1/9] Add gpt-tokenizer package --- package-lock.json | 14 ++++++++++++++ package.json | 1 + 2 files changed, 15 insertions(+) diff --git a/package-lock.json b/package-lock.json index 1da9e88b..73287af5 100644 --- a/package-lock.json +++ b/package-lock.json @@ -14,6 +14,7 @@ "crawlee": "^3.0.0", "cross-env": "^7.0.3", "glob": "^10.3.10", + "gpt-tokenizer": "^2.1.2", "inquirer": "^9.2.12", "playwright": "*", "prettier": "^3.1.0", @@ -1982,6 +1983,14 @@ "url": "https://github.com/sindresorhus/is?sponsor=1" } }, + "node_modules/gpt-tokenizer": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/gpt-tokenizer/-/gpt-tokenizer-2.1.2.tgz", + "integrity": "sha512-HSuI5d6uey+c7x/VzQlPfCoGrfLyAc28vxWofKbjR9PJHm0AjQGSWkKw/OJnb+8S1g7nzgRsf0WH3dK+NNWYbg==", + "dependencies": { + "rfc4648": "^1.5.2" + } + }, "node_modules/graceful-fs": { "version": "4.2.11", "license": "ISC" @@ -2922,6 +2931,11 @@ "node": ">= 4" } }, + "node_modules/rfc4648": { + "version": "1.5.3", + "resolved": "https://registry.npmjs.org/rfc4648/-/rfc4648-1.5.3.tgz", + "integrity": "sha512-MjOWxM065+WswwnmNONOT+bD1nXzY9Km6u3kzvnx8F8/HXGZdz3T6e6vZJ8Q/RIMUSp/nxqjH3GwvJDy8ijeQQ==" + }, "node_modules/rrweb-cssom": { "version": "0.6.0", "license": "MIT" diff --git a/package.json b/package.json index 2226ba0f..be5626bc 100644 --- a/package.json +++ b/package.json @@ -11,6 +11,7 @@ "crawlee": "^3.0.0", "cross-env": "^7.0.3", "glob": "^10.3.10", + "gpt-tokenizer": "^2.1.2", "inquirer": "^9.2.12", "playwright": "*", "prettier": "^3.1.0", From c6b770aba26bf102e417f277f104261630893d4e Mon Sep 17 00:00:00 2001 From: guillermoscript Date: Wed, 22 Nov 2023 23:28:17 -0400 Subject: [PATCH 2/9] Add optional maxFileSize and maxTokens to Config --- README.md | 4 ++++ config.ts | 2 ++ src/config.ts | 10 ++++++++++ 3 files changed, 16 insertions(+) diff --git a/README.md b/README.md index 8aa1b984..6de7c7f0 100644 --- a/README.md +++ b/README.md @@ -78,6 +78,10 @@ type Config = { maxPagesToCrawl: number; /** File name for the finished data */ outputFileName: string; + /** Optional maximum file size in bytes to include in the output file */ + maxFileSize?: number()., + /** Optional maximum number tokens to include in the output file */ + maxTokens?: number()., }; ``` diff --git a/config.ts b/config.ts index bc2d22e0..0c02d29d 100644 --- a/config.ts +++ b/config.ts @@ -5,4 +5,6 @@ export const defaultConfig: Config = { match: "https://www.builder.io/c/docs/**", maxPagesToCrawl: 50, outputFileName: "output.json", + maxFileSize: 1000, + maxTokens: 5000 }; diff --git a/src/config.ts b/src/config.ts index 901b7a09..11d68c76 100644 --- a/src/config.ts +++ b/src/config.ts @@ -51,6 +51,16 @@ export const configSchema = z.object({ .optional(), /** Optional timeout for waiting for a selector to appear */ waitForSelectorTimeout: z.number().int().nonnegative().optional(), + + + /** Optional maximum file size in bytes to include in the output file + * @example 1000 + */ + maxFileSize: z.number().int().positive().optional(), + /** Optional maximum number tokens to include in the output file + * @example 5000 + */ + maxTokens: z.number().int().positive().optional(), }); export type Config = z.infer; From 569005b4f45106f2e4a0b6aef5e804322fbb89de Mon Sep 17 00:00:00 2001 From: guillermoscript Date: Wed, 22 Nov 2023 23:40:25 -0400 Subject: [PATCH 3/9] Refactor write function to handle large datasets and implement size and token limits --- src/core.ts | 62 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 55 insertions(+), 7 deletions(-) diff --git a/src/core.ts b/src/core.ts index 48966d99..5ee6ca65 100644 --- a/src/core.ts +++ b/src/core.ts @@ -4,6 +4,9 @@ import { readFile, writeFile } from "fs/promises"; import { glob } from "glob"; import {Config, configSchema} from "./config.js"; import { Page } from "playwright"; +import { + isWithinTokenLimit, +} from 'gpt-tokenizer' let pageCounter = 0; @@ -113,17 +116,62 @@ export async function crawl(config: Config) { } export async function write(config: Config) { - configSchema.parse(config); - const jsonFiles = await glob("storage/datasets/default/*.json", { absolute: true, }); - const results = []; + console.log(`Found ${jsonFiles.length} files to combine...`); + + let currentResults: any[] = []; + let currentSize = 0; + let fileCounter = 1; + const maxBytes = config.maxFileSize ? config.maxFileSize * 1024 * 1024 : null; // Convert maxFileSize from MB to bytes + + // Helper function to get byte size of string + const getStringByteSize = (str: string) => Buffer.byteLength(str, 'utf-8'); + + // Write the accumulated data to a file and reset the current batch + const writeToFile = async () => { + const fileName = `${config.outputFileName.replace(/\.json$/, '')}-${fileCounter}.json`; + await writeFile(fileName, JSON.stringify(currentResults, null, 2)); + console.log(`Wrote ${currentResults.length} items to ${fileName}`); + fileCounter++; + currentResults = []; // Start a new batch + currentSize = 0; // Reset the size counter + }; + for (const file of jsonFiles) { - const data = JSON.parse(await readFile(file, "utf-8")); - results.push(data); - } + const fileContent = await readFile(file, 'utf-8'); + const data = JSON.parse(fileContent); + const dataSize = getStringByteSize(fileContent); + let resultWritten = false; + + // Check if data exceeds file size limit (if present) + if (maxBytes && currentSize + dataSize > maxBytes) { + await writeToFile(); + resultWritten = true; + } + + // Check if data exceeds token limit (if present) + if (config.maxTokens && !isWithinTokenLimit(JSON.stringify(data), config.maxTokens)) { + if (!resultWritten) { // Write only if not already written + await writeToFile(); + } + continue; // Skip adding this object to the batch + } + + // Add data to current batch + currentResults.push(data); + currentSize += dataSize; - await writeFile(config.outputFileName, JSON.stringify(results, null, 2)); + // Write to file if batch is over size limit (File size check to delegate larger final batch size check) + if (maxBytes && currentSize > maxBytes) { + await writeToFile(); + } + } + + // Write any remaining data in the current batch to the final file + if (currentResults.length > 0) { + await writeToFile(); + } } From bd86e59ac0b6296d51bbc0f7c6dd3cf4776c0cf4 Mon Sep 17 00:00:00 2001 From: guillermoscript Date: Thu, 23 Nov 2023 20:35:45 -0400 Subject: [PATCH 4/9] Update maxFileSize comment to use megabytes --- README.md | 2 +- src/config.ts | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 6de7c7f0..e3e83940 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,7 @@ type Config = { maxPagesToCrawl: number; /** File name for the finished data */ outputFileName: string; - /** Optional maximum file size in bytes to include in the output file */ + /** Optional maximum file size in megabytes to include in the output file */ maxFileSize?: number()., /** Optional maximum number tokens to include in the output file */ maxTokens?: number()., diff --git a/src/config.ts b/src/config.ts index 11d68c76..90ec2184 100644 --- a/src/config.ts +++ b/src/config.ts @@ -53,8 +53,8 @@ export const configSchema = z.object({ waitForSelectorTimeout: z.number().int().nonnegative().optional(), - /** Optional maximum file size in bytes to include in the output file - * @example 1000 + /** Optional maximum file size in megabytes to include in the output file + * @example 1 */ maxFileSize: z.number().int().positive().optional(), /** Optional maximum number tokens to include in the output file From 05d497f98621a6b50b63f81a07a8fde16067c668 Mon Sep 17 00:00:00 2001 From: guillermoscript Date: Thu, 23 Nov 2023 21:33:21 -0400 Subject: [PATCH 5/9] Refactor write function to improve performance and handle large datasets --- src/core.ts | 93 +++++++++++++++++++++++++++-------------------------- 1 file changed, 47 insertions(+), 46 deletions(-) diff --git a/src/core.ts b/src/core.ts index 5ee6ca65..b340f7c6 100644 --- a/src/core.ts +++ b/src/core.ts @@ -115,63 +115,64 @@ export async function crawl(config: Config) { } } -export async function write(config: Config) { - const jsonFiles = await glob("storage/datasets/default/*.json", { - absolute: true, - }); +export async function write(config: Config) { + const jsonFiles = await glob("storage/datasets/default/*.json", { absolute: true }); console.log(`Found ${jsonFiles.length} files to combine...`); - let currentResults: any[] = []; - let currentSize = 0; - let fileCounter = 1; - const maxBytes = config.maxFileSize ? config.maxFileSize * 1024 * 1024 : null; // Convert maxFileSize from MB to bytes - - // Helper function to get byte size of string - const getStringByteSize = (str: string) => Buffer.byteLength(str, 'utf-8'); - - // Write the accumulated data to a file and reset the current batch - const writeToFile = async () => { - const fileName = `${config.outputFileName.replace(/\.json$/, '')}-${fileCounter}.json`; - await writeFile(fileName, JSON.stringify(currentResults, null, 2)); - console.log(`Wrote ${currentResults.length} items to ${fileName}`); + let currentResults: Record[] = []; + let currentSize: number = 0; + let fileCounter: number = 1; + const maxBytes: number = config.maxFileSize ? config.maxFileSize * 1024 * 1024 : Infinity; + + const getStringByteSize = (str: string): number => Buffer.byteLength(str, 'utf-8'); + + const nextFileName = (): string => `${config.outputFileName.replace(/\.json$/, '')}-${fileCounter}.json`; + + const writeBatchToFile = async (): Promise => { + await writeFile(nextFileName(), JSON.stringify(currentResults, null, 2)); + console.log(`Wrote ${currentResults.length} items to ${nextFileName()}`); + currentResults = []; + currentSize = 0; fileCounter++; - currentResults = []; // Start a new batch - currentSize = 0; // Reset the size counter }; + + let estimatedTokens: number = 0; - for (const file of jsonFiles) { - const fileContent = await readFile(file, 'utf-8'); - const data = JSON.parse(fileContent); - const dataSize = getStringByteSize(fileContent); - let resultWritten = false; - - // Check if data exceeds file size limit (if present) - if (maxBytes && currentSize + dataSize > maxBytes) { - await writeToFile(); - resultWritten = true; - } + const addContentOrSplit = async (data: Record): Promise => { + const contentString: string = JSON.stringify(data); + const tokenCount: number | false = isWithinTokenLimit(contentString, config.maxTokens || Infinity); - // Check if data exceeds token limit (if present) - if (config.maxTokens && !isWithinTokenLimit(JSON.stringify(data), config.maxTokens)) { - if (!resultWritten) { // Write only if not already written - await writeToFile(); + if (typeof tokenCount === 'number') { + if (estimatedTokens + tokenCount > config.maxTokens!) { + // Only write the batch if it's not empty (something to write) + if (currentResults.length > 0) { + await writeBatchToFile(); + } + // Since the addition of a single item exceeded the token limit, halve it. + estimatedTokens = Math.floor(tokenCount / 2); + currentResults.push(data); + } else { + currentResults.push(data); + estimatedTokens += tokenCount; } - continue; // Skip adding this object to the batch } - // Add data to current batch - currentResults.push(data); - currentSize += dataSize; - - // Write to file if batch is over size limit (File size check to delegate larger final batch size check) - if (maxBytes && currentSize > maxBytes) { - await writeToFile(); + currentSize += getStringByteSize(contentString); + if (currentSize > maxBytes) { + await writeBatchToFile(); } + }; + + // Iterate over each JSON file and process its contents. + for (const file of jsonFiles) { + const fileContent = await readFile(file, 'utf-8'); + const data: Record = JSON.parse(fileContent); + await addContentOrSplit(data); } - - // Write any remaining data in the current batch to the final file + + // Check if any remaining data needs to be written to a file. if (currentResults.length > 0) { - await writeToFile(); + await writeBatchToFile(); } -} +}; From ca4a2a54a9192139ea07757899e0d0bd4d61ec40 Mon Sep 17 00:00:00 2001 From: Guillermo Marin <52298929+guillermoscript@users.noreply.github.com> Date: Tue, 28 Nov 2023 09:48:03 -0400 Subject: [PATCH 6/9] Update README.md Co-authored-by: Steve Sewell --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e3e83940..e0e12074 100644 --- a/README.md +++ b/README.md @@ -79,7 +79,7 @@ type Config = { /** File name for the finished data */ outputFileName: string; /** Optional maximum file size in megabytes to include in the output file */ - maxFileSize?: number()., + maxFileSize?: number, /** Optional maximum number tokens to include in the output file */ maxTokens?: number()., }; From 632a82df428ca87c9b96aef80b23f12fbe9673cc Mon Sep 17 00:00:00 2001 From: guillermoscript Date: Tue, 28 Nov 2023 10:05:10 -0400 Subject: [PATCH 7/9] Remove maxFileSize and maxTokens from defaultConfig, this are optional --- config.ts | 2 -- 1 file changed, 2 deletions(-) diff --git a/config.ts b/config.ts index 0c02d29d..bc2d22e0 100644 --- a/config.ts +++ b/config.ts @@ -5,6 +5,4 @@ export const defaultConfig: Config = { match: "https://www.builder.io/c/docs/**", maxPagesToCrawl: 50, outputFileName: "output.json", - maxFileSize: 1000, - maxTokens: 5000 }; From 69d895e7d183b1ff587b05c8e11c2c219043a7b5 Mon Sep 17 00:00:00 2001 From: guillermoscript Date: Tue, 28 Nov 2023 10:07:07 -0400 Subject: [PATCH 8/9] Add instructions for splitting and uploading large files --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 58ae1c27..30712e3d 100644 --- a/README.md +++ b/README.md @@ -119,6 +119,7 @@ Use this option for UI access to your generated knowledge that you can easily sh 4. Choose "Create a GPT" 5. Choose "Configure" 6. Under "Knowledge" choose "Upload a file" and upload the file you generated +7. if you get an error about the file being too large, you can try to split it into multiple files and upload them separately using the option maxFileSize in the config.ts file or also use tokenization to reduce the size of the file with the option maxTokens in the config.ts file ![Gif of how to upload a custom GPT](https://github.com/BuilderIO/gpt-crawler/assets/844291/22f27fb5-6ca5-4748-9edd-6bcf00b408cf) From ed47ed48f4c7abc9771e0970a54900f6296a197b Mon Sep 17 00:00:00 2001 From: guillermoscript Date: Tue, 28 Nov 2023 22:10:04 -0400 Subject: [PATCH 9/9] Fix formatting in config.ts and core.ts --- README.md | 4 ++-- src/config.ts | 6 +++--- src/core.ts | 43 ++++++++++++++++++++++++++----------------- 3 files changed, 31 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index dae9f461..43bfe4c7 100644 --- a/README.md +++ b/README.md @@ -85,9 +85,9 @@ type Config = { */ resourceExclusions?: string[]; /** Optional maximum file size in megabytes to include in the output file */ - maxFileSize?: number, + maxFileSize?: number; /** Optional maximum number tokens to include in the output file */ - maxTokens?: number, + maxTokens?: number; }; ``` diff --git a/src/config.ts b/src/config.ts index 7e687f8b..7e5f5fbf 100644 --- a/src/config.ts +++ b/src/config.ts @@ -64,11 +64,11 @@ export const configSchema = z.object({ /** Optional maximum file size in megabytes to include in the output file * @example 1 - */ + */ maxFileSize: z.number().int().positive().optional(), - /** Optional maximum number tokens to include in the output file + /** Optional maximum number tokens to include in the output file * @example 5000 - */ + */ maxTokens: z.number().int().positive().optional(), }); diff --git a/src/core.ts b/src/core.ts index ca12efe4..8e03bbe5 100644 --- a/src/core.ts +++ b/src/core.ts @@ -4,9 +4,7 @@ import { readFile, writeFile } from "fs/promises"; import { glob } from "glob"; import { Config, configSchema } from "./config.js"; import { Page } from "playwright"; -import { - isWithinTokenLimit, -} from 'gpt-tokenizer' +import { isWithinTokenLimit } from "gpt-tokenizer"; let pageCounter = 0; @@ -144,20 +142,26 @@ export async function crawl(config: Config) { } } -export async function write(config: Config) { - const jsonFiles = await glob("storage/datasets/default/*.json", { absolute: true }); +export async function write(config: Config) { + const jsonFiles = await glob("storage/datasets/default/*.json", { + absolute: true, + }); console.log(`Found ${jsonFiles.length} files to combine...`); let currentResults: Record[] = []; let currentSize: number = 0; let fileCounter: number = 1; - const maxBytes: number = config.maxFileSize ? config.maxFileSize * 1024 * 1024 : Infinity; - - const getStringByteSize = (str: string): number => Buffer.byteLength(str, 'utf-8'); - - const nextFileName = (): string => `${config.outputFileName.replace(/\.json$/, '')}-${fileCounter}.json`; - + const maxBytes: number = config.maxFileSize + ? config.maxFileSize * 1024 * 1024 + : Infinity; + + const getStringByteSize = (str: string): number => + Buffer.byteLength(str, "utf-8"); + + const nextFileName = (): string => + `${config.outputFileName.replace(/\.json$/, "")}-${fileCounter}.json`; + const writeBatchToFile = async (): Promise => { await writeFile(nextFileName(), JSON.stringify(currentResults, null, 2)); console.log(`Wrote ${currentResults.length} items to ${nextFileName()}`); @@ -165,14 +169,19 @@ export async function write(config: Config) { currentSize = 0; fileCounter++; }; - + let estimatedTokens: number = 0; - const addContentOrSplit = async (data: Record): Promise => { + const addContentOrSplit = async ( + data: Record, + ): Promise => { const contentString: string = JSON.stringify(data); - const tokenCount: number | false = isWithinTokenLimit(contentString, config.maxTokens || Infinity); + const tokenCount: number | false = isWithinTokenLimit( + contentString, + config.maxTokens || Infinity, + ); - if (typeof tokenCount === 'number') { + if (typeof tokenCount === "number") { if (estimatedTokens + tokenCount > config.maxTokens!) { // Only write the batch if it's not empty (something to write) if (currentResults.length > 0) { @@ -195,7 +204,7 @@ export async function write(config: Config) { // Iterate over each JSON file and process its contents. for (const file of jsonFiles) { - const fileContent = await readFile(file, 'utf-8'); + const fileContent = await readFile(file, "utf-8"); const data: Record = JSON.parse(fileContent); await addContentOrSplit(data); } @@ -204,4 +213,4 @@ export async function write(config: Config) { if (currentResults.length > 0) { await writeBatchToFile(); } -}; +}