Skip to content

Commit

Permalink
Fix formatting in config.ts and core.ts
Browse files Browse the repository at this point in the history
  • Loading branch information
guillermoscript committed Nov 29, 2023
1 parent 98a645a commit ed47ed4
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 22 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,9 @@ type Config = {
*/
resourceExclusions?: string[];
/** Optional maximum file size in megabytes to include in the output file */
maxFileSize?: number,
maxFileSize?: number;
/** Optional maximum number tokens to include in the output file */
maxTokens?: number,
maxTokens?: number;
};
```

Expand Down
6 changes: 3 additions & 3 deletions src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,11 @@ export const configSchema = z.object({

/** Optional maximum file size in megabytes to include in the output file
* @example 1
*/
*/
maxFileSize: z.number().int().positive().optional(),
/** Optional maximum number tokens to include in the output file
/** Optional maximum number tokens to include in the output file
* @example 5000
*/
*/
maxTokens: z.number().int().positive().optional(),
});

Expand Down
43 changes: 26 additions & 17 deletions src/core.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@ import { readFile, writeFile } from "fs/promises";
import { glob } from "glob";
import { Config, configSchema } from "./config.js";
import { Page } from "playwright";
import {
isWithinTokenLimit,
} from 'gpt-tokenizer'
import { isWithinTokenLimit } from "gpt-tokenizer";

let pageCounter = 0;

Expand Down Expand Up @@ -144,35 +142,46 @@ export async function crawl(config: Config) {
}
}

export async function write(config: Config) {
const jsonFiles = await glob("storage/datasets/default/*.json", { absolute: true });
export async function write(config: Config) {
const jsonFiles = await glob("storage/datasets/default/*.json", {
absolute: true,
});

console.log(`Found ${jsonFiles.length} files to combine...`);

let currentResults: Record<string, any>[] = [];
let currentSize: number = 0;
let fileCounter: number = 1;
const maxBytes: number = config.maxFileSize ? config.maxFileSize * 1024 * 1024 : Infinity;

const getStringByteSize = (str: string): number => Buffer.byteLength(str, 'utf-8');

const nextFileName = (): string => `${config.outputFileName.replace(/\.json$/, '')}-${fileCounter}.json`;

const maxBytes: number = config.maxFileSize
? config.maxFileSize * 1024 * 1024
: Infinity;

const getStringByteSize = (str: string): number =>
Buffer.byteLength(str, "utf-8");

const nextFileName = (): string =>
`${config.outputFileName.replace(/\.json$/, "")}-${fileCounter}.json`;

const writeBatchToFile = async (): Promise<void> => {
await writeFile(nextFileName(), JSON.stringify(currentResults, null, 2));
console.log(`Wrote ${currentResults.length} items to ${nextFileName()}`);
currentResults = [];
currentSize = 0;
fileCounter++;
};

let estimatedTokens: number = 0;

const addContentOrSplit = async (data: Record<string, any>): Promise<void> => {
const addContentOrSplit = async (
data: Record<string, any>,
): Promise<void> => {
const contentString: string = JSON.stringify(data);
const tokenCount: number | false = isWithinTokenLimit(contentString, config.maxTokens || Infinity);
const tokenCount: number | false = isWithinTokenLimit(
contentString,
config.maxTokens || Infinity,
);

if (typeof tokenCount === 'number') {
if (typeof tokenCount === "number") {
if (estimatedTokens + tokenCount > config.maxTokens!) {
// Only write the batch if it's not empty (something to write)
if (currentResults.length > 0) {
Expand All @@ -195,7 +204,7 @@ export async function write(config: Config) {

// Iterate over each JSON file and process its contents.
for (const file of jsonFiles) {
const fileContent = await readFile(file, 'utf-8');
const fileContent = await readFile(file, "utf-8");
const data: Record<string, any> = JSON.parse(fileContent);
await addContentOrSplit(data);
}
Expand All @@ -204,4 +213,4 @@ export async function write(config: Config) {
if (currentResults.length > 0) {
await writeBatchToFile();
}
};
}

0 comments on commit ed47ed4

Please sign in to comment.