diff --git a/src/core.ts b/src/core.ts index 8e03bbe5..aaa43efc 100644 --- a/src/core.ts +++ b/src/core.ts @@ -5,8 +5,10 @@ import { glob } from "glob"; import { Config, configSchema } from "./config.js"; import { Page } from "playwright"; import { isWithinTokenLimit } from "gpt-tokenizer"; +import { PathLike } from "fs"; let pageCounter = 0; +let crawler: PlaywrightCrawler; export function getPageHtml(page: Page, selector = "body") { return page.evaluate((selector) => { @@ -52,7 +54,7 @@ export async function crawl(config: Config) { if (process.env.NO_CRAWL !== "true") { // PlaywrightCrawler crawls the web using a headless // browser controlled by the Playwright library. - const crawler = new PlaywrightCrawler({ + crawler = new PlaywrightCrawler({ // Use the requestHandler to process each of the crawled pages. async requestHandler({ request, page, enqueueLinks, log, pushData }) { if (config.cookie) { @@ -143,6 +145,7 @@ export async function crawl(config: Config) { } export async function write(config: Config) { + let nextFileNameString: PathLike = ""; const jsonFiles = await glob("storage/datasets/default/*.json", { absolute: true, }); @@ -163,8 +166,14 @@ export async function write(config: Config) { `${config.outputFileName.replace(/\.json$/, "")}-${fileCounter}.json`; const writeBatchToFile = async (): Promise => { - await writeFile(nextFileName(), JSON.stringify(currentResults, null, 2)); - console.log(`Wrote ${currentResults.length} items to ${nextFileName()}`); + nextFileNameString = nextFileName(); + await writeFile( + nextFileNameString, + JSON.stringify(currentResults, null, 2), + ); + console.log( + `Wrote ${currentResults.length} items to ${nextFileNameString}`, + ); currentResults = []; currentSize = 0; fileCounter++; @@ -213,4 +222,31 @@ export async function write(config: Config) { if (currentResults.length > 0) { await writeBatchToFile(); } + + return nextFileNameString; } + +class GPTCrawlerCore { + config: Config; + + constructor(config: Config) { + this.config = config; + } + + async crawl() { + await crawl(this.config); + } + + async write(): Promise { + // we need to wait for the file path as the path can change + return new Promise((resolve, reject) => { + write(this.config) + .then((outputFilePath) => { + resolve(outputFilePath); + }) + .catch(reject); + }); + } +} + +export default GPTCrawlerCore; diff --git a/src/server.ts b/src/server.ts index bf3ff510..50497a99 100644 --- a/src/server.ts +++ b/src/server.ts @@ -1,12 +1,13 @@ import express from "express"; import cors from "cors"; import { readFile } from "fs/promises"; -import { crawl, write } from "./core.js"; import { Config, configSchema } from "./config.js"; import { configDotenv } from "dotenv"; import swaggerUi from "swagger-ui-express"; // @ts-ignore import swaggerDocument from "../swagger-output.json" assert { type: "json" }; +import GPTCrawlerCore from "./core.js"; +import { PathLike } from "fs"; configDotenv(); @@ -23,12 +24,10 @@ app.post("/crawl", async (req, res) => { const config: Config = req.body; try { const validatedConfig = configSchema.parse(config); - await crawl(validatedConfig); - await write(validatedConfig); - const outputFileContent = await readFile( - validatedConfig.outputFileName, - "utf-8", - ); + const crawler = new GPTCrawlerCore(validatedConfig); + await crawler.crawl(); + const outputFileName: PathLike = await crawler.write(); + const outputFileContent = await readFile(outputFileName, "utf-8"); res.contentType("application/json"); return res.send(outputFileContent); } catch (error) { diff --git a/tsconfig.json b/tsconfig.json index ddf6ba3b..a4193efa 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -4,6 +4,7 @@ "module": "ES2022", "target": "ES2022", "outDir": "dist", + "moduleResolution": "node", "resolveJsonModule": true, "noUnusedLocals": false, "skipLibCheck": true,