Skip to content

Commit

Permalink
fix: refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
adityak74 committed Dec 25, 2023
1 parent 8e15bb3 commit 7707146
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 10 deletions.
42 changes: 39 additions & 3 deletions src/core.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@ import { glob } from "glob";
import { Config, configSchema } from "./config.js";
import { Page } from "playwright";
import { isWithinTokenLimit } from "gpt-tokenizer";
import { PathLike } from "fs";

let pageCounter = 0;
let crawler: PlaywrightCrawler;

export function getPageHtml(page: Page, selector = "body") {
return page.evaluate((selector) => {
Expand Down Expand Up @@ -52,7 +54,7 @@ export async function crawl(config: Config) {
if (process.env.NO_CRAWL !== "true") {
// PlaywrightCrawler crawls the web using a headless
// browser controlled by the Playwright library.
const crawler = new PlaywrightCrawler({
crawler = new PlaywrightCrawler({
// Use the requestHandler to process each of the crawled pages.
async requestHandler({ request, page, enqueueLinks, log, pushData }) {
if (config.cookie) {
Expand Down Expand Up @@ -143,6 +145,7 @@ export async function crawl(config: Config) {
}

export async function write(config: Config) {
let nextFileNameString: PathLike = "";
const jsonFiles = await glob("storage/datasets/default/*.json", {
absolute: true,
});
Expand All @@ -163,8 +166,14 @@ export async function write(config: Config) {
`${config.outputFileName.replace(/\.json$/, "")}-${fileCounter}.json`;

const writeBatchToFile = async (): Promise<void> => {
await writeFile(nextFileName(), JSON.stringify(currentResults, null, 2));
console.log(`Wrote ${currentResults.length} items to ${nextFileName()}`);
nextFileNameString = nextFileName();
await writeFile(
nextFileNameString,
JSON.stringify(currentResults, null, 2),
);
console.log(
`Wrote ${currentResults.length} items to ${nextFileNameString}`,
);
currentResults = [];
currentSize = 0;
fileCounter++;
Expand Down Expand Up @@ -213,4 +222,31 @@ export async function write(config: Config) {
if (currentResults.length > 0) {
await writeBatchToFile();
}

return nextFileNameString;
}

class GPTCrawlerCore {
config: Config;

constructor(config: Config) {
this.config = config;
}

async crawl() {
await crawl(this.config);
}

async write(): Promise<PathLike> {
// we need to wait for the file path as the path can change
return new Promise((resolve, reject) => {
write(this.config)
.then((outputFilePath) => {
resolve(outputFilePath);
})
.catch(reject);
});
}
}

export default GPTCrawlerCore;
13 changes: 6 additions & 7 deletions src/server.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import express from "express";
import cors from "cors";
import { readFile } from "fs/promises";
import { crawl, write } from "./core.js";
import { Config, configSchema } from "./config.js";
import { configDotenv } from "dotenv";
import swaggerUi from "swagger-ui-express";
// @ts-ignore
import swaggerDocument from "../swagger-output.json" assert { type: "json" };
import GPTCrawlerCore from "./core.js";
import { PathLike } from "fs";

configDotenv();

Expand All @@ -23,12 +24,10 @@ app.post("/crawl", async (req, res) => {
const config: Config = req.body;
try {
const validatedConfig = configSchema.parse(config);
await crawl(validatedConfig);
await write(validatedConfig);
const outputFileContent = await readFile(
validatedConfig.outputFileName,
"utf-8",
);
const crawler = new GPTCrawlerCore(validatedConfig);
await crawler.crawl();
const outputFileName: PathLike = await crawler.write();
const outputFileContent = await readFile(outputFileName, "utf-8");
res.contentType("application/json");
return res.send(outputFileContent);
} catch (error) {
Expand Down
1 change: 1 addition & 0 deletions tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"module": "ES2022",
"target": "ES2022",
"outDir": "dist",
"moduleResolution": "node",
"resolveJsonModule": true,
"noUnusedLocals": false,
"skipLibCheck": true,
Expand Down

0 comments on commit 7707146

Please sign in to comment.