Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: create crawler api server #52

Merged
merged 9 commits into from
Dec 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
API_PORT=5000
API_HOST=localhost
MAX_PAGES_TO_CRAWL=45
NODE_ENV=development
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,5 @@ storage

# any output from the crawler
*.json
.env
pnpm-lock.yaml
9,306 changes: 8,810 additions & 496 deletions package-lock.json

Large diffs are not rendered by default.

17 changes: 15 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,29 +8,42 @@
"description": "Crawl a site to generate knowledge files to create your own custom GPT",
"dependencies": {
"commander": "^11.1.0",
"cors": "^2.8.5",
"crawlee": "^3.0.0",
"dotenv": "^16.3.1",
"express": "^4.18.2",
"express-fileupload": "^1.4.3",
"cross-env": "^7.0.3",
"glob": "^10.3.10",
"gpt-tokenizer": "^2.1.2",
"inquirer": "^9.2.12",
"playwright": "*",
"zod": "^3.22.4"
"prettier": "^3.1.0",
"swagger-ui-express": "^5.0.0"
},
"devDependencies": {
"@apify/tsconfig": "^0.1.0",
"@types/cors": "^2.8.17",
"@types/express": "^4.17.21",
"@types/express-fileupload": "^1.4.4",
"@semantic-release/changelog": "^6.0.3",
"@semantic-release/git": "^10.0.1",
"@types/inquirer": "^9.0.7",
"@types/node": "^20.0.0",
"prettier": "^3.1.0",
"semantic-release": "^22.0.8",
"ts-node": "^10.8.0",
"typescript": "^5.0.0"
"typescript": "^5.0.0",
"@types/swagger-ui-express": "^4.1.6",
"swagger-autogen": "^2.23.7",
"zod": "^3.22.4"
},
"scripts": {
"semantic-release": "semantic-release",
"preinstall": "npx playwright install",
"start": "npm run start:dev",
"start:server": "NODE_ENV=development npm run build && node dist/src/server.js",
"start:server:prod": "npm run build && node dist/src/server.js",
"start:cli": "cross-env NODE_ENV=development npm run build && node dist/src/cli.js",
"start:dev": "cross-env NODE_ENV=development npm run build && node dist/src/main.js",
"start:prod": "node dist/src/main.js",
Expand Down
4 changes: 3 additions & 1 deletion src/config.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import { z } from "zod";

import type { Page } from "playwright";
import { configDotenv } from "dotenv";

configDotenv();

const Page: z.ZodType<Page> = z.any();

Expand Down
42 changes: 39 additions & 3 deletions src/core.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@ import { glob } from "glob";
import { Config, configSchema } from "./config.js";
import { Page } from "playwright";
import { isWithinTokenLimit } from "gpt-tokenizer";
import { PathLike } from "fs";

let pageCounter = 0;
let crawler: PlaywrightCrawler;

export function getPageHtml(page: Page, selector = "body") {
return page.evaluate((selector) => {
Expand Down Expand Up @@ -52,7 +54,7 @@ export async function crawl(config: Config) {
if (process.env.NO_CRAWL !== "true") {
// PlaywrightCrawler crawls the web using a headless
// browser controlled by the Playwright library.
const crawler = new PlaywrightCrawler({
crawler = new PlaywrightCrawler({
// Use the requestHandler to process each of the crawled pages.
async requestHandler({ request, page, enqueueLinks, log, pushData }) {
if (config.cookie) {
Expand Down Expand Up @@ -143,6 +145,7 @@ export async function crawl(config: Config) {
}

export async function write(config: Config) {
let nextFileNameString: PathLike = "";
const jsonFiles = await glob("storage/datasets/default/*.json", {
absolute: true,
});
Expand All @@ -163,8 +166,14 @@ export async function write(config: Config) {
`${config.outputFileName.replace(/\.json$/, "")}-${fileCounter}.json`;

const writeBatchToFile = async (): Promise<void> => {
await writeFile(nextFileName(), JSON.stringify(currentResults, null, 2));
console.log(`Wrote ${currentResults.length} items to ${nextFileName()}`);
nextFileNameString = nextFileName();
await writeFile(
nextFileNameString,
JSON.stringify(currentResults, null, 2),
);
console.log(
`Wrote ${currentResults.length} items to ${nextFileNameString}`,
);
currentResults = [];
currentSize = 0;
fileCounter++;
Expand Down Expand Up @@ -213,4 +222,31 @@ export async function write(config: Config) {
if (currentResults.length > 0) {
await writeBatchToFile();
}

return nextFileNameString;
}

class GPTCrawlerCore {
config: Config;

constructor(config: Config) {
this.config = config;
}

async crawl() {
await crawl(this.config);
}

async write(): Promise<PathLike> {
// we need to wait for the file path as the path can change
return new Promise((resolve, reject) => {
write(this.config)
.then((outputFilePath) => {
resolve(outputFilePath);
})
.catch(reject);
});
}
}

export default GPTCrawlerCore;
44 changes: 44 additions & 0 deletions src/server.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import express from "express";
import cors from "cors";
import { readFile } from "fs/promises";
import { Config, configSchema } from "./config.js";
import { configDotenv } from "dotenv";
import swaggerUi from "swagger-ui-express";
// @ts-ignore
import swaggerDocument from "../swagger-output.json" assert { type: "json" };
import GPTCrawlerCore from "./core.js";
import { PathLike } from "fs";

configDotenv();

const app = express();
const port = Number(process.env.API_PORT) || 3000;
const hostname = process.env.API_HOST || "localhost";

app.use(cors());
app.use(express.json());
app.use("/api-docs", swaggerUi.serve, swaggerUi.setup(swaggerDocument));

// Define a POST route to accept config and run the crawler
app.post("/crawl", async (req, res) => {
const config: Config = req.body;
try {
const validatedConfig = configSchema.parse(config);
const crawler = new GPTCrawlerCore(validatedConfig);
await crawler.crawl();
const outputFileName: PathLike = await crawler.write();
const outputFileContent = await readFile(outputFileName, "utf-8");
res.contentType("application/json");
return res.send(outputFileContent);
} catch (error) {
return res
.status(500)
.json({ message: "Error occurred during crawling", error });
}
});

app.listen(port, hostname, () => {
console.log(`API server listening at http://${hostname}:${port}`);
});

export default app;
14 changes: 14 additions & 0 deletions swagger.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import swaggerAutogen from "swagger-autogen";

const doc = {
info: {
title: "GPT Crawler API",
description: "GPT Crawler",
},
host: "localhost:5000",
};

const outputFile = "swagger-output.json";
const routes = ["./src/server.ts"];

swaggerAutogen()(outputFile, routes, doc);
1 change: 1 addition & 0 deletions tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"module": "ES2022",
"target": "ES2022",
"outDir": "dist",
"moduleResolution": "node",
"resolveJsonModule": true,
"noUnusedLocals": false,
"skipLibCheck": true,
Expand Down
Loading