From 33b93304c0ff9c83e2fb36276714d4687a7dbd1c Mon Sep 17 00:00:00 2001 From: hirsaeki <5356955+hirsaeki@users.noreply.github.com> Date: Thu, 28 Mar 2024 00:52:58 +0900 Subject: [PATCH] config.ts.example added --- .gitignore | 4 ++++ .tool-versions | 1 + config.ts | 5 ++--- config.ts.example | 17 +++++++++++++++++ 4 files changed, 24 insertions(+), 3 deletions(-) create mode 100644 .tool-versions create mode 100644 config.ts.example diff --git a/.gitignore b/.gitignore index 3da7e097..00b8be7e 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,7 @@ storage *.json .env pnpm-lock.yaml + +# exclude working files +/gpt-crawler-y-upstream_project_summary.* +/config.ts diff --git a/.tool-versions b/.tool-versions new file mode 100644 index 00000000..7bd5f0bf --- /dev/null +++ b/.tool-versions @@ -0,0 +1 @@ +nodejs 20.10.0 diff --git a/config.ts b/config.ts index d0ff0c6e..561906f3 100644 --- a/config.ts +++ b/config.ts @@ -4,8 +4,8 @@ export const defaultConfig: Config = { // url: "https://www.builder.io/c/docs/developers", // match: "https://www.builder.io/c/docs/**", url: "https://help.mypurecloud.com/articles/about-cx-cloud-from-genesys-and-salesforce/", - match: "https://help.mypurecloud.com/?p**", - maxPagesToCrawl: 100, + match: ["https://help.mypurecloud.com/?p**","https://help.salesforce.com/**"], + maxPagesToCrawl: 300, // outputFileName: "output.json", outputFileName: "cxcloud.json", waitTime: 1000, @@ -17,5 +17,4 @@ export const defaultConfig: Config = { // userAgent: "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", maxTokens: 2000000, selector: "#main-content > div", - crawlInsideSelector: true, }; diff --git a/config.ts.example b/config.ts.example new file mode 100644 index 00000000..0aebc29c --- /dev/null +++ b/config.ts.example @@ -0,0 +1,17 @@ +import { Config } from "./src/config"; + +export const defaultConfig: Config = { + url: "https://www.builder.io/c/docs/developers", + match: "https://www.builder.io/c/docs/**", + maxPagesToCrawl: 300, + outputFileName: "output.json", + waitTime: 1000, + onVisitPage: async ({ visitPageWaitTime }) => { + await new Promise((resolve) => + setTimeout(resolve, visitPageWaitTime ?? 1000), + ); + }, + // userAgent: "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", + maxTokens: 2000000, + selector: "#main-content > div", +};