diff --git a/.prettierignore b/.prettierignore new file mode 100644 index 00000000..fe827dbd --- /dev/null +++ b/.prettierignore @@ -0,0 +1,30 @@ +# Ignore artifacts + +node_modules +.github +storage +outputs +*.code-workspace + +## This file tells which files shouldn't be added to source control + +.idea +dist +node_modules +apify_storage +crawlee_storage +storage +.DS_Store + +## any output from the crawler + +*.json +pnpm-lock.yaml + +## Final ouputs folder + +outputs + +## VS Code workspace files + +*.code-workspace diff --git a/README.md b/README.md index a05d751d..33c67488 100644 --- a/README.md +++ b/README.md @@ -66,52 +66,110 @@ export const defaultConfig: Config = { See [config.ts](src/config.ts) for all available options. Here is a sample of the common config options: -```ts +````ts type Config = { - - /** Required - URL to start the crawl, if sitemap is provided then it will be used instead and download all pages in the sitemap */ + /** + * URL to start the crawl, if url is a sitemap, it will crawl all pages in the sitemap + * @example "https://www.builder.io/c/docs/developers" + * @example "https://www.builder.io/sitemap.xml" + * @default "" + * @required + */ url: string; - - /** Required - Pattern to match against for links on a page to subsequently crawl */ + /** + * Pattern to match against for links on a page to subsequently crawl + * @example "https://www.builder.io/c/docs/**" + * @default "" + */ match: string; - - /** Optional - Selector to grab the inner text from */ + /** + * Selector to grab the inner text from + * @example ".docs-builder-container" + * @default "" + * @required + */ selector: string; - - /** Optional - Don't crawl more than this many pages (0 = Crawl all, Default = 50)*/ + /** + * Don't crawl more than this many pages + * @default 50 + */ maxPagesToCrawl: number; - - /** Optional - File name for the finished data */ + /** + * File name for the finished data + * @example "output.json" + */ outputFileName: string; - - /** Optional - Timeout for waiting for a selector to appear */ - waitForSelectorTimeout: number; - - /** Optional - Resource file extensions to exclude from crawl - * + /** + * Cookie to be set. E.g. for Cookie Consent + */ + cookie?: { + name: string, + value: string, + url: string, + }; + /** + * Function to run for each page found + */ + onVisitPage?: (page: object, data: string); + /** + * Timeout to wait for a selector to appear + */ + waitForSelectorTimeout: object; + /** + * Resource file extensions to exclude from crawl * @example * ['png','jpg','jpeg','gif','svg','css','js','ico','woff','woff2','ttf','eot','otf','mp4','mp3','webm','ogg','wav','flac','aac','zip','tar','gz','rar','7z','exe','dmg','apk','csv','xls','xlsx','doc','docx','pdf','epub','iso','dmg','bin','ppt','pptx','odt','avi','mkv','xml','json','yml','yaml','rss','atom','swf','txt','dart','webp','bmp','tif','psd','ai','indd','eps','ps','zipx','srt','wasm','m4v','m4a','webp','weba','m4b','opus','ogv','ogm','oga','spx','ogx','flv','3gp','3g2','jxr','wdp','jng','hief','avif','apng','avifs','heif','heic','cur','ico','ani','jp2','jpm','jpx','mj2','wmv','wma','aac','tif','tiff','mpg','mpeg','mov','avi','wmv','flv','swf','mkv','m4v','m4p','m4b','m4r','m4a','mp3','wav','wma','ogg','oga','webm','3gp','3g2','flac','spx','amr','mid','midi','mka','dts','ac3','eac3','weba','m3u','m3u8','ts','wpl','pls','vob','ifo','bup','svcd','drc','dsm','dsv','dsa','dss','vivo','ivf','dvd','fli','flc','flic','flic','mng','asf','m2v','asx','ram','ra','rm','rpm','roq','smi','smil','wmf','wmz','wmd','wvx','wmx','movie','wri','ins','isp','acsm','djvu','fb2','xps','oxps','ps','eps','ai','prn','svg','dwg','dxf','ttf','fnt','fon','otf','cab'] */ resourceExclusions?: string[]; - /** Optional maximum file size in megabytes to include in the output file */ + /** + * Maximum file size in megabytes to include in the output file + * @example 1 + */ maxFileSize?: number; - /** Optional maximum number tokens to include in the output file */ + /** + * The maximum number tokens to include in the output file + * @example 5000 + */ maxTokens?: number; - /** Optional - Maximum concurent parellel requets at a time */ + /** + * Maximum concurent parellel requets at a time Maximum concurent parellel requets at a time + * @example + * Specific number of parellel requests + * ```ts + * maxConcurrency: 2; + * ``` + * @example + * 0 = Unlimited, Doesn't stop until cancelled + * text outside of the code block as regular text. + * ```ts + * maxConcurrency: 0; + * ``` + * @example + * undefined = max parellel requests possible + * ```ts + * maxConcurrency: undefined; + * ``` + * @default 1 + */ maxConcurrency?: number; - - /** Optional - waitPerPageCrawlTimeoutRange is a object containing a min and max each for the number of milliseconds to wait after each page crawl. - * Use waitPerPageCrawlTimeoutRange to handle rate limiting. - */ + /** + * Range for random number of milliseconds between **min** and **max** to wait after each page crawl + * @default {min:1000,max:1000} + * @example {min:1000, max:2000} + */ waitPerPageCrawlTimeoutRange?: { - min: number, + min: number, max: number, }; /** Optional - Boolean parameter to use PlayWright with displayed browser or headless ( default headless=True ). */ + /** + * Headless mode + * @default true + */ headless?: boolean; }; -``` +```` #### Run your crawler @@ -125,6 +183,22 @@ npm start To obtain the `output.json` with a containerized execution. Go into the `containerapp` directory. Modify the `config.ts` same as above, the `output.json`file should be generated in the data folder. Note : the `outputFileName` property in the `config.ts` file in containerapp folder is configured to work with the container. +#### [Running as a CLI](#running-as-a-cli) + +To run the `./dist/cli.ts` command line interface, follow these instructions: + +1. Open a terminal. +2. Navigate to the root directory of the project. +3. Run the following command: `./dist/cli.ts [arguments]` + Replace `[arguments]` with the appropriate command line arguments for your use case. +4. The CLI will execute the specified command and display the output in the terminal. + +> Note: Make sure you have the necessary dependencies installed and the project has been built before running the CLI. + +#### [Development](#development) + +> Instructions for Development will go here... + ### Upload your data to OpenAI The crawl will generate a file called `output.json` at the root of this project. Upload that [to OpenAI](https://platform.openai.com/docs/assistants/overview) to create your custom assistant or custom GPT. diff --git a/config.ts b/config.ts index e6df4001..e289244f 100644 --- a/config.ts +++ b/config.ts @@ -1,32 +1,33 @@ import { Config } from "./src/config"; -import { fileURLToPath } from 'url'; -import { dirname } from 'path'; +import { fileURLToPath } from "url"; +import { dirname } from "path"; const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); -const starting_url = "https://www.builder.io/c/docs/developers"; -const url_prefix = "https://" +const startingUrl = "https://www.builder.io/c/docs/developers"; +const urlPrefix = "https://"; const domain = "www.builder.io"; -const url_suffix = "/c/docs"; -const base_url = url_prefix + domain; -const match_url_prefix = base_url + url_suffix; -const match_url = match_url_prefix + "/**"; +const urlSuffix = "/c/docs"; +const baseUrl = urlPrefix + domain; +const matchUrl_prefix = baseUrl + urlSuffix; +const matchUrl = matchUrl_prefix + "/**"; // Now date stamp for output file name const now = new Date(); -const date = now.toISOString().split('T')[0]; -const time = now.toTimeString().split(' ')[0]; -const outputs_dir = __dirname.split('/').slice(0, -1).join('/') + '/outputs'; +const date = now.toISOString().split("T")[0]; +const time = now.toTimeString().split(" ")[0]; +const outputs_dir = __dirname.split("/").slice(0, -1).join("/") + "/outputs"; -const outputFileName = outputs_dir + "/" + domain + "-" + date + "-" + time + ".json"; +const outputFileName = + outputs_dir + "/" + domain + "-" + date + "-" + time + ".json"; export const defaultConfig: Config = { - url: starting_url, - match: match_url, + url: startingUrl, + match: matchUrl, maxPagesToCrawl: 50, - outputFileName: outputFileName, - waitPerPageCrawlTimeoutRange: {min:1000, max:1000}, + outputFileName: outputFileName, + waitPerPageCrawlTimeoutRange: { min: 1000, max: 1000 }, headless: true, maxConcurrency: 1, }; diff --git a/src/config.ts b/src/config.ts index 2d6bd45b..d5417738 100644 --- a/src/config.ts +++ b/src/config.ts @@ -6,54 +6,48 @@ const Page: z.ZodType = z.any(); export const configSchema = z.object({ /** - * **Required:** * URL to start the crawl, if url is a sitemap, it will crawl all pages in the sitemap * @example "https://www.builder.io/c/docs/developers" * @example "https://www.builder.io/sitemap.xml" * @default "" + * @required */ url: z.string(), /** - * **Required:** * Pattern to match against for links on a page to subsequently crawl * @example "https://www.builder.io/c/docs/**" * @default "" + * @required */ match: z.string().or(z.array(z.string())), - /** - * **Optional:** * Selector to grab the inner text from * @example ".docs-builder-container" * @default "" */ selector: z.string().optional(), /** - * **Optional:** * Don't crawl more than this many pages * @default 50 */ maxPagesToCrawl: z.number().int().nonnegative().or(z.undefined()).optional(), /** - * **Optional:** * File name for the finished data - * @default "output.json" + * @example "output.json" */ outputFileName: z.string(), - /** - * **Optional:** - * Cookie to be set. E.g. for Cookie Consent - * */ + /** + * Cookie to be set. E.g. for Cookie Consent + */ cookie: z .object({ name: z.string(), value: z.string(), }) .optional(), - /** - * **Optional:** - * Function to run for each page found - * */ + /** + * Function to run for each page found + */ onVisitPage: z .function() .args( @@ -64,52 +58,60 @@ export const configSchema = z.object({ ) .returns(z.promise(z.void())) .optional(), - /** Optional timeout for waiting for a selector to appear */ - waitForSelectorTimeout: z.number().int().nonnegative().optional(), - /** - * **Optional:** - * Resources to exclude - * + /** + * Resources to exclude * @example * ['png','jpg','jpeg','gif','svg','css','js','ico','woff','woff2','ttf','eot','otf','mp4','mp3','webm','ogg','wav','flac','aac','zip','tar','gz','rar','7z','exe','dmg','apk','csv','xls','xlsx','doc','docx','pdf','epub','iso','dmg','bin','ppt','pptx','odt','avi','mkv','xml','json','yml','yaml','rss','atom','swf','txt','dart','webp','bmp','tif','psd','ai','indd','eps','ps','zipx','srt','wasm','m4v','m4a','webp','weba','m4b','opus','ogv','ogm','oga','spx','ogx','flv','3gp','3g2','jxr','wdp','jng','hief','avif','apng','avifs','heif','heic','cur','ico','ani','jp2','jpm','jpx','mj2','wmv','wma','aac','tif','tiff','mpg','mpeg','mov','avi','wmv','flv','swf','mkv','m4v','m4p','m4b','m4r','m4a','mp3','wav','wma','ogg','oga','webm','3gp','3g2','flac','spx','amr','mid','midi','mka','dts','ac3','eac3','weba','m3u','m3u8','ts','wpl','pls','vob','ifo','bup','svcd','drc','dsm','dsv','dsa','dss','vivo','ivf','dvd','fli','flc','flic','flic','mng','asf','m2v','asx','ram','ra','rm','rpm','roq','smi','smil','wmf','wmz','wmd','wvx','wmx','movie','wri','ins','isp','acsm','djvu','fb2','xps','oxps','ps','eps','ai','prn','svg','dwg','dxf','ttf','fnt','fon','otf','cab'] */ resourceExclusions: z.array(z.string()).optional(), - - /** - * **Optional:** + /** * Maximum file size in megabytes to include in the output file * @example 1 */ maxFileSize: z.number().int().positive().optional(), - - /** - * **Optional:** + /** * The maximum number tokens to include in the output file * @example 5000 */ maxTokens: z.number().int().positive().optional(), - /** - * **Optional:** - * Range for random number of milliseconds between **min** and **max** to wait after each page crawl + /** + * Maximum concurent parellel requets at a time Maximum concurent parellel requets at a time + * @example + * Specific number of parellel requests + * ```ts + * maxConcurrency: 2; + * ``` + * @example + * 0 = Unlimited, Doesn't stop until cancelled + * text outside of the code block as regular text. + * ```ts + * maxConcurrency: 0; + * ``` + * @example + * undefined = max parellel requests possible + * ```ts + * maxConcurrency: undefined; + * ``` + * @default 1 + */ + maxConcurrency: z.number().int().nonnegative().optional(), + /** + * Range for random number of milliseconds between **min** and **max** to wait after each page crawl * @default {min:1000,max:1000} - * */ - waitPerPageCrawlTimeoutRange: z.object({ + * @example {min:1000,max:2000} + */ + waitForSelectorTimeout: z.number().int().nonnegative().optional(), + waitPerPageCrawlTimeoutRange: z + .object({ min: z.number().int().nonnegative(), max: z.number().int().nonnegative(), - }).optional(), - /** - * **Optional:** - * Headless mode - * @default true - */ - headless: z.boolean().optional(), - /** - * **Optional:** - * maxConcurrency - * description: ( 0 = Unlimited, Doesn't stop until cancelled, undefined = max parellel requests possible ) - * @default 1 - * */ - maxConcurrency: z.number().int().nonnegative().optional(), + }) + .optional(), + /** + * Headless mode + * @default true + */ + headless: z.boolean().optional(), }); export type Config = z.infer; diff --git a/src/core.ts b/src/core.ts index 78b95bd0..7741c108 100644 --- a/src/core.ts +++ b/src/core.ts @@ -47,11 +47,10 @@ export async function waitForXPath(page: Page, xpath: string, timeout: number) { } export async function crawl(config: Config) { - // Function to delay the next crawl function delay(time: number) { - return new Promise(function(resolve) { - setTimeout(resolve, time) + return new Promise(function (resolve) { + setTimeout(resolve, time); }); } @@ -66,9 +65,11 @@ export async function crawl(config: Config) { // Warn if unlimited crawling is enabled if (config.maxPagesToCrawl == 0) { config.maxPagesToCrawl = undefined; - log.warningOnce(`maxPagesToCrawl is set to ${config.maxPagesToCrawl} which means it will contine until it cannot find anymore links defined by match: ${config.match}`); + log.warningOnce( + `maxPagesToCrawl is set to ${config.maxPagesToCrawl} which means it will contine until it cannot find anymore links defined by match: ${config.match}`, + ); } - + if (config.cookie) { // Set the cookie for the specific URL const cookie = { @@ -81,10 +82,11 @@ export async function crawl(config: Config) { const title = await page.title(); // Display the pageCounter/maxPagesToCrawl number or pageCounter/∞ if maxPagesToCrawl=0 - const maxPagesToCrawlDisplay = config.maxPagesToCrawl == undefined ? "∞" : config.maxPagesToCrawl; + const maxPagesToCrawlDisplay = + config.maxPagesToCrawl == undefined ? "∞" : config.maxPagesToCrawl; pageCounter++; log.info( - `Crawling: Page ${pageCounter} / ${maxPagesToCrawlDisplay} - URL: ${request.loadedUrl}...` + `Crawling: Page ${pageCounter} / ${maxPagesToCrawlDisplay} - URL: ${request.loadedUrl}...`, ); // Use custom handling for XPath selector @@ -120,19 +122,25 @@ export async function crawl(config: Config) { // Use waitPerPageCrawlTimeoutRange to handle rate limiting if (config.waitPerPageCrawlTimeoutRange) { // Create a random number between min and max - const randomTimeout = Math.floor(Math.random() * (config.waitPerPageCrawlTimeoutRange.max - config.waitPerPageCrawlTimeoutRange.min + 1) + config.waitPerPageCrawlTimeoutRange.min); + const randomTimeout = Math.floor( + Math.random() * + (config.waitPerPageCrawlTimeoutRange.max - + config.waitPerPageCrawlTimeoutRange.min + + 1) + + config.waitPerPageCrawlTimeoutRange.min, + ); log.info( - `Waiting ${randomTimeout} milliseconds before next crawl to avoid rate limiting...` + `Waiting ${randomTimeout} milliseconds before next crawl to avoid rate limiting...`, ); // Wait for the random amount of time before crawling the next page await delay(randomTimeout); - }else{ + } else { // Wait for 1 second before crawling the next page await delay(1000); } }, - maxConcurrency: config.maxConcurrency || 1 , // Set the max concurrency - maxRequestsPerCrawl: config.maxPagesToCrawl, // Set the max pages to crawl or set to 0 to scrape the full website. + maxConcurrency: config.maxConcurrency || 1, // Set the max concurrency + maxRequestsPerCrawl: config.maxPagesToCrawl, // Set the max pages to crawl or set to 0 to scrape the full website. headless: config.headless ?? true, // Set to false to see the browser in action preNavigationHooks: [ // Abort requests for certain resource types