Skip to content

Commit

Permalink
Merge branch 'main' into multiple-files
Browse files Browse the repository at this point in the history
  • Loading branch information
guillermoscript committed Nov 29, 2023
2 parents 69d895e + 0c53280 commit 98a645a
Show file tree
Hide file tree
Showing 9 changed files with 74 additions and 48 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@ jobs:
- run: npm run build
- uses: preactjs/compressed-size-action@v2
with:
pattern: ".dist/**/*.{js,ts,json}"
pattern: ".dist/**/*.{js,ts,json}"
6 changes: 3 additions & 3 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@ jobs:
- uses: actions/checkout@v2
- uses: actions/setup-node@v2
with:
cache: npm
node-version: 18
cache: npm
node-version: 18
- run: npm i
- run: npm run build
- run: npm run semantic-release
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
18 changes: 18 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
name: Test workflow

on: [push, pull_request]

jobs:
prettier_check:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
- name: Set up Node.js
uses: actions/setup-node@v2
with:
node-version: "20"
- name: Install Dependencies
run: npm ci
- name: Run prettier
run: npm run prettier:check
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,16 +78,16 @@ type Config = {
maxPagesToCrawl: number;
/** File name for the finished data */
outputFileName: string;
/** Optional resources to exclude
*
/** Optional resources to exclude
*
* @example
* ['png','jpg','jpeg','gif','svg','css','js','ico','woff','woff2','ttf','eot','otf','mp4','mp3','webm','ogg','wav','flac','aac','zip','tar','gz','rar','7z','exe','dmg','apk','csv','xls','xlsx','doc','docx','pdf','epub','iso','dmg','bin','ppt','pptx','odt','avi','mkv','xml','json','yml','yaml','rss','atom','swf','txt','dart','webp','bmp','tif','psd','ai','indd','eps','ps','zipx','srt','wasm','m4v','m4a','webp','weba','m4b','opus','ogv','ogm','oga','spx','ogx','flv','3gp','3g2','jxr','wdp','jng','hief','avif','apng','avifs','heif','heic','cur','ico','ani','jp2','jpm','jpx','mj2','wmv','wma','aac','tif','tiff','mpg','mpeg','mov','avi','wmv','flv','swf','mkv','m4v','m4p','m4b','m4r','m4a','mp3','wav','wma','ogg','oga','webm','3gp','3g2','flac','spx','amr','mid','midi','mka','dts','ac3','eac3','weba','m3u','m3u8','ts','wpl','pls','vob','ifo','bup','svcd','drc','dsm','dsv','dsa','dss','vivo','ivf','dvd','fli','flc','flic','flic','mng','asf','m2v','asx','ram','ra','rm','rpm','roq','smi','smil','wmf','wmz','wmd','wvx','wmx','movie','wri','ins','isp','acsm','djvu','fb2','xps','oxps','ps','eps','ai','prn','svg','dwg','dxf','ttf','fnt','fon','otf','cab']
*/
resourceExclusions?: string[];
*/
resourceExclusions?: string[];
/** Optional maximum file size in megabytes to include in the output file */
maxFileSize?: number,
/** Optional maximum number tokens to include in the output file */
maxTokens?: number().,
maxTokens?: number,
};
```

Expand Down
2 changes: 1 addition & 1 deletion containerapp/data/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ export const defaultConfig: Config = {
match: "https://www.builder.io/c/docs/**",
maxPagesToCrawl: 50,
outputFileName: "../data/output.json",
};
};
7 changes: 4 additions & 3 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 4 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@builder.io/gpt-crawler",
"version": "1.0.0",
"version": "1.1.0",
"type": "module",
"bin": {
"gpt-crawler": "./dist/src/cli.js"
Expand All @@ -14,7 +14,6 @@
"gpt-tokenizer": "^2.1.2",
"inquirer": "^9.2.12",
"playwright": "*",
"prettier": "^3.1.0",
"zod": "^3.22.4"
},
"devDependencies": {
Expand All @@ -23,6 +22,7 @@
"@semantic-release/git": "^10.0.1",
"@types/inquirer": "^9.0.7",
"@types/node": "^20.0.0",
"prettier": "^3.1.0",
"semantic-release": "^22.0.8",
"ts-node": "^10.8.0",
"typescript": "^5.0.0"
Expand All @@ -35,7 +35,8 @@
"start:dev": "cross-env NODE_ENV=development npm run build && node dist/src/main.js",
"start:prod": "node dist/src/main.js",
"build": "tsc",
"fmt": "prettier --write ."
"fmt": "prettier --write .",
"prettier:check": "prettier --check ."
},
"author": "It's not you it's me",
"license": "ISC"
Expand Down
40 changes: 21 additions & 19 deletions src/config.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { z } from 'zod';
import { z } from "zod";

import type { Page } from "playwright";

Expand Down Expand Up @@ -36,27 +36,30 @@ export const configSchema = z.object({
*/
outputFileName: z.string(),
/** Optional cookie to be set. E.g. for Cookie Consent */
cookie: z.object({
name: z.string(),
value: z.string(),
}).optional(),
cookie: z
.object({
name: z.string(),
value: z.string(),
})
.optional(),
/** Optional function to run for each page found */
onVisitPage: z.function()
.args(z.object({
onVisitPage: z
.function()
.args(
z.object({
page: Page,
pushData: z.function()
.args(z.any())
.returns(z.promise(z.void()))
}))
.returns(z.promise(z.void()))
.optional(),
pushData: z.function().args(z.any()).returns(z.promise(z.void())),
}),
)
.returns(z.promise(z.void()))
.optional(),
/** Optional timeout for waiting for a selector to appear */
waitForSelectorTimeout: z.number().int().nonnegative().optional(),
/** Optional resources to exclude
*
* @example
* ['png','jpg','jpeg','gif','svg','css','js','ico','woff','woff2','ttf','eot','otf','mp4','mp3','webm','ogg','wav','flac','aac','zip','tar','gz','rar','7z','exe','dmg','apk','csv','xls','xlsx','doc','docx','pdf','epub','iso','dmg','bin','ppt','pptx','odt','avi','mkv','xml','json','yml','yaml','rss','atom','swf','txt','dart','webp','bmp','tif','psd','ai','indd','eps','ps','zipx','srt','wasm','m4v','m4a','webp','weba','m4b','opus','ogv','ogm','oga','spx','ogx','flv','3gp','3g2','jxr','wdp','jng','hief','avif','apng','avifs','heif','heic','cur','ico','ani','jp2','jpm','jpx','mj2','wmv','wma','aac','tif','tiff','mpg','mpeg','mov','avi','wmv','flv','swf','mkv','m4v','m4p','m4b','m4r','m4a','mp3','wav','wma','ogg','oga','webm','3gp','3g2','flac','spx','amr','mid','midi','mka','dts','ac3','eac3','weba','m3u','m3u8','ts','wpl','pls','vob','ifo','bup','svcd','drc','dsm','dsv','dsa','dss','vivo','ivf','dvd','fli','flc','flic','flic','mng','asf','m2v','asx','ram','ra','rm','rpm','roq','smi','smil','wmf','wmz','wmd','wvx','wmx','movie','wri','ins','isp','acsm','djvu','fb2','xps','oxps','ps','eps','ai','prn','svg','dwg','dxf','ttf','fnt','fon','otf','cab']
*/
/** Optional resources to exclude
*
* @example
* ['png','jpg','jpeg','gif','svg','css','js','ico','woff','woff2','ttf','eot','otf','mp4','mp3','webm','ogg','wav','flac','aac','zip','tar','gz','rar','7z','exe','dmg','apk','csv','xls','xlsx','doc','docx','pdf','epub','iso','dmg','bin','ppt','pptx','odt','avi','mkv','xml','json','yml','yaml','rss','atom','swf','txt','dart','webp','bmp','tif','psd','ai','indd','eps','ps','zipx','srt','wasm','m4v','m4a','webp','weba','m4b','opus','ogv','ogm','oga','spx','ogx','flv','3gp','3g2','jxr','wdp','jng','hief','avif','apng','avifs','heif','heic','cur','ico','ani','jp2','jpm','jpx','mj2','wmv','wma','aac','tif','tiff','mpg','mpeg','mov','avi','wmv','flv','swf','mkv','m4v','m4p','m4b','m4r','m4a','mp3','wav','wma','ogg','oga','webm','3gp','3g2','flac','spx','amr','mid','midi','mka','dts','ac3','eac3','weba','m3u','m3u8','ts','wpl','pls','vob','ifo','bup','svcd','drc','dsm','dsv','dsa','dss','vivo','ivf','dvd','fli','flc','flic','flic','mng','asf','m2v','asx','ram','ra','rm','rpm','roq','smi','smil','wmf','wmz','wmd','wvx','wmx','movie','wri','ins','isp','acsm','djvu','fb2','xps','oxps','ps','eps','ai','prn','svg','dwg','dxf','ttf','fnt','fon','otf','cab']
*/
resourceExclusions: z.array(z.string()).optional(),

/** Optional maximum file size in megabytes to include in the output file
Expand All @@ -70,4 +73,3 @@ export const configSchema = z.object({
});

export type Config = z.infer<typeof configSchema>;

30 changes: 17 additions & 13 deletions src/core.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import { PlaywrightCrawler, downloadListOfUrls } from "crawlee";
import { readFile, writeFile } from "fs/promises";
import { glob } from "glob";
import {Config, configSchema} from "./config.js";
import { Config, configSchema } from "./config.js";
import { Page } from "playwright";
import {
isWithinTokenLimit,
Expand All @@ -19,7 +19,7 @@ export function getPageHtml(page: Page, selector = "body") {
document,
null,
XPathResult.ANY_TYPE,
null
null,
);
let result = elements.iterateNext();
return result ? result.textContent || "" : "";
Expand All @@ -39,16 +39,16 @@ export async function waitForXPath(page: Page, xpath: string, timeout: number) {
document,
null,
XPathResult.ANY_TYPE,
null
null,
);
return elements.iterateNext() !== null;
},
xpath,
{ timeout }
{ timeout },
);
}

export async function crawl(config: Config) {
export async function crawl(config: Config) {
configSchema.parse(config);

if (process.env.NO_CRAWL !== "true") {
Expand All @@ -70,7 +70,7 @@ export async function crawl(config: Config) {
const title = await page.title();
pageCounter++;
log.info(
`Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`
`Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`,
);

// Use custom handling for XPath selector
Expand All @@ -79,7 +79,7 @@ export async function crawl(config: Config) {
await waitForXPath(
page,
config.selector,
config.waitForSelectorTimeout ?? 1000
config.waitForSelectorTimeout ?? 1000,
);
} else {
await page.waitForSelector(config.selector, {
Expand Down Expand Up @@ -116,21 +116,25 @@ export async function crawl(config: Config) {
if (RESOURCE_EXCLUSTIONS.length === 0) {
return;
}
await page.route(`**\/*.{${RESOURCE_EXCLUSTIONS.join()}}`, route => route.abort('aborted'));
log.info(`Aborting requests for as this is a resource excluded route`);
}
await page.route(`**\/*.{${RESOURCE_EXCLUSTIONS.join()}}`, (route) =>
route.abort("aborted"),
);
log.info(
`Aborting requests for as this is a resource excluded route`,
);
},
],
});

const SITEMAP_SUFFIX = "sitemap.xml";
const isUrlASitemap = config.url.endsWith(SITEMAP_SUFFIX);

if (isUrlASitemap) {
const listOfUrls = await downloadListOfUrls({ url: config.url });

// Add the initial URL to the crawling queue.
await crawler.addRequests(listOfUrls);

// Run the crawler
await crawler.run();
} else {
Expand Down

0 comments on commit 98a645a

Please sign in to comment.