BuilderIO · FTAndy · Nov 30, 2023 · Nov 30, 2023 · Nov 30, 2023 · Nov 30, 2023
diff --git a/README.md b/README.md
@@ -71,7 +71,14 @@ type Config = {
   /** URL to start the crawl, if sitemap is provided then it will be used instead and download all pages in the sitemap */
   url: string;
   /** Pattern to match against for links on a page to subsequently crawl */
-  match: string;
+  match:
+    | string
+    | string[]
+    | {
+        pattern: string; // url glob expressions from https://github.com/isaacs/minimatch
+        selector?: string | undefined; // Selector to grab the inner text from
+        skip?: boolean | undefined; // Whether skip to not grab any content from this pattern
+      }[];
   /** Selector to grab the inner text from */
   selector: string;
   /** Don't crawl more than this many pages */

diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -13,6 +13,7 @@
     "glob": "^10.3.10",
     "gpt-tokenizer": "^2.1.2",
     "inquirer": "^9.2.12",
+    "minimatch": "^9.0.3",
     "playwright": "*",
     "zod": "^3.22.4"
   },

diff --git a/src/config.ts b/src/config.ts
@@ -4,6 +4,36 @@ import type { Page } from "playwright";
 
 const Page: z.ZodType<Page> = z.any();
 
+/**
+ * Pattern to match against for links on a page to subsequently crawl
+ * @example "https://www.builder.io/c/docs/**"
+ * @default ""
+ */
+export const OriginMatch = z.string().or(z.array(z.string()));
+
+export const PatternMatch = z.array(
+  z.object({
+    /**
+     * Pattern to match against for links on a page to subsequently crawl
+     * @example "https://www.builder.io/c/docs/**"
+     * @refer https://github.com/isaacs/minimatch
+     * @default ""
+     */
+    pattern: z.string(),
+    /**
+     * Selector to grab the inner text from, limited to pattern
+     * @example ".docs-builder-container"
+     * @default "body"
+     */
+    selector: z.string().optional(),
+    /**
+     * Skip to grap inner text for this pattern
+     * @default false
+     */
+    skip: z.boolean().optional(),
+  }),
+);
+
 export const configSchema = z.object({
   /**
    * URL to start the crawl, if url is a sitemap, it will crawl all pages in the sitemap
@@ -17,8 +47,7 @@ export const configSchema = z.object({
    * @example "https://www.builder.io/c/docs/**"
    * @default ""
    */
-  match: z.string().or(z.array(z.string())),
-
+  match: OriginMatch.or(PatternMatch),
   /**
    * Selector to grab the inner text from
    * @example ".docs-builder-container"
@@ -73,3 +102,5 @@ export const configSchema = z.object({
 });
 
 export type Config = z.infer<typeof configSchema>;
+export type PatternMatchType = z.infer<typeof PatternMatch>;
+export type OriginMatchType = z.infer<typeof OriginMatch>;
diff --git a/src/core.ts b/src/core.ts
@@ -2,7 +2,15 @@
 import { PlaywrightCrawler, downloadListOfUrls } from "crawlee";
 import { readFile, writeFile } from "fs/promises";
 import { glob } from "glob";
-import { Config, configSchema } from "./config.js";
+import { minimatch } from "minimatch";
+import {
+  Config,
+  configSchema,
+  PatternMatch,
+  PatternMatchType,
+  OriginMatch,
+  OriginMatchType,
+} from "./config.js";
 import { Page } from "playwright";
 import { isWithinTokenLimit } from "gpt-tokenizer";
 
@@ -24,7 +32,7 @@ export function getPageHtml(page: Page, selector = "body") {
     } else {
       // Handle as a CSS selector
       const el = document.querySelector(selector) as HTMLElement | null;
-      return el?.innerText || "";
+      return el?.innerText || el?.innerHTML || "";
     }
   }, selector);
 }
@@ -71,8 +79,40 @@ export async function crawl(config: Config) {
           `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`,
         );
 
-        // Use custom handling for XPath selector
-        if (config.selector) {
+        let globs: string | string[] = [];
+
+        if (PatternMatch.safeParse(config.match).success) {
+          const matchPattern = config.match as PatternMatchType;
+          globs = matchPattern.map((s) => s.pattern);
+          const matchedPattern = matchPattern.find((match) => {
+            return minimatch(request.url, match.pattern);
+          });
+          if (matchedPattern && !matchedPattern.skip) {
+            const selector = matchedPattern?.selector || "body";
+            // Use custom handling for XPath selector
+            if (selector.startsWith("/")) {
+              await waitForXPath(
+                page,
+                selector,
+                config.waitForSelectorTimeout ?? 1000,
+              );
+            } else {
+              await page.waitForSelector(selector, {
+                timeout: config.waitForSelectorTimeout ?? 1000,
+              });
+            }
+            const html = await getPageHtml(page, selector);
+
+            // Save results as JSON to ./storage/datasets/default
+            await pushData({ title, url: request.loadedUrl, html });
+          }
+        } else if (
+          OriginMatch.safeParse(config.match).success &&
+          config.selector
+        ) {
+          const match = config.match as OriginMatchType;
+          globs = typeof match === "string" ? [match] : match;
+          // Use custom handling for XPath selector
           if (config.selector.startsWith("/")) {
             await waitForXPath(
               page,
@@ -84,12 +124,11 @@ export async function crawl(config: Config) {
               timeout: config.waitForSelectorTimeout ?? 1000,
             });
           }
-        }
+          const html = await getPageHtml(page, config.selector);
 
-        const html = await getPageHtml(page, config.selector);
-
-        // Save results as JSON to ./storage/datasets/default
-        await pushData({ title, url: request.loadedUrl, html });
+          // Save results as JSON to ./storage/datasets/default
+          await pushData({ title, url: request.loadedUrl, html });
+        }
 
         if (config.onVisitPage) {
           await config.onVisitPage({ page, pushData });
@@ -98,8 +137,7 @@ export async function crawl(config: Config) {
         // Extract links from the current page
         // and add them to the crawling queue.
         await enqueueLinks({
-          globs:
-            typeof config.match === "string" ? [config.match] : config.match,
+          globs,
         });
       },
       // Comment this option to scrape the full website.