Skip to content

Commit

Permalink
Optional Selector to limit link extraction to be within it
Browse files Browse the repository at this point in the history
  • Loading branch information
monagjr authored Dec 16, 2023
1 parent 9d536ec commit 6f27bac
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 1 deletion.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ type Config = {
url: string;
/** Pattern to match against for links on a page to subsequently crawl */
match: string;
/** Optional Selector to limit grabbing the links from */
matchSelector?: string;
/** Selector to grab the inner text from */
selector: string;
/** Don't crawl more than this many pages */
Expand Down
7 changes: 6 additions & 1 deletion src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,12 @@ export const configSchema = z.object({
* @default ""
*/
match: z.string().or(z.array(z.string())),

/**
* Selector to grab links from
* @example "li > a.block"
* @default ""
*/
matchSelector: z.string().optional(),
/**
* Selector to grab the inner text from
* @example ".docs-builder-container"
Expand Down
1 change: 1 addition & 0 deletions src/core.ts
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ export async function crawl(config: Config) {
await enqueueLinks({
globs:
typeof config.match === "string" ? [config.match] : config.match,
selector: config.matchSelector,
});
},
// Comment this option to scrape the full website.
Expand Down

0 comments on commit 6f27bac

Please sign in to comment.