diff --git a/packages/actor-scraper/cheerio-scraper/src/internals/consts.ts b/packages/actor-scraper/cheerio-scraper/src/internals/consts.ts index f324876790..b406d5bfc4 100644 --- a/packages/actor-scraper/cheerio-scraper/src/internals/consts.ts +++ b/packages/actor-scraper/cheerio-scraper/src/internals/consts.ts @@ -9,6 +9,7 @@ export interface Input { globs: GlobInput[]; regexps: RegExpInput[]; excludes: GlobInput[]; + transformRequestFunction?: string; pseudoUrls: PseudoUrlInput[]; keepUrlFragments: boolean; linkSelector?: string; diff --git a/packages/actor-scraper/cheerio-scraper/src/internals/crawler_setup.ts b/packages/actor-scraper/cheerio-scraper/src/internals/crawler_setup.ts index 8061759e48..9c56c1420b 100644 --- a/packages/actor-scraper/cheerio-scraper/src/internals/crawler_setup.ts +++ b/packages/actor-scraper/cheerio-scraper/src/internals/crawler_setup.ts @@ -19,6 +19,8 @@ import { log, Dictionary, Awaitable, + RequestOptions, + RequestTransform, } from '@crawlee/cheerio'; import { Actor, ApifyEnv } from 'apify'; import { load } from 'cheerio'; @@ -54,6 +56,7 @@ export class CrawlerSetup implements CrawlerSetupOptions { evaledPageFunction: (...args: unknown[]) => unknown; evaledPreNavigationHooks: ((...args: unknown[]) => Awaitable)[]; evaledPostNavigationHooks: ((...args: unknown[]) => Awaitable)[]; + evaledTransformRequestFunction?: RequestTransform; datasetName?: string; keyValueStoreName?: string; requestQueueName?: string; @@ -97,6 +100,10 @@ export class CrawlerSetup implements CrawlerSetupOptions { // Functions need to be evaluated. this.evaledPageFunction = tools.evalFunctionOrThrow(this.input.pageFunction); + if (this.input.transformRequestFunction) { + this.evaledTransformRequestFunction = tools.evalFunctionOrThrow(this.input.transformRequestFunction) as RequestTransform; + } + if (this.input.preNavigationHooks) { this.evaledPreNavigationHooks = tools.evalFunctionArrayOrThrow(this.input.preNavigationHooks, 'preNavigationHooks'); } else { @@ -335,22 +342,38 @@ export class CrawlerSetup implements CrawlerSetupOptions { return; } + const baseTransformRequestFunction = (requestOptions: RequestOptions) => { + requestOptions.userData ??= {}; + requestOptions.userData[META_KEY] = { + parentRequestId: request.id || request.uniqueKey, + depth: currentDepth + 1, + }; + + requestOptions.useExtendedUniqueKey = true; + requestOptions.keepUrlFragment = this.input.keepUrlFragments; + return requestOptions; + } + + let transformRequestFunction: RequestTransform; + + if (this.evaledTransformRequestFunction) { + transformRequestFunction = (requestOptions: RequestOptions) => { + const updatedOptions = this.evaledTransformRequestFunction!(requestOptions); + if (updatedOptions) { + return baseTransformRequestFunction(requestOptions); + } + return updatedOptions; + } + } else { + transformRequestFunction = baseTransformRequestFunction; + } + await enqueueLinks({ selector: this.input.linkSelector, pseudoUrls: this.input.pseudoUrls, globs: this.input.globs, exclude: this.input.excludes, - transformRequestFunction: (requestOptions) => { - requestOptions.userData ??= {}; - requestOptions.userData[META_KEY] = { - parentRequestId: request.id || request.uniqueKey, - depth: currentDepth + 1, - }; - - requestOptions.useExtendedUniqueKey = true; - requestOptions.keepUrlFragment = this.input.keepUrlFragments; - return requestOptions; - }, + transformRequestFunction, }); }