diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 69ffcdfe1..7cdbfee5b 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -508,7 +508,7 @@ export function fromLegacyScrapeOptions(pageOptions: PageOptions, extractorOptio v0DisableJsDom: pageOptions.disableJsDom, v0UseFastMode: pageOptions.useFastMode, }, - // TODO: fallback, fetchPage Content, replaceAllPathsWithAbsolutePaths, includeLinks + // TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks } } diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index 1f80644df..db2b021f0 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -87,7 +87,6 @@ export async function scrapeURLWithFireEngineChromeCDP(meta: Meta): Promise Document | Promise; @@ -110,6 +111,7 @@ export const transformerStack: Transformer[] = [ uploadScreenshot, performLLMExtract, coerceFieldsToFormats, + removeBase64Images, ]; export async function executeTransformers(meta: Meta, document: Document): Promise { diff --git a/apps/api/src/scraper/scrapeURL/transformers/removeBase64Images.ts b/apps/api/src/scraper/scrapeURL/transformers/removeBase64Images.ts new file mode 100644 index 000000000..92628f8af --- /dev/null +++ b/apps/api/src/scraper/scrapeURL/transformers/removeBase64Images.ts @@ -0,0 +1,11 @@ +import { Meta } from ".."; +import { Document } from "../../../controllers/v1/types"; + +const regex = /(!\[.*?\])\(data:image\/.*?;base64,.*?\)/g; + +export function removeBase64Images(meta: Meta, document: Document): Document { + if (meta.options.removeBase64Images && document.markdown !== undefined) { + document.markdown = document.markdown.replace(regex, '$1()'); + } + return document; +} \ No newline at end of file