From 4d1f92f4c8c36403022428285a03621fd90d62ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sun, 29 Dec 2024 17:34:36 +0100 Subject: [PATCH] fix(scrapeURL/fetch): block loopback and link-local IPs --- apps/api/package.json | 1 + apps/api/pnpm-lock.yaml | 9 +++ .../scraper/scrapeURL/engines/fetch/index.ts | 38 ++++++++---- .../scrapeURL/engines/utils/downloadFile.ts | 12 +--- .../scrapeURL/engines/utils/safeFetch.ts | 60 +++++++++++++++++++ 5 files changed, 98 insertions(+), 22 deletions(-) create mode 100644 apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts diff --git a/apps/api/package.json b/apps/api/package.json index c4e70901d..00c1bc0e1 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -86,6 +86,7 @@ "glob": "^10.4.2", "gpt3-tokenizer": "^1.1.5", "ioredis": "^5.4.1", + "ip-address": "^10.0.1", "joplin-turndown-plugin-gfm": "^1.0.12", "json-schema-to-zod": "^2.3.0", "keyword-extractor": "^0.0.28", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 569eafd9c..17532d257 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -113,6 +113,9 @@ importers: ioredis: specifier: ^5.4.1 version: 5.4.1 + ip-address: + specifier: ^10.0.1 + version: 10.0.1 joplin-turndown-plugin-gfm: specifier: ^1.0.12 version: 1.0.12 @@ -2690,6 +2693,10 @@ packages: resolution: {integrity: sha512-2YZsvl7jopIa1gaePkeMtd9rAcSjOOjPtpcLlOeusyO+XH2SK5ZcT+UCrElPP+WVIInh2TzeI4XW9ENaSLVVHA==} engines: {node: '>=12.22.0'} + ip-address@10.0.1: + resolution: {integrity: sha512-NWv9YLW4PoW2B7xtzaS3NCot75m6nK7Icdv0o3lfMceJVRfSoQwqD4wEH5rLwoKJwUiZ/rfpiVBhnaF0FK4HoA==} + engines: {node: '>= 12'} + ip-address@9.0.5: resolution: {integrity: sha512-zHtQzGojZXTwZTHQqra+ETKd4Sn3vgi7uBmlPoXVWZqYvuKmtI0l/VZTjqGmJY9x88GGOaZ9+G9ES8hC4T4X8g==} engines: {node: '>= 12'} @@ -7845,6 +7852,8 @@ snapshots: transitivePeerDependencies: - supports-color + ip-address@10.0.1: {} + ip-address@9.0.5: dependencies: jsbn: 1.1.0 diff --git a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts index a0c8eaba2..eefd4c0fb 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts @@ -1,7 +1,9 @@ +import * as undici from "undici"; import { EngineScrapeResult } from ".."; import { Meta } from "../.."; import { TimeoutError } from "../../error"; import { specialtyScrapeCheck } from "../utils/specialtyHandler"; +import { InsecureConnectionError, makeSecureDispatcher } from "../utils/safeFetch"; export async function scrapeURLWithFetch( meta: Meta, @@ -9,19 +11,29 @@ export async function scrapeURLWithFetch( ): Promise { const timeout = timeToRun ?? 300000; - const response = await Promise.race([ - fetch(meta.url, { - redirect: "follow", - headers: meta.options.headers, - }), - (async () => { - await new Promise((resolve) => setTimeout(() => resolve(null), timeout)); - throw new TimeoutError( - "Fetch was unable to scrape the page before timing out", - { cause: { timeout } }, - ); - })(), - ]); + let response: undici.Response; + try { + response = await Promise.race([ + undici.fetch(meta.url, { + dispatcher: await makeSecureDispatcher(meta.url), + redirect: "follow", + headers: meta.options.headers, + }), + (async () => { + await new Promise((resolve) => setTimeout(() => resolve(null), timeout)); + throw new TimeoutError( + "Fetch was unable to scrape the page before timing out", + { cause: { timeout } }, + ); + })(), + ]); + } catch (error) { + if (error instanceof TypeError && error.cause instanceof InsecureConnectionError) { + throw error.cause; + } else { + throw error; + } + } specialtyScrapeCheck( meta.logger.child({ method: "scrapeURLWithFetch/specialtyScrapeCheck" }), diff --git a/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts b/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts index e2e3ee6fd..d4932f34b 100644 --- a/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts +++ b/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts @@ -5,6 +5,7 @@ import { EngineError } from "../../error"; import { Writable } from "stream"; import { v4 as uuid } from "uuid"; import * as undici from "undici"; +import { makeSecureDispatcher } from "./safeFetch"; export async function fetchFileToBuffer(url: string): Promise<{ response: Response; @@ -28,16 +29,9 @@ export async function downloadFile( const tempFileWrite = createWriteStream(tempFilePath); // TODO: maybe we could use tlsclient for this? for proxying - // use undici to ignore SSL for now - const response = await undici.fetch(url, { - dispatcher: new undici.Agent({ - connect: { - rejectUnauthorized: false, - }, - }), - }); + const response = await undici.fetch(url, { dispatcher: await makeSecureDispatcher(url) }); - // This should never happen in the current state of JS (2024), but let's check anyways. + // This should never happen in the current state of JS/Undici (2024), but let's check anyways. if (response.body === null) { throw new EngineError("Response body was null", { cause: { response } }); } diff --git a/apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts b/apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts new file mode 100644 index 000000000..eb6f24598 --- /dev/null +++ b/apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts @@ -0,0 +1,60 @@ +import type { Socket } from "net"; +import type { TLSSocket } from "tls"; +import * as undici from "undici"; +import { Address6 } from "ip-address"; + +export class InsecureConnectionError extends Error { + constructor() { + super("Connection violated security rules.") + } +} + +function isIPv4Private(address: string): boolean { + const parts = address.split(".").map(x => parseInt(x, 10)); + return parts[0] === 0 // Current (local, "this") network + || parts[0] === 10 // Used for local communications within a private network + || (parts[0] === 100 && parts[1] >= 64 && parts[1] < 128) // Shared address space for communications between a service provider and its subscribers when using a carrier-grade NAT + || parts[0] === 127 // Used for loopback addresses to the local host + || (parts[0] === 169 && parts[1] === 254) // Used for link-local addresses between two hosts on a single link when no IP address is otherwise specified, such as would have normally been retrieved from a DHCP server + || (parts[0] === 127 && parts[1] >= 16 && parts[2] < 32) // Used for local communications within a private network + || (parts[0] === 192 && parts[1] === 0 && parts[2] === 0) // IETF Porotocol Assignments, DS-Lite (/29) + || (parts[0] === 192 && parts[1] === 0 && parts[2] === 2) // Assigned as TEST-NET-1, documentation and examples + || (parts[0] === 192 && parts[1] === 88 && parts[2] === 99) // Reserved. Formerly used for IPv6 to IPv4 relay (included IPv6 address block 2002::/16). + || (parts[0] === 192 && parts[1] === 168) // Used for local communications within a private network + || (parts[0] === 192 && parts[1] >= 18 && parts[1] < 20) // Used for benchmark testing of inter-network communications between two separate subnets + || (parts[0] === 198 && parts[1] === 51 && parts[2] === 100) // Assigned as TEST-NET-2, documentation and examples + || (parts[0] === 203 && parts[1] === 0 && parts[2] === 113) // Assigned as TEST-NET-3, documentation and examples + || (parts[0] >= 224 && parts[0] < 240) // In use for multicast (former Class D network) + || (parts[0] === 233 && parts[1] === 252 && parts[2] === 0) // Assigned as MCAST-TEST-NET, documentation and examples (Note that this is part of the above multicast space.) + || parts[0] >= 240 // Reserved for future use (former class E network) + || (parts[0] === 255 && parts[1] === 255 && parts[2] === 255 && parts[3] === 255) // Reserved for the "limited broadcast" destination address +} + +function isIPv6Private(ipv6) { + return new Address6(ipv6).getScope() !== "Global"; +} + +export function makeSecureDispatcher(url: string, options?: undici.Agent.Options) { + const agent = new undici.Agent({ + connect: { + rejectUnauthorized: false, // bypass SSL failures -- this is fine + // lookup: secureLookup, + }, + maxRedirections: 5000, + ...options, + }); + + agent.on("connect", (_, targets) => { + const client: undici.Client = targets.slice(-1)[0] as undici.Client; + const socketSymbol = Object.getOwnPropertySymbols(client).find(x => x.description === "socket")!; + const socket: Socket | TLSSocket = (client as any)[socketSymbol]; + + if (socket.remoteAddress) { + if (socket.remoteFamily === "IPv4" ? isIPv4Private(socket.remoteAddress!) : isIPv6Private(socket.remoteAddress!)) { + socket.destroy(new InsecureConnectionError()) + } + } + }); + + return agent; +} \ No newline at end of file