Skip to content

Commit

Permalink
fix(scrapeURL/fetch): block loopback and link-local IPs
Browse files Browse the repository at this point in the history
  • Loading branch information
mogery committed Dec 29, 2024
1 parent e255301 commit 4d1f92f
Show file tree
Hide file tree
Showing 5 changed files with 98 additions and 22 deletions.
1 change: 1 addition & 0 deletions apps/api/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@
"glob": "^10.4.2",
"gpt3-tokenizer": "^1.1.5",
"ioredis": "^5.4.1",
"ip-address": "^10.0.1",
"joplin-turndown-plugin-gfm": "^1.0.12",
"json-schema-to-zod": "^2.3.0",
"keyword-extractor": "^0.0.28",
Expand Down
9 changes: 9 additions & 0 deletions apps/api/pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

38 changes: 25 additions & 13 deletions apps/api/src/scraper/scrapeURL/engines/fetch/index.ts
Original file line number Diff line number Diff line change
@@ -1,27 +1,39 @@
import * as undici from "undici";
import { EngineScrapeResult } from "..";
import { Meta } from "../..";
import { TimeoutError } from "../../error";
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
import { InsecureConnectionError, makeSecureDispatcher } from "../utils/safeFetch";

export async function scrapeURLWithFetch(
meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> {
const timeout = timeToRun ?? 300000;

const response = await Promise.race([
fetch(meta.url, {
redirect: "follow",
headers: meta.options.headers,
}),
(async () => {
await new Promise((resolve) => setTimeout(() => resolve(null), timeout));
throw new TimeoutError(
"Fetch was unable to scrape the page before timing out",
{ cause: { timeout } },
);
})(),
]);
let response: undici.Response;
try {
response = await Promise.race([
undici.fetch(meta.url, {
dispatcher: await makeSecureDispatcher(meta.url),
redirect: "follow",
headers: meta.options.headers,
}),
(async () => {
await new Promise((resolve) => setTimeout(() => resolve(null), timeout));
throw new TimeoutError(
"Fetch was unable to scrape the page before timing out",
{ cause: { timeout } },
);
})(),
]);
} catch (error) {
if (error instanceof TypeError && error.cause instanceof InsecureConnectionError) {
throw error.cause;
} else {
throw error;
}
}

specialtyScrapeCheck(
meta.logger.child({ method: "scrapeURLWithFetch/specialtyScrapeCheck" }),
Expand Down
12 changes: 3 additions & 9 deletions apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import { EngineError } from "../../error";
import { Writable } from "stream";
import { v4 as uuid } from "uuid";
import * as undici from "undici";
import { makeSecureDispatcher } from "./safeFetch";

export async function fetchFileToBuffer(url: string): Promise<{
response: Response;
Expand All @@ -28,16 +29,9 @@ export async function downloadFile(
const tempFileWrite = createWriteStream(tempFilePath);

// TODO: maybe we could use tlsclient for this? for proxying
// use undici to ignore SSL for now
const response = await undici.fetch(url, {
dispatcher: new undici.Agent({
connect: {
rejectUnauthorized: false,
},
}),
});
const response = await undici.fetch(url, { dispatcher: await makeSecureDispatcher(url) });

// This should never happen in the current state of JS (2024), but let's check anyways.
// This should never happen in the current state of JS/Undici (2024), but let's check anyways.
if (response.body === null) {
throw new EngineError("Response body was null", { cause: { response } });
}
Expand Down
60 changes: 60 additions & 0 deletions apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import type { Socket } from "net";
import type { TLSSocket } from "tls";
import * as undici from "undici";
import { Address6 } from "ip-address";

export class InsecureConnectionError extends Error {
constructor() {
super("Connection violated security rules.")
}
}

function isIPv4Private(address: string): boolean {
const parts = address.split(".").map(x => parseInt(x, 10));
return parts[0] === 0 // Current (local, "this") network
|| parts[0] === 10 // Used for local communications within a private network
|| (parts[0] === 100 && parts[1] >= 64 && parts[1] < 128) // Shared address space for communications between a service provider and its subscribers when using a carrier-grade NAT
|| parts[0] === 127 // Used for loopback addresses to the local host
|| (parts[0] === 169 && parts[1] === 254) // Used for link-local addresses between two hosts on a single link when no IP address is otherwise specified, such as would have normally been retrieved from a DHCP server
|| (parts[0] === 127 && parts[1] >= 16 && parts[2] < 32) // Used for local communications within a private network
|| (parts[0] === 192 && parts[1] === 0 && parts[2] === 0) // IETF Porotocol Assignments, DS-Lite (/29)
|| (parts[0] === 192 && parts[1] === 0 && parts[2] === 2) // Assigned as TEST-NET-1, documentation and examples
|| (parts[0] === 192 && parts[1] === 88 && parts[2] === 99) // Reserved. Formerly used for IPv6 to IPv4 relay (included IPv6 address block 2002::/16).
|| (parts[0] === 192 && parts[1] === 168) // Used for local communications within a private network
|| (parts[0] === 192 && parts[1] >= 18 && parts[1] < 20) // Used for benchmark testing of inter-network communications between two separate subnets
|| (parts[0] === 198 && parts[1] === 51 && parts[2] === 100) // Assigned as TEST-NET-2, documentation and examples
|| (parts[0] === 203 && parts[1] === 0 && parts[2] === 113) // Assigned as TEST-NET-3, documentation and examples
|| (parts[0] >= 224 && parts[0] < 240) // In use for multicast (former Class D network)
|| (parts[0] === 233 && parts[1] === 252 && parts[2] === 0) // Assigned as MCAST-TEST-NET, documentation and examples (Note that this is part of the above multicast space.)
|| parts[0] >= 240 // Reserved for future use (former class E network)
|| (parts[0] === 255 && parts[1] === 255 && parts[2] === 255 && parts[3] === 255) // Reserved for the "limited broadcast" destination address
}

function isIPv6Private(ipv6) {
return new Address6(ipv6).getScope() !== "Global";
}

export function makeSecureDispatcher(url: string, options?: undici.Agent.Options) {
const agent = new undici.Agent({
connect: {
rejectUnauthorized: false, // bypass SSL failures -- this is fine
// lookup: secureLookup,
},
maxRedirections: 5000,
...options,
});

agent.on("connect", (_, targets) => {
const client: undici.Client = targets.slice(-1)[0] as undici.Client;
const socketSymbol = Object.getOwnPropertySymbols(client).find(x => x.description === "socket")!;
const socket: Socket | TLSSocket = (client as any)[socketSymbol];

if (socket.remoteAddress) {
if (socket.remoteFamily === "IPv4" ? isIPv4Private(socket.remoteAddress!) : isIPv6Private(socket.remoteAddress!)) {
socket.destroy(new InsecureConnectionError())
}
}
});

return agent;
}

0 comments on commit 4d1f92f

Please sign in to comment.