Skip to content

Commit

Permalink
expose engine results tracker for ScrapeEvents implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
mogery committed Nov 6, 2024
1 parent be40dcb commit 461eda8
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 5 deletions.
47 changes: 44 additions & 3 deletions apps/api/src/scraper/scrapeURL/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ export type ScrapeUrlResponse = ({
error: any,
}) & {
logs: any[],
engines: EngineResultsTracker,
}

export type Meta = {
Expand Down Expand Up @@ -116,7 +117,7 @@ export type InternalOptions = {
v0DisableJsDom?: boolean;
};

export type EngineResultsTracker = { [E in Engine]?: {
export type EngineResultsTracker = { [E in Engine]?: ({
state: "error",
error: any,
unexpected: boolean,
Expand All @@ -127,6 +128,9 @@ export type EngineResultsTracker = { [E in Engine]?: {
unsupportedFeatures: Set<FeatureFlag>,
} | {
state: "timeout",
}) & {
startedAt: number,
finishedAt: number,
} };

export type EngineScrapeResultWithContext = {
Expand All @@ -135,6 +139,16 @@ export type EngineScrapeResultWithContext = {
result: (EngineScrapeResult & { markdown: string }),
};

function safeguardCircularError<T>(error: T): T {
if (typeof error === "object" && error !== null && (error as any).results) {
const newError = structuredClone(error);
delete (newError as any).results;
return newError;
} else {
return error;
}
}

async function scrapeURLLoop(
meta: Meta
): Promise<ScrapeUrlResponse> {
Expand All @@ -149,6 +163,7 @@ async function scrapeURLLoop(
let result: EngineScrapeResultWithContext | null = null;

for (const { engine, unsupportedFeatures } of fallbackList) {
const startedAt = Date.now();
try {
meta.logger.info("Scraping via " + engine + "...");
const _engineResult = await scrapeURLWithEngine(meta, engine);
Expand All @@ -167,6 +182,8 @@ async function scrapeURLLoop(
result: engineResult,
factors: { isLongEnough, isGoodStatusCode, hasNoPageError },
unsupportedFeatures,
startedAt,
finishedAt: Date.now(),
};

// NOTE: TODO: what to do when status code is bad is tough...
Expand All @@ -186,23 +203,40 @@ async function scrapeURLLoop(
meta.logger.info("Engine " + engine + " could not scrape the page.", { error });
results[engine] = {
state: "error",
error,
error: safeguardCircularError(error),
unexpected: false,
startedAt,
finishedAt: Date.now(),
};
} else if (error instanceof TimeoutError) {
meta.logger.info("Engine " + engine + " timed out while scraping.", { error });
results[engine] = {
state: "timeout",
startedAt,
finishedAt: Date.now(),
};
} else if (error instanceof AddFeatureError) {
throw error;
} else if (error instanceof LLMRefusalError) {
results[engine] = {
state: "error",
error: safeguardCircularError(error),
unexpected: true,
startedAt,
finishedAt: Date.now(),
}
error.results = results;
meta.logger.warn("LLM refusal encountered", { error });
throw error;
} else {
Sentry.captureException(error);
meta.logger.info("An unexpected error happened while scraping with " + engine + ".", { error });
results[engine] = {
state: "error",
error,
error: safeguardCircularError(error),
unexpected: true,
startedAt,
finishedAt: Date.now(),
}
}
}
Expand Down Expand Up @@ -237,6 +271,7 @@ async function scrapeURLLoop(
success: true,
document,
logs: meta.logs,
engines: results,
};
}

Expand All @@ -261,19 +296,25 @@ export async function scrapeURL(
}
}
} catch (error) {
let results: EngineResultsTracker = {};

if (error instanceof NoEnginesLeftError) {
meta.logger.warn("scrapeURL: All scraping engines failed!", { error });
results = error.results;
} else if (error instanceof LLMRefusalError) {
meta.logger.warn("scrapeURL: LLM refused to extract content", { error });
results = error.results!;
} else {
Sentry.captureException(error);
meta.logger.error("scrapeURL: Unexpected error happened", { error });
// TODO: results?
}

return {
success: false,
error,
logs: meta.logs,
engines: results,
}
}
}
8 changes: 6 additions & 2 deletions apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,18 @@ import { encoding_for_model } from "@dqbd/tiktoken";
import { TiktokenModel } from "@dqbd/tiktoken";
import { Document, ExtractOptions } from "../../../controllers/v1/types";
import { Logger } from "winston";
import { Meta } from "..";
import { EngineResultsTracker, Meta } from "..";

const maxTokens = 32000;
const modifier = 4;

export class LLMRefusalError extends Error {
public refusal: string;
public results: EngineResultsTracker | undefined;

constructor(refusal: string) {
super("LLM refused to extract the website's content", { cause: { refusal } })
super("LLM refused to extract the website's content")
this.refusal = refusal;
}
}

Expand Down

0 comments on commit 461eda8

Please sign in to comment.