Skip to content

Commit

Permalink
feat(runWebScraper): retry a scrape max 3 times in a crawl if the sta…
Browse files Browse the repository at this point in the history
…tus code is failure
  • Loading branch information
mogery committed Dec 13, 2024
1 parent 6b17a53 commit e74e4bc
Show file tree
Hide file tree
Showing 7 changed files with 108 additions and 72 deletions.
16 changes: 14 additions & 2 deletions apps/api/logview.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,19 @@
const fs = require("fs");

const logs = fs.readFileSync("7a373219-0eb4-4e47-b2df-e90e12afd5c1.log", "utf8")
.split("\n").filter(x => x.trim().length > 0).map(x => JSON.parse(x));
// METHOD: Winston log file
// const logs = fs.readFileSync("7a373219-0eb4-4e47-b2df-e90e12afd5c1.log", "utf8")
// .split("\n").filter(x => x.trim().length > 0).map(x => JSON.parse(x));

// METHOD: GCloud export
const logs = [
"downloaded-logs-20241213-225607.json",
"downloaded-logs-20241213-225654.json",
"downloaded-logs-20241213-225720.json",
"downloaded-logs-20241213-225758.json",
"downloaded-logs-20241213-225825.json",
"downloaded-logs-20241213-225843.json",
].flatMap(x => JSON.parse(fs.readFileSync(x, "utf8"))).map(x => x.jsonPayload);


const crawlIds = [...new Set(logs.map(x => x.crawlId).filter(x => x))];

Expand Down
16 changes: 8 additions & 8 deletions apps/api/src/controllers/v0/scrape.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ import { authenticateUser } from "../auth";
import { PlanType, RateLimiterMode } from "../../types";
import { logJob } from "../../services/logging/log_job";
import {
Document,
fromLegacyCombo,
toLegacyDocument,
url as urlSchema,
Expand All @@ -29,6 +28,7 @@ import * as Sentry from "@sentry/node";
import { getJobPriority } from "../../lib/job-priority";
import { fromLegacyScrapeOptions } from "../v1/types";
import { ZodError } from "zod";
import { Document as V0Document } from "./../../lib/entities";

export async function scrapeHelper(
jobId: string,
Expand All @@ -42,7 +42,7 @@ export async function scrapeHelper(
): Promise<{
success: boolean;
error?: string;
data?: Document | { url: string };
data?: V0Document | { url: string };
returnCode: number;
}> {
const url = urlSchema.parse(req.body.url);
Expand Down Expand Up @@ -241,9 +241,9 @@ export async function scrapeController(req: Request, res: Response) {
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;
const numTokens =
result.data && (result.data as Document).markdown
result.data && (result.data as V0Document).markdown
? numTokensFromString(
(result.data as Document).markdown!,
(result.data as V0Document).markdown!,
"gpt-3.5-turbo",
)
: 0;
Expand Down Expand Up @@ -276,14 +276,14 @@ export async function scrapeController(req: Request, res: Response) {

let doc = result.data;
if (!pageOptions || !pageOptions.includeRawHtml) {
if (doc && (doc as Document).rawHtml) {
delete (doc as Document).rawHtml;
if (doc && (doc as V0Document).rawHtml) {
delete (doc as V0Document).rawHtml;
}
}

if (pageOptions && pageOptions.includeExtract) {
if (!pageOptions.includeMarkdown && doc && (doc as Document).markdown) {
delete (doc as Document).markdown;
if (!pageOptions.includeMarkdown && doc && (doc as V0Document).markdown) {
delete (doc as V0Document).markdown;
}
}

Expand Down
4 changes: 2 additions & 2 deletions apps/api/src/controllers/v1/extract.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import { Request, Response } from "express";
import {
// Document,
Document,
RequestWithAuth,
ExtractRequest,
extractRequestSchema,
ExtractResponse,
MapDocument,
scrapeOptions,
} from "./types";
import { Document } from "../../lib/entities";
// import { Document } from "../../lib/entities";
import Redis from "ioredis";
import { configDotenv } from "dotenv";
import { performRanking } from "../../lib/ranker";
Expand Down
2 changes: 1 addition & 1 deletion apps/api/src/controllers/v1/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,7 @@ export type Document = {
articleSection?: string;
url?: string;
sourceURL?: string;
statusCode?: number;
statusCode: number;
error?: string;
[key: string]: string | string[] | number | undefined;
};
Expand Down
139 changes: 81 additions & 58 deletions apps/api/src/main/runWebScraper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ export async function startWebScraperPipeline({
bull_job_id: job.id.toString(),
priority: job.opts.priority,
is_scrape: job.data.is_scrape ?? false,
is_crawl: !!(job.data.crawl_id && job.data.crawlerOptions !== null),
});
}

Expand All @@ -63,73 +64,63 @@ export async function runWebScraper({
bull_job_id,
priority,
is_scrape = false,
is_crawl = false,
}: RunWebScraperParams): Promise<ScrapeUrlResponse> {
const tries = is_crawl ? 3 : 1;

let response: ScrapeUrlResponse | undefined = undefined;
let engines: EngineResultsTracker = {};
try {
response = await scrapeURL(bull_job_id, url, scrapeOptions, {
priority,
...internalOptions,
});
if (!response.success) {
if (response.error instanceof Error) {
throw response.error;
} else {
throw new Error(
"scrapeURL error: " +
(Array.isArray(response.error)
? JSON.stringify(response.error)
: typeof response.error === "object"
? JSON.stringify({ ...response.error })
: response.error),
);
}
}
let error: any = undefined;

if (is_scrape === false) {
let creditsToBeBilled = 1; // Assuming 1 credit per document
if (scrapeOptions.extract) {
creditsToBeBilled = 5;
}

billTeam(team_id, undefined, creditsToBeBilled).catch((error) => {
logger.error(
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`,
);
// Optionally, you could notify an admin or add to a retry queue here
});
for (let i = 0; i < tries; i++) {
if (i > 0) {
logger.debug("Retrying scrape...", { scrapeId: bull_job_id, jobId: bull_job_id, method: "runWebScraper", module: "runWebScraper", tries, i, previousStatusCode: (response as any)?.document?.metadata?.statusCode, previousError: error });
}

// This is where the returnvalue from the job is set
// onSuccess(response.document, mode);
response = undefined;
engines = {};
error = undefined;

engines = response.engines;
return response;
} catch (error) {
engines =
response !== undefined
? response.engines
: typeof error === "object" && error !== null
? ((error as any).results ?? {})
: {};
try {
response = await scrapeURL(bull_job_id, url, scrapeOptions, {
priority,
...internalOptions,
});
if (!response.success) {
if (response.error instanceof Error) {
throw response.error;
} else {
throw new Error(
"scrapeURL error: " +
(Array.isArray(response.error)
? JSON.stringify(response.error)
: typeof response.error === "object"
? JSON.stringify({ ...response.error })
: response.error),
);
}
}

// This is where the returnvalue from the job is set
// onSuccess(response.document, mode);

engines = response.engines;

if (response !== undefined) {
return {
...response,
success: false,
error,
};
} else {
return {
success: false,
error,
logs: ["no logs -- error coming from runWebScraper"],
engines,
};
if ((response.document.metadata.statusCode >= 200 && response.document.metadata.statusCode < 300) || response.document.metadata.statusCode === 304) {
// status code is good -- do not attempt retry
break;
}
} catch (error) {
engines =
response !== undefined
? response.engines
: typeof error === "object" && error !== null
? ((error as any).results ?? {})
: {};
}
// onError(error);
} finally {
const engineOrder = Object.entries(engines)
}

const engineOrder = Object.entries(engines)
.sort((a, b) => a[1].startedAt - b[1].startedAt)
.map((x) => x[0]) as Engine[];

Expand Down Expand Up @@ -158,6 +149,38 @@ export async function runWebScraper({
},
});
}

if (error === undefined && response?.success) {
if (is_scrape === false) {
let creditsToBeBilled = 1; // Assuming 1 credit per document
if (scrapeOptions.extract) {
creditsToBeBilled = 5;
}

billTeam(team_id, undefined, creditsToBeBilled).catch((error) => {
logger.error(
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`,
);
// Optionally, you could notify an admin or add to a retry queue here
});
}

return response;
} else {
if (response !== undefined) {
return {
...response,
success: false,
error,
};
} else {
return {
success: false,
error,
logs: ["no logs -- error coming from runWebScraper"],
engines,
};
}
}
}

Expand Down
2 changes: 1 addition & 1 deletion apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import { Meta } from "..";
export function extractMetadata(
meta: Meta,
html: string,
): Document["metadata"] {
): Partial<Document["metadata"]> {
let title: string | undefined = undefined;
let description: string | undefined = undefined;
let language: string | undefined = undefined;
Expand Down
1 change: 1 addition & 0 deletions apps/api/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ export interface RunWebScraperParams {
bull_job_id: string;
priority?: number;
is_scrape?: boolean;
is_crawl?: boolean;
}

export type RunWebScraperResult =
Expand Down

0 comments on commit e74e4bc

Please sign in to comment.