Skip to content

Commit

Permalink
ingest scrape events
Browse files Browse the repository at this point in the history
  • Loading branch information
mogery committed Nov 7, 2024
1 parent 461eda8 commit 49801ac
Showing 1 changed file with 26 additions and 3 deletions.
29 changes: 26 additions & 3 deletions apps/api/src/main/runWebScraper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ import { supabase_service } from "../services/supabase";
import { logger } from "../lib/logger";
import { ScrapeEvents } from "../lib/scrape-events";
import { configDotenv } from "dotenv";
import { scrapeURL, ScrapeUrlResponse } from "../scraper/scrapeURL";
import { EngineResultsTracker, scrapeURL, ScrapeUrlResponse } from "../scraper/scrapeURL";
import { Engine } from "../scraper/scrapeURL/engines";
configDotenv();

export async function startWebScraperPipeline({
Expand Down Expand Up @@ -58,6 +59,7 @@ export async function runWebScraper({
is_scrape=false,
}: RunWebScraperParams): Promise<ScrapeUrlResponse> {
let response: ScrapeUrlResponse | undefined = undefined;
let engines: EngineResultsTracker = {};
try {
response = await scrapeURL(bull_job_id, url, scrapeOptions, { priority, ...internalOptions });
if (!response.success) {
Expand All @@ -83,22 +85,43 @@ export async function runWebScraper({
// This is where the returnvalue from the job is set
// onSuccess(response.document, mode);

engines = response.engines;
return response;
} catch (error) {
engines = response !== undefined ? response.engines : ((typeof error === "object" && error !== null ? (error as any).results ?? {} : {}));

if (response !== undefined) {
return {
...response,
success: false,
error,
}
} else {
return { success: false, error, logs: ["no logs -- error coming from runWebScraper"] };
return { success: false, error, logs: ["no logs -- error coming from runWebScraper"], engines };
}
// onError(error);
} finally {
const engineOrder = Object.entries(engines).sort((a, b) => a[1].startedAt - b[1].startedAt).map(x => x[0]) as Engine[];

for (const engine of engineOrder) {
const result = engines[engine] as Exclude<EngineResultsTracker[Engine], undefined>;
ScrapeEvents.insert(bull_job_id, {
type: "scrape",
url,
method: engine,
result: {
success: result.state === "success",
response_code: (result.state === "success" ? result.result.statusCode : undefined),
response_size: (result.state === "success" ? result.result.html.length : undefined),
error: (result.state === "error" ? result.error : result.state === "timeout" ? "Timed out" : undefined),
time_taken: result.finishedAt - result.startedAt,
},
});
}
}
}

const saveJob = async (job: Job, result: any, token: string, mode: string) => {
const saveJob = async (job: Job, result: any, token: string, mode: string, engines?: EngineResultsTracker) => {
try {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
if (useDbAuthentication) {
Expand Down

0 comments on commit 49801ac

Please sign in to comment.