Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
nickscamara committed Dec 17, 2024
2 parents 0f8b8a7 + 4ad6665 commit e26a0a6
Show file tree
Hide file tree
Showing 7 changed files with 43 additions and 38 deletions.
14 changes: 7 additions & 7 deletions apps/api/pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion apps/api/src/controllers/v1/scrape.ts
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ export async function scrapeController(
try {
doc = await waitForJob<Document>(jobId, timeout + totalWait); // TODO: better types for this
} catch (e) {
logger.error(`Error in scrapeController: ${e}`);
logger.error(`Error in scrapeController: ${e}`, { jobId, scrapeId: jobId, startTime });
if (
e instanceof Error &&
(e.message.startsWith("Job wait") || e.message === "timeout")
Expand Down
4 changes: 2 additions & 2 deletions apps/api/src/main/runWebScraper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ export async function runWebScraper({
...internalOptions,
});
if (!response.success) {
error = response.error;
if (response.error instanceof Error) {
throw response.error;
} else {
Expand Down Expand Up @@ -124,7 +123,8 @@ export async function runWebScraper({
// status code is good -- do not attempt retry
break;
}
} catch (error) {
} catch (_error) {
error = _error;
engines =
response !== undefined
? response.engines
Expand Down
19 changes: 10 additions & 9 deletions apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,13 @@ export async function scrapeURLWithFireEngineChromeCDP(
// Include specified actions
...(meta.options.actions ?? []),
];

const totalWait = actions.reduce(
(a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a),
0,
);

const timeout = timeToRun ?? 300000;
const timeout = (timeToRun ?? 300000) + totalWait;

const request: FireEngineScrapeRequestCommon &
FireEngineScrapeRequestChromeCDP = {
Expand All @@ -146,18 +151,13 @@ export async function scrapeURLWithFireEngineChromeCDP(
// TODO: scrollXPaths
};

const totalWait = actions.reduce(
(a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a),
0,
);

let response = await performFireEngineScrape(
meta.logger.child({
method: "scrapeURLWithFireEngineChromeCDP/callFireEngine",
request,
}),
request,
timeout + totalWait,
timeout,
);

specialtyScrapeCheck(
Expand Down Expand Up @@ -213,7 +213,8 @@ export async function scrapeURLWithFireEnginePlaywright(
meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> {
const timeout = timeToRun ?? 300000;
const totalWait = meta.options.waitFor;
const timeout = (timeToRun ?? 300000) + totalWait;

const request: FireEngineScrapeRequestCommon &
FireEngineScrapeRequestPlaywright = {
Expand All @@ -237,7 +238,7 @@ export async function scrapeURLWithFireEnginePlaywright(
request,
}),
request,
timeout + meta.options.waitFor,
timeout,
);

specialtyScrapeCheck(
Expand Down
37 changes: 20 additions & 17 deletions apps/api/src/scraper/scrapeURL/engines/pdf/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -140,9 +140,23 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom
const { response, tempFilePath } = await downloadFile(meta.id, meta.url);

let result: PDFProcessorResult | null = null;
if (process.env.LLAMAPARSE_API_KEY) {

// First, try parsing with PdfParse
result = await scrapePDFWithParsePDF(
{
...meta,
logger: meta.logger.child({
method: "scrapePDF/scrapePDFWithParsePDF",
}),
},
tempFilePath,
);


// If the parsed text is under 500 characters and LLAMAPARSE_API_KEY exists, try LlamaParse
if (result.markdown && result.markdown.length < 500 && process.env.LLAMAPARSE_API_KEY) {
try {
result = await scrapePDFWithLlamaParse(
const llamaResult = await scrapePDFWithLlamaParse(
{
...meta,
logger: meta.logger.child({
Expand All @@ -152,35 +166,24 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom
tempFilePath,
timeToRun,
);
result = llamaResult; // Use LlamaParse result if successful
} catch (error) {
if (error instanceof Error && error.message === "LlamaParse timed out") {
meta.logger.warn("LlamaParse timed out -- falling back to parse-pdf", {
meta.logger.warn("LlamaParse timed out -- using parse-pdf result", {
error,
});
} else if (error instanceof RemoveFeatureError) {
throw error;
} else {
meta.logger.warn(
"LlamaParse failed to parse PDF -- falling back to parse-pdf",
"LlamaParse failed to parse PDF -- using parse-pdf result",
{ error },
);
Sentry.captureException(error);
}
}
}

if (result === null) {
result = await scrapePDFWithParsePDF(
{
...meta,
logger: meta.logger.child({
method: "scrapePDF/scrapePDFWithParsePDF",
}),
},
tempFilePath,
);
}

await fs.unlink(tempFilePath);

return {
Expand All @@ -190,4 +193,4 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom
html: result.html,
markdown: result.markdown,
};
}
}
2 changes: 1 addition & 1 deletion apps/api/src/scraper/scrapeURL/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
let result: EngineScrapeResultWithContext | null = null;

const timeToRun = meta.options.timeout !== undefined
? Math.round(meta.options.timeout / Math.min(fallbackList.length, 3))
? Math.round(meta.options.timeout / Math.min(fallbackList.length, 2))
: undefined

for (const { engine, unsupportedFeatures } of fallbackList) {
Expand Down
3 changes: 2 additions & 1 deletion apps/api/tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"rootDir": "./src",
"lib": ["ES2022", "DOM"],


// or higher
"target": "ES2022",

Expand All @@ -18,7 +19,7 @@
"*": ["node_modules/*", "src/types/*"],
},

"inlineSources": true
"inlineSources": true,
},
"include": ["src/","src/**/*", "services/db/supabase.ts", "utils/utils.ts", "services/db/supabaseEmbeddings.ts", "utils/EventEmmitter.ts", "src/services/queue-service.ts"]
}

0 comments on commit e26a0a6

Please sign in to comment.