This commit is contained in:
Nicolas 2024-12-17 15:19:53 -03:00
commit e26a0a65a7
7 changed files with 43 additions and 38 deletions

View File

@ -7478,7 +7478,7 @@ snapshots:
extract-zip@2.0.1:
dependencies:
debug: 4.3.4
debug: 4.3.5
get-stream: 5.2.0
yauzl: 2.10.0
optionalDependencies:
@ -7622,7 +7622,7 @@ snapshots:
dependencies:
basic-ftp: 5.0.5
data-uri-to-buffer: 6.0.2
debug: 4.3.4
debug: 4.3.5
fs-extra: 11.2.0
transitivePeerDependencies:
- supports-color
@ -7723,7 +7723,7 @@ snapshots:
http-proxy-agent@7.0.2:
dependencies:
agent-base: 7.1.1
debug: 4.3.4
debug: 4.3.5
transitivePeerDependencies:
- supports-color
@ -7771,7 +7771,7 @@ snapshots:
https-proxy-agent@7.0.5:
dependencies:
agent-base: 7.1.1
debug: 4.3.4
debug: 4.3.5
transitivePeerDependencies:
- supports-color
@ -8836,7 +8836,7 @@ snapshots:
dependencies:
'@tootallnate/quickjs-emscripten': 0.23.0
agent-base: 7.1.1
debug: 4.3.4
debug: 4.3.5
get-uri: 6.0.3
http-proxy-agent: 7.0.2
https-proxy-agent: 7.0.5
@ -9031,7 +9031,7 @@ snapshots:
proxy-agent@6.4.0:
dependencies:
agent-base: 7.1.1
debug: 4.3.4
debug: 4.3.5
http-proxy-agent: 7.0.2
https-proxy-agent: 7.0.5
lru-cache: 7.18.3
@ -9338,7 +9338,7 @@ snapshots:
socks-proxy-agent@8.0.4:
dependencies:
agent-base: 7.1.1
debug: 4.3.4
debug: 4.3.5
socks: 2.8.3
transitivePeerDependencies:
- supports-color

View File

@ -60,7 +60,7 @@ export async function scrapeController(
try {
doc = await waitForJob<Document>(jobId, timeout + totalWait); // TODO: better types for this
} catch (e) {
logger.error(`Error in scrapeController: ${e}`);
logger.error(`Error in scrapeController: ${e}`, { jobId, scrapeId: jobId, startTime });
if (
e instanceof Error &&
(e.message.startsWith("Job wait") || e.message === "timeout")

View File

@ -96,7 +96,6 @@ export async function runWebScraper({
...internalOptions,
});
if (!response.success) {
error = response.error;
if (response.error instanceof Error) {
throw response.error;
} else {
@ -124,7 +123,8 @@ export async function runWebScraper({
// status code is good -- do not attempt retry
break;
}
} catch (error) {
} catch (_error) {
error = _error;
engines =
response !== undefined
? response.engines

View File

@ -123,8 +123,13 @@ export async function scrapeURLWithFireEngineChromeCDP(
// Include specified actions
...(meta.options.actions ?? []),
];
const totalWait = actions.reduce(
(a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a),
0,
);
const timeout = timeToRun ?? 300000;
const timeout = (timeToRun ?? 300000) + totalWait;
const request: FireEngineScrapeRequestCommon &
FireEngineScrapeRequestChromeCDP = {
@ -146,18 +151,13 @@ export async function scrapeURLWithFireEngineChromeCDP(
// TODO: scrollXPaths
};
const totalWait = actions.reduce(
(a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a),
0,
);
let response = await performFireEngineScrape(
meta.logger.child({
method: "scrapeURLWithFireEngineChromeCDP/callFireEngine",
request,
}),
request,
timeout + totalWait,
timeout,
);
specialtyScrapeCheck(
@ -213,7 +213,8 @@ export async function scrapeURLWithFireEnginePlaywright(
meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> {
const timeout = timeToRun ?? 300000;
const totalWait = meta.options.waitFor;
const timeout = (timeToRun ?? 300000) + totalWait;
const request: FireEngineScrapeRequestCommon &
FireEngineScrapeRequestPlaywright = {
@ -237,7 +238,7 @@ export async function scrapeURLWithFireEnginePlaywright(
request,
}),
request,
timeout + meta.options.waitFor,
timeout,
);
specialtyScrapeCheck(

View File

@ -140,9 +140,23 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom
const { response, tempFilePath } = await downloadFile(meta.id, meta.url);
let result: PDFProcessorResult | null = null;
if (process.env.LLAMAPARSE_API_KEY) {
// First, try parsing with PdfParse
result = await scrapePDFWithParsePDF(
{
...meta,
logger: meta.logger.child({
method: "scrapePDF/scrapePDFWithParsePDF",
}),
},
tempFilePath,
);
// If the parsed text is under 500 characters and LLAMAPARSE_API_KEY exists, try LlamaParse
if (result.markdown && result.markdown.length < 500 && process.env.LLAMAPARSE_API_KEY) {
try {
result = await scrapePDFWithLlamaParse(
const llamaResult = await scrapePDFWithLlamaParse(
{
...meta,
logger: meta.logger.child({
@ -152,16 +166,17 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom
tempFilePath,
timeToRun,
);
result = llamaResult; // Use LlamaParse result if successful
} catch (error) {
if (error instanceof Error && error.message === "LlamaParse timed out") {
meta.logger.warn("LlamaParse timed out -- falling back to parse-pdf", {
meta.logger.warn("LlamaParse timed out -- using parse-pdf result", {
error,
});
} else if (error instanceof RemoveFeatureError) {
throw error;
} else {
meta.logger.warn(
"LlamaParse failed to parse PDF -- falling back to parse-pdf",
"LlamaParse failed to parse PDF -- using parse-pdf result",
{ error },
);
Sentry.captureException(error);
@ -169,18 +184,6 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom
}
}
if (result === null) {
result = await scrapePDFWithParsePDF(
{
...meta,
logger: meta.logger.child({
method: "scrapePDF/scrapePDFWithParsePDF",
}),
},
tempFilePath,
);
}
await fs.unlink(tempFilePath);
return {
@ -190,4 +193,4 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom
html: result.html,
markdown: result.markdown,
};
}
}

View File

@ -204,7 +204,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
let result: EngineScrapeResultWithContext | null = null;
const timeToRun = meta.options.timeout !== undefined
? Math.round(meta.options.timeout / Math.min(fallbackList.length, 3))
? Math.round(meta.options.timeout / Math.min(fallbackList.length, 2))
: undefined
for (const { engine, unsupportedFeatures } of fallbackList) {

View File

@ -3,6 +3,7 @@
"rootDir": "./src",
"lib": ["ES2022", "DOM"],
// or higher
"target": "ES2022",
@ -18,7 +19,7 @@
"*": ["node_modules/*", "src/types/*"],
},
"inlineSources": true
"inlineSources": true,
},
"include": ["src/","src/**/*", "services/db/supabase.ts", "utils/utils.ts", "services/db/supabaseEmbeddings.ts", "utils/EventEmmitter.ts", "src/services/queue-service.ts"]
}