mirror of
https://github.com/mendableai/firecrawl.git
synced 2025-06-27 00:41:33 +00:00
fixes
This commit is contained in:
parent
3cf22f9167
commit
5e760aacbb
@ -55,8 +55,8 @@ export async function scrapeURLWithFetch(
|
||||
} else {
|
||||
try {
|
||||
const x = await Promise.race([
|
||||
undici.fetch(meta.url, {
|
||||
dispatcher: await makeSecureDispatcher(meta.url),
|
||||
undici.fetch(meta.rewrittenUrl ?? meta.url, {
|
||||
dispatcher: await makeSecureDispatcher(meta.rewrittenUrl ?? meta.url),
|
||||
redirect: "follow",
|
||||
headers: meta.options.headers,
|
||||
signal: meta.internalOptions.abort ?? AbortSignal.timeout(timeout),
|
||||
|
@ -87,7 +87,7 @@ export async function sendDocumentToIndex(meta: Meta, document: Document) {
|
||||
url: normalizedURL,
|
||||
url_hash: urlHash,
|
||||
original_url: document.metadata.sourceURL ?? meta.url,
|
||||
resolved_url: document.metadata.url ?? document.metadata.sourceURL ?? meta.url,
|
||||
resolved_url: document.metadata.url ?? document.metadata.sourceURL ?? meta.rewrittenUrl ?? meta.url,
|
||||
has_screenshot: document.screenshot !== undefined && meta.featureFlags.has("screenshot"),
|
||||
has_screenshot_fullscreen: document.screenshot !== undefined && meta.featureFlags.has("screenshot@fullScreen"),
|
||||
is_mobile: meta.options.mobile,
|
||||
|
@ -243,7 +243,7 @@ function safeguardCircularError<T>(error: T): T {
|
||||
}
|
||||
|
||||
async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
||||
meta.logger.info(`Scraping URL ${JSON.stringify(meta.url)}...`);
|
||||
meta.logger.info(`Scraping URL ${JSON.stringify(meta.rewrittenUrl ?? meta.url)}...`);
|
||||
|
||||
// TODO: handle sitemap data, see WebScraper/index.ts:280
|
||||
// TODO: ScrapeEvents
|
||||
|
@ -12,7 +12,7 @@ export async function extractMetadataRust(
|
||||
return {
|
||||
...fromRust,
|
||||
...(fromRust.favicon ? {
|
||||
favicon: new URL(fromRust.favicon, meta.url)
|
||||
favicon: new URL(fromRust.favicon, meta.rewrittenUrl ?? meta.url)
|
||||
} : {}),
|
||||
scrapeId: meta.id,
|
||||
};
|
||||
@ -75,7 +75,7 @@ export async function extractMetadata(
|
||||
soup('link[rel*="icon"]').first().attr("href") ||
|
||||
undefined;
|
||||
if (faviconLink) {
|
||||
const baseUrl = new URL(meta.url).origin;
|
||||
const baseUrl = new URL(meta.rewrittenUrl ?? meta.url).origin;
|
||||
favicon = faviconLink.startsWith("http")
|
||||
? faviconLink
|
||||
: `${baseUrl}${faviconLink}`;
|
||||
|
@ -63,12 +63,12 @@ export async function deriveDiff(meta: Meta, document: Document): Promise<Docume
|
||||
const res = await supabase_service
|
||||
.rpc("diff_get_last_scrape_4", {
|
||||
i_team_id: meta.internalOptions.teamId,
|
||||
i_url: document.metadata.sourceURL ?? meta.url,
|
||||
i_url: document.metadata.sourceURL ?? meta.rewrittenUrl ?? meta.url,
|
||||
i_tag: meta.options.changeTrackingOptions?.tag ?? null,
|
||||
});
|
||||
const end = Date.now();
|
||||
if (end - start > 100) {
|
||||
meta.logger.debug("Diffing took a while", { time: end - start, params: { i_team_id: meta.internalOptions.teamId, i_url: document.metadata.sourceURL ?? meta.url } });
|
||||
meta.logger.debug("Diffing took a while", { time: end - start, params: { i_team_id: meta.internalOptions.teamId, i_url: document.metadata.sourceURL ?? meta.rewrittenUrl ?? meta.url } });
|
||||
}
|
||||
|
||||
const data: {
|
||||
|
@ -48,7 +48,7 @@ export async function deriveHTMLFromRawHTML(
|
||||
|
||||
document.html = await htmlTransform(
|
||||
document.rawHtml,
|
||||
document.metadata.url ?? document.metadata.sourceURL ?? meta.url,
|
||||
document.metadata.url ?? document.metadata.sourceURL ?? meta.rewrittenUrl ?? meta.url,
|
||||
meta.options,
|
||||
);
|
||||
return document;
|
||||
@ -88,7 +88,7 @@ export async function deriveLinksFromHTML(meta: Meta, document: Document): Promi
|
||||
);
|
||||
}
|
||||
|
||||
document.links = await extractLinks(document.html, meta.url);
|
||||
document.links = await extractLinks(document.html, document.metadata.url ?? document.metadata.sourceURL ?? meta.rewrittenUrl ?? meta.url);
|
||||
}
|
||||
|
||||
return document;
|
||||
|
@ -684,7 +684,7 @@ export async function performLLMExtract(
|
||||
const { extractedDataArray, warning, costLimitExceededTokenUsage } =
|
||||
await extractData({
|
||||
extractOptions: generationOptions,
|
||||
urls: [meta.url],
|
||||
urls: [meta.rewrittenUrl ?? meta.url],
|
||||
useAgent: false,
|
||||
scrapeId: meta.id,
|
||||
});
|
||||
@ -760,7 +760,7 @@ export async function performLLMExtract(
|
||||
// // if (shouldUseSmartscrape && smartscrape_prompt) {
|
||||
// // meta.logger.info("Triggering SmartScrape refinement...", { reason: smartscrape_reasoning, prompt: smartscrape_prompt });
|
||||
// // // Call the smartScrape function (which needs to be implemented/imported)
|
||||
// // // const smartScrapedDocs = await smartScrape(meta.url, smartscrape_prompt);
|
||||
// // // const smartScrapedDocs = await smartScrape(meta.rewrittenUrl ?? meta.url, smartscrape_prompt);
|
||||
// // // Process/merge smartScrapedDocs with extractedData
|
||||
// // // ... potentially update finalExtract ...
|
||||
// // } else {
|
||||
|
Loading…
x
Reference in New Issue
Block a user