This commit is contained in:
Gergő Móricz 2025-06-20 11:28:22 +02:00
parent 3cf22f9167
commit 5e760aacbb
7 changed files with 12 additions and 12 deletions

View File

@ -55,8 +55,8 @@ export async function scrapeURLWithFetch(
} else {
try {
const x = await Promise.race([
undici.fetch(meta.url, {
dispatcher: await makeSecureDispatcher(meta.url),
undici.fetch(meta.rewrittenUrl ?? meta.url, {
dispatcher: await makeSecureDispatcher(meta.rewrittenUrl ?? meta.url),
redirect: "follow",
headers: meta.options.headers,
signal: meta.internalOptions.abort ?? AbortSignal.timeout(timeout),

View File

@ -87,7 +87,7 @@ export async function sendDocumentToIndex(meta: Meta, document: Document) {
url: normalizedURL,
url_hash: urlHash,
original_url: document.metadata.sourceURL ?? meta.url,
resolved_url: document.metadata.url ?? document.metadata.sourceURL ?? meta.url,
resolved_url: document.metadata.url ?? document.metadata.sourceURL ?? meta.rewrittenUrl ?? meta.url,
has_screenshot: document.screenshot !== undefined && meta.featureFlags.has("screenshot"),
has_screenshot_fullscreen: document.screenshot !== undefined && meta.featureFlags.has("screenshot@fullScreen"),
is_mobile: meta.options.mobile,

View File

@ -243,7 +243,7 @@ function safeguardCircularError<T>(error: T): T {
}
async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
meta.logger.info(`Scraping URL ${JSON.stringify(meta.url)}...`);
meta.logger.info(`Scraping URL ${JSON.stringify(meta.rewrittenUrl ?? meta.url)}...`);
// TODO: handle sitemap data, see WebScraper/index.ts:280
// TODO: ScrapeEvents

View File

@ -12,7 +12,7 @@ export async function extractMetadataRust(
return {
...fromRust,
...(fromRust.favicon ? {
favicon: new URL(fromRust.favicon, meta.url)
favicon: new URL(fromRust.favicon, meta.rewrittenUrl ?? meta.url)
} : {}),
scrapeId: meta.id,
};
@ -75,7 +75,7 @@ export async function extractMetadata(
soup('link[rel*="icon"]').first().attr("href") ||
undefined;
if (faviconLink) {
const baseUrl = new URL(meta.url).origin;
const baseUrl = new URL(meta.rewrittenUrl ?? meta.url).origin;
favicon = faviconLink.startsWith("http")
? faviconLink
: `${baseUrl}${faviconLink}`;

View File

@ -63,12 +63,12 @@ export async function deriveDiff(meta: Meta, document: Document): Promise<Docume
const res = await supabase_service
.rpc("diff_get_last_scrape_4", {
i_team_id: meta.internalOptions.teamId,
i_url: document.metadata.sourceURL ?? meta.url,
i_url: document.metadata.sourceURL ?? meta.rewrittenUrl ?? meta.url,
i_tag: meta.options.changeTrackingOptions?.tag ?? null,
});
const end = Date.now();
if (end - start > 100) {
meta.logger.debug("Diffing took a while", { time: end - start, params: { i_team_id: meta.internalOptions.teamId, i_url: document.metadata.sourceURL ?? meta.url } });
meta.logger.debug("Diffing took a while", { time: end - start, params: { i_team_id: meta.internalOptions.teamId, i_url: document.metadata.sourceURL ?? meta.rewrittenUrl ?? meta.url } });
}
const data: {

View File

@ -48,7 +48,7 @@ export async function deriveHTMLFromRawHTML(
document.html = await htmlTransform(
document.rawHtml,
document.metadata.url ?? document.metadata.sourceURL ?? meta.url,
document.metadata.url ?? document.metadata.sourceURL ?? meta.rewrittenUrl ?? meta.url,
meta.options,
);
return document;
@ -88,7 +88,7 @@ export async function deriveLinksFromHTML(meta: Meta, document: Document): Promi
);
}
document.links = await extractLinks(document.html, meta.url);
document.links = await extractLinks(document.html, document.metadata.url ?? document.metadata.sourceURL ?? meta.rewrittenUrl ?? meta.url);
}
return document;

View File

@ -684,7 +684,7 @@ export async function performLLMExtract(
const { extractedDataArray, warning, costLimitExceededTokenUsage } =
await extractData({
extractOptions: generationOptions,
urls: [meta.url],
urls: [meta.rewrittenUrl ?? meta.url],
useAgent: false,
scrapeId: meta.id,
});
@ -760,7 +760,7 @@ export async function performLLMExtract(
// // if (shouldUseSmartscrape && smartscrape_prompt) {
// // meta.logger.info("Triggering SmartScrape refinement...", { reason: smartscrape_reasoning, prompt: smartscrape_prompt });
// // // Call the smartScrape function (which needs to be implemented/imported)
// // // const smartScrapedDocs = await smartScrape(meta.url, smartscrape_prompt);
// // // const smartScrapedDocs = await smartScrape(meta.rewrittenUrl ?? meta.url, smartscrape_prompt);
// // // Process/merge smartScrapedDocs with extractedData
// // // ... potentially update finalExtract ...
// // } else {