feat(scrapeURL): add unnormalizedSourceURL for url matching DX (FIR-2137) (#1601)

* feat(scrapeURL): add unnormalizedSourceURL for url matching DX

* fix(tests): fixc
This commit is contained in:
Gergő Móricz 2025-05-27 21:33:44 +02:00 committed by GitHub
parent 474e5a0543
commit a36c6a4f40
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 26 additions and 9 deletions

View File

@ -48,4 +48,12 @@ describe("Batch scrape tests", () => {
}, 180000);
});
}
it.concurrent("sourceURL stays unnormalized", async () => {
const response = await batchScrape({
urls: ["https://firecrawl.dev/?pagewanted=all&et_blog"],
});
expect(response.body.data[0].metadata.sourceURL).toBe("https://firecrawl.dev/?pagewanted=all&et_blog");
}, 35000);
});

View File

@ -366,4 +366,12 @@ describe("Scrape tests", () => {
}, 30000);
});
}
it.concurrent("sourceURL stays unnormalized", async () => {
const response = await scrape({
url: "https://firecrawl.dev/?pagewanted=all&et_blog",
});
expect(response.metadata.sourceURL).toBe("https://firecrawl.dev/?pagewanted=all&et_blog");
}, 35000);
});

View File

@ -22,7 +22,6 @@ import { getJobPriority } from "../../lib/job-priority";
import { addScrapeJobs } from "../../services/queue-jobs";
import { callWebhook } from "../../services/webhook";
import { logger as _logger } from "../../lib/logger";
import { CostTracking } from "../../lib/extract/extraction-service";
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
@ -30,6 +29,8 @@ export async function batchScrapeController(
req: RequestWithAuth<{}, BatchScrapeResponse, BatchScrapeRequest>,
res: Response<BatchScrapeResponse>,
) {
const preNormalizedBody = { ...req.body };
if (req.body?.ignoreInvalidURLs === true) {
req.body = batchScrapeRequestSchemaNoURLValidation.parse(req.body);
} else {
@ -46,6 +47,7 @@ export async function batchScrapeController(
});
let urls = req.body.urls;
let unnormalizedURLs = preNormalizedBody.urls;
let invalidURLs: string[] | undefined = undefined;
if (req.body.ignoreInvalidURLs) {
@ -53,11 +55,13 @@ export async function batchScrapeController(
let pendingURLs = urls;
urls = [];
unnormalizedURLs = [];
for (const u of pendingURLs) {
try {
const nu = urlSchema.parse(u);
if (!isUrlBlocked(nu, req.acuc?.flags ?? null)) {
urls.push(nu);
unnormalizedURLs.push(u);
} else {
invalidURLs.push(u);
}
@ -86,12 +90,6 @@ export async function batchScrapeController(
await logCrawl(id, req.auth.team_id);
}
let { remainingCredits } = req.account!;
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
if (!useDbAuthentication) {
remainingCredits = Infinity;
}
const sc: StoredCrawl = req.body.appendToId
? ((await getCrawl(req.body.appendToId)) as StoredCrawl)
: {
@ -127,7 +125,7 @@ export async function batchScrapeController(
delete (scrapeOptions as any).urls;
delete (scrapeOptions as any).appendToId;
const jobs = urls.map((x) => {
const jobs = urls.map((x, i) => {
return {
data: {
url: x,
@ -142,6 +140,7 @@ export async function batchScrapeController(
webhook: req.body.webhook,
internalOptions: {
saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false,
unnormalizedSourceURL: unnormalizedURLs[i],
},
},
opts: {

View File

@ -51,6 +51,7 @@ export async function scrapeController(
internalOptions: {
teamId: req.auth.team_id,
saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false,
unnormalizedSourceURL: preNormalizedBody.url,
},
origin: req.body.origin,
is_scrape: true,

View File

@ -182,6 +182,7 @@ export type InternalOptions = {
fromCache?: boolean; // Indicates if the document was retrieved from cache
abort?: AbortSignal;
urlInvisibleInCurrentCrawl?: boolean;
unnormalizedSourceURL?: string;
saveScrapeResultToGCS?: boolean; // Passed along to fire-engine
};
@ -373,7 +374,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
screenshot: result.result.screenshot,
actions: result.result.actions,
metadata: {
sourceURL: meta.url,
sourceURL: meta.internalOptions.unnormalizedSourceURL ?? meta.url,
url: result.result.url,
statusCode: result.result.statusCode,
error: result.result.error,