mirror of
https://github.com/mendableai/firecrawl.git
synced 2025-06-27 00:41:33 +00:00
feat(scrapeURL): add unnormalizedSourceURL for url matching DX (FIR-2137) (#1601)
* feat(scrapeURL): add unnormalizedSourceURL for url matching DX * fix(tests): fixc
This commit is contained in:
parent
474e5a0543
commit
a36c6a4f40
@ -48,4 +48,12 @@ describe("Batch scrape tests", () => {
|
||||
}, 180000);
|
||||
});
|
||||
}
|
||||
|
||||
it.concurrent("sourceURL stays unnormalized", async () => {
|
||||
const response = await batchScrape({
|
||||
urls: ["https://firecrawl.dev/?pagewanted=all&et_blog"],
|
||||
});
|
||||
|
||||
expect(response.body.data[0].metadata.sourceURL).toBe("https://firecrawl.dev/?pagewanted=all&et_blog");
|
||||
}, 35000);
|
||||
});
|
||||
|
@ -366,4 +366,12 @@ describe("Scrape tests", () => {
|
||||
}, 30000);
|
||||
});
|
||||
}
|
||||
|
||||
it.concurrent("sourceURL stays unnormalized", async () => {
|
||||
const response = await scrape({
|
||||
url: "https://firecrawl.dev/?pagewanted=all&et_blog",
|
||||
});
|
||||
|
||||
expect(response.metadata.sourceURL).toBe("https://firecrawl.dev/?pagewanted=all&et_blog");
|
||||
}, 35000);
|
||||
});
|
||||
|
@ -22,7 +22,6 @@ import { getJobPriority } from "../../lib/job-priority";
|
||||
import { addScrapeJobs } from "../../services/queue-jobs";
|
||||
import { callWebhook } from "../../services/webhook";
|
||||
import { logger as _logger } from "../../lib/logger";
|
||||
import { CostTracking } from "../../lib/extract/extraction-service";
|
||||
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||
|
||||
@ -30,6 +29,8 @@ export async function batchScrapeController(
|
||||
req: RequestWithAuth<{}, BatchScrapeResponse, BatchScrapeRequest>,
|
||||
res: Response<BatchScrapeResponse>,
|
||||
) {
|
||||
const preNormalizedBody = { ...req.body };
|
||||
|
||||
if (req.body?.ignoreInvalidURLs === true) {
|
||||
req.body = batchScrapeRequestSchemaNoURLValidation.parse(req.body);
|
||||
} else {
|
||||
@ -46,6 +47,7 @@ export async function batchScrapeController(
|
||||
});
|
||||
|
||||
let urls = req.body.urls;
|
||||
let unnormalizedURLs = preNormalizedBody.urls;
|
||||
let invalidURLs: string[] | undefined = undefined;
|
||||
|
||||
if (req.body.ignoreInvalidURLs) {
|
||||
@ -53,11 +55,13 @@ export async function batchScrapeController(
|
||||
|
||||
let pendingURLs = urls;
|
||||
urls = [];
|
||||
unnormalizedURLs = [];
|
||||
for (const u of pendingURLs) {
|
||||
try {
|
||||
const nu = urlSchema.parse(u);
|
||||
if (!isUrlBlocked(nu, req.acuc?.flags ?? null)) {
|
||||
urls.push(nu);
|
||||
unnormalizedURLs.push(u);
|
||||
} else {
|
||||
invalidURLs.push(u);
|
||||
}
|
||||
@ -86,12 +90,6 @@ export async function batchScrapeController(
|
||||
await logCrawl(id, req.auth.team_id);
|
||||
}
|
||||
|
||||
let { remainingCredits } = req.account!;
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
|
||||
if (!useDbAuthentication) {
|
||||
remainingCredits = Infinity;
|
||||
}
|
||||
|
||||
const sc: StoredCrawl = req.body.appendToId
|
||||
? ((await getCrawl(req.body.appendToId)) as StoredCrawl)
|
||||
: {
|
||||
@ -127,7 +125,7 @@ export async function batchScrapeController(
|
||||
delete (scrapeOptions as any).urls;
|
||||
delete (scrapeOptions as any).appendToId;
|
||||
|
||||
const jobs = urls.map((x) => {
|
||||
const jobs = urls.map((x, i) => {
|
||||
return {
|
||||
data: {
|
||||
url: x,
|
||||
@ -142,6 +140,7 @@ export async function batchScrapeController(
|
||||
webhook: req.body.webhook,
|
||||
internalOptions: {
|
||||
saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false,
|
||||
unnormalizedSourceURL: unnormalizedURLs[i],
|
||||
},
|
||||
},
|
||||
opts: {
|
||||
|
@ -51,6 +51,7 @@ export async function scrapeController(
|
||||
internalOptions: {
|
||||
teamId: req.auth.team_id,
|
||||
saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false,
|
||||
unnormalizedSourceURL: preNormalizedBody.url,
|
||||
},
|
||||
origin: req.body.origin,
|
||||
is_scrape: true,
|
||||
|
@ -182,6 +182,7 @@ export type InternalOptions = {
|
||||
fromCache?: boolean; // Indicates if the document was retrieved from cache
|
||||
abort?: AbortSignal;
|
||||
urlInvisibleInCurrentCrawl?: boolean;
|
||||
unnormalizedSourceURL?: string;
|
||||
|
||||
saveScrapeResultToGCS?: boolean; // Passed along to fire-engine
|
||||
};
|
||||
@ -373,7 +374,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
||||
screenshot: result.result.screenshot,
|
||||
actions: result.result.actions,
|
||||
metadata: {
|
||||
sourceURL: meta.url,
|
||||
sourceURL: meta.internalOptions.unnormalizedSourceURL ?? meta.url,
|
||||
url: result.result.url,
|
||||
statusCode: result.result.statusCode,
|
||||
error: result.result.error,
|
||||
|
Loading…
x
Reference in New Issue
Block a user