mirror of
https://github.com/mendableai/firecrawl.git
synced 2025-11-20 20:29:27 +00:00
Revert "feat(queue-worker): always crawl links from content even if sitemapped"
This reverts commit 3c045c43a446bb7895892338c881cd7bc4f77cbf.
This commit is contained in:
parent
445fc432e9
commit
258c67ce67
@ -300,41 +300,49 @@ async function processJob(job: Job, token: string) {
|
|||||||
|
|
||||||
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
||||||
|
|
||||||
if (!sc.cancelled) {
|
if (!job.data.sitemapped) {
|
||||||
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
if (!sc.cancelled) {
|
||||||
|
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
||||||
|
|
||||||
const links = crawler.filterLinks(
|
const links = crawler.filterLinks(
|
||||||
crawler.extractLinksFromHTML(rawHtml ?? "", sc.originUrl),
|
crawler.extractLinksFromHTML(rawHtml ?? "", sc.originUrl),
|
||||||
Infinity,
|
Infinity,
|
||||||
sc.crawlerOptions?.maxDepth ?? 10
|
sc.crawlerOptions?.maxDepth ?? 10
|
||||||
);
|
);
|
||||||
|
|
||||||
for (const link of links) {
|
for (const link of links) {
|
||||||
if (await lockURL(job.data.crawl_id, sc, link)) {
|
if (await lockURL(job.data.crawl_id, sc, link)) {
|
||||||
const jobPriority = await getJobPriority({
|
// This seems to work really welel
|
||||||
plan: sc.plan as PlanType,
|
const jobPriority = await getJobPriority({
|
||||||
team_id: sc.team_id,
|
plan: sc.plan as PlanType,
|
||||||
basePriority: job.data.crawl_id ? 20 : 10,
|
|
||||||
});
|
|
||||||
const jobId = uuidv4();
|
|
||||||
|
|
||||||
const newJob = await addScrapeJob(
|
|
||||||
{
|
|
||||||
url: link,
|
|
||||||
mode: "single_urls",
|
|
||||||
crawlerOptions: sc.crawlerOptions,
|
|
||||||
team_id: sc.team_id,
|
team_id: sc.team_id,
|
||||||
pageOptions: sc.pageOptions,
|
basePriority: job.data.crawl_id ? 20 : 10,
|
||||||
origin: job.data.origin,
|
});
|
||||||
crawl_id: job.data.crawl_id,
|
const jobId = uuidv4();
|
||||||
v1: job.data.v1,
|
|
||||||
},
|
|
||||||
{},
|
|
||||||
jobId,
|
|
||||||
jobPriority
|
|
||||||
);
|
|
||||||
|
|
||||||
await addCrawlJob(job.data.crawl_id, newJob.id);
|
// console.log("plan: ", sc.plan);
|
||||||
|
// console.log("team_id: ", sc.team_id)
|
||||||
|
// console.log("base priority: ", job.data.crawl_id ? 20 : 10)
|
||||||
|
// console.log("job priority: " , jobPriority, "\n\n\n")
|
||||||
|
|
||||||
|
const newJob = await addScrapeJob(
|
||||||
|
{
|
||||||
|
url: link,
|
||||||
|
mode: "single_urls",
|
||||||
|
crawlerOptions: sc.crawlerOptions,
|
||||||
|
team_id: sc.team_id,
|
||||||
|
pageOptions: sc.pageOptions,
|
||||||
|
origin: job.data.origin,
|
||||||
|
crawl_id: job.data.crawl_id,
|
||||||
|
v1: job.data.v1,
|
||||||
|
},
|
||||||
|
{},
|
||||||
|
jobId,
|
||||||
|
jobPriority
|
||||||
|
);
|
||||||
|
|
||||||
|
await addCrawlJob(job.data.crawl_id, newJob.id);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user