mirror of
https://github.com/mendableai/firecrawl.git
synced 2025-11-03 03:22:58 +00:00
compare format (FIR-1560) (#1405)
This commit is contained in:
parent
b3b63486f1
commit
24f5199359
@ -19,7 +19,7 @@ Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
|||||||
content-type: application/json
|
content-type: application/json
|
||||||
|
|
||||||
{
|
{
|
||||||
"url": "https://firecrawl.dev"
|
"url":"https://firecrawl.dev"
|
||||||
}
|
}
|
||||||
|
|
||||||
### Check Crawl Status
|
### Check Crawl Status
|
||||||
|
|||||||
@ -84,6 +84,18 @@ describe("Scrape tests", () => {
|
|||||||
// expect(response.markdown).toMatch(/(\.g\.doubleclick\.net|amazon-adsystem\.com)\//);
|
// expect(response.markdown).toMatch(/(\.g\.doubleclick\.net|amazon-adsystem\.com)\//);
|
||||||
// }, 30000);
|
// }, 30000);
|
||||||
// });
|
// });
|
||||||
|
|
||||||
|
describe("Compare format", () => {
|
||||||
|
it.concurrent("works", async () => {
|
||||||
|
const response = await scrape({
|
||||||
|
url: "https://example.com",
|
||||||
|
formats: ["markdown", "compare"],
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(response.compare).toBeDefined();
|
||||||
|
expect(response.compare?.previousScrapeAt).not.toBeNull();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
describe("Location API (f-e dependant)", () => {
|
describe("Location API (f-e dependant)", () => {
|
||||||
it.concurrent("works without specifying an explicit location", async () => {
|
it.concurrent("works without specifying an explicit location", async () => {
|
||||||
|
|||||||
@ -158,6 +158,7 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
pageOptions,
|
pageOptions,
|
||||||
undefined,
|
undefined,
|
||||||
undefined,
|
undefined,
|
||||||
|
team_id
|
||||||
);
|
);
|
||||||
internalOptions.disableSmartWaitCache = true; // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
|
internalOptions.disableSmartWaitCache = true; // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
|
||||||
|
|
||||||
|
|||||||
@ -99,6 +99,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||||||
pageOptions,
|
pageOptions,
|
||||||
undefined,
|
undefined,
|
||||||
undefined,
|
undefined,
|
||||||
|
team_id
|
||||||
);
|
);
|
||||||
|
|
||||||
const sc: StoredCrawl = {
|
const sc: StoredCrawl = {
|
||||||
|
|||||||
@ -66,6 +66,7 @@ export async function scrapeHelper(
|
|||||||
extractorOptions,
|
extractorOptions,
|
||||||
timeout,
|
timeout,
|
||||||
crawlerOptions,
|
crawlerOptions,
|
||||||
|
team_id,
|
||||||
);
|
);
|
||||||
|
|
||||||
await addScrapeJob(
|
await addScrapeJob(
|
||||||
@ -297,6 +298,7 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
pageOptions,
|
pageOptions,
|
||||||
extractorOptions,
|
extractorOptions,
|
||||||
timeout,
|
timeout,
|
||||||
|
team_id,
|
||||||
);
|
);
|
||||||
|
|
||||||
logJob({
|
logJob({
|
||||||
|
|||||||
@ -72,6 +72,7 @@ export async function searchHelper(
|
|||||||
undefined,
|
undefined,
|
||||||
60000,
|
60000,
|
||||||
crawlerOptions,
|
crawlerOptions,
|
||||||
|
team_id,
|
||||||
);
|
);
|
||||||
|
|
||||||
if (justSearch) {
|
if (justSearch) {
|
||||||
|
|||||||
@ -82,7 +82,7 @@ export async function batchScrapeController(
|
|||||||
: {
|
: {
|
||||||
crawlerOptions: null,
|
crawlerOptions: null,
|
||||||
scrapeOptions: req.body,
|
scrapeOptions: req.body,
|
||||||
internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter
|
internalOptions: { disableSmartWaitCache: true, teamId: req.auth.team_id }, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter
|
||||||
team_id: req.auth.team_id,
|
team_id: req.auth.team_id,
|
||||||
createdAt: Date.now(),
|
createdAt: Date.now(),
|
||||||
plan: req.auth.plan,
|
plan: req.auth.plan,
|
||||||
|
|||||||
@ -81,7 +81,7 @@ export async function crawlController(
|
|||||||
originUrl: req.body.url,
|
originUrl: req.body.url,
|
||||||
crawlerOptions: toLegacyCrawlerOptions(crawlerOptions),
|
crawlerOptions: toLegacyCrawlerOptions(crawlerOptions),
|
||||||
scrapeOptions,
|
scrapeOptions,
|
||||||
internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
|
internalOptions: { disableSmartWaitCache: true, teamId: req.auth.team_id }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
|
||||||
team_id: req.auth.team_id,
|
team_id: req.auth.team_id,
|
||||||
createdAt: Date.now(),
|
createdAt: Date.now(),
|
||||||
plan: req.auth.plan,
|
plan: req.auth.plan,
|
||||||
|
|||||||
@ -85,7 +85,7 @@ export async function getMapResults({
|
|||||||
scrapeOptions: undefined,
|
scrapeOptions: undefined,
|
||||||
},
|
},
|
||||||
scrapeOptions: scrapeOptions.parse({}),
|
scrapeOptions: scrapeOptions.parse({}),
|
||||||
internalOptions: {},
|
internalOptions: { teamId },
|
||||||
team_id: teamId,
|
team_id: teamId,
|
||||||
createdAt: Date.now(),
|
createdAt: Date.now(),
|
||||||
plan: plan,
|
plan: plan,
|
||||||
|
|||||||
@ -50,7 +50,7 @@ export async function scrapeController(
|
|||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
team_id: req.auth.team_id,
|
team_id: req.auth.team_id,
|
||||||
scrapeOptions: req.body,
|
scrapeOptions: req.body,
|
||||||
internalOptions: {},
|
internalOptions: { teamId: req.auth.team_id },
|
||||||
plan: req.auth.plan!,
|
plan: req.auth.plan!,
|
||||||
origin: req.body.origin,
|
origin: req.body.origin,
|
||||||
is_scrape: true,
|
is_scrape: true,
|
||||||
|
|||||||
@ -83,7 +83,7 @@ async function scrapeSearchResult(
|
|||||||
mode: "single_urls" as Mode,
|
mode: "single_urls" as Mode,
|
||||||
team_id: options.teamId,
|
team_id: options.teamId,
|
||||||
scrapeOptions: options.scrapeOptions,
|
scrapeOptions: options.scrapeOptions,
|
||||||
internalOptions: {},
|
internalOptions: { teamId: options.teamId },
|
||||||
plan: options.plan || "free",
|
plan: options.plan || "free",
|
||||||
origin: options.origin,
|
origin: options.origin,
|
||||||
is_scrape: true,
|
is_scrape: true,
|
||||||
|
|||||||
@ -20,7 +20,8 @@ export type Format =
|
|||||||
| "links"
|
| "links"
|
||||||
| "screenshot"
|
| "screenshot"
|
||||||
| "screenshot@fullPage"
|
| "screenshot@fullPage"
|
||||||
| "extract";
|
| "extract"
|
||||||
|
| "compare";
|
||||||
|
|
||||||
export const url = z.preprocess(
|
export const url = z.preprocess(
|
||||||
(x) => {
|
(x) => {
|
||||||
@ -165,6 +166,7 @@ const baseScrapeOptions = z
|
|||||||
"screenshot@fullPage",
|
"screenshot@fullPage",
|
||||||
"extract",
|
"extract",
|
||||||
"json",
|
"json",
|
||||||
|
"compare",
|
||||||
])
|
])
|
||||||
.array()
|
.array()
|
||||||
.optional()
|
.optional()
|
||||||
@ -172,6 +174,10 @@ const baseScrapeOptions = z
|
|||||||
.refine(
|
.refine(
|
||||||
(x) => !(x.includes("screenshot") && x.includes("screenshot@fullPage")),
|
(x) => !(x.includes("screenshot") && x.includes("screenshot@fullPage")),
|
||||||
"You may only specify either screenshot or screenshot@fullPage",
|
"You may only specify either screenshot or screenshot@fullPage",
|
||||||
|
)
|
||||||
|
.refine(
|
||||||
|
(x) => !x.includes("compare") || x.includes("markdown"),
|
||||||
|
"The compare format requires the markdown format to be specified as well",
|
||||||
),
|
),
|
||||||
headers: z.record(z.string(), z.string()).optional(),
|
headers: z.record(z.string(), z.string()).optional(),
|
||||||
includeTags: z.string().array().optional(),
|
includeTags: z.string().array().optional(),
|
||||||
@ -546,6 +552,11 @@ export type Document = {
|
|||||||
value: unknown
|
value: unknown
|
||||||
}[];
|
}[];
|
||||||
};
|
};
|
||||||
|
compare?: {
|
||||||
|
previousScrapeAt: string | null;
|
||||||
|
changeStatus: "new" | "same" | "changed" | "removed";
|
||||||
|
visibility: "visible" | "hidden";
|
||||||
|
}
|
||||||
metadata: {
|
metadata: {
|
||||||
title?: string;
|
title?: string;
|
||||||
description?: string;
|
description?: string;
|
||||||
@ -812,7 +823,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
export function fromLegacyCrawlerOptions(x: any): {
|
export function fromLegacyCrawlerOptions(x: any, teamId: string): {
|
||||||
crawlOptions: CrawlerOptions;
|
crawlOptions: CrawlerOptions;
|
||||||
internalOptions: InternalOptions;
|
internalOptions: InternalOptions;
|
||||||
} {
|
} {
|
||||||
@ -834,6 +845,7 @@ export function fromLegacyCrawlerOptions(x: any): {
|
|||||||
}),
|
}),
|
||||||
internalOptions: {
|
internalOptions: {
|
||||||
v0CrawlOnlyUrls: x.returnOnlyUrls,
|
v0CrawlOnlyUrls: x.returnOnlyUrls,
|
||||||
|
teamId,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@ -847,6 +859,7 @@ export function fromLegacyScrapeOptions(
|
|||||||
pageOptions: PageOptions,
|
pageOptions: PageOptions,
|
||||||
extractorOptions: ExtractorOptions | undefined,
|
extractorOptions: ExtractorOptions | undefined,
|
||||||
timeout: number | undefined,
|
timeout: number | undefined,
|
||||||
|
teamId: string,
|
||||||
): { scrapeOptions: ScrapeOptions; internalOptions: InternalOptions } {
|
): { scrapeOptions: ScrapeOptions; internalOptions: InternalOptions } {
|
||||||
return {
|
return {
|
||||||
scrapeOptions: scrapeOptions.parse({
|
scrapeOptions: scrapeOptions.parse({
|
||||||
@ -896,6 +909,7 @@ export function fromLegacyScrapeOptions(
|
|||||||
internalOptions: {
|
internalOptions: {
|
||||||
atsv: pageOptions.atsv,
|
atsv: pageOptions.atsv,
|
||||||
v0DisableJsDom: pageOptions.disableJsDom,
|
v0DisableJsDom: pageOptions.disableJsDom,
|
||||||
|
teamId,
|
||||||
},
|
},
|
||||||
// TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks
|
// TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks
|
||||||
};
|
};
|
||||||
@ -906,13 +920,15 @@ export function fromLegacyCombo(
|
|||||||
extractorOptions: ExtractorOptions | undefined,
|
extractorOptions: ExtractorOptions | undefined,
|
||||||
timeout: number | undefined,
|
timeout: number | undefined,
|
||||||
crawlerOptions: any,
|
crawlerOptions: any,
|
||||||
|
teamId: string,
|
||||||
): { scrapeOptions: ScrapeOptions; internalOptions: InternalOptions } {
|
): { scrapeOptions: ScrapeOptions; internalOptions: InternalOptions } {
|
||||||
const { scrapeOptions, internalOptions: i1 } = fromLegacyScrapeOptions(
|
const { scrapeOptions, internalOptions: i1 } = fromLegacyScrapeOptions(
|
||||||
pageOptions,
|
pageOptions,
|
||||||
extractorOptions,
|
extractorOptions,
|
||||||
timeout,
|
timeout,
|
||||||
|
teamId,
|
||||||
);
|
);
|
||||||
const { internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions);
|
const { internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions, teamId);
|
||||||
return { scrapeOptions, internalOptions: Object.assign(i1, i2) };
|
return { scrapeOptions, internalOptions: Object.assign(i1, i2) };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -154,20 +154,20 @@ export async function finishCrawlKickoff(id: string) {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function finishCrawl(id: string) {
|
export async function finishCrawlPre(id: string) {
|
||||||
if (await isCrawlFinished(id)) {
|
if (await isCrawlFinished(id)) {
|
||||||
_logger.debug("Marking crawl as finished.", {
|
_logger.debug("Marking crawl as pre-finished.", {
|
||||||
module: "crawl-redis",
|
module: "crawl-redis",
|
||||||
method: "finishCrawl",
|
method: "finishCrawlPre",
|
||||||
crawlId: id,
|
crawlId: id,
|
||||||
});
|
});
|
||||||
const set = await redisConnection.setnx("crawl:" + id + ":finish", "yes");
|
const set = await redisConnection.setnx("crawl:" + id + ":finished_pre", "yes");
|
||||||
await redisConnection.expire("crawl:" + id + ":finish", 24 * 60 * 60);
|
await redisConnection.expire("crawl:" + id + ":finished_pre", 24 * 60 * 60);
|
||||||
return set === 1;
|
return set === 1;
|
||||||
} else {
|
} else {
|
||||||
_logger.debug("Crawl can not be finished yet, not marking as finished.", {
|
_logger.debug("Crawl can not be pre-finished yet, not marking as finished.", {
|
||||||
module: "crawl-redis",
|
module: "crawl-redis",
|
||||||
method: "finishCrawl",
|
method: "finishCrawlPre",
|
||||||
crawlId: id,
|
crawlId: id,
|
||||||
jobs_done: await redisConnection.scard("crawl:" + id + ":jobs_done"),
|
jobs_done: await redisConnection.scard("crawl:" + id + ":jobs_done"),
|
||||||
jobs: await redisConnection.scard("crawl:" + id + ":jobs"),
|
jobs: await redisConnection.scard("crawl:" + id + ":jobs"),
|
||||||
@ -177,6 +177,16 @@ export async function finishCrawl(id: string) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export async function finishCrawl(id: string) {
|
||||||
|
_logger.debug("Marking crawl as finished.", {
|
||||||
|
module: "crawl-redis",
|
||||||
|
method: "finishCrawl",
|
||||||
|
crawlId: id,
|
||||||
|
});
|
||||||
|
await redisConnection.set("crawl:" + id + ":finish", "yes");
|
||||||
|
await redisConnection.expire("crawl:" + id + ":finish", 24 * 60 * 60);
|
||||||
|
}
|
||||||
|
|
||||||
export async function getCrawlJobs(id: string): Promise<string[]> {
|
export async function getCrawlJobs(id: string): Promise<string[]> {
|
||||||
return await redisConnection.smembers("crawl:" + id + ":jobs");
|
return await redisConnection.smembers("crawl:" + id + ":jobs");
|
||||||
}
|
}
|
||||||
@ -250,7 +260,7 @@ export function generateURLPermutations(url: string | URL): URL[] {
|
|||||||
return [urlWithHTML, urlWithPHP, urlWithSlash, urlWithBare];
|
return [urlWithHTML, urlWithPHP, urlWithSlash, urlWithBare];
|
||||||
});
|
});
|
||||||
|
|
||||||
return permutations;
|
return [...new Set(permutations.map(x => x.href))].map(x => new URL(x));
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function lockURL(
|
export async function lockURL(
|
||||||
|
|||||||
@ -44,6 +44,7 @@ export async function scrapeDocument(
|
|||||||
scrapeOptions: scrapeOptions.parse({ ...internalScrapeOptions }),
|
scrapeOptions: scrapeOptions.parse({ ...internalScrapeOptions }),
|
||||||
internalOptions: {
|
internalOptions: {
|
||||||
useCache: true,
|
useCache: true,
|
||||||
|
teamId: options.teamId,
|
||||||
},
|
},
|
||||||
plan: options.plan,
|
plan: options.plan,
|
||||||
origin: options.origin,
|
origin: options.origin,
|
||||||
|
|||||||
@ -51,6 +51,7 @@ export async function startWebScraperPipeline({
|
|||||||
priority: job.opts.priority,
|
priority: job.opts.priority,
|
||||||
is_scrape: job.data.is_scrape ?? false,
|
is_scrape: job.data.is_scrape ?? false,
|
||||||
is_crawl: !!(job.data.crawl_id && job.data.crawlerOptions !== null),
|
is_crawl: !!(job.data.crawl_id && job.data.crawlerOptions !== null),
|
||||||
|
urlInvisibleInCurrentCrawl: job.data.crawlerOptions?.urlInvisibleInCurrentCrawl ?? false,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -66,6 +67,7 @@ export async function runWebScraper({
|
|||||||
priority,
|
priority,
|
||||||
is_scrape = false,
|
is_scrape = false,
|
||||||
is_crawl = false,
|
is_crawl = false,
|
||||||
|
urlInvisibleInCurrentCrawl = false,
|
||||||
}: RunWebScraperParams): Promise<ScrapeUrlResponse> {
|
}: RunWebScraperParams): Promise<ScrapeUrlResponse> {
|
||||||
const logger = _logger.child({
|
const logger = _logger.child({
|
||||||
method: "runWebScraper",
|
method: "runWebScraper",
|
||||||
@ -97,6 +99,8 @@ export async function runWebScraper({
|
|||||||
response = await scrapeURL(bull_job_id, url, scrapeOptions, {
|
response = await scrapeURL(bull_job_id, url, scrapeOptions, {
|
||||||
priority,
|
priority,
|
||||||
...internalOptions,
|
...internalOptions,
|
||||||
|
urlInvisibleInCurrentCrawl,
|
||||||
|
teamId: internalOptions?.teamId ?? team_id,
|
||||||
});
|
});
|
||||||
if (!response.success) {
|
if (!response.success) {
|
||||||
if (response.error instanceof Error) {
|
if (response.error instanceof Error) {
|
||||||
|
|||||||
@ -47,6 +47,7 @@ export async function getLinksFromSitemap(
|
|||||||
],
|
],
|
||||||
v0DisableJsDom: true,
|
v0DisableJsDom: true,
|
||||||
abort,
|
abort,
|
||||||
|
teamId: "sitemap",
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|||||||
@ -162,6 +162,8 @@ async function buildMetaObject(
|
|||||||
}
|
}
|
||||||
|
|
||||||
export type InternalOptions = {
|
export type InternalOptions = {
|
||||||
|
teamId: string;
|
||||||
|
|
||||||
priority?: number; // Passed along to fire-engine
|
priority?: number; // Passed along to fire-engine
|
||||||
forceEngine?: Engine | Engine[];
|
forceEngine?: Engine | Engine[];
|
||||||
atsv?: boolean; // anti-bot solver, beta
|
atsv?: boolean; // anti-bot solver, beta
|
||||||
@ -173,6 +175,7 @@ export type InternalOptions = {
|
|||||||
isBackgroundIndex?: boolean;
|
isBackgroundIndex?: boolean;
|
||||||
fromCache?: boolean; // Indicates if the document was retrieved from cache
|
fromCache?: boolean; // Indicates if the document was retrieved from cache
|
||||||
abort?: AbortSignal;
|
abort?: AbortSignal;
|
||||||
|
urlInvisibleInCurrentCrawl?: boolean;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type EngineResultsTracker = {
|
export type EngineResultsTracker = {
|
||||||
@ -383,7 +386,7 @@ export async function scrapeURL(
|
|||||||
id: string,
|
id: string,
|
||||||
url: string,
|
url: string,
|
||||||
options: ScrapeOptions,
|
options: ScrapeOptions,
|
||||||
internalOptions: InternalOptions = {},
|
internalOptions: InternalOptions,
|
||||||
): Promise<ScrapeUrlResponse> {
|
): Promise<ScrapeUrlResponse> {
|
||||||
const meta = await buildMetaObject(id, url, options, internalOptions);
|
const meta = await buildMetaObject(id, url, options, internalOptions);
|
||||||
try {
|
try {
|
||||||
|
|||||||
@ -31,7 +31,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"test:scrape-basic",
|
"test:scrape-basic",
|
||||||
"https://www.roastmywebsite.ai/",
|
"https://www.roastmywebsite.ai/",
|
||||||
scrapeOptions.parse({}),
|
scrapeOptions.parse({}),
|
||||||
{ forceEngine },
|
{ forceEngine, teamId: "test" },
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -78,7 +78,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
scrapeOptions.parse({
|
scrapeOptions.parse({
|
||||||
formats: ["markdown", "html"],
|
formats: ["markdown", "html"],
|
||||||
}),
|
}),
|
||||||
{ forceEngine },
|
{ forceEngine, teamId: "test" },
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -102,7 +102,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
scrapeOptions.parse({
|
scrapeOptions.parse({
|
||||||
onlyMainContent: false,
|
onlyMainContent: false,
|
||||||
}),
|
}),
|
||||||
{ forceEngine },
|
{ forceEngine, teamId: "test" },
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -125,7 +125,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
onlyMainContent: false,
|
onlyMainContent: false,
|
||||||
excludeTags: [".nav", "#footer", "strong"],
|
excludeTags: [".nav", "#footer", "strong"],
|
||||||
}),
|
}),
|
||||||
{ forceEngine },
|
{ forceEngine, teamId: "test" },
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -145,7 +145,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"test:scrape-400",
|
"test:scrape-400",
|
||||||
"https://httpstat.us/400",
|
"https://httpstat.us/400",
|
||||||
scrapeOptions.parse({}),
|
scrapeOptions.parse({}),
|
||||||
{ forceEngine },
|
{ forceEngine, teamId: "test" },
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -163,7 +163,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"test:scrape-401",
|
"test:scrape-401",
|
||||||
"https://httpstat.us/401",
|
"https://httpstat.us/401",
|
||||||
scrapeOptions.parse({}),
|
scrapeOptions.parse({}),
|
||||||
{ forceEngine },
|
{ forceEngine, teamId: "test" },
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -181,7 +181,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"test:scrape-403",
|
"test:scrape-403",
|
||||||
"https://httpstat.us/403",
|
"https://httpstat.us/403",
|
||||||
scrapeOptions.parse({}),
|
scrapeOptions.parse({}),
|
||||||
{ forceEngine },
|
{ forceEngine, teamId: "test" },
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -199,7 +199,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"test:scrape-404",
|
"test:scrape-404",
|
||||||
"https://httpstat.us/404",
|
"https://httpstat.us/404",
|
||||||
scrapeOptions.parse({}),
|
scrapeOptions.parse({}),
|
||||||
{ forceEngine },
|
{ forceEngine, teamId: "test" },
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -217,7 +217,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"test:scrape-405",
|
"test:scrape-405",
|
||||||
"https://httpstat.us/405",
|
"https://httpstat.us/405",
|
||||||
scrapeOptions.parse({}),
|
scrapeOptions.parse({}),
|
||||||
{ forceEngine },
|
{ forceEngine, teamId: "test" },
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -235,7 +235,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"test:scrape-500",
|
"test:scrape-500",
|
||||||
"https://httpstat.us/500",
|
"https://httpstat.us/500",
|
||||||
scrapeOptions.parse({}),
|
scrapeOptions.parse({}),
|
||||||
{ forceEngine },
|
{ forceEngine, teamId: "test" },
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -253,7 +253,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"test:scrape-redirect",
|
"test:scrape-redirect",
|
||||||
"https://scrapethissite.com/",
|
"https://scrapethissite.com/",
|
||||||
scrapeOptions.parse({}),
|
scrapeOptions.parse({}),
|
||||||
{ forceEngine },
|
{ forceEngine, teamId: "test" },
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -285,7 +285,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
scrapeOptions.parse({
|
scrapeOptions.parse({
|
||||||
formats: ["screenshot"],
|
formats: ["screenshot"],
|
||||||
}),
|
}),
|
||||||
{ forceEngine },
|
{ forceEngine, teamId: "test" },
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -313,7 +313,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
scrapeOptions.parse({
|
scrapeOptions.parse({
|
||||||
formats: ["screenshot@fullPage"],
|
formats: ["screenshot@fullPage"],
|
||||||
}),
|
}),
|
||||||
{ forceEngine },
|
{ forceEngine, teamId: "test" },
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -341,6 +341,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"test:scrape-pdf",
|
"test:scrape-pdf",
|
||||||
"https://arxiv.org/pdf/astro-ph/9301001.pdf",
|
"https://arxiv.org/pdf/astro-ph/9301001.pdf",
|
||||||
scrapeOptions.parse({}),
|
scrapeOptions.parse({}),
|
||||||
|
{ teamId: "test" },
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -359,6 +360,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"test:scrape-docx",
|
"test:scrape-docx",
|
||||||
"https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx",
|
"https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx",
|
||||||
scrapeOptions.parse({}),
|
scrapeOptions.parse({}),
|
||||||
|
{ teamId: "test" },
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -395,6 +397,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
}),
|
}),
|
||||||
|
{ teamId: "test" },
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -430,6 +433,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
}),
|
}),
|
||||||
|
{ teamId: "test" },
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -451,7 +455,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
async (i) => {
|
async (i) => {
|
||||||
const url = "https://www.scrapethissite.com/?i=" + i;
|
const url = "https://www.scrapethissite.com/?i=" + i;
|
||||||
const id = "test:concurrent:" + url;
|
const id = "test:concurrent:" + url;
|
||||||
const out = await scrapeURL(id, url, scrapeOptions.parse({}));
|
const out = await scrapeURL(id, url, scrapeOptions.parse({}), { teamId: "test" });
|
||||||
|
|
||||||
const replacer = (key: string, value: any) => {
|
const replacer = (key: string, value: any) => {
|
||||||
if (value instanceof Error) {
|
if (value instanceof Error) {
|
||||||
|
|||||||
42
apps/api/src/scraper/scrapeURL/transformers/diff.ts
Normal file
42
apps/api/src/scraper/scrapeURL/transformers/diff.ts
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
import { supabase_service } from "../../../services/supabase";
|
||||||
|
import { Document } from "../../../controllers/v1/types";
|
||||||
|
import { Meta } from "../index";
|
||||||
|
|
||||||
|
export async function deriveDiff(meta: Meta, document: Document): Promise<Document> {
|
||||||
|
if (meta.options.formats.includes("compare")) {
|
||||||
|
const res = await supabase_service
|
||||||
|
.rpc("diff_get_last_scrape_1", {
|
||||||
|
i_team_id: meta.internalOptions.teamId,
|
||||||
|
i_url: document.metadata.sourceURL ?? meta.url,
|
||||||
|
});
|
||||||
|
|
||||||
|
const data: {
|
||||||
|
o_docs: Document[],
|
||||||
|
o_date_added: string,
|
||||||
|
} | undefined | null = (res.data ?? [])[0] as any;
|
||||||
|
|
||||||
|
if (data && data.o_docs.length > 0) {
|
||||||
|
const previousMarkdown = data.o_docs[0].markdown!;
|
||||||
|
const currentMarkdown = document.markdown!;
|
||||||
|
|
||||||
|
const transformer = (x: string) => [...x.replace(/\s+/g, "").replace(/\[iframe\]\(.+?\)/g, "")].sort().join("");
|
||||||
|
|
||||||
|
document.compare = {
|
||||||
|
previousScrapeAt: data.o_date_added,
|
||||||
|
changeStatus: document.metadata.statusCode === 404 ? "removed" : transformer(previousMarkdown) === transformer(currentMarkdown) ? "same" : "changed",
|
||||||
|
visibility: meta.internalOptions.urlInvisibleInCurrentCrawl ? "hidden" : "visible",
|
||||||
|
}
|
||||||
|
} else if (!res.error) {
|
||||||
|
document.compare = {
|
||||||
|
previousScrapeAt: null,
|
||||||
|
changeStatus: document.metadata.statusCode === 404 ? "removed" : "new",
|
||||||
|
visibility: meta.internalOptions.urlInvisibleInCurrentCrawl ? "hidden" : "visible",
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
meta.logger.error("Error fetching previous scrape", { error: res.error });
|
||||||
|
document.warning = "Comparing failed, please try again later." + (document.warning ? ` ${document.warning}` : "");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return document;
|
||||||
|
}
|
||||||
@ -8,7 +8,7 @@ import { performLLMExtract } from "./llmExtract";
|
|||||||
import { uploadScreenshot } from "./uploadScreenshot";
|
import { uploadScreenshot } from "./uploadScreenshot";
|
||||||
import { removeBase64Images } from "./removeBase64Images";
|
import { removeBase64Images } from "./removeBase64Images";
|
||||||
import { saveToCache } from "./cache";
|
import { saveToCache } from "./cache";
|
||||||
|
import { deriveDiff } from "./diff";
|
||||||
export type Transformer = (
|
export type Transformer = (
|
||||||
meta: Meta,
|
meta: Meta,
|
||||||
document: Document,
|
document: Document,
|
||||||
@ -148,6 +148,17 @@ export function coerceFieldsToFormats(
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!formats.has("compare") && document.compare !== undefined) {
|
||||||
|
meta.logger.warn(
|
||||||
|
"Removed compare from Document because it wasn't in formats -- this is extremely wasteful and indicates a bug.",
|
||||||
|
);
|
||||||
|
delete document.compare;
|
||||||
|
} else if (formats.has("compare") && document.compare === undefined) {
|
||||||
|
meta.logger.warn(
|
||||||
|
"Request had format compare, but there was no compare field in the result.",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
if (meta.options.actions === undefined || meta.options.actions.length === 0) {
|
if (meta.options.actions === undefined || meta.options.actions.length === 0) {
|
||||||
delete document.actions;
|
delete document.actions;
|
||||||
}
|
}
|
||||||
@ -164,6 +175,7 @@ export const transformerStack: Transformer[] = [
|
|||||||
deriveMetadataFromRawHTML,
|
deriveMetadataFromRawHTML,
|
||||||
uploadScreenshot,
|
uploadScreenshot,
|
||||||
performLLMExtract,
|
performLLMExtract,
|
||||||
|
deriveDiff,
|
||||||
coerceFieldsToFormats,
|
coerceFieldsToFormats,
|
||||||
removeBase64Images,
|
removeBase64Images,
|
||||||
];
|
];
|
||||||
|
|||||||
@ -28,6 +28,7 @@ import {
|
|||||||
addCrawlJobs,
|
addCrawlJobs,
|
||||||
crawlToCrawler,
|
crawlToCrawler,
|
||||||
finishCrawl,
|
finishCrawl,
|
||||||
|
finishCrawlPre,
|
||||||
finishCrawlKickoff,
|
finishCrawlKickoff,
|
||||||
generateURLPermutations,
|
generateURLPermutations,
|
||||||
getCrawl,
|
getCrawl,
|
||||||
@ -100,7 +101,77 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
|
|||||||
const runningJobs: Set<string> = new Set();
|
const runningJobs: Set<string> = new Set();
|
||||||
|
|
||||||
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
||||||
if (await finishCrawl(job.data.crawl_id)) {
|
if (await finishCrawlPre(job.data.crawl_id)) {
|
||||||
|
if (job.data.crawlerOptions && !await redisConnection.exists("crawl:" + job.data.crawl_id + ":invisible_urls")) {
|
||||||
|
await redisConnection.set("crawl:" + job.data.crawl_id + ":invisible_urls", "done", "EX", 60 * 60 * 24);
|
||||||
|
|
||||||
|
const sc = (await getCrawl(job.data.crawl_id))!;
|
||||||
|
|
||||||
|
const visitedUrls = new Set(await redisConnection.smembers(
|
||||||
|
"crawl:" + job.data.crawl_id + ":visited_unique",
|
||||||
|
));
|
||||||
|
|
||||||
|
const lastUrls: string[] = ((await supabase_service.rpc("diff_get_last_crawl_urls", {
|
||||||
|
i_team_id: job.data.team_id,
|
||||||
|
i_url: sc.originUrl!,
|
||||||
|
})).data ?? []).map(x => x.url);
|
||||||
|
|
||||||
|
const lastUrlsSet = new Set(lastUrls);
|
||||||
|
|
||||||
|
const univistedUrls = Array.from(lastUrlsSet).filter(x => !visitedUrls.has(x));
|
||||||
|
const addableJobCount = sc.crawlerOptions.limit === undefined ? Infinity : (sc.crawlerOptions.limit - await getDoneJobsOrderedLength(job.data.crawl_id));
|
||||||
|
|
||||||
|
console.log(sc.originUrl!, univistedUrls, visitedUrls, lastUrls, addableJobCount);
|
||||||
|
|
||||||
|
if (univistedUrls.length !== 0 && addableJobCount > 0) {
|
||||||
|
const jobs = univistedUrls.slice(0, addableJobCount).map((url) => {
|
||||||
|
const uuid = uuidv4();
|
||||||
|
return {
|
||||||
|
name: uuid,
|
||||||
|
data: {
|
||||||
|
url,
|
||||||
|
mode: "single_urls" as const,
|
||||||
|
team_id: job.data.team_id,
|
||||||
|
plan: job.data.plan!,
|
||||||
|
crawlerOptions: {
|
||||||
|
...job.data.crawlerOptions,
|
||||||
|
urlInvisibleInCurrentCrawl: true,
|
||||||
|
},
|
||||||
|
scrapeOptions: job.data.scrapeOptions,
|
||||||
|
internalOptions: sc.internalOptions,
|
||||||
|
origin: job.data.origin,
|
||||||
|
crawl_id: job.data.crawl_id,
|
||||||
|
sitemapped: true,
|
||||||
|
webhook: job.data.webhook,
|
||||||
|
v1: job.data.v1,
|
||||||
|
},
|
||||||
|
opts: {
|
||||||
|
jobId: uuid,
|
||||||
|
priority: 20,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
const lockedIds = await lockURLsIndividually(
|
||||||
|
job.data.crawl_id,
|
||||||
|
sc,
|
||||||
|
jobs.map((x) => ({ id: x.opts.jobId, url: x.data.url })),
|
||||||
|
);
|
||||||
|
const lockedJobs = jobs.filter((x) =>
|
||||||
|
lockedIds.find((y) => y.id === x.opts.jobId),
|
||||||
|
);
|
||||||
|
await addCrawlJobs(
|
||||||
|
job.data.crawl_id,
|
||||||
|
lockedJobs.map((x) => x.opts.jobId),
|
||||||
|
);
|
||||||
|
await addScrapeJobs(lockedJobs);
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
await finishCrawl(job.data.crawl_id);
|
||||||
|
|
||||||
(async () => {
|
(async () => {
|
||||||
const originUrl = sc.originUrl
|
const originUrl = sc.originUrl
|
||||||
? normalizeUrlOnlyHostname(sc.originUrl)
|
? normalizeUrlOnlyHostname(sc.originUrl)
|
||||||
|
|||||||
@ -59,6 +59,7 @@ export interface RunWebScraperParams {
|
|||||||
priority?: number;
|
priority?: number;
|
||||||
is_scrape?: boolean;
|
is_scrape?: boolean;
|
||||||
is_crawl?: boolean;
|
is_crawl?: boolean;
|
||||||
|
urlInvisibleInCurrentCrawl?: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
export type RunWebScraperResult =
|
export type RunWebScraperResult =
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@mendable/firecrawl-js",
|
"name": "@mendable/firecrawl-js",
|
||||||
"version": "1.21.0",
|
"version": "1.21.1",
|
||||||
"description": "JavaScript SDK for Firecrawl API",
|
"description": "JavaScript SDK for Firecrawl API",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"types": "dist/index.d.ts",
|
"types": "dist/index.d.ts",
|
||||||
|
|||||||
@ -69,6 +69,11 @@ export interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult
|
|||||||
screenshot?: string;
|
screenshot?: string;
|
||||||
metadata?: FirecrawlDocumentMetadata;
|
metadata?: FirecrawlDocumentMetadata;
|
||||||
actions: ActionsSchema;
|
actions: ActionsSchema;
|
||||||
|
compare?: {
|
||||||
|
previousScrapeAt: string | null;
|
||||||
|
changeStatus: "new" | "same" | "changed" | "removed";
|
||||||
|
visibility: "visible" | "hidden";
|
||||||
|
};
|
||||||
// v1 search only
|
// v1 search only
|
||||||
title?: string;
|
title?: string;
|
||||||
description?: string;
|
description?: string;
|
||||||
@ -79,7 +84,7 @@ export interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult
|
|||||||
* Defines the options and configurations available for scraping web content.
|
* Defines the options and configurations available for scraping web content.
|
||||||
*/
|
*/
|
||||||
export interface CrawlScrapeOptions {
|
export interface CrawlScrapeOptions {
|
||||||
formats?: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "extract" | "json")[];
|
formats?: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "extract" | "json" | "compare")[];
|
||||||
headers?: Record<string, string>;
|
headers?: Record<string, string>;
|
||||||
includeTags?: string[];
|
includeTags?: string[];
|
||||||
excludeTags?: string[];
|
excludeTags?: string[];
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user