feat(api): GET /crawl/ongoing (FIR-2189) (#1620)

* feat(api): GET /crawl/ongoing

* fix: routers in wrong order

* feat(api/crawl/ongoing): return more details

---------

Co-authored-by: Nicolas <nicolascamara29@gmail.com>
This commit is contained in:
Gergő Móricz 2025-06-04 18:14:23 +02:00 committed by GitHub
parent 077c5dd8ec
commit a05c4ae97d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 137 additions and 2 deletions

View File

@ -1,4 +1,4 @@
import { crawl } from "./lib";
import { asyncCrawl, asyncCrawlWaitForFinish, crawl, crawlOngoing } from "./lib";
import { describe, it, expect } from "@jest/globals";
describe("Crawl tests", () => {
@ -45,6 +45,23 @@ describe("Crawl tests", () => {
delay: 5,
});
}, 300000);
it.concurrent("ongoing crawls endpoint works", async () => {
const res = await asyncCrawl({
url: "https://firecrawl.dev",
limit: 3,
});
const ongoing = await crawlOngoing();
expect(ongoing.crawls.find(x => x.id === res.id)).toBeDefined();
await asyncCrawlWaitForFinish(res.id);
const ongoing2 = await crawlOngoing();
expect(ongoing2.crawls.find(x => x.id === res.id)).toBeUndefined();
}, 120000);
// TEMP: Flaky
// it.concurrent("discovers URLs properly when origin is not included", async () => {

View File

@ -1,7 +1,7 @@
import { configDotenv } from "dotenv";
configDotenv();
import { ScrapeRequestInput, Document, ExtractRequestInput, ExtractResponse, CrawlRequestInput, MapRequestInput, BatchScrapeRequestInput, SearchRequestInput, CrawlStatusResponse, ErrorResponse } from "../../controllers/v1/types";
import { ScrapeRequestInput, Document, ExtractRequestInput, ExtractResponse, CrawlRequestInput, MapRequestInput, BatchScrapeRequestInput, SearchRequestInput, CrawlStatusResponse, CrawlResponse, OngoingCrawlsResponse, ErrorResponse } from "../../controllers/v1/types";
import request from "supertest";
// =========================================
@ -90,6 +90,20 @@ async function crawlStatus(id: string) {
.send();
}
async function crawlOngoingRaw() {
return await request(TEST_URL)
.get("/v1/crawl/ongoing")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.send();
}
export async function crawlOngoing(): Promise<Exclude<OngoingCrawlsResponse, ErrorResponse>> {
const res = await crawlOngoingRaw();
expect(res.statusCode).toBe(200);
expect(res.body.success).toBe(true);
return res.body;
}
function expectCrawlStartToSucceed(response: Awaited<ReturnType<typeof crawlStart>>) {
expect(response.statusCode).toBe(200);
expect(response.body.success).toBe(true);
@ -106,6 +120,25 @@ function expectCrawlToSucceed(response: Awaited<ReturnType<typeof crawlStatus>>)
expect(response.body.data.length).toBeGreaterThan(0);
}
export async function asyncCrawl(body: CrawlRequestInput): Promise<Exclude<CrawlResponse, ErrorResponse>> {
const cs = await crawlStart(body);
expectCrawlStartToSucceed(cs);
return cs.body;
}
export async function asyncCrawlWaitForFinish(id: string): Promise<Exclude<CrawlStatusResponse, ErrorResponse>> {
let x;
do {
x = await crawlStatus(id);
expect(x.statusCode).toBe(200);
expect(typeof x.body.status).toBe("string");
} while (x.body.status === "scraping");
expectCrawlToSucceed(x);
return x.body;
}
export async function crawl(body: CrawlRequestInput): Promise<Exclude<CrawlStatusResponse, ErrorResponse>> {
const cs = await crawlStart(body);
expectCrawlStartToSucceed(cs);

View File

@ -0,0 +1,34 @@
import { Response } from "express";
import {
OngoingCrawlsResponse,
RequestWithAuth,
toNewCrawlerOptions,
} from "./types";
import {
getCrawl,
getCrawlsByTeamId,
} from "../../lib/crawl-redis";
import { configDotenv } from "dotenv";
configDotenv();
export async function ongoingCrawlsController(
req: RequestWithAuth<{}, undefined, OngoingCrawlsResponse>,
res: Response<OngoingCrawlsResponse>,
) {
const ids = await getCrawlsByTeamId(req.auth.team_id);
const crawls = (await Promise.all(ids.map(async id => ({ ...(await getCrawl(id)), id })))).filter((crawl) => crawl !== null && !crawl.cancelled);
res.status(200).json({
success: true,
crawls: crawls.map(x => ({
id: x.id,
teamId: x.team_id!,
url: x.originUrl!,
options: {
...toNewCrawlerOptions(x.crawlerOptions),
scrapeOptions: x.scrapeOptions,
},
})),
});
}

View File

@ -882,6 +882,18 @@ export type CrawlStatusResponse =
data: Document[];
};
export type OngoingCrawlsResponse =
| ErrorResponse
| {
success: true;
crawls: {
id: string;
teamId: string;
url: string;
options: CrawlerOptions;
}[];
};
export type CrawlErrorsResponse =
| ErrorResponse
| {
@ -1016,6 +1028,25 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
};
}
export function toNewCrawlerOptions(x: any): CrawlerOptions {
return {
includePaths: x.includes,
excludePaths: x.excludes,
limit: x.limit,
maxDepth: x.maxDepth,
allowBackwardLinks: x.allowBackwardCrawling,
allowExternalLinks: x.allowExternalContentLinks,
allowSubdomains: x.allowSubdomains,
ignoreRobotsTxt: x.ignoreRobotsTxt,
ignoreSitemap: x.ignoreSitemap,
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
ignoreQueryParameters: x.ignoreQueryParameters,
regexOnFullURL: x.regexOnFullURL,
maxDiscoveryDepth: x.maxDiscoveryDepth,
delay: x.delay,
}
}
export function fromLegacyCrawlerOptions(x: any, teamId: string): {
crawlOptions: CrawlerOptions;
internalOptions: InternalOptions;

View File

@ -26,6 +26,13 @@ export async function saveCrawl(id: string, crawl: StoredCrawl) {
});
await redisEvictConnection.set("crawl:" + id, JSON.stringify(crawl));
await redisEvictConnection.expire("crawl:" + id, 24 * 60 * 60);
await redisEvictConnection.sadd("crawls_by_team_id:" + crawl.team_id, id);
await redisEvictConnection.expire("crawls_by_team_id:" + crawl.team_id, 24 * 60 * 60);
}
export async function getCrawlsByTeamId(team_id: string): Promise<string[]> {
return await redisEvictConnection.smembers("crawls_by_team_id:" + team_id);
}
export async function getCrawl(id: string): Promise<StoredCrawl | null> {
@ -184,6 +191,12 @@ export async function finishCrawl(id: string) {
});
await redisEvictConnection.set("crawl:" + id + ":finish", "yes");
await redisEvictConnection.expire("crawl:" + id + ":finish", 24 * 60 * 60);
const crawl = await getCrawl(id);
if (crawl && crawl.team_id) {
await redisEvictConnection.srem("crawls_by_team_id:" + crawl.team_id, id);
await redisEvictConnection.expire("crawls_by_team_id:" + crawl.team_id, 24 * 60 * 60);
}
}
export async function getCrawlJobs(id: string): Promise<string[]> {

View File

@ -35,6 +35,7 @@ import { generateLLMsTextStatusController } from "../controllers/v1/generate-llm
import { deepResearchController } from "../controllers/v1/deep-research";
import { deepResearchStatusController } from "../controllers/v1/deep-research-status";
import { tokenUsageController } from "../controllers/v1/token-usage";
import { ongoingCrawlsController } from "../controllers/v1/crawl-ongoing";
function checkCreditsMiddleware(
minimum?: number,
@ -213,6 +214,12 @@ v1Router.post(
wrap(mapController),
);
v1Router.get(
"/crawl/ongoing",
authMiddleware(RateLimiterMode.CrawlStatus),
wrap(ongoingCrawlsController),
);
v1Router.get(
"/crawl/:jobId",
authMiddleware(RateLimiterMode.CrawlStatus),