2024-04-21 11:27:31 -07:00
import request from "supertest" ;
import { app } from "../../index" ;
import dotenv from "dotenv" ;
2024-05-07 15:29:27 -03:00
import { v4 as uuidv4 } from "uuid" ;
2024-04-18 16:28:01 -03:00
dotenv . config ( ) ;
2024-04-20 19:53:04 -07:00
// const TEST_URL = 'http://localhost:3002'
2024-04-21 11:27:31 -07:00
const TEST_URL = "http://127.0.0.1:3002" ;
2024-04-20 19:53:04 -07:00
2024-04-28 17:38:20 -07:00
describe ( "E2E Tests for API Routes" , ( ) = > {
beforeAll ( ( ) = > {
process . env . USE_DB_AUTHENTICATION = "true" ;
} ) ;
afterAll ( ( ) = > {
delete process . env . USE_DB_AUTHENTICATION ;
} ) ;
describe ( "GET /" , ( ) = > {
2024-06-06 15:36:20 -03:00
it . concurrent ( "should return Hello, world! message" , async ( ) = > {
2024-04-28 17:38:20 -07:00
const response = await request ( TEST_URL ) . get ( "/" ) ;
expect ( response . statusCode ) . toBe ( 200 ) ;
expect ( response . text ) . toContain ( "SCRAPERS-JS: Hello, world! Fly.io" ) ;
2024-04-18 16:28:01 -03:00
} ) ;
2024-04-28 17:38:20 -07:00
} ) ;
2024-04-18 16:28:01 -03:00
2024-04-28 17:38:20 -07:00
describe ( "GET /test" , ( ) = > {
2024-06-06 15:36:20 -03:00
it . concurrent ( "should return Hello, world! message" , async ( ) = > {
2024-04-28 17:38:20 -07:00
const response = await request ( TEST_URL ) . get ( "/test" ) ;
expect ( response . statusCode ) . toBe ( 200 ) ;
expect ( response . text ) . toContain ( "Hello, world!" ) ;
2024-04-18 16:28:01 -03:00
} ) ;
2024-04-28 17:38:20 -07:00
} ) ;
2024-04-18 16:28:01 -03:00
2024-04-28 17:38:20 -07:00
describe ( "POST /v0/scrape" , ( ) = > {
2024-06-06 15:36:20 -03:00
it . concurrent ( "should require authorization" , async ( ) = > {
2024-04-28 17:38:20 -07:00
const response = await request ( app ) . post ( "/v0/scrape" ) ;
expect ( response . statusCode ) . toBe ( 401 ) ;
2024-04-18 16:28:01 -03:00
} ) ;
2024-06-06 15:36:20 -03:00
it . concurrent ( "should return an error response with an invalid API key" , async ( ) = > {
2024-04-28 17:38:20 -07:00
const response = await request ( TEST_URL )
. post ( "/v0/scrape" )
. set ( "Authorization" , ` Bearer invalid-api-key ` )
. set ( "Content-Type" , "application/json" )
. send ( { url : "https://firecrawl.dev" } ) ;
expect ( response . statusCode ) . toBe ( 401 ) ;
2024-04-18 16:28:01 -03:00
} ) ;
2024-06-06 15:36:20 -03:00
it . concurrent ( "should return an error for a blocklisted URL" , async ( ) = > {
2024-04-28 17:38:20 -07:00
const blocklistedUrl = "https://facebook.com/fake-test" ;
const response = await request ( TEST_URL )
. post ( "/v0/scrape" )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` )
. set ( "Content-Type" , "application/json" )
. send ( { url : blocklistedUrl } ) ;
expect ( response . statusCode ) . toBe ( 403 ) ;
2024-05-07 10:20:44 -07:00
expect ( response . body . error ) . toContain (
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
) ;
2024-04-18 16:28:01 -03:00
} ) ;
2024-06-06 15:36:20 -03:00
// tested on rate limit test
// it.concurrent("should return a successful response with a valid preview token", async () => {
// const response = await request(TEST_URL)
// .post("/v0/scrape")
// .set("Authorization", `Bearer this_is_just_a_preview_token`)
// .set("Content-Type", "application/json")
// .send({ url: "https://roastmywebsite.ai" });
// expect(response.statusCode).toBe(200);
// }, 30000); // 30 seconds timeout
2024-04-28 17:38:20 -07:00
2024-06-06 15:36:20 -03:00
it . concurrent ( "should return a successful response with a valid API key" , async ( ) = > {
2024-04-28 17:38:20 -07:00
const response = await request ( TEST_URL )
. post ( "/v0/scrape" )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` )
. set ( "Content-Type" , "application/json" )
2024-06-06 15:36:20 -03:00
. send ( { url : "https://roastmywebsite.ai" } ) ;
2024-04-28 17:38:20 -07:00
expect ( response . statusCode ) . toBe ( 200 ) ;
expect ( response . body ) . toHaveProperty ( "data" ) ;
expect ( response . body . data ) . toHaveProperty ( "content" ) ;
expect ( response . body . data ) . toHaveProperty ( "markdown" ) ;
expect ( response . body . data ) . toHaveProperty ( "metadata" ) ;
2024-05-06 19:45:56 -03:00
expect ( response . body . data ) . not . toHaveProperty ( "html" ) ;
2024-06-06 15:36:20 -03:00
expect ( response . body . data . content ) . toContain ( "_Roast_" ) ;
2024-04-28 17:38:20 -07:00
} , 30000 ) ; // 30 seconds timeout
2024-05-06 11:36:44 -03:00
2024-06-06 15:36:20 -03:00
it . concurrent ( "should return a successful response with a valid API key and includeHtml set to true" , async ( ) = > {
2024-05-06 11:36:44 -03:00
const response = await request ( TEST_URL )
. post ( "/v0/scrape" )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` )
. set ( "Content-Type" , "application/json" )
2024-05-07 10:20:44 -07:00
. send ( {
2024-06-06 15:36:20 -03:00
url : "https://roastmywebsite.ai" ,
2024-05-07 10:20:44 -07:00
pageOptions : { includeHtml : true } ,
} ) ;
2024-05-06 11:36:44 -03:00
expect ( response . statusCode ) . toBe ( 200 ) ;
expect ( response . body ) . toHaveProperty ( "data" ) ;
expect ( response . body . data ) . toHaveProperty ( "content" ) ;
2024-05-06 19:45:56 -03:00
expect ( response . body . data ) . toHaveProperty ( "markdown" ) ;
expect ( response . body . data ) . toHaveProperty ( "html" ) ;
2024-05-06 11:36:44 -03:00
expect ( response . body . data ) . toHaveProperty ( "metadata" ) ;
2024-06-06 15:36:20 -03:00
expect ( response . body . data . content ) . toContain ( "_Roast_" ) ;
expect ( response . body . data . markdown ) . toContain ( "_Roast_" ) ;
2024-05-06 19:45:56 -03:00
expect ( response . body . data . html ) . toContain ( "<h1" ) ;
2024-05-06 11:36:44 -03:00
} , 30000 ) ; // 30 seconds timeout
2024-05-10 15:53:13 -03:00
2024-06-06 15:36:20 -03:00
it . concurrent ( 'should return a successful response for a valid scrape with PDF file' , async ( ) = > {
2024-05-10 15:53:13 -03:00
const response = await request ( TEST_URL )
. post ( '/v0/scrape' )
. set ( 'Authorization' , ` Bearer ${ process . env . TEST_API_KEY } ` )
. set ( 'Content-Type' , 'application/json' )
. send ( { url : 'https://arxiv.org/pdf/astro-ph/9301001.pdf' } ) ;
await new Promise ( ( r ) = > setTimeout ( r , 6000 ) ) ;
expect ( response . statusCode ) . toBe ( 200 ) ;
expect ( response . body ) . toHaveProperty ( 'data' ) ;
expect ( response . body . data ) . toHaveProperty ( 'content' ) ;
expect ( response . body . data ) . toHaveProperty ( 'metadata' ) ;
expect ( response . body . data . content ) . toContain ( 'We present spectrophotometric observations of the Broad Line Radio Galaxy' ) ;
2024-05-13 09:13:42 -03:00
} , 60000 ) ; // 60 seconds
2024-05-10 15:53:13 -03:00
2024-06-06 15:36:20 -03:00
it . concurrent ( 'should return a successful response for a valid scrape with PDF file without explicit .pdf extension' , async ( ) = > {
2024-05-10 15:53:13 -03:00
const response = await request ( TEST_URL )
. post ( '/v0/scrape' )
. set ( 'Authorization' , ` Bearer ${ process . env . TEST_API_KEY } ` )
. set ( 'Content-Type' , 'application/json' )
. send ( { url : 'https://arxiv.org/pdf/astro-ph/9301001' } ) ;
await new Promise ( ( r ) = > setTimeout ( r , 6000 ) ) ;
expect ( response . statusCode ) . toBe ( 200 ) ;
expect ( response . body ) . toHaveProperty ( 'data' ) ;
expect ( response . body . data ) . toHaveProperty ( 'content' ) ;
expect ( response . body . data ) . toHaveProperty ( 'metadata' ) ;
expect ( response . body . data . content ) . toContain ( 'We present spectrophotometric observations of the Broad Line Radio Galaxy' ) ;
2024-05-13 09:13:42 -03:00
} , 60000 ) ; // 60 seconds
2024-05-28 12:56:24 -07:00
2024-05-28 17:17:12 -07:00
// TODO: add this test back once we nail the waitFor option to be more deterministic
2024-06-06 15:36:20 -03:00
// it.concurrent("should return a successful response with a valid API key and waitFor option", async () => {
2024-05-28 17:17:12 -07:00
// const startTime = Date.now();
// const response = await request(TEST_URL)
// .post("/v0/scrape")
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
// .set("Content-Type", "application/json")
// .send({ url: "https://firecrawl.dev", pageOptions: { waitFor: 7000 } });
// const endTime = Date.now();
// const duration = endTime - startTime;
// expect(response.statusCode).toBe(200);
// expect(response.body).toHaveProperty("data");
// expect(response.body.data).toHaveProperty("content");
// expect(response.body.data).toHaveProperty("markdown");
// expect(response.body.data).toHaveProperty("metadata");
// expect(response.body.data).not.toHaveProperty("html");
// expect(response.body.data.content).toContain("🔥 Firecrawl");
2024-05-28 17:17:20 -07:00
// expect(duration).toBeGreaterThanOrEqual(7000);
2024-05-28 17:17:12 -07:00
// }, 12000); // 12 seconds timeout
2024-04-28 17:38:20 -07:00
} ) ;
describe ( "POST /v0/crawl" , ( ) = > {
2024-06-06 15:36:20 -03:00
it . concurrent ( "should require authorization" , async ( ) = > {
2024-04-28 17:38:20 -07:00
const response = await request ( TEST_URL ) . post ( "/v0/crawl" ) ;
expect ( response . statusCode ) . toBe ( 401 ) ;
2024-04-18 16:28:01 -03:00
} ) ;
2024-06-06 15:36:20 -03:00
it . concurrent ( "should return an error response with an invalid API key" , async ( ) = > {
2024-04-28 17:38:20 -07:00
const response = await request ( TEST_URL )
. post ( "/v0/crawl" )
. set ( "Authorization" , ` Bearer invalid-api-key ` )
. set ( "Content-Type" , "application/json" )
. send ( { url : "https://firecrawl.dev" } ) ;
expect ( response . statusCode ) . toBe ( 401 ) ;
2024-04-18 16:28:01 -03:00
} ) ;
2024-06-06 15:36:20 -03:00
it . concurrent ( "should return an error for a blocklisted URL" , async ( ) = > {
2024-04-28 17:38:20 -07:00
const blocklistedUrl = "https://twitter.com/fake-test" ;
const response = await request ( TEST_URL )
. post ( "/v0/crawl" )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` )
. set ( "Content-Type" , "application/json" )
. send ( { url : blocklistedUrl } ) ;
expect ( response . statusCode ) . toBe ( 403 ) ;
2024-05-07 10:20:44 -07:00
expect ( response . body . error ) . toContain (
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
) ;
2024-04-28 17:38:20 -07:00
} ) ;
2024-04-23 16:56:09 -07:00
2024-06-06 15:36:20 -03:00
it . concurrent ( "should return a successful response with a valid API key for crawl" , async ( ) = > {
2024-04-28 17:38:20 -07:00
const response = await request ( TEST_URL )
. post ( "/v0/crawl" )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` )
. set ( "Content-Type" , "application/json" )
. send ( { url : "https://firecrawl.dev" } ) ;
expect ( response . statusCode ) . toBe ( 200 ) ;
expect ( response . body ) . toHaveProperty ( "jobId" ) ;
expect ( response . body . jobId ) . toMatch (
/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/
) ;
} ) ;
2024-06-06 15:36:20 -03:00
it . concurrent ( 'should prevent duplicate requests using the same idempotency key' , async ( ) = > {
2024-05-07 15:29:27 -03:00
const uniqueIdempotencyKey = uuidv4 ( ) ;
// First request with the idempotency key
const firstResponse = await request ( TEST_URL )
. post ( '/v0/crawl' )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` )
. set ( "Content-Type" , "application/json" )
. set ( "x-idempotency-key" , uniqueIdempotencyKey )
. send ( { url : 'https://mendable.ai' } ) ;
expect ( firstResponse . statusCode ) . toBe ( 200 ) ;
// Second request with the same idempotency key
const secondResponse = await request ( TEST_URL )
. post ( '/v0/crawl' )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` )
. set ( "Content-Type" , "application/json" )
. set ( "x-idempotency-key" , uniqueIdempotencyKey )
. send ( { url : 'https://mendable.ai' } ) ;
expect ( secondResponse . statusCode ) . toBe ( 409 ) ;
expect ( secondResponse . body . error ) . toBe ( 'Idempotency key already used' ) ;
} ) ;
2024-04-23 16:56:09 -07:00
2024-06-06 15:36:20 -03:00
it . concurrent ( "should return a successful response with a valid API key and valid includes option" , async ( ) = > {
2024-05-15 15:50:50 -03:00
const crawlResponse = await request ( TEST_URL )
. post ( "/v0/crawl" )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` )
. set ( "Content-Type" , "application/json" )
. send ( {
url : "https://mendable.ai" ,
limit : 10 ,
crawlerOptions : {
2024-05-15 17:29:22 -07:00
includes : [ "blog/*" ] ,
2024-05-15 15:50:50 -03:00
} ,
} ) ;
2024-05-15 17:22:29 -07:00
let response ;
let isFinished = false ;
2024-05-15 15:50:50 -03:00
2024-05-15 17:22:29 -07:00
while ( ! isFinished ) {
response = await request ( TEST_URL )
. get ( ` /v0/crawl/status/ ${ crawlResponse . body . jobId } ` )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` ) ;
2024-05-15 15:50:50 -03:00
2024-05-15 17:22:29 -07:00
expect ( response . statusCode ) . toBe ( 200 ) ;
expect ( response . body ) . toHaveProperty ( "status" ) ;
isFinished = response . body . status === "completed" ;
2024-05-15 15:50:50 -03:00
2024-05-15 17:22:29 -07:00
if ( ! isFinished ) {
await new Promise ( ( resolve ) = > setTimeout ( resolve , 1000 ) ) ; // Wait for 1 second before checking again
}
}
const completedResponse = response ;
const urls = completedResponse . body . data . map (
2024-05-15 15:50:50 -03:00
( item : any ) = > item . metadata ? . sourceURL
) ;
expect ( urls . length ) . toBeGreaterThan ( 5 ) ;
urls . forEach ( ( url : string ) = > {
2024-05-15 17:29:22 -07:00
expect ( url . startsWith ( "https://www.mendable.ai/blog/" ) ) . toBeTruthy ( ) ;
2024-05-15 15:50:50 -03:00
} ) ;
expect ( completedResponse . statusCode ) . toBe ( 200 ) ;
expect ( completedResponse . body ) . toHaveProperty ( "status" ) ;
expect ( completedResponse . body . status ) . toBe ( "completed" ) ;
expect ( completedResponse . body ) . toHaveProperty ( "data" ) ;
expect ( completedResponse . body . data [ 0 ] ) . toHaveProperty ( "content" ) ;
expect ( completedResponse . body . data [ 0 ] ) . toHaveProperty ( "markdown" ) ;
expect ( completedResponse . body . data [ 0 ] ) . toHaveProperty ( "metadata" ) ;
expect ( completedResponse . body . data [ 0 ] . content ) . toContain ( "Mendable" ) ;
} , 60000 ) ; // 60 seconds
2024-06-06 15:36:20 -03:00
it . concurrent ( "should return a successful response with a valid API key and valid excludes option" , async ( ) = > {
2024-05-15 15:50:50 -03:00
const crawlResponse = await request ( TEST_URL )
. post ( "/v0/crawl" )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` )
. set ( "Content-Type" , "application/json" )
. send ( {
url : "https://mendable.ai" ,
limit : 10 ,
crawlerOptions : {
2024-05-15 17:29:22 -07:00
excludes : [ "blog/*" ] ,
2024-05-15 15:50:50 -03:00
} ,
} ) ;
2024-05-15 17:22:29 -07:00
let isFinished = false ;
let response ;
2024-05-15 15:50:50 -03:00
2024-05-15 17:22:29 -07:00
while ( ! isFinished ) {
response = await request ( TEST_URL )
. get ( ` /v0/crawl/status/ ${ crawlResponse . body . jobId } ` )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` ) ;
2024-05-15 15:50:50 -03:00
2024-05-15 17:22:29 -07:00
expect ( response . statusCode ) . toBe ( 200 ) ;
expect ( response . body ) . toHaveProperty ( "status" ) ;
isFinished = response . body . status === "completed" ;
if ( ! isFinished ) {
await new Promise ( ( resolve ) = > setTimeout ( resolve , 1000 ) ) ; // Wait for 1 second before checking again
}
}
const completedResponse = response ;
2024-05-15 15:50:50 -03:00
const urls = completedResponse . body . data . map (
( item : any ) = > item . metadata ? . sourceURL
) ;
expect ( urls . length ) . toBeGreaterThan ( 5 ) ;
urls . forEach ( ( url : string ) = > {
2024-05-15 17:29:22 -07:00
expect ( url . startsWith ( "https://wwww.mendable.ai/blog/" ) ) . toBeFalsy ( ) ;
2024-05-15 15:50:50 -03:00
} ) ;
2024-05-22 14:38:41 -03:00
} , 90000 ) ; // 90 seconds
2024-05-15 15:50:50 -03:00
2024-06-06 15:36:20 -03:00
it . concurrent ( "should return a successful response with a valid API key and limit to 3" , async ( ) = > {
2024-05-15 15:50:50 -03:00
const crawlResponse = await request ( TEST_URL )
. post ( "/v0/crawl" )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` )
. set ( "Content-Type" , "application/json" )
. send ( {
url : "https://mendable.ai" ,
2024-05-15 17:35:06 -07:00
crawlerOptions : { limit : 3 } ,
2024-05-15 15:50:50 -03:00
} ) ;
2024-05-15 17:22:29 -07:00
let isFinished = false ;
let response ;
2024-05-15 15:50:50 -03:00
2024-05-15 17:22:29 -07:00
while ( ! isFinished ) {
response = await request ( TEST_URL )
. get ( ` /v0/crawl/status/ ${ crawlResponse . body . jobId } ` )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` ) ;
2024-05-15 15:50:50 -03:00
2024-05-15 17:22:29 -07:00
expect ( response . statusCode ) . toBe ( 200 ) ;
expect ( response . body ) . toHaveProperty ( "status" ) ;
isFinished = response . body . status === "completed" ;
if ( ! isFinished ) {
await new Promise ( ( resolve ) = > setTimeout ( resolve , 1000 ) ) ; // Wait for 1 second before checking again
}
}
const completedResponse = response ;
2024-05-15 15:50:50 -03:00
expect ( completedResponse . statusCode ) . toBe ( 200 ) ;
expect ( completedResponse . body ) . toHaveProperty ( "status" ) ;
expect ( completedResponse . body . status ) . toBe ( "completed" ) ;
expect ( completedResponse . body ) . toHaveProperty ( "data" ) ;
expect ( completedResponse . body . data . length ) . toBe ( 3 ) ;
expect ( completedResponse . body . data [ 0 ] ) . toHaveProperty ( "content" ) ;
expect ( completedResponse . body . data [ 0 ] ) . toHaveProperty ( "markdown" ) ;
expect ( completedResponse . body . data [ 0 ] ) . toHaveProperty ( "metadata" ) ;
expect ( completedResponse . body . data [ 0 ] . content ) . toContain ( "Mendable" ) ;
} , 60000 ) ; // 60 seconds
2024-06-06 15:36:20 -03:00
it . concurrent ( "should return a successful response with max depth option for a valid crawl job" , async ( ) = > {
2024-05-15 15:50:50 -03:00
const crawlResponse = await request ( TEST_URL )
. post ( "/v0/crawl" )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` )
. set ( "Content-Type" , "application/json" )
. send ( {
url : "https://www.scrapethissite.com" ,
crawlerOptions : { maxDepth : 2 } ,
} ) ;
expect ( crawlResponse . statusCode ) . toBe ( 200 ) ;
const response = await request ( TEST_URL )
. get ( ` /v0/crawl/status/ ${ crawlResponse . body . jobId } ` )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` ) ;
expect ( response . statusCode ) . toBe ( 200 ) ;
expect ( response . body ) . toHaveProperty ( "status" ) ;
expect ( response . body . status ) . toBe ( "active" ) ;
// wait for 60 seconds
await new Promise ( ( r ) = > setTimeout ( r , 60000 ) ) ;
const completedResponse = await request ( TEST_URL )
. get ( ` /v0/crawl/status/ ${ crawlResponse . body . jobId } ` )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` ) ;
expect ( completedResponse . statusCode ) . toBe ( 200 ) ;
expect ( completedResponse . body ) . toHaveProperty ( "status" ) ;
expect ( completedResponse . body . status ) . toBe ( "completed" ) ;
expect ( completedResponse . body ) . toHaveProperty ( "data" ) ;
expect ( completedResponse . body . data [ 0 ] ) . toHaveProperty ( "content" ) ;
expect ( completedResponse . body . data [ 0 ] ) . toHaveProperty ( "markdown" ) ;
expect ( completedResponse . body . data [ 0 ] ) . toHaveProperty ( "metadata" ) ;
const urls = completedResponse . body . data . map (
( item : any ) = > item . metadata ? . sourceURL
) ;
expect ( urls . length ) . toBeGreaterThan ( 1 ) ;
// Check if all URLs have a maximum depth of 1
urls . forEach ( ( url : string ) = > {
const depth = new URL ( url ) . pathname . split ( "/" ) . filter ( Boolean ) . length ;
expect ( depth ) . toBeLessThanOrEqual ( 1 ) ;
} ) ;
} , 120000 ) ;
2024-06-06 15:36:20 -03:00
// it.concurrent("should return a successful response with a valid API key and valid limit option", async () => {
2024-05-15 17:40:46 -07:00
// const crawlResponse = await request(TEST_URL)
// .post("/v0/crawl")
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
// .set("Content-Type", "application/json")
// .send({
// url: "https://mendable.ai",
// crawlerOptions: { limit: 10 },
// });
2024-05-15 15:50:50 -03:00
2024-05-15 17:40:46 -07:00
// const response = await request(TEST_URL)
// .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
// expect(response.statusCode).toBe(200);
// expect(response.body).toHaveProperty("status");
// expect(response.body.status).toBe("active");
// let isCompleted = false;
// while (!isCompleted) {
// const statusCheckResponse = await request(TEST_URL)
// .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
// expect(statusCheckResponse.statusCode).toBe(200);
// isCompleted = statusCheckResponse.body.status === "completed";
// if (!isCompleted) {
// await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
// }
// }
// const completedResponse = await request(TEST_URL)
// .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
// expect(completedResponse.statusCode).toBe(200);
// expect(completedResponse.body).toHaveProperty("status");
// expect(completedResponse.body.status).toBe("completed");
// expect(completedResponse.body).toHaveProperty("data");
// expect(completedResponse.body.data.length).toBe(10);
// expect(completedResponse.body.data[0]).toHaveProperty("content");
// expect(completedResponse.body.data[0]).toHaveProperty("markdown");
// expect(completedResponse.body.data[0]).toHaveProperty("metadata");
// expect(completedResponse.body.data[0].content).toContain("Mendable");
// expect(completedResponse.body.data[0].content).not.toContain("main menu");
// }, 60000); // 60 seconds
2024-05-15 15:50:50 -03:00
2024-06-06 15:36:20 -03:00
it . concurrent ( "should return a successful response for a valid crawl job with includeHtml set to true option" , async ( ) = > {
2024-05-15 15:50:50 -03:00
const crawlResponse = await request ( TEST_URL )
. post ( "/v0/crawl" )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` )
. set ( "Content-Type" , "application/json" )
. send ( {
2024-06-06 15:36:20 -03:00
url : "https://roastmywebsite.ai" ,
2024-05-15 15:50:50 -03:00
pageOptions : { includeHtml : true } ,
} ) ;
expect ( crawlResponse . statusCode ) . toBe ( 200 ) ;
const response = await request ( TEST_URL )
. get ( ` /v0/crawl/status/ ${ crawlResponse . body . jobId } ` )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` ) ;
expect ( response . statusCode ) . toBe ( 200 ) ;
expect ( response . body ) . toHaveProperty ( "status" ) ;
expect ( response . body . status ) . toBe ( "active" ) ;
2024-05-15 17:22:29 -07:00
let isCompleted = false ;
while ( ! isCompleted ) {
const statusCheckResponse = await request ( TEST_URL )
. get ( ` /v0/crawl/status/ ${ crawlResponse . body . jobId } ` )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` ) ;
expect ( statusCheckResponse . statusCode ) . toBe ( 200 ) ;
isCompleted = statusCheckResponse . body . status === "completed" ;
if ( ! isCompleted ) {
await new Promise ( ( resolve ) = > setTimeout ( resolve , 1000 ) ) ; // Wait for 1 second before checking again
}
}
2024-05-15 15:50:50 -03:00
const completedResponse = await request ( TEST_URL )
. get ( ` /v0/crawl/status/ ${ crawlResponse . body . jobId } ` )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` ) ;
expect ( completedResponse . statusCode ) . toBe ( 200 ) ;
expect ( completedResponse . body ) . toHaveProperty ( "status" ) ;
expect ( completedResponse . body . status ) . toBe ( "completed" ) ;
expect ( completedResponse . body ) . toHaveProperty ( "data" ) ;
expect ( completedResponse . body . data [ 0 ] ) . toHaveProperty ( "content" ) ;
expect ( completedResponse . body . data [ 0 ] ) . toHaveProperty ( "markdown" ) ;
expect ( completedResponse . body . data [ 0 ] ) . toHaveProperty ( "metadata" ) ;
// 120 seconds
expect ( completedResponse . body . data [ 0 ] ) . toHaveProperty ( "html" ) ;
expect ( completedResponse . body . data [ 0 ] ) . toHaveProperty ( "metadata" ) ;
2024-06-06 15:36:20 -03:00
expect ( completedResponse . body . data [ 0 ] . content ) . toContain ( "_Roast_" ) ;
expect ( completedResponse . body . data [ 0 ] . markdown ) . toContain ( "_Roast_" ) ;
2024-05-15 15:50:50 -03:00
expect ( completedResponse . body . data [ 0 ] . html ) . toContain ( "<h1" ) ;
} , 60000 ) ;
2024-04-28 17:38:20 -07:00
} ) ;
describe ( "POST /v0/crawlWebsitePreview" , ( ) = > {
2024-06-06 15:36:20 -03:00
it . concurrent ( "should require authorization" , async ( ) = > {
2024-05-07 10:20:44 -07:00
const response = await request ( TEST_URL ) . post ( "/v0/crawlWebsitePreview" ) ;
2024-04-28 17:38:20 -07:00
expect ( response . statusCode ) . toBe ( 401 ) ;
} ) ;
2024-06-06 15:36:20 -03:00
it . concurrent ( "should return an error response with an invalid API key" , async ( ) = > {
2024-04-28 17:38:20 -07:00
const response = await request ( TEST_URL )
. post ( "/v0/crawlWebsitePreview" )
. set ( "Authorization" , ` Bearer invalid-api-key ` )
. set ( "Content-Type" , "application/json" )
. send ( { url : "https://firecrawl.dev" } ) ;
expect ( response . statusCode ) . toBe ( 401 ) ;
} ) ;
2024-06-06 15:36:20 -03:00
// it.concurrent("should return an error for a blocklisted URL", async () => {
2024-05-06 19:45:56 -03:00
// const blocklistedUrl = "https://instagram.com/fake-test";
// const response = await request(TEST_URL)
// .post("/v0/crawlWebsitePreview")
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
// .set("Content-Type", "application/json")
// .send({ url: blocklistedUrl });
// // is returning 429 instead of 403
// expect(response.statusCode).toBe(403);
// expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
// });
2024-04-28 17:38:20 -07:00
2024-06-06 15:36:20 -03:00
it . concurrent ( "should return a timeout error when scraping takes longer than the specified timeout" , async ( ) = > {
2024-05-13 13:01:43 -07:00
const response = await request ( TEST_URL )
. post ( "/v0/scrape" )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` )
. set ( "Content-Type" , "application/json" )
. send ( { url : "https://firecrawl.dev" , timeout : 1000 } ) ;
expect ( response . statusCode ) . toBe ( 408 ) ;
} , 3000 ) ;
2024-06-06 16:33:27 -03:00
// it.concurrent("should return a successful response with a valid API key for crawlWebsitePreview", async () => {
// const response = await request(TEST_URL)
// .post("/v0/crawlWebsitePreview")
// .set("Authorization", `Bearer this_is_just_a_preview_token`)
// .set("Content-Type", "application/json")
// .send({ url: "https://firecrawl.dev" });
// expect(response.statusCode).toBe(200);
// expect(response.body).toHaveProperty("jobId");
// expect(response.body.jobId).toMatch(
// /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/
// );
// });
2024-04-28 17:38:20 -07:00
} ) ;
describe ( "POST /v0/search" , ( ) = > {
2024-06-06 15:36:20 -03:00
it . concurrent ( "should require authorization" , async ( ) = > {
2024-04-28 17:38:20 -07:00
const response = await request ( TEST_URL ) . post ( "/v0/search" ) ;
expect ( response . statusCode ) . toBe ( 401 ) ;
2024-04-23 16:56:09 -07:00
} ) ;
2024-06-06 15:36:20 -03:00
it . concurrent ( "should return an error response with an invalid API key" , async ( ) = > {
2024-04-28 17:38:20 -07:00
const response = await request ( TEST_URL )
. post ( "/v0/search" )
. set ( "Authorization" , ` Bearer invalid-api-key ` )
. set ( "Content-Type" , "application/json" )
. send ( { query : "test" } ) ;
expect ( response . statusCode ) . toBe ( 401 ) ;
2024-04-21 11:27:31 -07:00
} ) ;
2024-04-18 16:28:01 -03:00
2024-06-06 15:36:20 -03:00
it . concurrent ( "should return a successful response with a valid API key for search" , async ( ) = > {
2024-04-28 17:38:20 -07:00
const response = await request ( TEST_URL )
. post ( "/v0/search" )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` )
. set ( "Content-Type" , "application/json" )
. send ( { query : "test" } ) ;
expect ( response . statusCode ) . toBe ( 200 ) ;
expect ( response . body ) . toHaveProperty ( "success" ) ;
expect ( response . body . success ) . toBe ( true ) ;
expect ( response . body ) . toHaveProperty ( "data" ) ;
} , 30000 ) ; // 30 seconds timeout
} ) ;
describe ( "GET /v0/crawl/status/:jobId" , ( ) = > {
2024-06-06 15:36:20 -03:00
it . concurrent ( "should require authorization" , async ( ) = > {
2024-04-28 17:38:20 -07:00
const response = await request ( TEST_URL ) . get ( "/v0/crawl/status/123" ) ;
expect ( response . statusCode ) . toBe ( 401 ) ;
} ) ;
2024-06-06 15:36:20 -03:00
it . concurrent ( "should return an error response with an invalid API key" , async ( ) = > {
2024-04-28 17:38:20 -07:00
const response = await request ( TEST_URL )
. get ( "/v0/crawl/status/123" )
. set ( "Authorization" , ` Bearer invalid-api-key ` ) ;
expect ( response . statusCode ) . toBe ( 401 ) ;
} ) ;
2024-06-06 15:36:20 -03:00
it . concurrent ( "should return Job not found for invalid job ID" , async ( ) = > {
2024-04-28 17:38:20 -07:00
const response = await request ( TEST_URL )
. get ( "/v0/crawl/status/invalidJobId" )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` ) ;
expect ( response . statusCode ) . toBe ( 404 ) ;
} ) ;
2024-06-06 15:36:20 -03:00
it . concurrent ( "should return a successful crawl status response for a valid crawl job" , async ( ) = > {
2024-04-28 17:38:20 -07:00
const crawlResponse = await request ( TEST_URL )
. post ( "/v0/crawl" )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` )
. set ( "Content-Type" , "application/json" )
2024-06-11 17:46:25 -03:00
. send ( { url : "https://mendable.ai/blog" } ) ;
2024-04-28 17:38:20 -07:00
expect ( crawlResponse . statusCode ) . toBe ( 200 ) ;
2024-05-15 17:22:29 -07:00
let isCompleted = false ;
let completedResponse ;
2024-04-28 17:38:20 -07:00
2024-05-15 17:22:29 -07:00
while ( ! isCompleted ) {
const response = await request ( TEST_URL )
. get ( ` /v0/crawl/status/ ${ crawlResponse . body . jobId } ` )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` ) ;
expect ( response . statusCode ) . toBe ( 200 ) ;
expect ( response . body ) . toHaveProperty ( "status" ) ;
2024-04-28 17:38:20 -07:00
2024-05-15 17:22:29 -07:00
if ( response . body . status === "completed" ) {
isCompleted = true ;
completedResponse = response ;
} else {
await new Promise ( ( r ) = > setTimeout ( r , 1000 ) ) ; // Wait for 1 second before checking again
}
}
2024-04-28 17:38:20 -07:00
expect ( completedResponse . body ) . toHaveProperty ( "status" ) ;
expect ( completedResponse . body . status ) . toBe ( "completed" ) ;
expect ( completedResponse . body ) . toHaveProperty ( "data" ) ;
expect ( completedResponse . body . data [ 0 ] ) . toHaveProperty ( "content" ) ;
expect ( completedResponse . body . data [ 0 ] ) . toHaveProperty ( "markdown" ) ;
expect ( completedResponse . body . data [ 0 ] ) . toHaveProperty ( "metadata" ) ;
2024-06-11 17:46:25 -03:00
expect ( completedResponse . body . data [ 0 ] . content ) . toContain ( "Mendable" ) ;
2024-06-12 11:27:06 -03:00
const childrenLinks = completedResponse . body . data . filter ( doc = >
doc . metadata && doc . metadata . sourceURL && doc . metadata . sourceURL . includes ( "mendable.ai/blog" )
) ;
2024-06-11 17:46:25 -03:00
2024-06-12 11:27:06 -03:00
expect ( childrenLinks . length ) . toBe ( completedResponse . body . data . length ) ;
2024-06-06 15:36:20 -03:00
} , 120000 ) ; // 120 seconds
2024-05-10 15:53:13 -03:00
2024-06-06 15:36:20 -03:00
it . concurrent ( 'should return a successful response for a valid crawl job with PDF files without explicit .pdf extension' , async ( ) = > {
2024-05-10 15:53:13 -03:00
const crawlResponse = await request ( TEST_URL )
. post ( '/v0/crawl' )
. set ( 'Authorization' , ` Bearer ${ process . env . TEST_API_KEY } ` )
. set ( 'Content-Type' , 'application/json' )
2024-05-17 10:40:44 -07:00
. send ( { url : 'https://arxiv.org/pdf/astro-ph/9301001' , crawlerOptions : { limit : 10 , excludes : [ 'list/*' , 'login' , 'abs/*' , 'static/*' , 'about/*' , 'archive/*' ] } } ) ;
2024-05-10 15:53:13 -03:00
expect ( crawlResponse . statusCode ) . toBe ( 200 ) ;
2024-05-17 10:00:05 -07:00
let isCompleted = false ;
let completedResponse ;
while ( ! isCompleted ) {
const response = await request ( TEST_URL )
. get ( ` /v0/crawl/status/ ${ crawlResponse . body . jobId } ` )
. set ( 'Authorization' , ` Bearer ${ process . env . TEST_API_KEY } ` ) ;
2024-04-18 16:28:01 -03:00
expect ( response . statusCode ) . toBe ( 200 ) ;
2024-05-10 15:53:13 -03:00
expect ( response . body ) . toHaveProperty ( 'status' ) ;
2024-05-17 10:00:05 -07:00
if ( response . body . status === 'completed' ) {
isCompleted = true ;
completedResponse = response ;
} else {
await new Promise ( ( r ) = > setTimeout ( r , 1000 ) ) ; // Wait for 1 second before checking again
}
}
2024-05-10 15:53:13 -03:00
expect ( completedResponse . body . status ) . toBe ( 'completed' ) ;
expect ( completedResponse . body ) . toHaveProperty ( 'data' ) ;
2024-05-17 10:40:44 -07:00
expect ( completedResponse . body . data . length ) . toEqual ( 1 ) ;
2024-05-10 15:53:13 -03:00
expect ( completedResponse . body . data ) . toEqual (
expect . arrayContaining ( [
expect . objectContaining ( {
content : expect.stringContaining ( 'asymmetries might represent, for instance, preferred source orientations to our line of sight.' )
} )
] )
2024-04-21 11:27:31 -07:00
) ;
2024-06-06 15:36:20 -03:00
} , 120000 ) ; // 120 seconds
2024-05-07 11:06:26 -03:00
2024-06-06 15:36:20 -03:00
it . concurrent ( "should return a successful response with max depth option for a valid crawl job" , async ( ) = > {
2024-05-07 11:06:26 -03:00
const crawlResponse = await request ( TEST_URL )
. post ( "/v0/crawl" )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` )
. set ( "Content-Type" , "application/json" )
2024-05-07 10:20:44 -07:00
. send ( {
url : "https://www.scrapethissite.com" ,
crawlerOptions : { maxDepth : 2 } ,
} ) ;
2024-05-07 11:06:26 -03:00
expect ( crawlResponse . statusCode ) . toBe ( 200 ) ;
2024-05-17 10:00:05 -07:00
let isCompleted = false ;
let completedResponse ;
2024-05-07 10:20:44 -07:00
2024-05-17 10:00:05 -07:00
while ( ! isCompleted ) {
const response = await request ( TEST_URL )
. get ( ` /v0/crawl/status/ ${ crawlResponse . body . jobId } ` )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` ) ;
expect ( response . statusCode ) . toBe ( 200 ) ;
expect ( response . body ) . toHaveProperty ( "status" ) ;
2024-05-07 10:20:44 -07:00
2024-05-17 10:00:05 -07:00
if ( response . body . status === "completed" ) {
isCompleted = true ;
completedResponse = response ;
}
}
2024-05-07 11:06:26 -03:00
expect ( completedResponse . statusCode ) . toBe ( 200 ) ;
expect ( completedResponse . body ) . toHaveProperty ( "status" ) ;
expect ( completedResponse . body . status ) . toBe ( "completed" ) ;
expect ( completedResponse . body ) . toHaveProperty ( "data" ) ;
expect ( completedResponse . body . data [ 0 ] ) . toHaveProperty ( "content" ) ;
expect ( completedResponse . body . data [ 0 ] ) . toHaveProperty ( "markdown" ) ;
expect ( completedResponse . body . data [ 0 ] ) . toHaveProperty ( "metadata" ) ;
2024-05-07 10:20:44 -07:00
const urls = completedResponse . body . data . map (
( item : any ) = > item . metadata ? . sourceURL
) ;
2024-05-07 11:06:26 -03:00
expect ( urls . length ) . toBeGreaterThan ( 1 ) ;
// Check if all URLs have a maximum depth of 1
urls . forEach ( ( url ) = > {
2024-05-07 10:20:44 -07:00
const depth = new URL ( url ) . pathname . split ( "/" ) . filter ( Boolean ) . length ;
2024-05-07 11:06:26 -03:00
expect ( depth ) . toBeLessThanOrEqual ( 1 ) ;
} ) ;
2024-06-06 16:50:20 -03:00
} , 180000 ) ;
2024-05-07 11:06:26 -03:00
2024-06-06 15:36:20 -03:00
it . concurrent ( "should return a successful response for a valid crawl job with includeHtml set to true option" , async ( ) = > {
2024-05-06 11:36:44 -03:00
const crawlResponse = await request ( TEST_URL )
. post ( "/v0/crawl" )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` )
. set ( "Content-Type" , "application/json" )
2024-05-07 10:20:44 -07:00
. send ( {
2024-06-06 15:36:20 -03:00
url : "https://roastmywebsite.ai" ,
2024-05-07 10:20:44 -07:00
pageOptions : { includeHtml : true } ,
} ) ;
2024-05-06 11:36:44 -03:00
expect ( crawlResponse . statusCode ) . toBe ( 200 ) ;
const response = await request ( TEST_URL )
. get ( ` /v0/crawl/status/ ${ crawlResponse . body . jobId } ` )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` ) ;
expect ( response . statusCode ) . toBe ( 200 ) ;
expect ( response . body ) . toHaveProperty ( "status" ) ;
expect ( response . body . status ) . toBe ( "active" ) ;
2024-06-06 15:36:20 -03:00
let isFinished = false ;
let completedResponse ;
while ( ! isFinished ) {
const response = await request ( TEST_URL )
. get ( ` /v0/crawl/status/ ${ crawlResponse . body . jobId } ` )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` ) ;
expect ( response . statusCode ) . toBe ( 200 ) ;
expect ( response . body ) . toHaveProperty ( "status" ) ;
2024-05-06 11:36:44 -03:00
2024-06-06 15:36:20 -03:00
if ( response . body . status === "completed" ) {
isFinished = true ;
completedResponse = response ;
} else {
await new Promise ( ( r ) = > setTimeout ( r , 1000 ) ) ; // Wait for 1 second before checking again
}
}
2024-05-07 10:20:44 -07:00
2024-05-06 11:36:44 -03:00
expect ( completedResponse . statusCode ) . toBe ( 200 ) ;
expect ( completedResponse . body ) . toHaveProperty ( "status" ) ;
expect ( completedResponse . body . status ) . toBe ( "completed" ) ;
expect ( completedResponse . body ) . toHaveProperty ( "data" ) ;
expect ( completedResponse . body . data [ 0 ] ) . toHaveProperty ( "content" ) ;
2024-05-06 19:45:56 -03:00
expect ( completedResponse . body . data [ 0 ] ) . toHaveProperty ( "markdown" ) ;
2024-05-07 10:20:44 -07:00
expect ( completedResponse . body . data [ 0 ] ) . toHaveProperty ( "metadata" ) ;
2024-05-06 19:45:56 -03:00
expect ( completedResponse . body . data [ 0 ] ) . toHaveProperty ( "html" ) ;
2024-06-06 15:36:20 -03:00
expect ( completedResponse . body . data [ 0 ] . content ) . toContain ( "_Roast_" ) ;
expect ( completedResponse . body . data [ 0 ] . markdown ) . toContain ( "_Roast_" ) ;
2024-05-07 10:20:44 -07:00
expect ( completedResponse . body . data [ 0 ] . html ) . toContain ( "<h1" ) ;
} , 60000 ) ;
} ) ; // 60 seconds
2024-04-28 17:38:20 -07:00
2024-06-11 17:46:25 -03:00
it . concurrent ( "should return a successful response for a valid crawl job with allowBackwardCrawling set to true option" , async ( ) = > {
2024-05-06 17:16:43 -07:00
const crawlResponse = await request ( TEST_URL )
. post ( "/v0/crawl" )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` )
. set ( "Content-Type" , "application/json" )
2024-06-11 17:46:25 -03:00
. send ( {
url : "https://mendable.ai/blog" ,
pageOptions : { includeHtml : true } ,
crawlerOptions : { allowBackwardCrawling : true } ,
} ) ;
2024-05-06 17:16:43 -07:00
expect ( crawlResponse . statusCode ) . toBe ( 200 ) ;
2024-06-11 17:46:25 -03:00
let isFinished = false ;
let completedResponse ;
2024-05-06 17:16:43 -07:00
2024-06-11 17:46:25 -03:00
while ( ! isFinished ) {
const response = await request ( TEST_URL )
. get ( ` /v0/crawl/status/ ${ crawlResponse . body . jobId } ` )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` ) ;
expect ( response . statusCode ) . toBe ( 200 ) ;
expect ( response . body ) . toHaveProperty ( "status" ) ;
2024-05-06 17:16:43 -07:00
2024-06-11 17:46:25 -03:00
if ( response . body . status === "completed" ) {
isFinished = true ;
completedResponse = response ;
} else {
await new Promise ( ( r ) = > setTimeout ( r , 1000 ) ) ; // Wait for 1 second before checking again
}
}
2024-05-06 17:16:43 -07:00
expect ( completedResponse . statusCode ) . toBe ( 200 ) ;
expect ( completedResponse . body ) . toHaveProperty ( "status" ) ;
2024-06-11 17:46:25 -03:00
expect ( completedResponse . body . status ) . toBe ( "completed" ) ;
2024-05-07 14:03:00 -03:00
expect ( completedResponse . body ) . toHaveProperty ( "data" ) ;
2024-06-11 17:46:25 -03:00
expect ( completedResponse . body . data [ 0 ] ) . toHaveProperty ( "content" ) ;
expect ( completedResponse . body . data [ 0 ] ) . toHaveProperty ( "markdown" ) ;
expect ( completedResponse . body . data [ 0 ] ) . toHaveProperty ( "metadata" ) ;
expect ( completedResponse . body . data [ 0 ] ) . toHaveProperty ( "html" ) ;
expect ( completedResponse . body . data [ 0 ] . content ) . toContain ( "Mendable" ) ;
expect ( completedResponse . body . data [ 0 ] . markdown ) . toContain ( "Mendable" ) ;
const onlyChildrenLinks = completedResponse . body . data . filter ( doc = > {
return doc . metadata && doc . metadata . sourceURL && doc . metadata . sourceURL . includes ( "mendable.ai/blog" )
} ) ;
expect ( completedResponse . body . data . length ) . toBeGreaterThan ( onlyChildrenLinks . length ) ;
} , 60000 ) ;
2024-06-11 17:46:56 -03:00
it . concurrent ( "If someone cancels a crawl job, it should turn into failed status" , async ( ) = > {
const crawlResponse = await request ( TEST_URL )
. post ( "/v0/crawl" )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` )
. set ( "Content-Type" , "application/json" )
2024-06-12 11:27:06 -03:00
. send ( { url : "https://jestjs.io" } ) ;
2024-06-11 17:46:56 -03:00
expect ( crawlResponse . statusCode ) . toBe ( 200 ) ;
2024-06-12 11:27:06 -03:00
await new Promise ( ( r ) = > setTimeout ( r , 20000 ) ) ;
2024-06-11 17:46:56 -03:00
const responseCancel = await request ( TEST_URL )
. delete ( ` /v0/crawl/cancel/ ${ crawlResponse . body . jobId } ` )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` ) ;
expect ( responseCancel . statusCode ) . toBe ( 200 ) ;
2024-06-12 11:27:06 -03:00
expect ( responseCancel . body ) . toHaveProperty ( "status" ) ;
expect ( responseCancel . body . status ) . toBe ( "cancelled" ) ;
2024-06-11 17:46:56 -03:00
2024-06-12 11:27:06 -03:00
await new Promise ( ( r ) = > setTimeout ( r , 10000 ) ) ;
const completedResponse = await request ( TEST_URL )
. get ( ` /v0/crawl/status/ ${ crawlResponse . body . jobId } ` )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` ) ;
2024-06-11 17:46:56 -03:00
expect ( completedResponse . statusCode ) . toBe ( 200 ) ;
expect ( completedResponse . body ) . toHaveProperty ( "status" ) ;
expect ( completedResponse . body . status ) . toBe ( "failed" ) ;
expect ( completedResponse . body ) . toHaveProperty ( "data" ) ;
expect ( completedResponse . body . data ) . toBeNull ( ) ;
expect ( completedResponse . body ) . toHaveProperty ( "partial_data" ) ;
expect ( completedResponse . body . partial_data [ 0 ] ) . toHaveProperty ( "content" ) ;
expect ( completedResponse . body . partial_data [ 0 ] ) . toHaveProperty ( "markdown" ) ;
expect ( completedResponse . body . partial_data [ 0 ] ) . toHaveProperty ( "metadata" ) ;
2024-05-07 14:03:00 -03:00
2024-06-11 17:46:56 -03:00
} , 60000 ) ; // 60 seconds
2024-05-06 17:16:43 -07:00
2024-04-30 09:20:15 -07:00
describe ( "POST /v0/scrape with LLM Extraction" , ( ) = > {
2024-06-06 15:36:20 -03:00
it . concurrent ( "should extract data using LLM extraction mode" , async ( ) = > {
2024-04-28 17:38:20 -07:00
const response = await request ( TEST_URL )
. post ( "/v0/scrape" )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` )
. set ( "Content-Type" , "application/json" )
. send ( {
url : "https://mendable.ai" ,
pageOptions : {
2024-05-07 10:20:44 -07:00
onlyMainContent : true ,
2024-04-28 17:38:20 -07:00
} ,
extractorOptions : {
mode : "llm-extraction" ,
2024-05-07 10:20:44 -07:00
extractionPrompt :
"Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source" ,
2024-04-28 17:38:20 -07:00
extractionSchema : {
type : "object" ,
properties : {
company_mission : {
2024-05-07 10:20:44 -07:00
type : "string" ,
2024-04-28 17:38:20 -07:00
} ,
supports_sso : {
2024-05-07 10:20:44 -07:00
type : "boolean" ,
2024-04-28 15:52:09 -07:00
} ,
2024-04-28 17:38:20 -07:00
is_open_source : {
2024-05-07 10:20:44 -07:00
type : "boolean" ,
} ,
2024-04-28 17:38:20 -07:00
} ,
2024-05-07 10:20:44 -07:00
required : [ "company_mission" , "supports_sso" , "is_open_source" ] ,
} ,
} ,
2024-04-28 17:38:20 -07:00
} ) ;
2024-04-29 12:12:55 -07:00
// Ensure that the job was successfully created before proceeding with LLM extraction
expect ( response . statusCode ) . toBe ( 200 ) ;
2024-04-28 17:38:20 -07:00
// Assuming the LLM extraction object is available in the response body under `data.llm_extraction`
let llmExtraction = response . body . data . llm_extraction ;
// Check if the llm_extraction object has the required properties with correct types and values
expect ( llmExtraction ) . toHaveProperty ( "company_mission" ) ;
expect ( typeof llmExtraction . company_mission ) . toBe ( "string" ) ;
expect ( llmExtraction ) . toHaveProperty ( "supports_sso" ) ;
expect ( llmExtraction . supports_sso ) . toBe ( true ) ;
expect ( typeof llmExtraction . supports_sso ) . toBe ( "boolean" ) ;
expect ( llmExtraction ) . toHaveProperty ( "is_open_source" ) ;
expect ( llmExtraction . is_open_source ) . toBe ( false ) ;
expect ( typeof llmExtraction . is_open_source ) . toBe ( "boolean" ) ;
} , 60000 ) ; // 60 secs
} ) ;
2024-04-28 15:52:09 -07:00
2024-04-30 10:22:09 -07:00
// describe("POST /v0/scrape for Top 100 Companies", () => {
2024-06-06 15:36:20 -03:00
// it.concurrent("should extract data for the top 100 companies", async () => {
2024-04-30 10:22:09 -07:00
// const response = await request(TEST_URL)
// .post("/v0/scrape")
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
// .set("Content-Type", "application/json")
// .send({
// url: "https://companiesmarketcap.com/",
// pageOptions: {
// onlyMainContent: true
// },
// extractorOptions: {
// mode: "llm-extraction",
// extractionPrompt: "Extract the name, market cap, price, and today's change for the top 20 companies listed on the page.",
// extractionSchema: {
// type: "object",
// properties: {
// companies: {
// type: "array",
// items: {
// type: "object",
// properties: {
// rank: { type: "number" },
// name: { type: "string" },
// marketCap: { type: "string" },
// price: { type: "string" },
// todayChange: { type: "string" }
// },
// required: ["rank", "name", "marketCap", "price", "todayChange"]
// }
// }
// },
// required: ["companies"]
// }
// }
// });
// // Print the response body to the console for debugging purposes
// console.log("Response companies:", response.body.data.llm_extraction.companies);
// // Check if the response has the correct structure and data types
// expect(response.status).toBe(200);
// expect(Array.isArray(response.body.data.llm_extraction.companies)).toBe(true);
// expect(response.body.data.llm_extraction.companies.length).toBe(40);
// // Sample check for the first company
// const firstCompany = response.body.data.llm_extraction.companies[0];
// expect(firstCompany).toHaveProperty("name");
// expect(typeof firstCompany.name).toBe("string");
// expect(firstCompany).toHaveProperty("marketCap");
// expect(typeof firstCompany.marketCap).toBe("string");
// expect(firstCompany).toHaveProperty("price");
// expect(typeof firstCompany.price).toBe("string");
// expect(firstCompany).toHaveProperty("todayChange");
// expect(typeof firstCompany.todayChange).toBe("string");
// }, 120000); // 120 secs
// });
2024-04-30 09:20:15 -07:00
2024-05-14 12:28:25 -07:00
describe ( "POST /v0/crawl with fast mode" , ( ) = > {
2024-06-06 15:36:20 -03:00
it . concurrent ( "should complete the crawl under 20 seconds" , async ( ) = > {
2024-05-14 12:28:25 -07:00
const startTime = Date . now ( ) ;
const crawlResponse = await request ( TEST_URL )
. post ( "/v0/crawl" )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` )
. set ( "Content-Type" , "application/json" )
. send ( {
url : "https://flutterbricks.com" ,
crawlerOptions : {
mode : "fast"
}
} ) ;
expect ( crawlResponse . statusCode ) . toBe ( 200 ) ;
const jobId = crawlResponse . body . jobId ;
let statusResponse ;
let isFinished = false ;
while ( ! isFinished ) {
statusResponse = await request ( TEST_URL )
. get ( ` /v0/crawl/status/ ${ jobId } ` )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` ) ;
expect ( statusResponse . statusCode ) . toBe ( 200 ) ;
isFinished = statusResponse . body . status === "completed" ;
if ( ! isFinished ) {
await new Promise ( ( resolve ) = > setTimeout ( resolve , 1000 ) ) ; // Wait for 1 second before checking again
}
}
2024-06-06 15:36:20 -03:00
// const endTime = Date.now();
// const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds
2024-05-14 12:28:25 -07:00
2024-06-06 15:36:20 -03:00
// console.log(`Time elapsed: ${timeElapsed} seconds`);
2024-05-14 12:28:25 -07:00
expect ( statusResponse . body . status ) . toBe ( "completed" ) ;
expect ( statusResponse . body ) . toHaveProperty ( "data" ) ;
expect ( statusResponse . body . data [ 0 ] ) . toHaveProperty ( "content" ) ;
expect ( statusResponse . body . data [ 0 ] ) . toHaveProperty ( "markdown" ) ;
const results = statusResponse . body . data ;
// results.forEach((result, i) => {
// console.log(result.metadata.sourceURL);
// });
expect ( results . length ) . toBeGreaterThanOrEqual ( 10 ) ;
expect ( results . length ) . toBeLessThanOrEqual ( 15 ) ;
} , 20000 ) ;
2024-06-06 15:36:20 -03:00
// it.concurrent("should complete the crawl in more than 10 seconds", async () => {
2024-05-14 12:28:25 -07:00
// const startTime = Date.now();
// const crawlResponse = await request(TEST_URL)
// .post("/v0/crawl")
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
// .set("Content-Type", "application/json")
// .send({
// url: "https://flutterbricks.com",
// });
// expect(crawlResponse.statusCode).toBe(200);
// const jobId = crawlResponse.body.jobId;
// let statusResponse;
// let isFinished = false;
// while (!isFinished) {
// statusResponse = await request(TEST_URL)
// .get(`/v0/crawl/status/${jobId}`)
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
// expect(statusResponse.statusCode).toBe(200);
// isFinished = statusResponse.body.status === "completed";
// if (!isFinished) {
// await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
// }
// }
// const endTime = Date.now();
// const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds
// console.log(`Time elapsed: ${timeElapsed} seconds`);
// expect(statusResponse.body.status).toBe("completed");
// expect(statusResponse.body).toHaveProperty("data");
// expect(statusResponse.body.data[0]).toHaveProperty("content");
// expect(statusResponse.body.data[0]).toHaveProperty("markdown");
// const results = statusResponse.body.data;
// // results.forEach((result, i) => {
// // console.log(result.metadata.sourceURL);
// // });
// expect(results.length).toBeGreaterThanOrEqual(10);
// expect(results.length).toBeLessThanOrEqual(15);
// }, 50000);// 15 seconds timeout to account for network delays
} ) ;
2024-04-28 17:38:20 -07:00
describe ( "GET /is-production" , ( ) = > {
2024-06-06 15:36:20 -03:00
it . concurrent ( "should return the production status" , async ( ) = > {
2024-04-28 17:38:20 -07:00
const response = await request ( TEST_URL ) . get ( "/is-production" ) ;
expect ( response . statusCode ) . toBe ( 200 ) ;
expect ( response . body ) . toHaveProperty ( "isProduction" ) ;
2024-04-18 16:28:01 -03:00
} ) ;
} ) ;
2024-05-17 15:37:47 -03:00
describe ( "Rate Limiter" , ( ) = > {
2024-06-06 15:36:20 -03:00
it . concurrent ( "should return 429 when rate limit is exceeded for preview token" , async ( ) = > {
2024-06-06 16:36:51 -03:00
for ( let i = 0 ; i < 5 ; i ++ ) {
2024-05-17 15:37:47 -03:00
const response = await request ( TEST_URL )
. post ( "/v0/scrape" )
. set ( "Authorization" , ` Bearer this_is_just_a_preview_token ` )
. set ( "Content-Type" , "application/json" )
2024-05-17 15:41:27 -03:00
. send ( { url : "https://www.scrapethissite.com" } ) ;
2024-05-17 15:37:47 -03:00
expect ( response . statusCode ) . toBe ( 200 ) ;
}
const response = await request ( TEST_URL )
. post ( "/v0/scrape" )
. set ( "Authorization" , ` Bearer this_is_just_a_preview_token ` )
. set ( "Content-Type" , "application/json" )
2024-05-17 15:41:27 -03:00
. send ( { url : "https://www.scrapethissite.com" } ) ;
2024-05-17 15:37:47 -03:00
expect ( response . statusCode ) . toBe ( 429 ) ;
2024-06-06 16:44:46 -03:00
} , 90000 ) ;
2024-05-17 15:37:47 -03:00
} ) ;
2024-06-06 15:36:20 -03:00
// it.concurrent("should return 429 when rate limit is exceeded for API key", async () => {
2024-05-19 12:50:06 -07:00
// for (let i = 0; i < parseInt(process.env.RATE_LIMIT_TEST_API_KEY_SCRAPE); i++) {
// const response = await request(TEST_URL)
// .post("/v0/scrape")
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
// .set("Content-Type", "application/json")
// .send({ url: "https://www.scrapethissite.com" });
2024-05-17 15:37:47 -03:00
2024-05-19 12:50:06 -07:00
// expect(response.statusCode).toBe(200);
// }
2024-05-17 15:37:47 -03:00
2024-05-19 12:50:06 -07:00
// const response = await request(TEST_URL)
// .post("/v0/scrape")
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
// .set("Content-Type", "application/json")
// .send({ url: "https://www.scrapethissite.com" });
2024-05-17 15:37:47 -03:00
2024-05-19 12:50:06 -07:00
// expect(response.statusCode).toBe(429);
// }, 60000);
2024-05-17 15:37:47 -03:00
2024-06-06 15:36:20 -03:00
// it.concurrent("should return 429 when rate limit is exceeded for API key", async () => {
2024-05-19 12:50:06 -07:00
// for (let i = 0; i < parseInt(process.env.RATE_LIMIT_TEST_API_KEY_CRAWL); i++) {
// const response = await request(TEST_URL)
// .post("/v0/crawl")
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
// .set("Content-Type", "application/json")
// .send({ url: "https://www.scrapethissite.com" });
2024-05-17 15:37:47 -03:00
2024-05-19 12:50:06 -07:00
// expect(response.statusCode).toBe(200);
// }
2024-05-17 15:37:47 -03:00
2024-05-19 12:50:06 -07:00
// const response = await request(TEST_URL)
// .post("/v0/crawl")
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
// .set("Content-Type", "application/json")
// .send({ url: "https://www.scrapethissite.com" });
2024-05-17 15:37:47 -03:00
2024-05-19 12:50:06 -07:00
// expect(response.statusCode).toBe(429);
// }, 60000);
2024-04-28 17:38:20 -07:00
} ) ;