2024-09-19 22:22:57 +02:00
use dotenvy ::dotenv ;
2024-09-20 19:36:07 +02:00
use firecrawl ::scrape ::{ ExtractOptions , ScrapeFormats , ScrapeOptions } ;
2024-12-17 15:11:23 -08:00
use firecrawl ::{ FirecrawlApp , FirecrawlError } ;
2024-07-12 13:59:04 +02:00
use serde_json ::json ;
use std ::env ;
2024-12-20 18:09:49 -03:00
// #[tokio::test]
// async fn test_blocklisted_url() {
// dotenv().ok();
// let api_url = env::var("API_URL").unwrap();
// let api_key = env::var("TEST_API_KEY").ok();
// let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
// let blocklisted_url = "https://facebook.com/fake-test";
// let result = app.scrape_url(blocklisted_url, None).await;
2024-07-12 14:41:53 +02:00
2024-12-20 18:09:49 -03:00
// assert_matches!(
// result,
// Err(e) if e.to_string().contains("Firecrawl currently does not support social media scraping due to policy restrictions")
// );
// }
2024-07-12 13:59:04 +02:00
#[ tokio::test ]
async fn test_successful_response_with_valid_preview_token ( ) {
dotenv ( ) . ok ( ) ;
let api_url = env ::var ( " API_URL " ) . unwrap ( ) ;
2025-04-18 07:59:59 +03:00
let app =
FirecrawlApp ::new_selfhosted ( api_url , Some ( env ::var ( " PREVIEW_TOKEN " ) . unwrap ( ) ) ) . unwrap ( ) ;
2024-07-12 14:54:38 +02:00
let result = app
. scrape_url ( " https://roastmywebsite.ai " , None )
. await
. unwrap ( ) ;
2024-09-20 19:36:07 +02:00
assert! ( result . markdown . is_some ( ) ) ;
assert! ( result . markdown . unwrap ( ) . contains ( " _Roast_ " ) ) ;
2024-07-12 13:59:04 +02:00
}
#[ tokio::test ]
async fn test_scrape_url_e2e ( ) {
dotenv ( ) . ok ( ) ;
let api_url = env ::var ( " API_URL " ) . unwrap ( ) ;
2024-09-20 19:36:07 +02:00
let api_key = env ::var ( " TEST_API_KEY " ) . ok ( ) ;
let app = FirecrawlApp ::new_selfhosted ( api_url , api_key ) . unwrap ( ) ;
2024-07-12 14:54:38 +02:00
let result = app
. scrape_url ( " https://roastmywebsite.ai " , None )
. await
. unwrap ( ) ;
2024-09-20 19:36:07 +02:00
assert! ( result . markdown . is_some ( ) ) ;
assert! ( result . markdown . unwrap ( ) . contains ( " _Roast_ " ) ) ;
2024-07-12 13:59:04 +02:00
}
#[ tokio::test ]
async fn test_successful_response_with_valid_api_key_and_include_html ( ) {
dotenv ( ) . ok ( ) ;
let api_url = env ::var ( " API_URL " ) . unwrap ( ) ;
2024-09-20 19:36:07 +02:00
let api_key = env ::var ( " TEST_API_KEY " ) . ok ( ) ;
let app = FirecrawlApp ::new_selfhosted ( api_url , api_key ) . unwrap ( ) ;
let params = ScrapeOptions {
2025-04-18 07:59:59 +03:00
formats : vec ! [ ScrapeFormats ::Markdown , ScrapeFormats ::HTML ] . into ( ) ,
2024-09-20 19:36:07 +02:00
.. Default ::default ( )
} ;
2024-07-12 14:54:38 +02:00
let result = app
2024-09-20 19:36:07 +02:00
. scrape_url ( " https://roastmywebsite.ai " , params )
2024-07-12 14:54:38 +02:00
. await
. unwrap ( ) ;
2024-09-20 19:36:07 +02:00
assert! ( result . markdown . is_some ( ) ) ;
assert! ( result . html . is_some ( ) ) ;
assert! ( result . markdown . unwrap ( ) . contains ( " _Roast_ " ) ) ;
assert! ( result . html . unwrap ( ) . contains ( " <h1 " ) ) ;
2024-07-12 13:59:04 +02:00
}
#[ tokio::test ]
async fn test_successful_response_for_valid_scrape_with_pdf_file ( ) {
dotenv ( ) . ok ( ) ;
let api_url = env ::var ( " API_URL " ) . unwrap ( ) ;
2024-09-20 19:36:07 +02:00
let api_key = env ::var ( " TEST_API_KEY " ) . ok ( ) ;
let app = FirecrawlApp ::new_selfhosted ( api_url , api_key ) . unwrap ( ) ;
2024-07-12 14:54:38 +02:00
let result = app
. scrape_url ( " https://arxiv.org/pdf/astro-ph/9301001.pdf " , None )
. await
. unwrap ( ) ;
2024-09-20 19:36:07 +02:00
assert! ( result . markdown . is_some ( ) ) ;
2025-04-18 07:59:59 +03:00
assert! ( result
. markdown
2024-07-12 14:54:38 +02:00
. unwrap ( )
. contains ( " We present spectrophotometric observations of the Broad Line Radio Galaxy " ) ) ;
2024-07-12 13:59:04 +02:00
}
#[ tokio::test ]
async fn test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension ( ) {
dotenv ( ) . ok ( ) ;
let api_url = env ::var ( " API_URL " ) . unwrap ( ) ;
2024-09-20 19:36:07 +02:00
let api_key = env ::var ( " TEST_API_KEY " ) . ok ( ) ;
let app = FirecrawlApp ::new_selfhosted ( api_url , api_key ) . unwrap ( ) ;
2024-07-12 14:54:38 +02:00
let result = app
. scrape_url ( " https://arxiv.org/pdf/astro-ph/9301001 " , None )
. await
. unwrap ( ) ;
2024-09-20 19:36:07 +02:00
assert! ( result . markdown . is_some ( ) ) ;
2025-04-18 07:59:59 +03:00
assert! ( result
. markdown
2024-07-12 14:54:38 +02:00
. unwrap ( )
. contains ( " We present spectrophotometric observations of the Broad Line Radio Galaxy " ) ) ;
2024-07-12 13:59:04 +02:00
}
2024-12-20 18:09:49 -03:00
// #[tokio::test]
// async fn test_should_return_error_for_blocklisted_url() {
// dotenv().ok();
// let api_url = env::var("API_URL").unwrap();
// let api_key = env::var("TEST_API_KEY").ok();
// let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
// let blocklisted_url = "https://twitter.com/fake-test";
// let result = app.crawl_url(blocklisted_url, None).await;
// assert_matches!(
// result,
// Err(e) if e.to_string().contains("Firecrawl currently does not support social media scraping due to policy restrictions.")
// );
// }
2024-07-12 13:59:04 +02:00
#[ tokio::test ]
async fn test_llm_extraction ( ) {
dotenv ( ) . ok ( ) ;
let api_url = env ::var ( " API_URL " ) . unwrap ( ) ;
2024-09-20 19:36:07 +02:00
let api_key = env ::var ( " TEST_API_KEY " ) . ok ( ) ;
let app = FirecrawlApp ::new_selfhosted ( api_url , api_key ) . unwrap ( ) ;
let options = ScrapeOptions {
formats : vec ! [ ScrapeFormats ::Extract ] . into ( ) ,
extract : ExtractOptions {
prompt : " Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source " . to_string ( ) . into ( ) ,
schema : json ! ( {
2024-07-12 13:59:04 +02:00
" type " : " object " ,
" properties " : {
" company_mission " : { " type " : " string " } ,
" supports_sso " : { " type " : " boolean " } ,
" is_open_source " : { " type " : " boolean " }
} ,
" required " : [ " company_mission " , " supports_sso " , " is_open_source " ]
2024-09-20 19:36:07 +02:00
} ) . into ( ) ,
.. Default ::default ( )
} . into ( ) ,
.. Default ::default ( )
} ;
2024-07-12 14:54:38 +02:00
let result = app
2024-09-20 19:36:07 +02:00
. scrape_url ( " https://mendable.ai " , options )
2024-07-12 14:54:38 +02:00
. await
. unwrap ( ) ;
2024-09-20 19:36:07 +02:00
assert! ( result . extract . is_some ( ) ) ;
let llm_extraction = & result . extract . unwrap ( ) ;
2024-07-12 14:54:38 +02:00
assert! ( llm_extraction
. as_object ( )
. unwrap ( )
. contains_key ( " company_mission " ) ) ;
2024-07-12 13:59:04 +02:00
assert! ( llm_extraction [ " supports_sso " ] . is_boolean ( ) ) ;
assert! ( llm_extraction [ " is_open_source " ] . is_boolean ( ) ) ;
}
2024-12-17 15:11:23 -08:00
#[ test ]
fn test_api_key_requirements ( ) {
dotenv ( ) . ok ( ) ;
2025-04-18 07:59:59 +03:00
2024-12-17 15:11:23 -08:00
let api_url = env ::var ( " API_URL " ) . unwrap_or ( " http://localhost:3002 " . to_string ( ) ) ;
let api_key = env ::var ( " TEST_API_KEY " ) . ok ( ) ;
match ( api_url . contains ( " api.firecrawl.dev " ) , api_key ) {
( false , _ ) = > {
let result = FirecrawlApp ::new_selfhosted ( & api_url , None ::< String > ) ;
2025-04-18 07:59:59 +03:00
assert! (
result . is_ok ( ) ,
" Local setup failed: {:?} " ,
result . err ( ) . unwrap ( )
) ;
2024-12-17 15:11:23 -08:00
}
( true , None ) = > {
let result = FirecrawlApp ::new_selfhosted ( & api_url , None ::< String > ) ;
assert! ( matches! (
result ,
Err ( FirecrawlError ::APIError ( msg , _ ) ) if msg = = " Configuration "
) ) ;
}
( true , Some ( key ) ) = > {
let result = FirecrawlApp ::new_selfhosted ( & api_url , Some ( & key ) ) ;
assert! ( result . is_ok ( ) ) ;
}
}
}