2024-07-12 14:54:38 +02:00
use assert_matches ::assert_matches ;
2024-09-19 22:22:57 +02:00
use dotenvy ::dotenv ;
2024-09-20 19:36:07 +02:00
use firecrawl ::scrape ::{ ExtractOptions , ScrapeFormats , ScrapeOptions } ;
2024-08-15 10:11:27 -03:00
use firecrawl ::FirecrawlApp ;
2024-07-12 13:59:04 +02:00
use serde_json ::json ;
use std ::env ;
#[ tokio::test ]
async fn test_blocklisted_url ( ) {
dotenv ( ) . ok ( ) ;
let api_url = env ::var ( " API_URL " ) . unwrap ( ) ;
2024-09-20 19:36:07 +02:00
let api_key = env ::var ( " TEST_API_KEY " ) . ok ( ) ;
let app = FirecrawlApp ::new_selfhosted ( api_url , api_key ) . unwrap ( ) ;
2024-07-12 13:59:04 +02:00
let blocklisted_url = " https://facebook.com/fake-test " ;
let result = app . scrape_url ( blocklisted_url , None ) . await ;
2024-07-12 14:41:53 +02:00
assert_matches! (
result ,
Err ( e ) if e . to_string ( ) . contains ( " Firecrawl currently does not support social media scraping due to policy restrictions " )
) ;
2024-07-12 13:59:04 +02:00
}
#[ tokio::test ]
async fn test_successful_response_with_valid_preview_token ( ) {
dotenv ( ) . ok ( ) ;
let api_url = env ::var ( " API_URL " ) . unwrap ( ) ;
2024-09-20 19:36:07 +02:00
let app = FirecrawlApp ::new_selfhosted (
api_url ,
Some ( " this_is_just_a_preview_token " ) ,
2024-07-12 14:54:38 +02:00
)
. unwrap ( ) ;
let result = app
. scrape_url ( " https://roastmywebsite.ai " , None )
. await
. unwrap ( ) ;
2024-09-20 19:36:07 +02:00
assert! ( result . markdown . is_some ( ) ) ;
assert! ( result . markdown . unwrap ( ) . contains ( " _Roast_ " ) ) ;
2024-07-12 13:59:04 +02:00
}
#[ tokio::test ]
async fn test_scrape_url_e2e ( ) {
dotenv ( ) . ok ( ) ;
let api_url = env ::var ( " API_URL " ) . unwrap ( ) ;
2024-09-20 19:36:07 +02:00
let api_key = env ::var ( " TEST_API_KEY " ) . ok ( ) ;
let app = FirecrawlApp ::new_selfhosted ( api_url , api_key ) . unwrap ( ) ;
2024-07-12 14:54:38 +02:00
let result = app
. scrape_url ( " https://roastmywebsite.ai " , None )
. await
. unwrap ( ) ;
2024-09-20 19:36:07 +02:00
assert! ( result . markdown . is_some ( ) ) ;
assert! ( result . markdown . unwrap ( ) . contains ( " _Roast_ " ) ) ;
2024-07-12 13:59:04 +02:00
}
#[ tokio::test ]
async fn test_successful_response_with_valid_api_key_and_include_html ( ) {
dotenv ( ) . ok ( ) ;
let api_url = env ::var ( " API_URL " ) . unwrap ( ) ;
2024-09-20 19:36:07 +02:00
let api_key = env ::var ( " TEST_API_KEY " ) . ok ( ) ;
let app = FirecrawlApp ::new_selfhosted ( api_url , api_key ) . unwrap ( ) ;
let params = ScrapeOptions {
formats : vec ! [ ScrapeFormats ::Markdown , ScrapeFormats ::HTML ] . into ( ) ,
.. Default ::default ( )
} ;
2024-07-12 14:54:38 +02:00
let result = app
2024-09-20 19:36:07 +02:00
. scrape_url ( " https://roastmywebsite.ai " , params )
2024-07-12 14:54:38 +02:00
. await
. unwrap ( ) ;
2024-09-20 19:36:07 +02:00
assert! ( result . markdown . is_some ( ) ) ;
assert! ( result . html . is_some ( ) ) ;
assert! ( result . markdown . unwrap ( ) . contains ( " _Roast_ " ) ) ;
assert! ( result . html . unwrap ( ) . contains ( " <h1 " ) ) ;
2024-07-12 13:59:04 +02:00
}
#[ tokio::test ]
async fn test_successful_response_for_valid_scrape_with_pdf_file ( ) {
dotenv ( ) . ok ( ) ;
let api_url = env ::var ( " API_URL " ) . unwrap ( ) ;
2024-09-20 19:36:07 +02:00
let api_key = env ::var ( " TEST_API_KEY " ) . ok ( ) ;
let app = FirecrawlApp ::new_selfhosted ( api_url , api_key ) . unwrap ( ) ;
2024-07-12 14:54:38 +02:00
let result = app
. scrape_url ( " https://arxiv.org/pdf/astro-ph/9301001.pdf " , None )
. await
. unwrap ( ) ;
2024-09-20 19:36:07 +02:00
assert! ( result . markdown . is_some ( ) ) ;
assert! ( result . markdown
2024-07-12 14:54:38 +02:00
. unwrap ( )
. contains ( " We present spectrophotometric observations of the Broad Line Radio Galaxy " ) ) ;
2024-07-12 13:59:04 +02:00
}
#[ tokio::test ]
async fn test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension ( ) {
dotenv ( ) . ok ( ) ;
let api_url = env ::var ( " API_URL " ) . unwrap ( ) ;
2024-09-20 19:36:07 +02:00
let api_key = env ::var ( " TEST_API_KEY " ) . ok ( ) ;
let app = FirecrawlApp ::new_selfhosted ( api_url , api_key ) . unwrap ( ) ;
2024-07-12 14:54:38 +02:00
let result = app
. scrape_url ( " https://arxiv.org/pdf/astro-ph/9301001 " , None )
. await
. unwrap ( ) ;
2024-09-20 19:36:07 +02:00
assert! ( result . markdown . is_some ( ) ) ;
assert! ( result . markdown
2024-07-12 14:54:38 +02:00
. unwrap ( )
. contains ( " We present spectrophotometric observations of the Broad Line Radio Galaxy " ) ) ;
2024-07-12 13:59:04 +02:00
}
#[ tokio::test ]
async fn test_should_return_error_for_blocklisted_url ( ) {
dotenv ( ) . ok ( ) ;
let api_url = env ::var ( " API_URL " ) . unwrap ( ) ;
2024-09-20 19:36:07 +02:00
let api_key = env ::var ( " TEST_API_KEY " ) . ok ( ) ;
let app = FirecrawlApp ::new_selfhosted ( api_url , api_key ) . unwrap ( ) ;
2024-07-12 13:59:04 +02:00
let blocklisted_url = " https://twitter.com/fake-test " ;
2024-09-20 19:36:07 +02:00
let result = app . crawl_url ( blocklisted_url , None ) . await ;
2024-07-12 13:59:04 +02:00
2024-07-12 14:41:53 +02:00
assert_matches! (
result ,
2024-09-23 23:03:00 +02:00
Err ( e ) if e . to_string ( ) . contains ( " Firecrawl currently does not support social media scraping due to policy restrictions. " )
2024-07-12 14:41:53 +02:00
) ;
2024-07-12 13:59:04 +02:00
}
#[ tokio::test ]
async fn test_llm_extraction ( ) {
dotenv ( ) . ok ( ) ;
let api_url = env ::var ( " API_URL " ) . unwrap ( ) ;
2024-09-20 19:36:07 +02:00
let api_key = env ::var ( " TEST_API_KEY " ) . ok ( ) ;
let app = FirecrawlApp ::new_selfhosted ( api_url , api_key ) . unwrap ( ) ;
let options = ScrapeOptions {
formats : vec ! [ ScrapeFormats ::Extract ] . into ( ) ,
extract : ExtractOptions {
prompt : " Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source " . to_string ( ) . into ( ) ,
schema : json ! ( {
2024-07-12 13:59:04 +02:00
" type " : " object " ,
" properties " : {
" company_mission " : { " type " : " string " } ,
" supports_sso " : { " type " : " boolean " } ,
" is_open_source " : { " type " : " boolean " }
} ,
" required " : [ " company_mission " , " supports_sso " , " is_open_source " ]
2024-09-20 19:36:07 +02:00
} ) . into ( ) ,
.. Default ::default ( )
} . into ( ) ,
.. Default ::default ( )
} ;
2024-07-12 14:54:38 +02:00
let result = app
2024-09-20 19:36:07 +02:00
. scrape_url ( " https://mendable.ai " , options )
2024-07-12 14:54:38 +02:00
. await
. unwrap ( ) ;
2024-09-20 19:36:07 +02:00
assert! ( result . extract . is_some ( ) ) ;
let llm_extraction = & result . extract . unwrap ( ) ;
2024-07-12 14:54:38 +02:00
assert! ( llm_extraction
. as_object ( )
. unwrap ( )
. contains_key ( " company_mission " ) ) ;
2024-07-12 13:59:04 +02:00
assert! ( llm_extraction [ " supports_sso " ] . is_boolean ( ) ) ;
assert! ( llm_extraction [ " is_open_source " ] . is_boolean ( ) ) ;
}