mirror of
https://github.com/mendableai/firecrawl.git
synced 2025-12-12 23:51:27 +00:00
feat(sdk/rust/crawl): paginate through results
This commit is contained in:
parent
a078cdbd9d
commit
3ec0bbe28d
@ -251,6 +251,18 @@ impl FirecrawlApp {
|
|||||||
self.monitor_job_status(&res.id, poll_interval).await
|
self.monitor_job_status(&res.id, poll_interval).await
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn check_crawl_status_next(&self, next: impl AsRef<str>) -> Result<CrawlStatus, FirecrawlError> {
|
||||||
|
let response = self
|
||||||
|
.client
|
||||||
|
.get(next.as_ref())
|
||||||
|
.headers(self.prepare_headers(None))
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| FirecrawlError::HttpError(format!("Paginating crawl using URL {:?}", next.as_ref()), e))?;
|
||||||
|
|
||||||
|
self.handle_response(response, format!("Paginating crawl using URL {:?}", next.as_ref())).await
|
||||||
|
}
|
||||||
|
|
||||||
/// Checks for the status of a crawl, based on the crawl's ID. To be used in conjunction with `FirecrawlApp::crawl_url_async`.
|
/// Checks for the status of a crawl, based on the crawl's ID. To be used in conjunction with `FirecrawlApp::crawl_url_async`.
|
||||||
pub async fn check_crawl_status(&self, id: impl AsRef<str>) -> Result<CrawlStatus, FirecrawlError> {
|
pub async fn check_crawl_status(&self, id: impl AsRef<str>) -> Result<CrawlStatus, FirecrawlError> {
|
||||||
let response = self
|
let response = self
|
||||||
@ -272,26 +284,40 @@ impl FirecrawlApp {
|
|||||||
id: &str,
|
id: &str,
|
||||||
poll_interval: u64,
|
poll_interval: u64,
|
||||||
) -> Result<CrawlStatus, FirecrawlError> {
|
) -> Result<CrawlStatus, FirecrawlError> {
|
||||||
loop {
|
let result = loop {
|
||||||
let status_data = self.check_crawl_status(id).await?;
|
let status_data = self.check_crawl_status(id).await?;
|
||||||
match status_data.status {
|
match status_data.status {
|
||||||
CrawlStatusTypes::Completed => {
|
CrawlStatusTypes::Completed => {
|
||||||
return Ok(status_data);
|
break Ok(status_data);
|
||||||
}
|
}
|
||||||
CrawlStatusTypes::Scraping => {
|
CrawlStatusTypes::Scraping => {
|
||||||
tokio::time::sleep(tokio::time::Duration::from_millis(poll_interval)).await;
|
tokio::time::sleep(tokio::time::Duration::from_millis(poll_interval)).await;
|
||||||
}
|
}
|
||||||
CrawlStatusTypes::Failed => {
|
CrawlStatusTypes::Failed => {
|
||||||
return Err(FirecrawlError::CrawlJobFailed(format!(
|
break Err(FirecrawlError::CrawlJobFailed(format!(
|
||||||
"Crawl job failed."
|
"Crawl job failed."
|
||||||
), status_data));
|
), status_data));
|
||||||
}
|
}
|
||||||
CrawlStatusTypes::Cancelled => {
|
CrawlStatusTypes::Cancelled => {
|
||||||
return Err(FirecrawlError::CrawlJobFailed(format!(
|
break Err(FirecrawlError::CrawlJobFailed(format!(
|
||||||
"Crawl job was cancelled."
|
"Crawl job was cancelled."
|
||||||
), status_data));
|
), status_data));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
match result {
|
||||||
|
Ok(mut status) => {
|
||||||
|
// Paginate through results
|
||||||
|
while let Some(next) = status.next {
|
||||||
|
let new_status = self.check_crawl_status_next(next).await?;
|
||||||
|
status.data.extend_from_slice(&new_status.data);
|
||||||
|
status.next = new_status.next;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(status)
|
||||||
|
},
|
||||||
|
Err(_) => result,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user