mirror of
https://github.com/mendableai/firecrawl.git
synced 2025-12-26 14:44:36 +00:00
feat(python-sdk): add parsePDF parameter support (#1713)
* feat(python-sdk): add parsePDF parameter support - Add parsePDF field to ScrapeOptions class for Search API usage - Add parse_pdf parameter to both sync and async scrape_url methods - Add parameter handling logic to pass parsePDF to API requests - Add comprehensive tests for parsePDF functionality - Maintain backward compatibility with existing API The parsePDF parameter controls PDF processing behavior: - When true (default): PDF content extracted and converted to markdown - When false: PDF returned in base64 encoding with flat credit rate Resolves missing parsePDF support in Python SDK v2.9.0 Co-Authored-By: Micah Stairs <micah@sideguide.dev> * Update __init__.py * Update test.py * Update __init__.py --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: Micah Stairs <micah@sideguide.dev> Co-authored-by: Nicolas <nicolascamara29@gmail.com>
This commit is contained in:
parent
89e57ace3c
commit
1919799bed
@ -13,7 +13,7 @@ import os
|
||||
|
||||
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
|
||||
|
||||
__version__ = "2.9.0"
|
||||
__version__ = "2.10.0"
|
||||
|
||||
# Define the logger for the Firecrawl project
|
||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||
|
||||
@ -437,4 +437,29 @@ def test_search_with_invalid_params():
|
||||
app.search("test query", {"invalid_param": "value"})
|
||||
assert "ValidationError" in str(e.value)
|
||||
|
||||
# def test_scrape_url_with_parse_pdf_true():
|
||||
# if TEST_API_KEY:
|
||||
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
# response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf', parse_pdf=True)
|
||||
# assert response is not None
|
||||
# assert 'markdown' in response
|
||||
# assert len(response['markdown']) > 100
|
||||
|
||||
# def test_scrape_url_with_parse_pdf_false():
|
||||
# if TEST_API_KEY:
|
||||
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
# response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf', parse_pdf=False)
|
||||
# assert response is not None
|
||||
# assert 'markdown' in response
|
||||
# assert 'h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm' in response['markdown']
|
||||
|
||||
# def test_scrape_options_with_parse_pdf():
|
||||
# if TEST_API_KEY:
|
||||
# from firecrawl.firecrawl import ScrapeOptions
|
||||
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
# scrape_options = ScrapeOptions(parsePDF=False, formats=['markdown'])
|
||||
# response = app.search("firecrawl", limit=1, scrape_options=scrape_options)
|
||||
# assert response is not None
|
||||
# assert 'data' in response
|
||||
|
||||
|
||||
|
||||
@ -160,6 +160,7 @@ class ScrapeOptions(pydantic.BaseModel):
|
||||
changeTrackingOptions: Optional[ChangeTrackingOptions] = None
|
||||
maxAge: Optional[int] = None
|
||||
storeInCache: Optional[bool] = None
|
||||
parsePDF: Optional[bool] = None
|
||||
|
||||
class WaitAction(pydantic.BaseModel):
|
||||
"""Wait action to perform during scraping."""
|
||||
@ -465,6 +466,7 @@ class FirecrawlApp:
|
||||
remove_base64_images: Optional[bool] = None,
|
||||
block_ads: Optional[bool] = None,
|
||||
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
||||
parse_pdf: Optional[bool] = None,
|
||||
extract: Optional[JsonConfig] = None,
|
||||
json_options: Optional[JsonConfig] = None,
|
||||
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
||||
@ -538,6 +540,8 @@ class FirecrawlApp:
|
||||
scrape_params['blockAds'] = block_ads
|
||||
if proxy:
|
||||
scrape_params['proxy'] = proxy
|
||||
if parse_pdf is not None:
|
||||
scrape_params['parsePDF'] = parse_pdf
|
||||
if extract is not None:
|
||||
extract = self._ensure_schema_dict(extract)
|
||||
if isinstance(extract, dict) and "schema" in extract:
|
||||
@ -2904,6 +2908,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
||||
remove_base64_images: Optional[bool] = None,
|
||||
block_ads: Optional[bool] = None,
|
||||
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
||||
parse_pdf: Optional[bool] = None,
|
||||
extract: Optional[JsonConfig] = None,
|
||||
json_options: Optional[JsonConfig] = None,
|
||||
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
||||
@ -2981,6 +2986,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
||||
scrape_params['blockAds'] = block_ads
|
||||
if proxy:
|
||||
scrape_params['proxy'] = proxy
|
||||
if parse_pdf is not None:
|
||||
scrape_params['parsePDF'] = parse_pdf
|
||||
if extract is not None:
|
||||
extract = self._ensure_schema_dict(extract)
|
||||
if isinstance(extract, dict) and "schema" in extract:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user