feat(python-sdk): add parsePDF parameter support (#1713)

* feat(python-sdk): add parsePDF parameter support

- Add parsePDF field to ScrapeOptions class for Search API usage
- Add parse_pdf parameter to both sync and async scrape_url methods
- Add parameter handling logic to pass parsePDF to API requests
- Add comprehensive tests for parsePDF functionality
- Maintain backward compatibility with existing API

The parsePDF parameter controls PDF processing behavior:
- When true (default): PDF content extracted and converted to markdown
- When false: PDF returned in base64 encoding with flat credit rate

Resolves missing parsePDF support in Python SDK v2.9.0

Co-Authored-By: Micah Stairs <micah@sideguide.dev>

* Update __init__.py

* Update test.py

* Update __init__.py

---------

Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Co-authored-by: Micah Stairs <micah@sideguide.dev>
Co-authored-by: Nicolas <nicolascamara29@gmail.com>
This commit is contained in:
devin-ai-integration[bot] 2025-06-26 16:34:43 +00:00 committed by GitHub
parent 89e57ace3c
commit 1919799bed
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 33 additions and 1 deletions

View File

@ -13,7 +13,7 @@ import os
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
__version__ = "2.9.0"
__version__ = "2.10.0"
# Define the logger for the Firecrawl project
logger: logging.Logger = logging.getLogger("firecrawl")

View File

@ -437,4 +437,29 @@ def test_search_with_invalid_params():
app.search("test query", {"invalid_param": "value"})
assert "ValidationError" in str(e.value)
# def test_scrape_url_with_parse_pdf_true():
# if TEST_API_KEY:
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
# response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf', parse_pdf=True)
# assert response is not None
# assert 'markdown' in response
# assert len(response['markdown']) > 100
# def test_scrape_url_with_parse_pdf_false():
# if TEST_API_KEY:
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
# response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf', parse_pdf=False)
# assert response is not None
# assert 'markdown' in response
# assert 'h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm' in response['markdown']
# def test_scrape_options_with_parse_pdf():
# if TEST_API_KEY:
# from firecrawl.firecrawl import ScrapeOptions
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
# scrape_options = ScrapeOptions(parsePDF=False, formats=['markdown'])
# response = app.search("firecrawl", limit=1, scrape_options=scrape_options)
# assert response is not None
# assert 'data' in response

View File

@ -160,6 +160,7 @@ class ScrapeOptions(pydantic.BaseModel):
changeTrackingOptions: Optional[ChangeTrackingOptions] = None
maxAge: Optional[int] = None
storeInCache: Optional[bool] = None
parsePDF: Optional[bool] = None
class WaitAction(pydantic.BaseModel):
"""Wait action to perform during scraping."""
@ -465,6 +466,7 @@ class FirecrawlApp:
remove_base64_images: Optional[bool] = None,
block_ads: Optional[bool] = None,
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
parse_pdf: Optional[bool] = None,
extract: Optional[JsonConfig] = None,
json_options: Optional[JsonConfig] = None,
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
@ -538,6 +540,8 @@ class FirecrawlApp:
scrape_params['blockAds'] = block_ads
if proxy:
scrape_params['proxy'] = proxy
if parse_pdf is not None:
scrape_params['parsePDF'] = parse_pdf
if extract is not None:
extract = self._ensure_schema_dict(extract)
if isinstance(extract, dict) and "schema" in extract:
@ -2904,6 +2908,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
remove_base64_images: Optional[bool] = None,
block_ads: Optional[bool] = None,
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
parse_pdf: Optional[bool] = None,
extract: Optional[JsonConfig] = None,
json_options: Optional[JsonConfig] = None,
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
@ -2981,6 +2986,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
scrape_params['blockAds'] = block_ads
if proxy:
scrape_params['proxy'] = proxy
if parse_pdf is not None:
scrape_params['parsePDF'] = parse_pdf
if extract is not None:
extract = self._ensure_schema_dict(extract)
if isinstance(extract, dict) and "schema" in extract: