
Make PyPDF2 an optional dependency and improve import handling in PDF processor. Move imports inside methods to allow for lazy loading and better error handling. Add new 'pdf' optional dependency group in pyproject.toml. Clean up unused imports and remove deprecated files. BREAKING CHANGE: PyPDF2 is now an optional dependency. Users need to install with 'pip install crawl4ai[pdf]' to use PDF processing features.
34 lines
997 B
Python
34 lines
997 B
Python
import asyncio
|
|
from crawl4ai import (
|
|
AsyncWebCrawler,
|
|
BrowserConfig,
|
|
CrawlerRunConfig,
|
|
CacheMode,
|
|
DefaultMarkdownGenerator,
|
|
PruningContentFilter,
|
|
CrawlResult
|
|
)
|
|
|
|
|
|
async def main():
|
|
browser_config = BrowserConfig(headless=True, verbose=True)
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
crawler_config = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS,
|
|
markdown_generator=DefaultMarkdownGenerator(
|
|
content_filter=PruningContentFilter(
|
|
threshold=0.48, threshold_type="fixed", min_word_threshold=0
|
|
)
|
|
),
|
|
)
|
|
result : CrawlResult = await crawler.arun(
|
|
# url="https://www.helloworld.org", config=crawler_config
|
|
url="https://www.kidocode.com", config=crawler_config
|
|
)
|
|
print(result.markdown_v2.raw_markdown[:500])
|
|
# print(result.model_dump())
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|