
Adds a new BrowserProfiler class that provides comprehensive management of browser profiles for identity-based crawling. Features include: - Interactive profile creation and management - Profile listing, retrieval, and deletion - Guided console interface - Migration of profile management from ManagedBrowser - New example script for identity-based browsing ALSO: - Updates logging format in AsyncWebCrawler - Removes content filter from hello_world example - Relaxes httpx version constraint BREAKING CHANGE: Profile management methods from ManagedBrowser are now deprecated and delegate to BrowserProfiler
34 lines
1000 B
Python
34 lines
1000 B
Python
import asyncio
|
|
from crawl4ai import (
|
|
AsyncWebCrawler,
|
|
BrowserConfig,
|
|
CrawlerRunConfig,
|
|
CacheMode,
|
|
DefaultMarkdownGenerator,
|
|
PruningContentFilter,
|
|
CrawlResult
|
|
)
|
|
|
|
|
|
async def main():
|
|
browser_config = BrowserConfig(headless=True, verbose=True)
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
crawler_config = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS,
|
|
markdown_generator=DefaultMarkdownGenerator(
|
|
# content_filter=PruningContentFilter(
|
|
# threshold=0.48, threshold_type="fixed", min_word_threshold=0
|
|
# )
|
|
),
|
|
)
|
|
result : CrawlResult = await crawler.arun(
|
|
# url="https://www.helloworld.org", config=crawler_config
|
|
url="https://www.kidocode.com", config=crawler_config
|
|
)
|
|
print(result.markdown.raw_markdown[:500])
|
|
# print(result.model_dump())
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|