2025-01-01 19:39:42 +08:00
|
|
|
import asyncio
|
2025-02-23 21:23:41 +08:00
|
|
|
from crawl4ai import (
|
|
|
|
AsyncWebCrawler,
|
|
|
|
BrowserConfig,
|
|
|
|
CrawlerRunConfig,
|
|
|
|
CacheMode,
|
|
|
|
DefaultMarkdownGenerator,
|
|
|
|
PruningContentFilter,
|
2025-02-25 22:27:55 +08:00
|
|
|
CrawlResult
|
2025-02-23 21:23:41 +08:00
|
|
|
)
|
2025-01-01 19:39:42 +08:00
|
|
|
|
2025-03-21 22:50:00 +08:00
|
|
|
async def example_cdp():
|
|
|
|
browser_conf = BrowserConfig(
|
|
|
|
headless=False,
|
|
|
|
cdp_url="http://localhost:9223"
|
|
|
|
)
|
|
|
|
crawler_config = CrawlerRunConfig(
|
|
|
|
session_id="test",
|
|
|
|
js_code = """(() => { return {"result": "Hello World!"} })()""",
|
|
|
|
js_only=True
|
|
|
|
)
|
|
|
|
async with AsyncWebCrawler(
|
|
|
|
config=browser_conf,
|
|
|
|
verbose=True,
|
|
|
|
) as crawler:
|
|
|
|
result : CrawlResult = await crawler.arun(
|
|
|
|
url="https://www.helloworld.org",
|
|
|
|
config=crawler_config,
|
|
|
|
)
|
|
|
|
print(result.js_execution_result)
|
|
|
|
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2025-01-01 19:39:42 +08:00
|
|
|
async def main():
|
2025-04-26 21:09:50 +08:00
|
|
|
browser_config = BrowserConfig(headless=False, verbose=True)
|
2025-01-02 17:53:30 +08:00
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
2025-01-01 19:39:42 +08:00
|
|
|
crawler_config = CrawlerRunConfig(
|
|
|
|
cache_mode=CacheMode.BYPASS,
|
|
|
|
markdown_generator=DefaultMarkdownGenerator(
|
2025-03-21 22:50:00 +08:00
|
|
|
content_filter=PruningContentFilter(
|
|
|
|
threshold=0.48, threshold_type="fixed", min_word_threshold=0
|
|
|
|
)
|
2025-01-13 19:19:58 +08:00
|
|
|
),
|
2025-01-01 19:39:42 +08:00
|
|
|
)
|
2025-02-25 22:27:55 +08:00
|
|
|
result : CrawlResult = await crawler.arun(
|
2025-03-21 22:50:00 +08:00
|
|
|
url="https://www.helloworld.org", config=crawler_config
|
2025-01-01 19:39:42 +08:00
|
|
|
)
|
2025-02-28 17:23:35 +05:30
|
|
|
print(result.markdown.raw_markdown[:500])
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2025-01-01 19:39:42 +08:00
|
|
|
if __name__ == "__main__":
|
2025-01-13 19:19:58 +08:00
|
|
|
asyncio.run(main())
|