crawl4ai/docs/examples/quickstart_examples_set_1.py

413 lines
15 KiB
Python
Raw Normal View History

2025-04-05 21:37:25 +08:00
import asyncio
import os
import json
import base64
from pathlib import Path
from typing import List
from crawl4ai.proxy_strategy import ProxyConfig
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult
2025-04-05 21:37:25 +08:00
from crawl4ai import RoundRobinProxyStrategy
from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
from crawl4ai import LLMConfig
from crawl4ai import PruningContentFilter, BM25ContentFilter
2025-04-05 21:37:25 +08:00
from crawl4ai import DefaultMarkdownGenerator
from crawl4ai import BFSDeepCrawlStrategy, DomainFilter, FilterChain
from crawl4ai import BrowserConfig
__cur_dir__ = Path(__file__).parent
async def demo_basic_crawl():
"""Basic web crawling with markdown generation"""
print("\n=== 1. Basic Web Crawling ===")
async with AsyncWebCrawler(config = BrowserConfig(
viewport_height=800,
viewport_width=1200,
headless=True,
verbose=True,
)) as crawler:
2025-04-05 21:37:25 +08:00
results: List[CrawlResult] = await crawler.arun(
url="https://news.ycombinator.com/"
2025-04-05 21:37:25 +08:00
)
for i, result in enumerate(results):
print(f"Result {i + 1}:")
print(f"Success: {result.success}")
if result.success:
print(f"Markdown length: {len(result.markdown.raw_markdown)} chars")
print(f"First 100 chars: {result.markdown.raw_markdown[:100]}...")
else:
print("Failed to crawl the URL")
async def demo_parallel_crawl():
"""Crawl multiple URLs in parallel"""
print("\n=== 2. Parallel Crawling ===")
urls = [
"https://news.ycombinator.com/",
"https://example.com/",
"https://httpbin.org/html",
]
async with AsyncWebCrawler() as crawler:
results: List[CrawlResult] = await crawler.arun_many(
urls=urls,
)
print(f"Crawled {len(results)} URLs in parallel:")
for i, result in enumerate(results):
print(
f" {i + 1}. {result.url} - {'Success' if result.success else 'Failed'}"
)
async def demo_fit_markdown():
"""Generate focused markdown with LLM content filter"""
print("\n=== 3. Fit Markdown with LLM Content Filter ===")
async with AsyncWebCrawler() as crawler:
result: CrawlResult = await crawler.arun(
url = "https://en.wikipedia.org/wiki/Python_(programming_language)",
2025-04-05 21:37:25 +08:00
config=CrawlerRunConfig(
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter()
)
),
)
# Print stats and save the fit markdown
print(f"Raw: {len(result.markdown.raw_markdown)} chars")
print(f"Fit: {len(result.markdown.fit_markdown)} chars")
async def demo_llm_structured_extraction_no_schema():
# Create a simple LLM extraction strategy (no schema required)
extraction_strategy = LLMExtractionStrategy(
llm_config=LLMConfig(
provider="groq/qwen-2.5-32b",
api_token="env:GROQ_API_KEY",
),
instruction="This is news.ycombinator.com, extract all news, and for each, I want title, source url, number of comments.",
2025-04-05 21:37:25 +08:00
extract_type="schema",
schema="{title: string, url: string, comments: int}",
extra_args={
"temperature": 0.0,
"max_tokens": 4096,
},
verbose=True,
)
config = CrawlerRunConfig(extraction_strategy=extraction_strategy)
async with AsyncWebCrawler() as crawler:
results: List[CrawlResult] = await crawler.arun(
"https://news.ycombinator.com/", config=config
)
for result in results:
print(f"URL: {result.url}")
print(f"Success: {result.success}")
if result.success:
data = json.loads(result.extracted_content)
print(json.dumps(data, indent=2))
else:
print("Failed to extract structured data")
async def demo_css_structured_extraction_no_schema():
"""Extract structured data using CSS selectors"""
print("\n=== 5. CSS-Based Structured Extraction ===")
# Sample HTML for schema generation (one-time cost)
sample_html = """
<div class="body-post clear">
<a class="story-link" href="https://thehackernews.com/2025/04/malicious-python-packages-on-pypi.html">
<div class="clear home-post-box cf">
<div class="home-img clear">
<div class="img-ratio">
<img alt="..." src="...">
</div>
</div>
<div class="clear home-right">
<h2 class="home-title">Malicious Python Packages on PyPI Downloaded 39,000+ Times, Steal Sensitive Data</h2>
<div class="item-label">
<span class="h-datetime"><i class="icon-font icon-calendar"></i>Apr 05, 2025</span>
<span class="h-tags">Malware / Supply Chain Attack</span>
</div>
<div class="home-desc"> Cybersecurity researchers have...</div>
2025-04-05 21:37:25 +08:00
</div>
</div>
</a>
</div>
"""
# Check if schema file exists
schema_file_path = f"{__cur_dir__}/tmp/schema.json"
if os.path.exists(schema_file_path):
with open(schema_file_path, "r") as f:
schema = json.load(f)
else:
# Generate schema using LLM (one-time setup)
schema = JsonCssExtractionStrategy.generate_schema(
html=sample_html,
llm_config=LLMConfig(
provider="groq/qwen-2.5-32b",
api_token="env:GROQ_API_KEY",
),
query="From https://thehackernews.com/, I have shared a sample of one news div with a title, date, and description. Please generate a schema for this news div.",
)
2025-04-05 21:37:25 +08:00
print(f"Generated schema: {json.dumps(schema, indent=2)}")
# Save the schema to a file , and use it for future extractions, in result for such extraction you will call LLM once
with open(f"{__cur_dir__}/tmp/schema.json", "w") as f:
json.dump(schema, f, indent=2)
2025-04-05 21:37:25 +08:00
# Create no-LLM extraction strategy with the generated schema
extraction_strategy = JsonCssExtractionStrategy(schema)
config = CrawlerRunConfig(extraction_strategy=extraction_strategy)
# Use the fast CSS extraction (no LLM calls during extraction)
async with AsyncWebCrawler() as crawler:
results: List[CrawlResult] = await crawler.arun(
"https://thehackernews.com", config=config
)
for result in results:
print(f"URL: {result.url}")
print(f"Success: {result.success}")
if result.success:
data = json.loads(result.extracted_content)
print(json.dumps(data, indent=2))
else:
print("Failed to extract structured data")
async def demo_deep_crawl():
"""Deep crawling with BFS strategy"""
print("\n=== 6. Deep Crawling ===")
filter_chain = FilterChain([DomainFilter(allowed_domains=["crawl4ai.com"])])
deep_crawl_strategy = BFSDeepCrawlStrategy(
max_depth=1, max_pages=5, filter_chain=filter_chain
)
async with AsyncWebCrawler() as crawler:
results: List[CrawlResult] = await crawler.arun(
url="https://docs.crawl4ai.com",
config=CrawlerRunConfig(deep_crawl_strategy=deep_crawl_strategy),
)
print(f"Deep crawl returned {len(results)} pages:")
for i, result in enumerate(results):
depth = result.metadata.get("depth", "unknown")
print(f" {i + 1}. {result.url} (Depth: {depth})")
async def demo_js_interaction():
"""Execute JavaScript to load more content"""
print("\n=== 7. JavaScript Interaction ===")
# A simple page that needs JS to reveal content
async with AsyncWebCrawler(config=BrowserConfig(headless=False)) as crawler:
# Initial load
news_schema = {
"name": "news",
"baseSelector": "tr.athing",
"fields": [
{
"name": "title",
"selector": "span.titleline",
"type": "text",
}
],
}
results: List[CrawlResult] = await crawler.arun(
url="https://news.ycombinator.com",
config=CrawlerRunConfig(
session_id="hn_session", # Keep session
extraction_strategy=JsonCssExtractionStrategy(schema=news_schema),
),
)
news = []
for result in results:
if result.success:
data = json.loads(result.extracted_content)
news.extend(data)
print(json.dumps(data, indent=2))
else:
print("Failed to extract structured data")
print(f"Initial items: {len(news)}")
# Click "More" link
more_config = CrawlerRunConfig(
js_code="document.querySelector('a.morelink').click();",
js_only=True, # Continue in same page
session_id="hn_session", # Keep session
extraction_strategy=JsonCssExtractionStrategy(
schema=news_schema,
),
)
result: List[CrawlResult] = await crawler.arun(
url="https://news.ycombinator.com", config=more_config
)
# Extract new items
for result in results:
if result.success:
data = json.loads(result.extracted_content)
news.extend(data)
print(json.dumps(data, indent=2))
else:
print("Failed to extract structured data")
print(f"Total items: {len(news)}")
async def demo_media_and_links():
"""Extract media and links from a page"""
print("\n=== 8. Media and Links Extraction ===")
async with AsyncWebCrawler() as crawler:
result: List[CrawlResult] = await crawler.arun("https://en.wikipedia.org/wiki/Main_Page")
for i, result in enumerate(result):
# Extract and save all images
images = result.media.get("images", [])
print(f"Found {len(images)} images")
# Extract and save all links (internal and external)
internal_links = result.links.get("internal", [])
external_links = result.links.get("external", [])
print(f"Found {len(internal_links)} internal links")
print(f"Found {len(external_links)} external links")
# Print some of the images and links
for image in images[:3]:
print(f"Image: {image['src']}")
for link in internal_links[:3]:
print(f"Internal link: {link['href']}")
for link in external_links[:3]:
print(f"External link: {link['href']}")
# # Save everything to files
with open(f"{__cur_dir__}/tmp/images.json", "w") as f:
json.dump(images, f, indent=2)
with open(f"{__cur_dir__}/tmp/links.json", "w") as f:
json.dump(
{"internal": internal_links, "external": external_links},
f,
indent=2,
)
2025-04-05 21:37:25 +08:00
async def demo_screenshot_and_pdf():
"""Capture screenshot and PDF of a page"""
print("\n=== 9. Screenshot and PDF Capture ===")
async with AsyncWebCrawler() as crawler:
result: List[CrawlResult] = await crawler.arun(
# url="https://example.com",
url="https://en.wikipedia.org/wiki/Giant_anteater",
config=CrawlerRunConfig(screenshot=True, pdf=True),
)
for i, result in enumerate(result):
# if result.screenshot_data:
2025-04-05 21:37:25 +08:00
if result.screenshot:
# Save screenshot
screenshot_path = f"{__cur_dir__}/tmp/example_screenshot.png"
with open(screenshot_path, "wb") as f:
f.write(base64.b64decode(result.screenshot))
print(f"Screenshot saved to {screenshot_path}")
# if result.pdf_data:
2025-04-05 21:37:25 +08:00
if result.pdf:
# Save PDF
pdf_path = f"{__cur_dir__}/tmp/example.pdf"
with open(pdf_path, "wb") as f:
f.write(result.pdf)
print(f"PDF saved to {pdf_path}")
async def demo_proxy_rotation():
"""Proxy rotation for multiple requests"""
print("\n=== 10. Proxy Rotation ===")
# Example proxies (replace with real ones)
proxies = [
ProxyConfig(server="http://proxy1.example.com:8080"),
ProxyConfig(server="http://proxy2.example.com:8080"),
]
proxy_strategy = RoundRobinProxyStrategy(proxies)
print(f"Using {len(proxies)} proxies in rotation")
print(
"Note: This example uses placeholder proxies - replace with real ones to test"
)
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
proxy_rotation_strategy=proxy_strategy
2025-04-05 21:37:25 +08:00
)
# In a real scenario, these would be run and the proxies would rotate
print("In a real scenario, requests would rotate through the available proxies")
async def demo_raw_html_and_file():
"""Process raw HTML and local files"""
print("\n=== 11. Raw HTML and Local Files ===")
raw_html = """
<html><body>
<h1>Sample Article</h1>
<p>This is sample content for testing Crawl4AI's raw HTML processing.</p>
</body></html>
"""
# Save to file
file_path = Path("docs/examples/tmp/sample.html").absolute()
with open(file_path, "w") as f:
f.write(raw_html)
async with AsyncWebCrawler() as crawler:
# Crawl raw HTML
raw_result = await crawler.arun(
url="raw:" + raw_html, config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
)
print("Raw HTML processing:")
print(f" Markdown: {raw_result.markdown.raw_markdown[:50]}...")
# Crawl local file
file_result = await crawler.arun(
url=f"file://{file_path}",
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
)
print("\nLocal file processing:")
print(f" Markdown: {file_result.markdown.raw_markdown[:50]}...")
# Clean up
os.remove(file_path)
print(f"Processed both raw HTML and local file ({file_path})")
async def main():
"""Run all demo functions sequentially"""
print("=== Comprehensive Crawl4AI Demo ===")
print("Note: Some examples require API keys or other configurations")
# Run all demos
await demo_basic_crawl()
await demo_parallel_crawl()
await demo_fit_markdown()
await demo_llm_structured_extraction_no_schema()
await demo_css_structured_extraction_no_schema()
2025-04-05 21:37:25 +08:00
await demo_deep_crawl()
await demo_js_interaction()
await demo_media_and_links()
await demo_screenshot_and_pdf()
# # await demo_proxy_rotation()
await demo_raw_html_and_file()
2025-04-05 21:37:25 +08:00
# Clean up any temp files that may have been created
print("\n=== Demo Complete ===")
print("Check for any generated files (screenshots, PDFs) in the current directory")
if __name__ == "__main__":
asyncio.run(main())