2025-04-05 21:37:25 +08:00
import asyncio
import os
import json
import base64
from pathlib import Path
from typing import List
2025-04-05 22:55:56 +08:00
from crawl4ai . proxy_strategy import ProxyConfig
2025-04-05 22:57:45 +08:00
from crawl4ai import AsyncWebCrawler , CrawlerRunConfig , CacheMode , CrawlResult
2025-04-05 21:37:25 +08:00
from crawl4ai import RoundRobinProxyStrategy
from crawl4ai import JsonCssExtractionStrategy , LLMExtractionStrategy
from crawl4ai import LLMConfig
2025-04-05 22:57:45 +08:00
from crawl4ai import PruningContentFilter , BM25ContentFilter
2025-04-05 21:37:25 +08:00
from crawl4ai import DefaultMarkdownGenerator
from crawl4ai import BFSDeepCrawlStrategy , DomainFilter , FilterChain
from crawl4ai import BrowserConfig
__cur_dir__ = Path ( __file__ ) . parent
async def demo_basic_crawl ( ) :
""" Basic web crawling with markdown generation """
print ( " \n === 1. Basic Web Crawling === " )
2025-04-05 22:57:45 +08:00
async with AsyncWebCrawler ( config = BrowserConfig (
viewport_height = 800 ,
viewport_width = 1200 ,
headless = True ,
verbose = True ,
) ) as crawler :
2025-04-05 21:37:25 +08:00
results : List [ CrawlResult ] = await crawler . arun (
2025-04-05 22:55:56 +08:00
url = " https://news.ycombinator.com/ "
2025-04-05 21:37:25 +08:00
)
for i , result in enumerate ( results ) :
print ( f " Result { i + 1 } : " )
print ( f " Success: { result . success } " )
if result . success :
print ( f " Markdown length: { len ( result . markdown . raw_markdown ) } chars " )
print ( f " First 100 chars: { result . markdown . raw_markdown [ : 100 ] } ... " )
else :
print ( " Failed to crawl the URL " )
async def demo_parallel_crawl ( ) :
""" Crawl multiple URLs in parallel """
print ( " \n === 2. Parallel Crawling === " )
urls = [
" https://news.ycombinator.com/ " ,
" https://example.com/ " ,
" https://httpbin.org/html " ,
]
async with AsyncWebCrawler ( ) as crawler :
results : List [ CrawlResult ] = await crawler . arun_many (
urls = urls ,
)
print ( f " Crawled { len ( results ) } URLs in parallel: " )
for i , result in enumerate ( results ) :
print (
f " { i + 1 } . { result . url } - { ' Success ' if result . success else ' Failed ' } "
)
async def demo_fit_markdown ( ) :
""" Generate focused markdown with LLM content filter """
print ( " \n === 3. Fit Markdown with LLM Content Filter === " )
async with AsyncWebCrawler ( ) as crawler :
2025-04-05 22:55:56 +08:00
result : CrawlResult = await crawler . arun (
url = " https://en.wikipedia.org/wiki/Python_(programming_language) " ,
2025-04-05 21:37:25 +08:00
config = CrawlerRunConfig (
markdown_generator = DefaultMarkdownGenerator (
content_filter = PruningContentFilter ( )
)
) ,
)
# Print stats and save the fit markdown
print ( f " Raw: { len ( result . markdown . raw_markdown ) } chars " )
print ( f " Fit: { len ( result . markdown . fit_markdown ) } chars " )
async def demo_llm_structured_extraction_no_schema ( ) :
# Create a simple LLM extraction strategy (no schema required)
extraction_strategy = LLMExtractionStrategy (
llm_config = LLMConfig (
provider = " groq/qwen-2.5-32b " ,
api_token = " env:GROQ_API_KEY " ,
) ,
2025-04-05 22:55:56 +08:00
instruction = " This is news.ycombinator.com, extract all news, and for each, I want title, source url, number of comments. " ,
2025-04-05 21:37:25 +08:00
extract_type = " schema " ,
schema = " { title: string, url: string, comments: int} " ,
extra_args = {
" temperature " : 0.0 ,
" max_tokens " : 4096 ,
} ,
verbose = True ,
)
config = CrawlerRunConfig ( extraction_strategy = extraction_strategy )
async with AsyncWebCrawler ( ) as crawler :
results : List [ CrawlResult ] = await crawler . arun (
" https://news.ycombinator.com/ " , config = config
)
for result in results :
print ( f " URL: { result . url } " )
print ( f " Success: { result . success } " )
if result . success :
data = json . loads ( result . extracted_content )
print ( json . dumps ( data , indent = 2 ) )
else :
print ( " Failed to extract structured data " )
async def demo_css_structured_extraction_no_schema ( ) :
""" Extract structured data using CSS selectors """
print ( " \n === 5. CSS-Based Structured Extraction === " )
# Sample HTML for schema generation (one-time cost)
sample_html = """
< div class = " body-post clear " >
< a class = " story-link " href = " https://thehackernews.com/2025/04/malicious-python-packages-on-pypi.html " >
< div class = " clear home-post-box cf " >
< div class = " home-img clear " >
< div class = " img-ratio " >
< img alt = " ... " src = " ... " >
< / div >
< / div >
< div class = " clear home-right " >
< h2 class = " home-title " > Malicious Python Packages on PyPI Downloaded 39 , 000 + Times , Steal Sensitive Data < / h2 >
< div class = " item-label " >
< span class = " h-datetime " > < i class = " icon-font icon-calendar " > < / i > Apr 05 , 2025 < / span >
< span class = " h-tags " > Malware / Supply Chain Attack < / span >
< / div >
2025-04-05 22:55:56 +08:00
< div class = " home-desc " > Cybersecurity researchers have . . . < / div >
2025-04-05 21:37:25 +08:00
< / div >
< / div >
< / a >
< / div >
"""
2025-04-05 22:55:56 +08:00
# Check if schema file exists
schema_file_path = f " { __cur_dir__ } /tmp/schema.json "
if os . path . exists ( schema_file_path ) :
with open ( schema_file_path , " r " ) as f :
schema = json . load ( f )
else :
# Generate schema using LLM (one-time setup)
schema = JsonCssExtractionStrategy . generate_schema (
html = sample_html ,
llm_config = LLMConfig (
provider = " groq/qwen-2.5-32b " ,
api_token = " env:GROQ_API_KEY " ,
) ,
query = " From https://thehackernews.com/, I have shared a sample of one news div with a title, date, and description. Please generate a schema for this news div. " ,
)
2025-04-05 21:37:25 +08:00
print ( f " Generated schema: { json . dumps ( schema , indent = 2 ) } " )
# Save the schema to a file , and use it for future extractions, in result for such extraction you will call LLM once
2025-04-05 22:55:56 +08:00
with open ( f " { __cur_dir__ } /tmp/schema.json " , " w " ) as f :
json . dump ( schema , f , indent = 2 )
2025-04-05 21:37:25 +08:00
# Create no-LLM extraction strategy with the generated schema
extraction_strategy = JsonCssExtractionStrategy ( schema )
config = CrawlerRunConfig ( extraction_strategy = extraction_strategy )
# Use the fast CSS extraction (no LLM calls during extraction)
async with AsyncWebCrawler ( ) as crawler :
results : List [ CrawlResult ] = await crawler . arun (
" https://thehackernews.com " , config = config
)
for result in results :
print ( f " URL: { result . url } " )
print ( f " Success: { result . success } " )
if result . success :
data = json . loads ( result . extracted_content )
print ( json . dumps ( data , indent = 2 ) )
else :
print ( " Failed to extract structured data " )
async def demo_deep_crawl ( ) :
""" Deep crawling with BFS strategy """
print ( " \n === 6. Deep Crawling === " )
filter_chain = FilterChain ( [ DomainFilter ( allowed_domains = [ " crawl4ai.com " ] ) ] )
deep_crawl_strategy = BFSDeepCrawlStrategy (
max_depth = 1 , max_pages = 5 , filter_chain = filter_chain
)
async with AsyncWebCrawler ( ) as crawler :
results : List [ CrawlResult ] = await crawler . arun (
url = " https://docs.crawl4ai.com " ,
config = CrawlerRunConfig ( deep_crawl_strategy = deep_crawl_strategy ) ,
)
print ( f " Deep crawl returned { len ( results ) } pages: " )
for i , result in enumerate ( results ) :
depth = result . metadata . get ( " depth " , " unknown " )
print ( f " { i + 1 } . { result . url } (Depth: { depth } ) " )
async def demo_js_interaction ( ) :
""" Execute JavaScript to load more content """
print ( " \n === 7. JavaScript Interaction === " )
# A simple page that needs JS to reveal content
async with AsyncWebCrawler ( config = BrowserConfig ( headless = False ) ) as crawler :
# Initial load
news_schema = {
" name " : " news " ,
" baseSelector " : " tr.athing " ,
" fields " : [
{
" name " : " title " ,
" selector " : " span.titleline " ,
" type " : " text " ,
}
] ,
}
results : List [ CrawlResult ] = await crawler . arun (
url = " https://news.ycombinator.com " ,
config = CrawlerRunConfig (
session_id = " hn_session " , # Keep session
extraction_strategy = JsonCssExtractionStrategy ( schema = news_schema ) ,
) ,
)
news = [ ]
for result in results :
if result . success :
data = json . loads ( result . extracted_content )
news . extend ( data )
print ( json . dumps ( data , indent = 2 ) )
else :
print ( " Failed to extract structured data " )
print ( f " Initial items: { len ( news ) } " )
# Click "More" link
more_config = CrawlerRunConfig (
js_code = " document.querySelector( ' a.morelink ' ).click(); " ,
js_only = True , # Continue in same page
session_id = " hn_session " , # Keep session
extraction_strategy = JsonCssExtractionStrategy (
schema = news_schema ,
) ,
)
result : List [ CrawlResult ] = await crawler . arun (
url = " https://news.ycombinator.com " , config = more_config
)
# Extract new items
for result in results :
if result . success :
data = json . loads ( result . extracted_content )
news . extend ( data )
print ( json . dumps ( data , indent = 2 ) )
else :
print ( " Failed to extract structured data " )
print ( f " Total items: { len ( news ) } " )
async def demo_media_and_links ( ) :
""" Extract media and links from a page """
print ( " \n === 8. Media and Links Extraction === " )
async with AsyncWebCrawler ( ) as crawler :
result : List [ CrawlResult ] = await crawler . arun ( " https://en.wikipedia.org/wiki/Main_Page " )
for i , result in enumerate ( result ) :
# Extract and save all images
images = result . media . get ( " images " , [ ] )
print ( f " Found { len ( images ) } images " )
# Extract and save all links (internal and external)
internal_links = result . links . get ( " internal " , [ ] )
external_links = result . links . get ( " external " , [ ] )
print ( f " Found { len ( internal_links ) } internal links " )
print ( f " Found { len ( external_links ) } external links " )
2025-04-05 22:55:56 +08:00
# Print some of the images and links
for image in images [ : 3 ] :
print ( f " Image: { image [ ' src ' ] } " )
for link in internal_links [ : 3 ] :
print ( f " Internal link: { link [ ' href ' ] } " )
for link in external_links [ : 3 ] :
print ( f " External link: { link [ ' href ' ] } " )
# # Save everything to files
2025-04-05 22:57:45 +08:00
with open ( f " { __cur_dir__ } /tmp/images.json " , " w " ) as f :
json . dump ( images , f , indent = 2 )
with open ( f " { __cur_dir__ } /tmp/links.json " , " w " ) as f :
json . dump (
{ " internal " : internal_links , " external " : external_links } ,
f ,
indent = 2 ,
)
2025-04-05 21:37:25 +08:00
async def demo_screenshot_and_pdf ( ) :
""" Capture screenshot and PDF of a page """
print ( " \n === 9. Screenshot and PDF Capture === " )
async with AsyncWebCrawler ( ) as crawler :
result : List [ CrawlResult ] = await crawler . arun (
# url="https://example.com",
url = " https://en.wikipedia.org/wiki/Giant_anteater " ,
config = CrawlerRunConfig ( screenshot = True , pdf = True ) ,
)
for i , result in enumerate ( result ) :
2025-04-05 22:55:56 +08:00
# if result.screenshot_data:
2025-04-05 21:37:25 +08:00
if result . screenshot :
# Save screenshot
screenshot_path = f " { __cur_dir__ } /tmp/example_screenshot.png "
with open ( screenshot_path , " wb " ) as f :
f . write ( base64 . b64decode ( result . screenshot ) )
print ( f " Screenshot saved to { screenshot_path } " )
2025-04-05 22:55:56 +08:00
# if result.pdf_data:
2025-04-05 21:37:25 +08:00
if result . pdf :
# Save PDF
pdf_path = f " { __cur_dir__ } /tmp/example.pdf "
with open ( pdf_path , " wb " ) as f :
f . write ( result . pdf )
print ( f " PDF saved to { pdf_path } " )
async def demo_proxy_rotation ( ) :
""" Proxy rotation for multiple requests """
print ( " \n === 10. Proxy Rotation === " )
# Example proxies (replace with real ones)
proxies = [
ProxyConfig ( server = " http://proxy1.example.com:8080 " ) ,
ProxyConfig ( server = " http://proxy2.example.com:8080 " ) ,
]
proxy_strategy = RoundRobinProxyStrategy ( proxies )
print ( f " Using { len ( proxies ) } proxies in rotation " )
print (
" Note: This example uses placeholder proxies - replace with real ones to test "
)
async with AsyncWebCrawler ( ) as crawler :
config = CrawlerRunConfig (
2025-04-05 22:57:45 +08:00
proxy_rotation_strategy = proxy_strategy
2025-04-05 21:37:25 +08:00
)
# In a real scenario, these would be run and the proxies would rotate
print ( " In a real scenario, requests would rotate through the available proxies " )
async def demo_raw_html_and_file ( ) :
""" Process raw HTML and local files """
print ( " \n === 11. Raw HTML and Local Files === " )
raw_html = """
< html > < body >
< h1 > Sample Article < / h1 >
< p > This is sample content for testing Crawl4AI ' s raw HTML processing.</p>
< / body > < / html >
"""
# Save to file
file_path = Path ( " docs/examples/tmp/sample.html " ) . absolute ( )
with open ( file_path , " w " ) as f :
f . write ( raw_html )
async with AsyncWebCrawler ( ) as crawler :
# Crawl raw HTML
raw_result = await crawler . arun (
url = " raw: " + raw_html , config = CrawlerRunConfig ( cache_mode = CacheMode . BYPASS )
)
print ( " Raw HTML processing: " )
print ( f " Markdown: { raw_result . markdown . raw_markdown [ : 50 ] } ... " )
# Crawl local file
file_result = await crawler . arun (
url = f " file:// { file_path } " ,
config = CrawlerRunConfig ( cache_mode = CacheMode . BYPASS ) ,
)
print ( " \n Local file processing: " )
print ( f " Markdown: { file_result . markdown . raw_markdown [ : 50 ] } ... " )
# Clean up
os . remove ( file_path )
print ( f " Processed both raw HTML and local file ( { file_path } ) " )
async def main ( ) :
""" Run all demo functions sequentially """
print ( " === Comprehensive Crawl4AI Demo === " )
print ( " Note: Some examples require API keys or other configurations " )
# Run all demos
2025-04-05 22:57:45 +08:00
await demo_basic_crawl ( )
await demo_parallel_crawl ( )
await demo_fit_markdown ( )
await demo_llm_structured_extraction_no_schema ( )
await demo_css_structured_extraction_no_schema ( )
2025-04-05 21:37:25 +08:00
await demo_deep_crawl ( )
2025-04-05 22:57:45 +08:00
await demo_js_interaction ( )
await demo_media_and_links ( )
await demo_screenshot_and_pdf ( )
2025-04-05 22:55:56 +08:00
# # await demo_proxy_rotation()
2025-04-05 22:57:45 +08:00
await demo_raw_html_and_file ( )
2025-04-05 21:37:25 +08:00
# Clean up any temp files that may have been created
print ( " \n === Demo Complete === " )
print ( " Check for any generated files (screenshots, PDFs) in the current directory " )
if __name__ == " __main__ " :
asyncio . run ( main ( ) )