
Moves ProxyConfig from configs/ directory into proxy_strategy.py to improve code organization and reduce fragmentation. Updates all imports and type hints to reflect the new location. Key changes: - Moved ProxyConfig class from configs/proxy_config.py to proxy_strategy.py - Updated type hints in async_configs.py to support ProxyConfig - Fixed proxy configuration handling in browser_manager.py - Updated documentation and examples to use new import path BREAKING CHANGE: ProxyConfig import path has changed from crawl4ai.configs to crawl4ai.proxy_strategy
79 lines
3.4 KiB
Python
79 lines
3.4 KiB
Python
import asyncio
|
|
from typing import List
|
|
|
|
from crawl4ai import (
|
|
AsyncWebCrawler,
|
|
CrawlerRunConfig,
|
|
BFSDeepCrawlStrategy,
|
|
CrawlResult,
|
|
FilterChain,
|
|
DomainFilter,
|
|
URLPatternFilter,
|
|
)
|
|
|
|
# Import necessary classes from crawl4ai library:
|
|
# - AsyncWebCrawler: The main class for web crawling.
|
|
# - CrawlerRunConfig: Configuration class for crawler behavior.
|
|
# - BFSDeepCrawlStrategy: Breadth-First Search deep crawling strategy.
|
|
# - CrawlResult: Data model for individual crawl results.
|
|
# - FilterChain: Used to chain multiple URL filters.
|
|
# - URLPatternFilter: Filter URLs based on patterns.
|
|
# You had from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, which is also correct,
|
|
# but for simplicity and consistency, we will use the direct import from crawl4ai in this example, as it is re-exported in __init__.py
|
|
|
|
async def basic_deep_crawl():
|
|
"""
|
|
Performs a basic deep crawl starting from a seed URL, demonstrating:
|
|
- Breadth-First Search (BFS) deep crawling strategy.
|
|
- Filtering URLs based on URL patterns.
|
|
- Accessing crawl results and metadata.
|
|
"""
|
|
|
|
# 1. Define URL Filters:
|
|
# Create a URLPatternFilter to include only URLs containing "text".
|
|
# This filter will be used to restrict crawling to URLs that are likely to contain textual content.
|
|
url_filter = URLPatternFilter(
|
|
patterns=[
|
|
"*text*", # Include URLs that contain "text" in their path or URL
|
|
]
|
|
)
|
|
|
|
# Create a DomainFilter to allow only URLs from the "groq.com" domain and block URLs from the "example.com" domain.
|
|
# This filter will be used to restrict crawling to URLs within the "groq.com" domain.
|
|
domain_filter = DomainFilter(
|
|
allowed_domains=["groq.com"],
|
|
blocked_domains=["example.com"],
|
|
)
|
|
|
|
# 2. Configure CrawlerRunConfig for Deep Crawling:
|
|
# Configure CrawlerRunConfig to use BFSDeepCrawlStrategy for deep crawling.
|
|
config = CrawlerRunConfig(
|
|
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
|
max_depth=2, # Set the maximum depth of crawling to 2 levels from the start URL
|
|
max_pages=10, # Limit the total number of pages to crawl to 10, to prevent excessive crawling
|
|
include_external=False, # Set to False to only crawl URLs within the same domain as the start URL
|
|
filter_chain=FilterChain(filters=[url_filter, domain_filter]), # Apply the URLPatternFilter and DomainFilter to filter URLs during deep crawl
|
|
),
|
|
verbose=True, # Enable verbose logging to see detailed output during crawling
|
|
)
|
|
|
|
# 3. Initialize and Run AsyncWebCrawler:
|
|
# Use AsyncWebCrawler as a context manager for automatic start and close.
|
|
async with AsyncWebCrawler() as crawler:
|
|
results: List[CrawlResult] = await crawler.arun(
|
|
# url="https://docs.crawl4ai.com", # Uncomment to use crawl4ai documentation as start URL
|
|
url="https://console.groq.com/docs", # Set the start URL for deep crawling to Groq documentation
|
|
config=config, # Pass the configured CrawlerRunConfig to arun method
|
|
)
|
|
|
|
# 4. Process and Print Crawl Results:
|
|
# Iterate through the list of CrawlResult objects returned by the deep crawl.
|
|
for result in results:
|
|
# Print the URL and its crawl depth from the metadata for each crawled URL.
|
|
print(f"URL: {result.url}, Depth: {result.metadata.get('depth', 0)}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import asyncio
|
|
asyncio.run(basic_deep_crawl())
|