refactor(config): enhance serialization and config handling
- Add ignore_default_value option to to_serializable_dict - Add viewport dict support in BrowserConfig - Replace FastFilterChain with FilterChain - Add deprecation warnings for unwanted properties - Clean up unused imports - Rename example files for consistency - Add comprehensive Docker configuration tutorial BREAKING CHANGE: FastFilterChain has been replaced with FilterChain
This commit is contained in:
parent
dad592c801
commit
3cb28875c3
@ -1,5 +1,3 @@
|
||||
from email import header
|
||||
from re import I
|
||||
from .config import (
|
||||
MIN_WORD_THRESHOLD,
|
||||
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
||||
@ -23,7 +21,7 @@ import inspect
|
||||
from typing import Any, Dict, Optional
|
||||
from enum import Enum
|
||||
|
||||
def to_serializable_dict(obj: Any) -> Dict:
|
||||
def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
|
||||
"""
|
||||
Recursively convert an object to a serializable dictionary using {type, params} structure
|
||||
for complex objects.
|
||||
@ -60,7 +58,9 @@ def to_serializable_dict(obj: Any) -> Dict:
|
||||
"type": "dict", # Mark as plain dictionary
|
||||
"value": {str(k): to_serializable_dict(v) for k, v in obj.items()}
|
||||
}
|
||||
|
||||
|
||||
_type = obj.__class__.__name__
|
||||
|
||||
# Handle class instances
|
||||
if hasattr(obj, '__class__'):
|
||||
# Get constructor signature
|
||||
@ -77,10 +77,18 @@ def to_serializable_dict(obj: Any) -> Dict:
|
||||
|
||||
# Only include if different from default, considering empty values
|
||||
if not (is_empty_value(value) and is_empty_value(param.default)):
|
||||
if value != param.default:
|
||||
if value != param.default and not ignore_default_value:
|
||||
current_values[name] = to_serializable_dict(value)
|
||||
|
||||
_type = obj.__class__.__name__
|
||||
if hasattr(obj, '__slots__'):
|
||||
for slot in obj.__slots__:
|
||||
if slot.startswith('_'): # Handle private slots
|
||||
attr_name = slot[1:] # Remove leading '_'
|
||||
value = getattr(obj, slot, None)
|
||||
if value is not None:
|
||||
current_values[attr_name] = to_serializable_dict(value)
|
||||
|
||||
|
||||
|
||||
return {
|
||||
"type": obj.__class__.__name__,
|
||||
@ -169,6 +177,8 @@ class BrowserConfig():
|
||||
If None, no additional proxy config. Default: None.
|
||||
viewport_width (int): Default viewport width for pages. Default: 1080.
|
||||
viewport_height (int): Default viewport height for pages. Default: 600.
|
||||
viewport (dict): Default viewport dimensions for pages. If set, overrides viewport_width and viewport_height.
|
||||
Default: None.
|
||||
verbose (bool): Enable verbose logging.
|
||||
Default: True.
|
||||
accept_downloads (bool): Whether to allow file downloads. If True, requires a downloads_path.
|
||||
@ -211,6 +221,7 @@ class BrowserConfig():
|
||||
proxy_config: dict = None,
|
||||
viewport_width: int = 1080,
|
||||
viewport_height: int = 600,
|
||||
viewport: dict = None,
|
||||
accept_downloads: bool = False,
|
||||
downloads_path: str = None,
|
||||
storage_state : Union[str, dict, None]=None,
|
||||
@ -249,6 +260,10 @@ class BrowserConfig():
|
||||
self.proxy_config = proxy_config
|
||||
self.viewport_width = viewport_width
|
||||
self.viewport_height = viewport_height
|
||||
self.viewport = viewport
|
||||
if self.viewport is not None:
|
||||
self.viewport_width = self.viewport.get("width", 1080)
|
||||
self.viewport_height = self.viewport.get("height", 600)
|
||||
self.accept_downloads = accept_downloads
|
||||
self.downloads_path = downloads_path
|
||||
self.storage_state = storage_state
|
||||
@ -436,6 +451,13 @@ class HTTPCrawlerConfig():
|
||||
return HTTPCrawlerConfig.from_kwargs(config)
|
||||
|
||||
class CrawlerRunConfig():
|
||||
_UNWANTED_PROPS = {
|
||||
'disable_cache' : 'Instead, use cache_mode=CacheMode.DISABLED',
|
||||
'bypass_cache' : 'Instead, use cache_mode=CacheMode.BYPASS',
|
||||
'no_cache_read' : 'Instead, use cache_mode=CacheMode.WRITE_ONLY',
|
||||
'no_cache_write' : 'Instead, use cache_mode=CacheMode.READ_ONLY',
|
||||
}
|
||||
|
||||
"""
|
||||
Configuration class for controlling how the crawler runs each crawl operation.
|
||||
This includes parameters for content extraction, page manipulation, waiting conditions,
|
||||
@ -680,6 +702,7 @@ class CrawlerRunConfig():
|
||||
deep_crawl_strategy: Optional[DeepCrawlStrategy] = None,
|
||||
|
||||
):
|
||||
# TODO: Planning to set properties dynamically based on the __init__ signature
|
||||
self.url = url
|
||||
|
||||
# Content Processing Parameters
|
||||
@ -791,6 +814,24 @@ class CrawlerRunConfig():
|
||||
# Deep Crawl Parameters
|
||||
self.deep_crawl_strategy = deep_crawl_strategy
|
||||
|
||||
|
||||
def __getattr__(self, name):
|
||||
"""Handle attribute access."""
|
||||
if name in self._UNWANTED_PROPS:
|
||||
raise AttributeError(f"Getting '{name}' is deprecated. {self._UNWANTED_PROPS[name]}")
|
||||
raise AttributeError(f"'{self.__class__.__name__}' has no attribute '{name}'")
|
||||
|
||||
def __setattr__(self, name, value):
|
||||
"""Handle attribute setting."""
|
||||
# TODO: Planning to set properties dynamically based on the __init__ signature
|
||||
sig = inspect.signature(self.__init__)
|
||||
all_params = sig.parameters # Dictionary of parameter names and their details
|
||||
|
||||
if name in self._UNWANTED_PROPS and value is not all_params[name].default:
|
||||
raise AttributeError(f"Setting '{name}' is deprecated. {self._UNWANTED_PROPS[name]}")
|
||||
|
||||
super().__setattr__(name, value)
|
||||
|
||||
@staticmethod
|
||||
def from_kwargs(kwargs: dict) -> "CrawlerRunConfig":
|
||||
return CrawlerRunConfig(
|
||||
@ -988,3 +1029,5 @@ class CrawlerRunConfig():
|
||||
config_dict = self.to_dict()
|
||||
config_dict.update(kwargs)
|
||||
return CrawlerRunConfig.from_kwargs(config_dict)
|
||||
|
||||
|
||||
|
@ -10,7 +10,7 @@ import inspect
|
||||
from crawl4ai import CacheMode
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
from crawl4ai.models import CrawlResult, TraversalStats
|
||||
from crawl4ai.deep_crawling.filters import FastFilterChain
|
||||
from crawl4ai.deep_crawling.filters import FilterChain
|
||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||
import time
|
||||
import logging
|
||||
@ -313,7 +313,7 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
||||
|
||||
def __init__(self,
|
||||
max_depth: int,
|
||||
filter_chain: FastFilterChain = FastFilterChain(),
|
||||
filter_chain: FilterChain = FilterChain(),
|
||||
priority_fn: Callable[[str], Awaitable[float]] = lambda url: 1.0,
|
||||
logger: logging.Logger = None):
|
||||
self.max_depth = max_depth
|
||||
@ -408,7 +408,7 @@ async def main():
|
||||
strategy = BFSDeepCrawlStrategy(
|
||||
max_depth=2,
|
||||
priority_fn=lambda url: 1.0 / (len(url) + 1e-9), # Inverse length priority
|
||||
# filter_chain=FastFilterChain(...)
|
||||
# filter_chain=FilterChain(...)
|
||||
)
|
||||
|
||||
config: CrawlerRunConfig = CrawlerRunConfig(
|
||||
|
@ -469,57 +469,40 @@ print(config.dump()) # Use this JSON in your API calls
|
||||
"type": "BFSDeepCrawlStrategy",
|
||||
"params": {
|
||||
"max_depth": 3,
|
||||
"max_pages": 100,
|
||||
"filter_chain": {
|
||||
"type": "FastFilterChain",
|
||||
"type": "FilterChain",
|
||||
"params": {
|
||||
"filters": [
|
||||
{
|
||||
"type": "FastContentTypeFilter",
|
||||
"type": "ContentTypeFilter",
|
||||
"params": {
|
||||
"allowed_types": ["text/html", "application/xhtml+xml"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "FastDomainFilter",
|
||||
"type": "DomainFilter",
|
||||
"params": {
|
||||
"allowed_domains": ["blog.*", "docs.*"],
|
||||
"blocked_domains": ["ads.*", "analytics.*"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "FastURLPatternFilter",
|
||||
"params": {
|
||||
"allowed_patterns": ["^/blog/", "^/docs/"],
|
||||
"blocked_patterns": [".*/ads/", ".*/sponsored/"]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"url_scorer": {
|
||||
"type": "FastCompositeScorer",
|
||||
"type": "CompositeScorer",
|
||||
"params": {
|
||||
"scorers": [
|
||||
{
|
||||
"type": "FastKeywordRelevanceScorer",
|
||||
"type": "KeywordRelevanceScorer",
|
||||
"params": {
|
||||
"keywords": ["tutorial", "guide", "documentation"],
|
||||
"weight": 1.0
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "FastPathDepthScorer",
|
||||
"type": "PathDepthScorer",
|
||||
"params": {
|
||||
"weight": 0.5,
|
||||
"preferred_depth": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "FastFreshnessScorer",
|
||||
"params": {
|
||||
"weight": 0.8,
|
||||
"max_age_days": 365
|
||||
"optimal_depth": 3
|
||||
}
|
||||
}
|
||||
]
|
||||
|
249
docs/examples/docker_config_obj.py
Normal file
249
docs/examples/docker_config_obj.py
Normal file
@ -0,0 +1,249 @@
|
||||
from crawl4ai import BrowserConfig, CrawlerRunConfig, PruningContentFilter, DefaultMarkdownGenerator
|
||||
from crawl4ai.deep_crawling.filters import ContentTypeFilter, DomainFilter
|
||||
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer, PathDepthScorer
|
||||
from crawl4ai.cache_context import CacheMode
|
||||
from crawl4ai.deep_crawling.bfs_strategy import BFSDeepCrawlStrategy
|
||||
from crawl4ai.deep_crawling.filters import FilterChain
|
||||
from crawl4ai.deep_crawling.scorers import CompositeScorer
|
||||
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||
import json
|
||||
from rich.console import Console
|
||||
from rich.syntax import Syntax
|
||||
|
||||
console = Console()
|
||||
|
||||
def print_json(data: dict, title: str = None):
|
||||
"""Helper to print JSON prettily with syntax highlighting"""
|
||||
if title:
|
||||
console.print(f"\n[bold blue]{title}[/bold blue]")
|
||||
json_str = json.dumps(data, indent=2)
|
||||
syntax = Syntax(json_str, "json", theme="monokai", line_numbers=True)
|
||||
console.print(syntax)
|
||||
|
||||
async def part1_basic_config():
|
||||
"""PART 1: Understanding Basic Configuration Objects
|
||||
|
||||
Here we create simple configuration objects and examine their structure.
|
||||
This helps understand the basic type-params pattern used throughout the API.
|
||||
"""
|
||||
console.print("\n[bold green]Explanation:[/bold green] Configuration objects like BrowserConfig and CrawlerRunConfig are the foundation of Crawl4AI. They define how the crawler behaves—e.g., whether it runs headless or how it processes content. These objects use a 'type-params' pattern: 'type' identifies the object class, and 'params' holds its settings. This structure is key because it’s reusable and can be serialized into JSON for API calls.")
|
||||
|
||||
# Create a simple browser config
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
viewport_width=500,
|
||||
headers = {"User-Agent": "Mozilla/5.0"}
|
||||
)
|
||||
|
||||
# Show its structure
|
||||
print_json(browser_config.dump(), "Simple Browser Config Structure")
|
||||
|
||||
# Create a more complex config with nested objects
|
||||
crawler_config = CrawlerRunConfig(
|
||||
word_count_threshold=200,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(threshold=0.5)
|
||||
)
|
||||
)
|
||||
|
||||
print_json(crawler_config.dump(), "Complex Config with Nested Objects")
|
||||
|
||||
async def part2_manual_json():
|
||||
"""PART 2: Building JSON Manually
|
||||
|
||||
Learn how to construct the JSON structure by hand.
|
||||
This demonstrates deep understanding of the configuration format.
|
||||
"""
|
||||
console.print("\n[bold green]Explanation:[/bold green] Manually building JSON configurations mirrors how the API expects data. It’s a hands-on way to learn the exact structure—each object has a 'type' and 'params' section. This is useful when you’re troubleshooting or working without the SDK, as it forces you to understand every detail of the config format.")
|
||||
|
||||
# Manual browser config
|
||||
manual_browser = {
|
||||
"type": "BrowserConfig",
|
||||
"params": {
|
||||
"headless": True,
|
||||
"viewport": {
|
||||
"type": "dict",
|
||||
"value": {
|
||||
"width": 1200,
|
||||
"height": 800
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Validate by loading into BrowserConfig
|
||||
loaded_config = BrowserConfig.load(manual_browser)
|
||||
print_json(loaded_config.dump(), "Manually Created -> Loaded -> Dumped")
|
||||
|
||||
# Show they're equivalent
|
||||
original = BrowserConfig(headless=True, viewport={"width": 1200, "height": 800})
|
||||
assert loaded_config.dump() == original.dump(), "Configs are equivalent!"
|
||||
|
||||
async def part3_complex_structures():
|
||||
"""PART 3: Working with Complex Nested Structures
|
||||
|
||||
Explore more complex configurations with multiple levels of nesting.
|
||||
This shows how the type-params pattern scales to complex scenarios.
|
||||
"""
|
||||
console.print("\n[bold green]Explanation:[/bold green] Real-world crawling often requires detailed settings—like filtering content or customizing output. Here, we nest objects (e.g., a markdown generator with a content filter) using the same 'type-params' pattern. This nesting lets you fine-tune the crawler’s behavior at multiple levels, making it powerful and flexible.")
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter()
|
||||
),
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=5,
|
||||
filter_chain=FilterChain(
|
||||
filters=[
|
||||
ContentTypeFilter(allowed_types=["text/html"]),
|
||||
DomainFilter(allowed_domains=["example.com"])
|
||||
]
|
||||
),
|
||||
url_scorer=CompositeScorer(
|
||||
scorers=[
|
||||
KeywordRelevanceScorer(keywords=["data", "analysis"]),
|
||||
PathDepthScorer(optimal_depth=3)
|
||||
]
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
print_json(config.dump(), "Deep Nested Configuration")
|
||||
|
||||
async def part4_client_sdk():
|
||||
"""PART 4: Using the Client SDK
|
||||
|
||||
Demonstrate how the SDK makes working with the API simple by handling
|
||||
all the complex serialization automatically.
|
||||
"""
|
||||
console.print("\n[bold green]Explanation:[/bold green] The Crawl4aiDockerClient SDK is a time-saver—it takes your configuration objects and turns them into API-ready JSON automatically. This means less manual work and fewer mistakes. You just define your settings, pass them to the SDK, and it handles the rest, making crawling easier and faster.")
|
||||
|
||||
async with Crawl4aiDockerClient(base_url="http://localhost:8000") as client:
|
||||
# You would normally authenticate here if JWT is enabled
|
||||
await client.authenticate("user@example.com")
|
||||
|
||||
# Create configs
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
crawler_config = CrawlerRunConfig(stream=False)
|
||||
|
||||
# SDK handles all serialization
|
||||
result = await client.crawl(
|
||||
urls=["https://example.com"],
|
||||
browser_config=browser_config,
|
||||
crawler_config=crawler_config
|
||||
)
|
||||
|
||||
console.print("\n[bold green]🚀 Crawl completed successfully![/bold green]")
|
||||
console.print(f"Markdown length: {len(result.markdown)} characters")
|
||||
|
||||
async def part5_direct_api():
|
||||
"""PART 5: Using the API Directly
|
||||
|
||||
Learn how to make direct API calls without the SDK.
|
||||
This demonstrates the raw request structure and gives more control.
|
||||
"""
|
||||
console.print("\n[bold green]Explanation:[/bold green] Skipping the SDK means you’re in full control—you build the JSON payload yourself and send it to the API. This is harder but gives you a deeper understanding of how Crawl4AI works under the hood. It’s also useful if you’re integrating with systems that don’t use the SDK.")
|
||||
|
||||
import aiohttp
|
||||
from datetime import datetime
|
||||
|
||||
# Prepare the request payload
|
||||
payload = {
|
||||
"urls": ["https://example.com"],
|
||||
"browser_config": {
|
||||
"type": "BrowserConfig",
|
||||
"params": {
|
||||
"headless": True,
|
||||
"viewport": {
|
||||
"type": "dict",
|
||||
"value": {
|
||||
"width": 1200,
|
||||
"height": 800
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"cache_mode": "bypass",
|
||||
"markdown_generator": {
|
||||
"type": "DefaultMarkdownGenerator",
|
||||
"params": {
|
||||
"content_filter": {
|
||||
"type": "PruningContentFilter",
|
||||
"params": {
|
||||
"threshold": 0.48,
|
||||
"threshold_type": "fixed"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
print_json(payload, "Direct API Request Payload")
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
# If JWT is enabled, get token first
|
||||
token_response = await session.post(
|
||||
"http://localhost:8000/token",
|
||||
json={"email": "user@example.com"}
|
||||
)
|
||||
token = (await token_response.json())["access_token"]
|
||||
headers = {"Authorization": f"Bearer {token}"}
|
||||
|
||||
# Make the crawl request
|
||||
start_time = datetime.now()
|
||||
async with session.post(
|
||||
"http://localhost:8000/crawl",
|
||||
json=payload,
|
||||
headers=headers # comment if using JWT
|
||||
) as response:
|
||||
result = await response.json()
|
||||
duration = (datetime.now() - start_time).total_seconds()
|
||||
|
||||
console.print(f"\n[bold green]✅ API call completed in {duration:.2f}s[/bold green]")
|
||||
print_json(result, "API Response")
|
||||
|
||||
async def part6_wrap_up():
|
||||
"""PART 6: Wrap-Up and Key Takeaways
|
||||
|
||||
Summarize the key concepts learned in this tutorial.
|
||||
"""
|
||||
console.print("\n[bold yellow]🎓 Tutorial Wrap-Up[/bold yellow]")
|
||||
console.print("[italic]Key Takeaways:[/italic]\n")
|
||||
console.print("- **Configurations:** Use the type-params pattern to define settings flexibly.")
|
||||
console.print("- **Manual JSON:** Build configs by hand to master the structure.")
|
||||
console.print("- **Nesting:** Customize deeply with nested objects.")
|
||||
console.print("- **SDK:** Simplify API calls with automatic serialization.")
|
||||
console.print("- **Direct API:** Gain control by crafting raw requests.")
|
||||
console.print("\n[bold green]🚀 You’re ready to crawl with Crawl4AI![/bold green]")
|
||||
|
||||
async def main():
|
||||
"""Main tutorial runner that executes each part in sequence"""
|
||||
console.print("\n[bold yellow]🎓 Crawl4AI Docker Tutorial[/bold yellow]")
|
||||
console.print("[italic]Learn how to work with configuration objects and the Docker API[/italic]\n")
|
||||
|
||||
parts = [
|
||||
(part1_basic_config, "Understanding Basic Configurations"),
|
||||
(part2_manual_json, "Manual JSON Construction"),
|
||||
(part3_complex_structures, "Complex Nested Structures"),
|
||||
(part4_client_sdk, "Using the Client SDK"),
|
||||
(part5_direct_api, "Direct API Integration"),
|
||||
(part6_wrap_up, "Wrap-Up and Key Takeaways")
|
||||
]
|
||||
|
||||
for func, title in parts:
|
||||
console.print(f"\n[bold cyan]📚 {title}[/bold cyan]")
|
||||
console.print("[dim]" + func.__doc__.strip() + "[/dim]\n")
|
||||
await func()
|
||||
if func != part6_wrap_up: # No pause after wrap-up
|
||||
input("\nPress Enter to continue...\n")
|
||||
|
||||
# Run the tutorial
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
asyncio.run(main())
|
Loading…
x
Reference in New Issue
Block a user