
Add new features to enhance browser automation and HTML extraction: - Add CDP browser launch capability with customizable ports and profiles - Implement JsonLxmlExtractionStrategy for faster HTML parsing - Add CLI command 'crwl cdp' for launching standalone CDP browsers - Support connecting to external CDP browsers via URL - Optimize selector caching and context-sensitive queries BREAKING CHANGE: LLMConfig import path changed from crawl4ai.types to crawl4ai
255 lines
8.3 KiB
Python
255 lines
8.3 KiB
Python
import inspect
|
|
from typing import Any, Dict
|
|
from enum import Enum
|
|
|
|
from crawl4ai import LLMConfig
|
|
|
|
def to_serializable_dict(obj: Any) -> Dict:
|
|
"""
|
|
Recursively convert an object to a serializable dictionary using {type, params} structure
|
|
for complex objects.
|
|
"""
|
|
if obj is None:
|
|
return None
|
|
|
|
# Handle basic types
|
|
if isinstance(obj, (str, int, float, bool)):
|
|
return obj
|
|
|
|
# Handle Enum
|
|
if isinstance(obj, Enum):
|
|
return {
|
|
"type": obj.__class__.__name__,
|
|
"params": obj.value
|
|
}
|
|
|
|
# Handle datetime objects
|
|
if hasattr(obj, 'isoformat'):
|
|
return obj.isoformat()
|
|
|
|
# Handle lists, tuples, and sets
|
|
if isinstance(obj, (list, tuple, set)):
|
|
return [to_serializable_dict(item) for item in obj]
|
|
|
|
# Handle dictionaries - preserve them as-is
|
|
if isinstance(obj, dict):
|
|
return {
|
|
"type": "dict", # Mark as plain dictionary
|
|
"value": {str(k): to_serializable_dict(v) for k, v in obj.items()}
|
|
}
|
|
|
|
# Handle class instances
|
|
if hasattr(obj, '__class__'):
|
|
# Get constructor signature
|
|
sig = inspect.signature(obj.__class__.__init__)
|
|
params = sig.parameters
|
|
|
|
# Get current values
|
|
current_values = {}
|
|
for name, param in params.items():
|
|
if name == 'self':
|
|
continue
|
|
|
|
value = getattr(obj, name, param.default)
|
|
|
|
# Only include if different from default, considering empty values
|
|
if not (is_empty_value(value) and is_empty_value(param.default)):
|
|
if value != param.default:
|
|
current_values[name] = to_serializable_dict(value)
|
|
|
|
return {
|
|
"type": obj.__class__.__name__,
|
|
"params": current_values
|
|
}
|
|
|
|
return str(obj)
|
|
|
|
def from_serializable_dict(data: Any) -> Any:
|
|
"""
|
|
Recursively convert a serializable dictionary back to an object instance.
|
|
"""
|
|
if data is None:
|
|
return None
|
|
|
|
# Handle basic types
|
|
if isinstance(data, (str, int, float, bool)):
|
|
return data
|
|
|
|
# Handle typed data
|
|
if isinstance(data, dict) and "type" in data:
|
|
# Handle plain dictionaries
|
|
if data["type"] == "dict":
|
|
return {k: from_serializable_dict(v) for k, v in data["value"].items()}
|
|
|
|
# Import from crawl4ai for class instances
|
|
import crawl4ai
|
|
cls = getattr(crawl4ai, data["type"])
|
|
|
|
# Handle Enum
|
|
if issubclass(cls, Enum):
|
|
return cls(data["params"])
|
|
|
|
# Handle class instances
|
|
constructor_args = {
|
|
k: from_serializable_dict(v) for k, v in data["params"].items()
|
|
}
|
|
return cls(**constructor_args)
|
|
|
|
# Handle lists
|
|
if isinstance(data, list):
|
|
return [from_serializable_dict(item) for item in data]
|
|
|
|
# Handle raw dictionaries (legacy support)
|
|
if isinstance(data, dict):
|
|
return {k: from_serializable_dict(v) for k, v in data.items()}
|
|
|
|
return data
|
|
|
|
def is_empty_value(value: Any) -> bool:
|
|
"""Check if a value is effectively empty/null."""
|
|
if value is None:
|
|
return True
|
|
if isinstance(value, (list, tuple, set, dict, str)) and len(value) == 0:
|
|
return True
|
|
return False
|
|
|
|
# if __name__ == "__main__":
|
|
# from crawl4ai import (
|
|
# CrawlerRunConfig, CacheMode, DefaultMarkdownGenerator,
|
|
# PruningContentFilter, BM25ContentFilter, LLMContentFilter,
|
|
# JsonCssExtractionStrategy, CosineStrategy, RegexChunking,
|
|
# WebScrapingStrategy, LXMLWebScrapingStrategy
|
|
# )
|
|
|
|
# # Test Case 1: BM25 content filtering through markdown generator
|
|
# config1 = CrawlerRunConfig(
|
|
# cache_mode=CacheMode.BYPASS,
|
|
# markdown_generator=DefaultMarkdownGenerator(
|
|
# content_filter=BM25ContentFilter(
|
|
# user_query="technology articles",
|
|
# bm25_threshold=1.2,
|
|
# language="english"
|
|
# )
|
|
# ),
|
|
# chunking_strategy=RegexChunking(patterns=[r"\n\n", r"\.\s+"]),
|
|
# excluded_tags=["nav", "footer", "aside"],
|
|
# remove_overlay_elements=True
|
|
# )
|
|
|
|
# # Serialize
|
|
# serialized = to_serializable_dict(config1)
|
|
# print("\nSerialized Config:")
|
|
# print(serialized)
|
|
|
|
# # Example output structure would now look like:
|
|
# """
|
|
# {
|
|
# "type": "CrawlerRunConfig",
|
|
# "params": {
|
|
# "cache_mode": {
|
|
# "type": "CacheMode",
|
|
# "params": "bypass"
|
|
# },
|
|
# "markdown_generator": {
|
|
# "type": "DefaultMarkdownGenerator",
|
|
# "params": {
|
|
# "content_filter": {
|
|
# "type": "BM25ContentFilter",
|
|
# "params": {
|
|
# "user_query": "technology articles",
|
|
# "bm25_threshold": 1.2,
|
|
# "language": "english"
|
|
# }
|
|
# }
|
|
# }
|
|
# }
|
|
# }
|
|
# }
|
|
# """
|
|
|
|
# # Deserialize
|
|
# deserialized = from_serializable_dict(serialized)
|
|
# print("\nDeserialized Config:")
|
|
# print(to_serializable_dict(deserialized))
|
|
|
|
# # Verify they match
|
|
# assert to_serializable_dict(config1) == to_serializable_dict(deserialized)
|
|
# print("\nVerification passed: Configuration matches after serialization/deserialization!")
|
|
|
|
if __name__ == "__main__":
|
|
from crawl4ai import (
|
|
CrawlerRunConfig, CacheMode, DefaultMarkdownGenerator,
|
|
PruningContentFilter, BM25ContentFilter, LLMContentFilter,
|
|
JsonCssExtractionStrategy, RegexChunking,
|
|
WebScrapingStrategy, LXMLWebScrapingStrategy
|
|
)
|
|
|
|
# Test Case 1: BM25 content filtering through markdown generator
|
|
config1 = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS,
|
|
markdown_generator=DefaultMarkdownGenerator(
|
|
content_filter=BM25ContentFilter(
|
|
user_query="technology articles",
|
|
bm25_threshold=1.2,
|
|
language="english"
|
|
)
|
|
),
|
|
chunking_strategy=RegexChunking(patterns=[r"\n\n", r"\.\s+"]),
|
|
excluded_tags=["nav", "footer", "aside"],
|
|
remove_overlay_elements=True
|
|
)
|
|
|
|
# Test Case 2: LLM-based extraction with pruning filter
|
|
schema = {
|
|
"baseSelector": "article.post",
|
|
"fields": [
|
|
{"name": "title", "selector": "h1", "type": "text"},
|
|
{"name": "content", "selector": ".content", "type": "html"}
|
|
]
|
|
}
|
|
config2 = CrawlerRunConfig(
|
|
extraction_strategy=JsonCssExtractionStrategy(schema=schema),
|
|
markdown_generator=DefaultMarkdownGenerator(
|
|
content_filter=PruningContentFilter(
|
|
threshold=0.48,
|
|
threshold_type="fixed",
|
|
min_word_threshold=0
|
|
),
|
|
options={"ignore_links": True}
|
|
),
|
|
scraping_strategy=LXMLWebScrapingStrategy()
|
|
)
|
|
|
|
# Test Case 3:LLM content filter
|
|
config3 = CrawlerRunConfig(
|
|
markdown_generator=DefaultMarkdownGenerator(
|
|
content_filter=LLMContentFilter(
|
|
llm_config = LLMConfig(provider="openai/gpt-4"),
|
|
instruction="Extract key technical concepts",
|
|
chunk_token_threshold=2000,
|
|
overlap_rate=0.1
|
|
),
|
|
options={"ignore_images": True}
|
|
),
|
|
scraping_strategy=WebScrapingStrategy()
|
|
)
|
|
|
|
# Test all configurations
|
|
test_configs = [config1, config2, config3]
|
|
|
|
for i, config in enumerate(test_configs, 1):
|
|
print(f"\nTesting Configuration {i}:")
|
|
|
|
# Serialize
|
|
serialized = to_serializable_dict(config)
|
|
print(f"\nSerialized Config {i}:")
|
|
print(serialized)
|
|
|
|
# Deserialize
|
|
deserialized = from_serializable_dict(serialized)
|
|
print(f"\nDeserialized Config {i}:")
|
|
print(to_serializable_dict(deserialized)) # Convert back to dict for comparison
|
|
|
|
# Verify they match
|
|
assert to_serializable_dict(config) == to_serializable_dict(deserialized)
|
|
print(f"\nVerification passed: Configuration {i} matches after serialization/deserialization!") |