crawl4ai/tests/docker/test_config_object.py
UncleCode 966fb47e64 feat(config): enhance serialization and add deep crawling exports
Improve configuration serialization with better handling of frozensets and slots.
Expand deep crawling module exports and documentation.
Add comprehensive API usage examples in Docker README.

- Add support for frozenset serialization
- Improve error handling in config loading
- Export additional deep crawling components
- Enhance Docker API documentation with detailed examples
- Fix ContentTypeFilter initialization
2025-02-13 21:45:19 +08:00

113 lines
4.1 KiB
Python

import json
from crawl4ai import (
CrawlerRunConfig,
DefaultMarkdownGenerator,
RegexChunking,
JsonCssExtractionStrategy,
BM25ContentFilter,
CacheMode
)
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
from crawl4ai.deep_crawling.filters import FastFilterChain
from crawl4ai.deep_crawling.filters import FastContentTypeFilter, FastDomainFilter
from crawl4ai.deep_crawling.scorers import FastKeywordRelevanceScorer
def create_test_config() -> CrawlerRunConfig:
# Set up content filtering and markdown generation
content_filter = BM25ContentFilter(
user_query="technology articles",
)
markdown_generator = DefaultMarkdownGenerator(
content_filter=content_filter,
options={"ignore_links": False, "body_width": 0}
)
# Set up extraction strategy
extraction_schema = {
"name": "ArticleExtractor",
"baseSelector": "article.content",
"fields": [
{"name": "title", "selector": "h1", "type": "text"},
{"name": "content", "selector": ".article-body", "type": "html"}
]
}
extraction_strategy = JsonCssExtractionStrategy(schema=extraction_schema)
# Set up deep crawling
filter_chain = FastFilterChain([
FastContentTypeFilter(["text/html"]),
FastDomainFilter(blocked_domains=["ads.*"])
])
url_scorer = FastKeywordRelevanceScorer(
keywords=["article", "blog"],
weight=1.0
)
deep_crawl_strategy = BFSDeepCrawlStrategy(
max_depth=3,
filter_chain=filter_chain,
url_scorer=url_scorer
)
# Create the config
config = CrawlerRunConfig(
word_count_threshold=200,
extraction_strategy=extraction_strategy,
chunking_strategy=RegexChunking(patterns=[r"\n\n"]),
markdown_generator=markdown_generator,
css_selector="main.content",
excluded_tags=["nav", "footer"],
keep_attrs=["href", "src"],
cache_mode=CacheMode.BYPASS,
wait_until="networkidle",
page_timeout=30000,
scan_full_page=True,
deep_crawl_strategy=deep_crawl_strategy,
verbose=True,
stream=True
)
return config
def test_config_serialization_cycle():
# Create original config
original_config = create_test_config()
# Dump to serializable dictionary
serialized = original_config.dump()
print(json.dumps(serialized, indent=2))
# Load back into config object
deserialized_config = CrawlerRunConfig.load(serialized)
# Verify core attributes
assert deserialized_config.word_count_threshold == original_config.word_count_threshold
assert deserialized_config.css_selector == original_config.css_selector
assert deserialized_config.excluded_tags == original_config.excluded_tags
assert deserialized_config.keep_attrs == original_config.keep_attrs
assert deserialized_config.cache_mode == original_config.cache_mode
assert deserialized_config.wait_until == original_config.wait_until
assert deserialized_config.page_timeout == original_config.page_timeout
assert deserialized_config.scan_full_page == original_config.scan_full_page
assert deserialized_config.verbose == original_config.verbose
assert deserialized_config.stream == original_config.stream
# Verify complex objects
assert isinstance(deserialized_config.extraction_strategy, JsonCssExtractionStrategy)
assert isinstance(deserialized_config.chunking_strategy, RegexChunking)
assert isinstance(deserialized_config.markdown_generator, DefaultMarkdownGenerator)
assert isinstance(deserialized_config.markdown_generator.content_filter, BM25ContentFilter)
assert isinstance(deserialized_config.deep_crawl_strategy, BFSDeepCrawlStrategy)
# Verify deep crawl strategy configuration
assert deserialized_config.deep_crawl_strategy.max_depth == 3
assert isinstance(deserialized_config.deep_crawl_strategy.filter_chain, FastFilterChain)
assert isinstance(deserialized_config.deep_crawl_strategy.url_scorer, FastKeywordRelevanceScorer)
print("Serialization cycle test passed successfully!")
if __name__ == "__main__":
test_config_serialization_cycle()