
Implement comprehensive network request and console message capturing functionality: - Add capture_network_requests and capture_console_messages config parameters - Add network_requests and console_messages fields to models - Implement Playwright event listeners to capture requests, responses, and console output - Create detailed documentation and examples - Add comprehensive tests This feature enables deep visibility into web page activity for debugging, security analysis, performance profiling, and API discovery in web applications.
171 lines
5.9 KiB
Python
171 lines
5.9 KiB
Python
import asyncio
|
|
from typing import Dict
|
|
from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
|
|
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
|
import time
|
|
|
|
# Test HTML samples
|
|
TEST_HTML_SAMPLES = {
|
|
"basic": """
|
|
<body>
|
|
<h1>Test Title</h1>
|
|
<p>This is a test paragraph with <a href="http://example.com">a link</a>.</p>
|
|
<div class="content">
|
|
<h2>Section 1</h2>
|
|
<p>More content here with <b>bold text</b>.</p>
|
|
</div>
|
|
</body>
|
|
""",
|
|
|
|
"complex": """
|
|
<body>
|
|
<nav>Navigation menu that should be removed</nav>
|
|
<header>Header content to remove</header>
|
|
<main>
|
|
<article>
|
|
<h1>Main Article</h1>
|
|
<p>Important content paragraph with <a href="http://test.com">useful link</a>.</p>
|
|
<section>
|
|
<h2>Key Section</h2>
|
|
<p>Detailed explanation with multiple sentences. This should be kept
|
|
in the final output. Very important information here.</p>
|
|
</section>
|
|
</article>
|
|
<aside>Sidebar content to remove</aside>
|
|
</main>
|
|
<footer>Footer content to remove</footer>
|
|
</body>
|
|
""",
|
|
|
|
"edge_cases": """
|
|
<body>
|
|
<div>
|
|
<p></p>
|
|
<p> </p>
|
|
<script>alert('remove me');</script>
|
|
<div class="advertisement">Ad content to remove</div>
|
|
<p class="social-share">Share buttons to remove</p>
|
|
<h1>!!Special>> Characters## Title!!</h1>
|
|
<pre><code>def test(): pass</code></pre>
|
|
</div>
|
|
</body>
|
|
""",
|
|
|
|
"links_citations": """
|
|
<body>
|
|
<h1>Document with Links</h1>
|
|
<p>First link to <a href="http://example.com/1">Example 1</a></p>
|
|
<p>Second link to <a href="http://example.com/2" title="Example 2">Test 2</a></p>
|
|
<p>Image link: <img src="test.jpg" alt="test image"></p>
|
|
<p>Repeated link to <a href="http://example.com/1">Example 1 again</a></p>
|
|
</body>
|
|
""",
|
|
}
|
|
|
|
def test_content_filters() -> Dict[str, Dict[str, int]]:
|
|
"""Test various content filtering strategies and return length comparisons."""
|
|
results = {}
|
|
|
|
# Initialize filters
|
|
pruning_filter = PruningContentFilter(
|
|
threshold=0.48,
|
|
threshold_type="fixed",
|
|
min_word_threshold=2
|
|
)
|
|
|
|
bm25_filter = BM25ContentFilter(
|
|
bm25_threshold=1.0,
|
|
user_query="test article content important"
|
|
)
|
|
|
|
# Test each HTML sample
|
|
for test_name, html in TEST_HTML_SAMPLES.items():
|
|
# Store results for this test case
|
|
results[test_name] = {}
|
|
|
|
# Test PruningContentFilter
|
|
start_time = time.time()
|
|
pruned_content = pruning_filter.filter_content(html)
|
|
pruning_time = time.time() - start_time
|
|
|
|
# Test BM25ContentFilter
|
|
start_time = time.time()
|
|
bm25_content = bm25_filter.filter_content(html)
|
|
bm25_time = time.time() - start_time
|
|
|
|
# Store results
|
|
results[test_name] = {
|
|
"original_length": len(html),
|
|
"pruned_length": sum(len(c) for c in pruned_content),
|
|
"bm25_length": sum(len(c) for c in bm25_content),
|
|
"pruning_time": pruning_time,
|
|
"bm25_time": bm25_time
|
|
}
|
|
|
|
return results
|
|
|
|
def test_markdown_generation():
|
|
"""Test markdown generation with different configurations."""
|
|
results = []
|
|
|
|
# Initialize generators with different configurations
|
|
generators = {
|
|
"no_filter": DefaultMarkdownGenerator(),
|
|
"pruning": DefaultMarkdownGenerator(
|
|
content_filter=PruningContentFilter(threshold=0.48)
|
|
),
|
|
"bm25": DefaultMarkdownGenerator(
|
|
content_filter=BM25ContentFilter(
|
|
user_query="test article content important"
|
|
)
|
|
)
|
|
}
|
|
|
|
# Test each generator with each HTML sample
|
|
for test_name, html in TEST_HTML_SAMPLES.items():
|
|
for gen_name, generator in generators.items():
|
|
start_time = time.time()
|
|
result = generator.generate_markdown(
|
|
html,
|
|
base_url="http://example.com",
|
|
citations=True
|
|
)
|
|
|
|
results.append({
|
|
"test_case": test_name,
|
|
"generator": gen_name,
|
|
"time": time.time() - start_time,
|
|
"raw_length": len(result.raw_markdown),
|
|
"fit_length": len(result.fit_markdown) if result.fit_markdown else 0,
|
|
"citations": len(result.references_markdown)
|
|
})
|
|
|
|
return results
|
|
|
|
def main():
|
|
"""Run all tests and print results."""
|
|
print("Starting content filter tests...")
|
|
filter_results = test_content_filters()
|
|
|
|
print("\nContent Filter Results:")
|
|
print("-" * 50)
|
|
for test_name, metrics in filter_results.items():
|
|
print(f"\nTest case: {test_name}")
|
|
print(f"Original length: {metrics['original_length']}")
|
|
print(f"Pruned length: {metrics['pruned_length']} ({metrics['pruning_time']:.3f}s)")
|
|
print(f"BM25 length: {metrics['bm25_length']} ({metrics['bm25_time']:.3f}s)")
|
|
|
|
print("\nStarting markdown generation tests...")
|
|
markdown_results = test_markdown_generation()
|
|
|
|
print("\nMarkdown Generation Results:")
|
|
print("-" * 50)
|
|
for result in markdown_results:
|
|
print(f"\nTest: {result['test_case']} - Generator: {result['generator']}")
|
|
print(f"Time: {result['time']:.3f}s")
|
|
print(f"Raw length: {result['raw_length']}")
|
|
print(f"Fit length: {result['fit_length']}")
|
|
print(f"Citations: {result['citations']}")
|
|
|
|
if __name__ == "__main__":
|
|
main() |