
Update example scripts to reflect latest API changes and improve demonstrations: - Increase test URLs in dispatcher example from 20 to 40 pages - Comment out unused dispatcher strategies for cleaner output - Fix scraping strategies performance script to use correct object notation - Update v0_4_3_features_demo with additional feature mentions and uncomment demo sections These changes make the examples more current and better aligned with the actual API.
135 lines
4.6 KiB
Python
135 lines
4.6 KiB
Python
import time, re
|
|
from crawl4ai.content_scraping_strategy import WebScrapingStrategy, LXMLWebScrapingStrategy
|
|
import time
|
|
import functools
|
|
from collections import defaultdict
|
|
|
|
class TimingStats:
|
|
def __init__(self):
|
|
self.stats = defaultdict(lambda: defaultdict(lambda: {"calls": 0, "total_time": 0}))
|
|
|
|
def add(self, strategy_name, func_name, elapsed):
|
|
self.stats[strategy_name][func_name]["calls"] += 1
|
|
self.stats[strategy_name][func_name]["total_time"] += elapsed
|
|
|
|
def report(self):
|
|
for strategy_name, funcs in self.stats.items():
|
|
print(f"\n{strategy_name} Timing Breakdown:")
|
|
print("-" * 60)
|
|
print(f"{'Function':<30} {'Calls':<10} {'Total(s)':<10} {'Avg(ms)':<10}")
|
|
print("-" * 60)
|
|
|
|
for func, data in sorted(funcs.items(), key=lambda x: x[1]["total_time"], reverse=True):
|
|
avg_ms = (data["total_time"] / data["calls"]) * 1000
|
|
print(f"{func:<30} {data['calls']:<10} {data['total_time']:<10.3f} {avg_ms:<10.2f}")
|
|
|
|
timing_stats = TimingStats()
|
|
|
|
# Modify timing decorator
|
|
def timing_decorator(strategy_name):
|
|
def decorator(func):
|
|
@functools.wraps(func)
|
|
def wrapper(*args, **kwargs):
|
|
start = time.time()
|
|
result = func(*args, **kwargs)
|
|
elapsed = time.time() - start
|
|
timing_stats.add(strategy_name, func.__name__, elapsed)
|
|
return result
|
|
return wrapper
|
|
return decorator
|
|
|
|
# Modified decorator application
|
|
def apply_decorators(cls, method_name, strategy_name):
|
|
try:
|
|
original_method = getattr(cls, method_name)
|
|
decorated_method = timing_decorator(strategy_name)(original_method)
|
|
setattr(cls, method_name, decorated_method)
|
|
except AttributeError:
|
|
print(f"Method {method_name} not found in class {cls.__name__}.")
|
|
|
|
# Apply to key methods
|
|
methods_to_profile = [
|
|
'_scrap',
|
|
# 'process_element',
|
|
'_process_element',
|
|
'process_image',
|
|
]
|
|
|
|
|
|
# Apply decorators to both strategies
|
|
for strategy, name in [(WebScrapingStrategy, "Original"), (LXMLWebScrapingStrategy, "LXML")]:
|
|
for method in methods_to_profile:
|
|
apply_decorators(strategy, method, name)
|
|
|
|
|
|
def generate_large_html(n_elements=1000):
|
|
html = ['<!DOCTYPE html><html><head></head><body>']
|
|
for i in range(n_elements):
|
|
html.append(f'''
|
|
<div class="article">
|
|
<h2>Heading {i}</h2>
|
|
<div>
|
|
<div>
|
|
<p>This is paragraph {i} with some content and a <a href="http://example.com/{i}">link</a></p>
|
|
</div>
|
|
</div>
|
|
<img src="image{i}.jpg" alt="Image {i}">
|
|
<ul>
|
|
<li>List item {i}.1</li>
|
|
<li>List item {i}.2</li>
|
|
</ul>
|
|
</div>
|
|
''')
|
|
html.append('</body></html>')
|
|
return ''.join(html)
|
|
|
|
def test_scraping():
|
|
# Initialize both scrapers
|
|
original_scraper = WebScrapingStrategy()
|
|
selected_scraper = LXMLWebScrapingStrategy()
|
|
|
|
# Generate test HTML
|
|
print("Generating HTML...")
|
|
html = generate_large_html(5000)
|
|
print(f"HTML Size: {len(html)/1024:.2f} KB")
|
|
|
|
# Time the scraping
|
|
print("\nStarting scrape...")
|
|
start_time = time.time()
|
|
|
|
kwargs = {
|
|
"url": "http://example.com",
|
|
"html": html,
|
|
"word_count_threshold": 5,
|
|
"keep_data_attributes": True
|
|
}
|
|
|
|
t1 = time.perf_counter()
|
|
result_selected = selected_scraper.scrap(**kwargs)
|
|
t2 = time.perf_counter()
|
|
|
|
result_original = original_scraper.scrap(**kwargs)
|
|
t3 = time.perf_counter()
|
|
|
|
elapsed = t3 - start_time
|
|
print(f"\nScraping completed in {elapsed:.2f} seconds")
|
|
|
|
timing_stats.report()
|
|
|
|
# Print stats of LXML output
|
|
print("\Turbo Output:")
|
|
print(f"\nExtracted links: {len(result_selected.links.internal) + len(result_selected.links.external)}")
|
|
print(f"Extracted images: {len(result_selected.media.images)}")
|
|
print(f"Clean HTML size: {len(result_selected.cleaned_html)/1024:.2f} KB")
|
|
print(f"Scraping time: {t2 - t1:.2f} seconds")
|
|
|
|
# Print stats of original output
|
|
print("\nOriginal Output:")
|
|
print(f"\nExtracted links: {len(result_original.links.internal) + len(result_original.links.external)}")
|
|
print(f"Extracted images: {len(result_original.media.images)}")
|
|
print(f"Clean HTML size: {len(result_original.cleaned_html)/1024:.2f} KB")
|
|
print(f"Scraping time: {t3 - t1:.2f} seconds")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
test_scraping() |