crawl4ai/tests/general/test_cache_context.py
unclecode 66ac07b4f3 feat(crawler): add network request and console message capturing
Implement comprehensive network request and console message capturing functionality:
- Add capture_network_requests and capture_console_messages config parameters
- Add network_requests and console_messages fields to models
- Implement Playwright event listeners to capture requests, responses, and console output
- Create detailed documentation and examples
- Add comprehensive tests

This feature enables deep visibility into web page activity for debugging,
security analysis, performance profiling, and API discovery in web applications.
2025-04-10 16:03:48 +08:00

86 lines
3.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from playwright.async_api import Page, BrowserContext
async def test_reuse_context_by_config():
# We will store each context ID in these maps to confirm reuse
context_ids_for_A = []
context_ids_for_B = []
# Create a small hook to track context creation
async def on_page_context_created(page: Page, context: BrowserContext, config: CrawlerRunConfig, **kwargs):
c_id = id(context)
print(f"[HOOK] on_page_context_created - Context ID: {c_id}")
# Distinguish which config we used by checking a custom hook param
config_label = config.shared_data.get("config_label", "unknown")
if config_label == "A":
context_ids_for_A.append(c_id)
elif config_label == "B":
context_ids_for_B.append(c_id)
return page
# Browser config - Headless, verbose so we see logs
browser_config = BrowserConfig(headless=True, verbose=True)
# Two crawler run configs that differ (for example, text_mode):
configA = CrawlerRunConfig(
only_text=True,
cache_mode=CacheMode.BYPASS,
wait_until="domcontentloaded",
shared_data = {
"config_label" : "A"
}
)
configB = CrawlerRunConfig(
only_text=False,
cache_mode=CacheMode.BYPASS,
wait_until="domcontentloaded",
shared_data = {
"config_label" : "B"
}
)
# Create the crawler
crawler = AsyncWebCrawler(config=browser_config)
# Attach our custom hook
# Note: "on_page_context_created" will be called each time a new context+page is generated
crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created)
# Start the crawler (launches the browser)
await crawler.start()
# For demonstration, well crawl a benign site multiple times with each config
test_url = "https://example.com"
print("\n--- Crawling with config A (text_mode=True) ---")
for _ in range(2):
# Pass an extra kwarg to the hook so we know which config is being used
await crawler.arun(test_url, config=configA)
print("\n--- Crawling with config B (text_mode=False) ---")
for _ in range(2):
await crawler.arun(test_url, config=configB)
# Close the crawler (shuts down the browser, closes contexts)
await crawler.close()
# Validate and show the results
print("\n=== RESULTS ===")
print(f"Config A context IDs: {context_ids_for_A}")
print(f"Config B context IDs: {context_ids_for_B}")
if len(set(context_ids_for_A)) == 1:
print("✅ All config A crawls used the SAME BrowserContext.")
else:
print("❌ Config A crawls created multiple contexts unexpectedly.")
if len(set(context_ids_for_B)) == 1:
print("✅ All config B crawls used the SAME BrowserContext.")
else:
print("❌ Config B crawls created multiple contexts unexpectedly.")
if set(context_ids_for_A).isdisjoint(context_ids_for_B):
print("✅ Config A context is different from Config B context.")
else:
print("❌ A and B ended up sharing the same context somehow!")
if __name__ == "__main__":
asyncio.run(test_reuse_context_by_config())