
Implement comprehensive network request and console message capturing functionality: - Add capture_network_requests and capture_console_messages config parameters - Add network_requests and console_messages fields to models - Implement Playwright event listeners to capture requests, responses, and console output - Create detailed documentation and examples - Add comprehensive tests This feature enables deep visibility into web page activity for debugging, security analysis, performance profiling, and API discovery in web applications.
86 lines
3.2 KiB
Python
86 lines
3.2 KiB
Python
import asyncio
|
||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||
from playwright.async_api import Page, BrowserContext
|
||
|
||
async def test_reuse_context_by_config():
|
||
# We will store each context ID in these maps to confirm reuse
|
||
context_ids_for_A = []
|
||
context_ids_for_B = []
|
||
|
||
# Create a small hook to track context creation
|
||
async def on_page_context_created(page: Page, context: BrowserContext, config: CrawlerRunConfig, **kwargs):
|
||
c_id = id(context)
|
||
print(f"[HOOK] on_page_context_created - Context ID: {c_id}")
|
||
# Distinguish which config we used by checking a custom hook param
|
||
config_label = config.shared_data.get("config_label", "unknown")
|
||
if config_label == "A":
|
||
context_ids_for_A.append(c_id)
|
||
elif config_label == "B":
|
||
context_ids_for_B.append(c_id)
|
||
return page
|
||
|
||
# Browser config - Headless, verbose so we see logs
|
||
browser_config = BrowserConfig(headless=True, verbose=True)
|
||
|
||
# Two crawler run configs that differ (for example, text_mode):
|
||
configA = CrawlerRunConfig(
|
||
only_text=True,
|
||
cache_mode=CacheMode.BYPASS,
|
||
wait_until="domcontentloaded",
|
||
shared_data = {
|
||
"config_label" : "A"
|
||
}
|
||
)
|
||
configB = CrawlerRunConfig(
|
||
only_text=False,
|
||
cache_mode=CacheMode.BYPASS,
|
||
wait_until="domcontentloaded",
|
||
shared_data = {
|
||
"config_label" : "B"
|
||
}
|
||
)
|
||
|
||
# Create the crawler
|
||
crawler = AsyncWebCrawler(config=browser_config)
|
||
|
||
# Attach our custom hook
|
||
# Note: "on_page_context_created" will be called each time a new context+page is generated
|
||
crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created)
|
||
|
||
# Start the crawler (launches the browser)
|
||
await crawler.start()
|
||
|
||
# For demonstration, we’ll crawl a benign site multiple times with each config
|
||
test_url = "https://example.com"
|
||
print("\n--- Crawling with config A (text_mode=True) ---")
|
||
for _ in range(2):
|
||
# Pass an extra kwarg to the hook so we know which config is being used
|
||
await crawler.arun(test_url, config=configA)
|
||
|
||
print("\n--- Crawling with config B (text_mode=False) ---")
|
||
for _ in range(2):
|
||
await crawler.arun(test_url, config=configB)
|
||
|
||
# Close the crawler (shuts down the browser, closes contexts)
|
||
await crawler.close()
|
||
|
||
# Validate and show the results
|
||
print("\n=== RESULTS ===")
|
||
print(f"Config A context IDs: {context_ids_for_A}")
|
||
print(f"Config B context IDs: {context_ids_for_B}")
|
||
if len(set(context_ids_for_A)) == 1:
|
||
print("✅ All config A crawls used the SAME BrowserContext.")
|
||
else:
|
||
print("❌ Config A crawls created multiple contexts unexpectedly.")
|
||
if len(set(context_ids_for_B)) == 1:
|
||
print("✅ All config B crawls used the SAME BrowserContext.")
|
||
else:
|
||
print("❌ Config B crawls created multiple contexts unexpectedly.")
|
||
if set(context_ids_for_A).isdisjoint(context_ids_for_B):
|
||
print("✅ Config A context is different from Config B context.")
|
||
else:
|
||
print("❌ A and B ended up sharing the same context somehow!")
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(test_reuse_context_by_config())
|