86 lines
3.2 KiB
Python
86 lines
3.2 KiB
Python
![]() |
import asyncio
|
|||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
|||
|
from playwright.async_api import Page, BrowserContext
|
|||
|
|
|||
|
async def test_reuse_context_by_config():
|
|||
|
# We will store each context ID in these maps to confirm reuse
|
|||
|
context_ids_for_A = []
|
|||
|
context_ids_for_B = []
|
|||
|
|
|||
|
# Create a small hook to track context creation
|
|||
|
async def on_page_context_created(page: Page, context: BrowserContext, config: CrawlerRunConfig, **kwargs):
|
|||
|
c_id = id(context)
|
|||
|
print(f"[HOOK] on_page_context_created - Context ID: {c_id}")
|
|||
|
# Distinguish which config we used by checking a custom hook param
|
|||
|
config_label = config.shared_data.get("config_label", "unknown")
|
|||
|
if config_label == "A":
|
|||
|
context_ids_for_A.append(c_id)
|
|||
|
elif config_label == "B":
|
|||
|
context_ids_for_B.append(c_id)
|
|||
|
return page
|
|||
|
|
|||
|
# Browser config - Headless, verbose so we see logs
|
|||
|
browser_config = BrowserConfig(headless=True, verbose=True)
|
|||
|
|
|||
|
# Two crawler run configs that differ (for example, text_mode):
|
|||
|
configA = CrawlerRunConfig(
|
|||
|
only_text=True,
|
|||
|
cache_mode=CacheMode.BYPASS,
|
|||
|
wait_until="domcontentloaded",
|
|||
|
shared_data = {
|
|||
|
"config_label" : "A"
|
|||
|
}
|
|||
|
)
|
|||
|
configB = CrawlerRunConfig(
|
|||
|
only_text=False,
|
|||
|
cache_mode=CacheMode.BYPASS,
|
|||
|
wait_until="domcontentloaded",
|
|||
|
shared_data = {
|
|||
|
"config_label" : "B"
|
|||
|
}
|
|||
|
)
|
|||
|
|
|||
|
# Create the crawler
|
|||
|
crawler = AsyncWebCrawler(config=browser_config)
|
|||
|
|
|||
|
# Attach our custom hook
|
|||
|
# Note: "on_page_context_created" will be called each time a new context+page is generated
|
|||
|
crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created)
|
|||
|
|
|||
|
# Start the crawler (launches the browser)
|
|||
|
await crawler.start()
|
|||
|
|
|||
|
# For demonstration, we’ll crawl a benign site multiple times with each config
|
|||
|
test_url = "https://example.com"
|
|||
|
print("\n--- Crawling with config A (text_mode=True) ---")
|
|||
|
for _ in range(2):
|
|||
|
# Pass an extra kwarg to the hook so we know which config is being used
|
|||
|
await crawler.arun(test_url, config=configA)
|
|||
|
|
|||
|
print("\n--- Crawling with config B (text_mode=False) ---")
|
|||
|
for _ in range(2):
|
|||
|
await crawler.arun(test_url, config=configB)
|
|||
|
|
|||
|
# Close the crawler (shuts down the browser, closes contexts)
|
|||
|
await crawler.close()
|
|||
|
|
|||
|
# Validate and show the results
|
|||
|
print("\n=== RESULTS ===")
|
|||
|
print(f"Config A context IDs: {context_ids_for_A}")
|
|||
|
print(f"Config B context IDs: {context_ids_for_B}")
|
|||
|
if len(set(context_ids_for_A)) == 1:
|
|||
|
print("✅ All config A crawls used the SAME BrowserContext.")
|
|||
|
else:
|
|||
|
print("❌ Config A crawls created multiple contexts unexpectedly.")
|
|||
|
if len(set(context_ids_for_B)) == 1:
|
|||
|
print("✅ All config B crawls used the SAME BrowserContext.")
|
|||
|
else:
|
|||
|
print("❌ Config B crawls created multiple contexts unexpectedly.")
|
|||
|
if set(context_ids_for_A).isdisjoint(context_ids_for_B):
|
|||
|
print("✅ Config A context is different from Config B context.")
|
|||
|
else:
|
|||
|
print("❌ A and B ended up sharing the same context somehow!")
|
|||
|
|
|||
|
if __name__ == "__main__":
|
|||
|
asyncio.run(test_reuse_context_by_config())
|