
* fix: Update export of URLPatternFilter * chore: Add dependancy for cchardet in requirements * docs: Update example for deep crawl in release note for v0.5 * Docs: update the example for memory dispatcher * docs: updated example for crawl strategies * Refactor: Removed wrapping in if __name__==main block since this is a markdown file. * chore: removed cchardet from dependancy list, since unclecode is planning to remove it * docs: updated the example for proxy rotation to a working example * feat: Introduced ProxyConfig param * Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1 * chore: update and test new dependancies * feat:Make PyPDF2 a conditional dependancy * updated tutorial and release note for v0.5 * docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename * refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult * fix: Bug in serialisation of markdown in acache_url * Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown * fix: remove deprecated markdown_v2 from docker * Refactor: remove deprecated fit_markdown and fit_html from result * refactor: fix cache retrieval for markdown as a string * chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
127 lines
4.0 KiB
Python
127 lines
4.0 KiB
Python
"""
|
|
This example demonstrates optimal browser usage patterns in Crawl4AI:
|
|
1. Sequential crawling with session reuse
|
|
2. Parallel crawling with browser instance reuse
|
|
3. Performance optimization settings
|
|
"""
|
|
|
|
import asyncio
|
|
from typing import List
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
|
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
|
|
|
|
|
async def crawl_sequential(urls: List[str]):
|
|
"""
|
|
Sequential crawling using session reuse - most efficient for moderate workloads
|
|
"""
|
|
print("\n=== Sequential Crawling with Session Reuse ===")
|
|
|
|
# Configure browser with optimized settings
|
|
browser_config = BrowserConfig(
|
|
headless=True,
|
|
browser_args=[
|
|
"--disable-gpu", # Disable GPU acceleration
|
|
"--disable-dev-shm-usage", # Disable /dev/shm usage
|
|
"--no-sandbox", # Required for Docker
|
|
],
|
|
viewport={
|
|
"width": 800,
|
|
"height": 600,
|
|
}, # Smaller viewport for better performance
|
|
)
|
|
|
|
# Configure crawl settings
|
|
crawl_config = CrawlerRunConfig(
|
|
markdown_generator=DefaultMarkdownGenerator(
|
|
# content_filter=PruningContentFilter(), In case you need fit_markdown
|
|
),
|
|
)
|
|
|
|
# Create single crawler instance
|
|
crawler = AsyncWebCrawler(config=browser_config)
|
|
await crawler.start()
|
|
|
|
try:
|
|
session_id = "session1" # Use same session for all URLs
|
|
for url in urls:
|
|
result = await crawler.arun(
|
|
url=url,
|
|
config=crawl_config,
|
|
session_id=session_id, # Reuse same browser tab
|
|
)
|
|
if result.success:
|
|
print(f"Successfully crawled {url}")
|
|
print(f"Content length: {len(result.markdown.raw_markdown)}")
|
|
finally:
|
|
await crawler.close()
|
|
|
|
|
|
async def crawl_parallel(urls: List[str], max_concurrent: int = 3):
|
|
"""
|
|
Parallel crawling while reusing browser instance - best for large workloads
|
|
"""
|
|
print("\n=== Parallel Crawling with Browser Reuse ===")
|
|
|
|
browser_config = BrowserConfig(
|
|
headless=True,
|
|
browser_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
|
|
viewport={"width": 800, "height": 600},
|
|
)
|
|
|
|
crawl_config = CrawlerRunConfig(
|
|
markdown_generator=DefaultMarkdownGenerator(
|
|
# content_filter=PruningContentFilter(), In case you need fit_markdown
|
|
),
|
|
)
|
|
|
|
# Create single crawler instance for all parallel tasks
|
|
crawler = AsyncWebCrawler(config=browser_config)
|
|
await crawler.start()
|
|
|
|
try:
|
|
# Create tasks in batches to control concurrency
|
|
for i in range(0, len(urls), max_concurrent):
|
|
batch = urls[i : i + max_concurrent]
|
|
tasks = []
|
|
|
|
for j, url in enumerate(batch):
|
|
session_id = (
|
|
f"parallel_session_{j}" # Different session per concurrent task
|
|
)
|
|
task = crawler.arun(url=url, config=crawl_config, session_id=session_id)
|
|
tasks.append(task)
|
|
|
|
# Wait for batch to complete
|
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
# Process results
|
|
for url, result in zip(batch, results):
|
|
if isinstance(result, Exception):
|
|
print(f"Error crawling {url}: {str(result)}")
|
|
elif result.success:
|
|
print(f"Successfully crawled {url}")
|
|
print(f"Content length: {len(result.markdown.raw_markdown)}")
|
|
finally:
|
|
await crawler.close()
|
|
|
|
|
|
async def main():
|
|
# Example URLs
|
|
urls = [
|
|
"https://example.com/page1",
|
|
"https://example.com/page2",
|
|
"https://example.com/page3",
|
|
"https://example.com/page4",
|
|
]
|
|
|
|
# Demo sequential crawling
|
|
await crawl_sequential(urls)
|
|
|
|
# Demo parallel crawling
|
|
await crawl_parallel(urls, max_concurrent=2)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|