56 lines
1.7 KiB
Python
56 lines
1.7 KiB
Python
# File: async_webcrawler_multiple_urls_example.py
|
|
import os, sys
|
|
|
|
# append 2 parent directories to sys.path to import crawl4ai
|
|
parent_dir = os.path.dirname(
|
|
os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
)
|
|
sys.path.append(parent_dir)
|
|
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler
|
|
|
|
|
|
async def main():
|
|
# Initialize the AsyncWebCrawler
|
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
|
# List of URLs to crawl
|
|
urls = [
|
|
"https://example.com",
|
|
"https://python.org",
|
|
"https://github.com",
|
|
"https://stackoverflow.com",
|
|
"https://news.ycombinator.com",
|
|
]
|
|
|
|
# Set up crawling parameters
|
|
word_count_threshold = 100
|
|
|
|
# Run the crawling process for multiple URLs
|
|
results = await crawler.arun_many(
|
|
urls=urls,
|
|
word_count_threshold=word_count_threshold,
|
|
bypass_cache=True,
|
|
verbose=True,
|
|
)
|
|
|
|
# Process the results
|
|
for result in results:
|
|
if result.success:
|
|
print(f"Successfully crawled: {result.url}")
|
|
print(f"Title: {result.metadata.get('title', 'N/A')}")
|
|
print(f"Word count: {len(result.markdown.split())}")
|
|
print(
|
|
f"Number of links: {len(result.links.get('internal', [])) + len(result.links.get('external', []))}"
|
|
)
|
|
print(f"Number of images: {len(result.media.get('images', []))}")
|
|
print("---")
|
|
else:
|
|
print(f"Failed to crawl: {result.url}")
|
|
print(f"Error: {result.error_message}")
|
|
print("---")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|