crawl4ai/tests/general/tets_robot.py
unclecode 66ac07b4f3 feat(crawler): add network request and console message capturing
Implement comprehensive network request and console message capturing functionality:
- Add capture_network_requests and capture_console_messages config parameters
- Add network_requests and console_messages fields to models
- Implement Playwright event listeners to capture requests, responses, and console output
- Create detailed documentation and examples
- Add comprehensive tests

This feature enables deep visibility into web page activity for debugging,
security analysis, performance profiling, and API discovery in web applications.
2025-04-10 16:03:48 +08:00

62 lines
2.4 KiB
Python

import asyncio
from crawl4ai import *
async def test_real_websites():
print("\n=== Testing Real Website Robots.txt Compliance ===\n")
browser_config = BrowserConfig(headless=True, verbose=True)
async with AsyncWebCrawler(config=browser_config) as crawler:
# Test cases with URLs
test_cases = [
# Public sites that should be allowed
("https://example.com", True), # Simple public site
("https://httpbin.org/get", True), # API endpoint
# Sites with known strict robots.txt
("https://www.facebook.com/robots.txt", False), # Social media
("https://www.google.com/search", False), # Search pages
# Edge cases
("https://api.github.com", True), # API service
("https://raw.githubusercontent.com", True), # Content delivery
# Non-existent/error cases
("https://thisisnotarealwebsite.com", True), # Non-existent domain
("https://localhost:12345", True), # Invalid port
]
for url, expected in test_cases:
print(f"\nTesting: {url}")
try:
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
check_robots_txt=True, # Enable robots.txt checking
verbose=True
)
result = await crawler.arun(url=url, config=config)
allowed = result.success and not result.error_message
print(f"Expected: {'allowed' if expected else 'denied'}")
print(f"Actual: {'allowed' if allowed else 'denied'}")
print(f"Status Code: {result.status_code}")
if result.error_message:
print(f"Error: {result.error_message}")
# Optional: Print robots.txt content if available
if result.metadata and 'robots_txt' in result.metadata:
print(f"Robots.txt rules:\n{result.metadata['robots_txt']}")
except Exception as e:
print(f"Test failed with error: {str(e)}")
async def main():
try:
await test_real_websites()
except Exception as e:
print(f"Test suite failed: {str(e)}")
raise
if __name__ == "__main__":
asyncio.run(main())