crawl4ai/tests/general/tets_robot.py

import asyncio
from crawl4ai import *

async def test_real_websites():
    print("\n=== Testing Real Website Robots.txt Compliance ===\n")
    
    browser_config = BrowserConfig(headless=True, verbose=True)
    async with AsyncWebCrawler(config=browser_config) as crawler:
        
        # Test cases with URLs
        test_cases = [
            # Public sites that should be allowed
            ("https://example.com", True),  # Simple public site
            ("https://httpbin.org/get", True),  # API endpoint
            
            # Sites with known strict robots.txt
            ("https://www.facebook.com/robots.txt", False),  # Social media
            ("https://www.google.com/search", False),  # Search pages
            
            # Edge cases
            ("https://api.github.com", True),  # API service
            ("https://raw.githubusercontent.com", True),  # Content delivery
            
            # Non-existent/error cases
            ("https://thisisnotarealwebsite.com", True),  # Non-existent domain
            ("https://localhost:12345", True),  # Invalid port
        ]

        for url, expected in test_cases:
            print(f"\nTesting: {url}")
            try:
                config = CrawlerRunConfig(
                    cache_mode=CacheMode.BYPASS,
                    check_robots_txt=True,  # Enable robots.txt checking
                    verbose=True
                )
                
                result = await crawler.arun(url=url, config=config)
                allowed = result.success and not result.error_message
                
                print(f"Expected: {'allowed' if expected else 'denied'}")
                print(f"Actual: {'allowed' if allowed else 'denied'}")
                print(f"Status Code: {result.status_code}")
                if result.error_message:
                    print(f"Error: {result.error_message}")
                
                # Optional: Print robots.txt content if available
                if result.metadata and 'robots_txt' in result.metadata:
                    print(f"Robots.txt rules:\n{result.metadata['robots_txt']}")
                
            except Exception as e:
                print(f"Test failed with error: {str(e)}")

async def main():
    try:
        await test_real_websites()
    except Exception as e:
        print(f"Test suite failed: {str(e)}")
        raise

if __name__ == "__main__":
    asyncio.run(main())
feat(robots): add robots.txt compliance support Add support for checking and respecting robots.txt rules before crawling websites: - Implement RobotsParser class with SQLite caching - Add check_robots_txt parameter to CrawlerRunConfig - Integrate robots.txt checking in AsyncWebCrawler - Update documentation with robots.txt compliance examples - Add tests for robot parser functionality The cache uses WAL mode for better concurrency and has a default TTL of 7 days. 2025-01-21 17:54:13 +08:00			`import asyncio`
			`from crawl4ai import *`

			`async def test_real_websites():`
			`print("\n=== Testing Real Website Robots.txt Compliance ===\n")`

			`browser_config = BrowserConfig(headless=True, verbose=True)`
			`async with AsyncWebCrawler(config=browser_config) as crawler:`

			`# Test cases with URLs`
			`test_cases = [`
			`# Public sites that should be allowed`
			`("https://example.com", True), # Simple public site`
			`("https://httpbin.org/get", True), # API endpoint`

			`# Sites with known strict robots.txt`
			`("https://www.facebook.com/robots.txt", False), # Social media`
			`("https://www.google.com/search", False), # Search pages`

			`# Edge cases`
			`("https://api.github.com", True), # API service`
			`("https://raw.githubusercontent.com", True), # Content delivery`

			`# Non-existent/error cases`
			`("https://thisisnotarealwebsite.com", True), # Non-existent domain`
			`("https://localhost:12345", True), # Invalid port`
			`]`

			`for url, expected in test_cases:`
			`print(f"\nTesting: {url}")`
			`try:`
			`config = CrawlerRunConfig(`
			`cache_mode=CacheMode.BYPASS,`
			`check_robots_txt=True, # Enable robots.txt checking`
			`verbose=True`
			`)`

			`result = await crawler.arun(url=url, config=config)`
			`allowed = result.success and not result.error_message`

			`print(f"Expected: {'allowed' if expected else 'denied'}")`
			`print(f"Actual: {'allowed' if allowed else 'denied'}")`
			`print(f"Status Code: {result.status_code}")`
			`if result.error_message:`
			`print(f"Error: {result.error_message}")`

			`# Optional: Print robots.txt content if available`
			`if result.metadata and 'robots_txt' in result.metadata:`
			`print(f"Robots.txt rules:\n{result.metadata['robots_txt']}")`

			`except Exception as e:`
			`print(f"Test failed with error: {str(e)}")`

			`async def main():`
			`try:`
			`await test_real_websites()`
			`except Exception as e:`
			`print(f"Test suite failed: {str(e)}")`
			`raise`

			`if __name__ == "__main__":`
			`asyncio.run(main())`