crawl4ai/tests/general/test_robot_parser.py

from crawl4ai.utils import RobotsParser
            
import asyncio
import aiohttp
from aiohttp import web
import tempfile
import shutil
import os, sys, time, json


async def test_robots_parser():
    print("\n=== Testing RobotsParser ===\n")
    
    # Setup temporary directory for testing
    temp_dir = tempfile.mkdtemp()
    try:
        # 1. Basic setup test
        print("1. Testing basic initialization...")
        parser = RobotsParser(cache_dir=temp_dir)
        assert os.path.exists(parser.db_path), "Database file not created"
        print("✓ Basic initialization passed")

        # 2. Test common cases
        print("\n2. Testing common cases...")
        allowed = await parser.can_fetch("https://www.example.com", "MyBot/1.0")
        print(f"✓ Regular website fetch: {'allowed' if allowed else 'denied'}")
        
        # Test caching
        print("Testing cache...")
        start = time.time()
        await parser.can_fetch("https://www.example.com", "MyBot/1.0")
        duration = time.time() - start
        print(f"✓ Cached lookup took: {duration*1000:.2f}ms")
        assert duration < 0.03, "Cache lookup too slow"

        # 3. Edge cases
        print("\n3. Testing edge cases...")
        
        # Empty URL
        result = await parser.can_fetch("", "MyBot/1.0")
        print(f"✓ Empty URL handled: {'allowed' if result else 'denied'}")
        
        # Invalid URL
        result = await parser.can_fetch("not_a_url", "MyBot/1.0")
        print(f"✓ Invalid URL handled: {'allowed' if result else 'denied'}")
        
        # URL without scheme
        result = await parser.can_fetch("example.com/page", "MyBot/1.0")
        print(f"✓ URL without scheme handled: {'allowed' if result else 'denied'}")

        # 4. Test with local server
        async def start_test_server():
            app = web.Application()
            
            async def robots_txt(request):
                return web.Response(text="""User-agent: *
Disallow: /private/
Allow: /public/
""")
            
            async def malformed_robots(request):
                return web.Response(text="<<<malformed>>>")
            
            async def timeout_robots(request):
                await asyncio.sleep(5)
                return web.Response(text="Should timeout")
            
            async def empty_robots(request):
                return web.Response(text="")
            
            async def giant_robots(request):
                return web.Response(text="User-agent: *\nDisallow: /\n" * 10000)

            # Mount all handlers at root level
            app.router.add_get('/robots.txt', robots_txt)
            app.router.add_get('/malformed/robots.txt', malformed_robots)
            app.router.add_get('/timeout/robots.txt', timeout_robots)
            app.router.add_get('/empty/robots.txt', empty_robots)
            app.router.add_get('/giant/robots.txt', giant_robots)
            
            runner = web.AppRunner(app)
            await runner.setup()
            site = web.TCPSite(runner, 'localhost', 8080)
            await site.start()
            return runner

        runner = await start_test_server()
        try:
            print("\n4. Testing robots.txt rules...")
            base_url = "http://localhost:8080"
            
            # Test public access
            result = await parser.can_fetch(f"{base_url}/public/page", "bot")
            print(f"Public access (/public/page): {'allowed' if result else 'denied'}")
            assert result, "Public path should be allowed"
            
            # Test private access
            result = await parser.can_fetch(f"{base_url}/private/secret", "bot")
            print(f"Private access (/private/secret): {'allowed' if result else 'denied'}")
            assert not result, "Private path should be denied"
            
            # Test malformed
            result = await parser.can_fetch("http://localhost:8080/malformed/page", "bot")
            print(f"✓ Malformed robots.txt handled: {'allowed' if result else 'denied'}")
            
            # Test timeout
            start = time.time()
            result = await parser.can_fetch("http://localhost:8080/timeout/page", "bot")
            duration = time.time() - start
            print(f"✓ Timeout handled (took {duration:.2f}s): {'allowed' if result else 'denied'}")
            assert duration < 3, "Timeout not working"
            
            # Test empty
            result = await parser.can_fetch("http://localhost:8080/empty/page", "bot")
            print(f"✓ Empty robots.txt handled: {'allowed' if result else 'denied'}")
            
            # Test giant file
            start = time.time()
            result = await parser.can_fetch("http://localhost:8080/giant/page", "bot")
            duration = time.time() - start
            print(f"✓ Giant robots.txt handled (took {duration:.2f}s): {'allowed' if result else 'denied'}")

        finally:
            await runner.cleanup()

        # 5. Cache manipulation
        print("\n5. Testing cache manipulation...")
        
        # Clear expired
        parser.clear_expired()
        print("✓ Clear expired entries completed")
        
        # Clear all
        parser.clear_cache()
        print("✓ Clear all cache completed")
        
        # Test with custom TTL
        custom_parser = RobotsParser(cache_dir=temp_dir, cache_ttl=1)  # 1 second TTL
        await custom_parser.can_fetch("https://www.example.com", "bot")
        print("✓ Custom TTL fetch completed")
        await asyncio.sleep(1.1)
        start = time.time()
        await custom_parser.can_fetch("https://www.example.com", "bot")
        print(f"✓ TTL expiry working (refetched after {time.time() - start:.2f}s)")

    finally:
        # Cleanup
        shutil.rmtree(temp_dir)
        print("\nTest cleanup completed")

async def main():
    try:
        await test_robots_parser()
    except Exception as e:
        print(f"Test failed: {str(e)}")
        raise

if __name__ == "__main__":
    asyncio.run(main())
feat(robots): add robots.txt compliance support Add support for checking and respecting robots.txt rules before crawling websites: - Implement RobotsParser class with SQLite caching - Add check_robots_txt parameter to CrawlerRunConfig - Integrate robots.txt checking in AsyncWebCrawler - Update documentation with robots.txt compliance examples - Add tests for robot parser functionality The cache uses WAL mode for better concurrency and has a default TTL of 7 days. 2025-01-21 17:54:13 +08:00			`from crawl4ai.utils import RobotsParser`

			`import asyncio`
			`import aiohttp`
			`from aiohttp import web`
			`import tempfile`
			`import shutil`
			`import os, sys, time, json`


			`async def test_robots_parser():`
			`print("\n=== Testing RobotsParser ===\n")`

			`# Setup temporary directory for testing`
			`temp_dir = tempfile.mkdtemp()`
			`try:`
			`# 1. Basic setup test`
			`print("1. Testing basic initialization...")`
			`parser = RobotsParser(cache_dir=temp_dir)`
			`assert os.path.exists(parser.db_path), "Database file not created"`
			`print("✓ Basic initialization passed")`

			`# 2. Test common cases`
			`print("\n2. Testing common cases...")`
			`allowed = await parser.can_fetch("https://www.example.com", "MyBot/1.0")`
			`print(f"✓ Regular website fetch: {'allowed' if allowed else 'denied'}")`

			`# Test caching`
			`print("Testing cache...")`
			`start = time.time()`
			`await parser.can_fetch("https://www.example.com", "MyBot/1.0")`
			`duration = time.time() - start`
			`print(f"✓ Cached lookup took: {duration*1000:.2f}ms")`
			`assert duration < 0.03, "Cache lookup too slow"`

			`# 3. Edge cases`
			`print("\n3. Testing edge cases...")`

			`# Empty URL`
			`result = await parser.can_fetch("", "MyBot/1.0")`
			`print(f"✓ Empty URL handled: {'allowed' if result else 'denied'}")`

			`# Invalid URL`
			`result = await parser.can_fetch("not_a_url", "MyBot/1.0")`
			`print(f"✓ Invalid URL handled: {'allowed' if result else 'denied'}")`

			`# URL without scheme`
			`result = await parser.can_fetch("example.com/page", "MyBot/1.0")`
			`print(f"✓ URL without scheme handled: {'allowed' if result else 'denied'}")`

			`# 4. Test with local server`
			`async def start_test_server():`
			`app = web.Application()`

			`async def robots_txt(request):`
			`return web.Response(text="""User-agent: *`
			`Disallow: /private/`
			`Allow: /public/`
			`""")`

			`async def malformed_robots(request):`
			`return web.Response(text="<<<malformed>>>")`

			`async def timeout_robots(request):`
			`await asyncio.sleep(5)`
			`return web.Response(text="Should timeout")`

			`async def empty_robots(request):`
			`return web.Response(text="")`

			`async def giant_robots(request):`
			`return web.Response(text="User-agent: \nDisallow: /\n" 10000)`

			`# Mount all handlers at root level`
			`app.router.add_get('/robots.txt', robots_txt)`
			`app.router.add_get('/malformed/robots.txt', malformed_robots)`
			`app.router.add_get('/timeout/robots.txt', timeout_robots)`
			`app.router.add_get('/empty/robots.txt', empty_robots)`
			`app.router.add_get('/giant/robots.txt', giant_robots)`

			`runner = web.AppRunner(app)`
			`await runner.setup()`
			`site = web.TCPSite(runner, 'localhost', 8080)`
			`await site.start()`
			`return runner`

			`runner = await start_test_server()`
			`try:`
			`print("\n4. Testing robots.txt rules...")`
			`base_url = "http://localhost:8080"`

			`# Test public access`
			`result = await parser.can_fetch(f"{base_url}/public/page", "bot")`
			`print(f"Public access (/public/page): {'allowed' if result else 'denied'}")`
			`assert result, "Public path should be allowed"`

			`# Test private access`
			`result = await parser.can_fetch(f"{base_url}/private/secret", "bot")`
			`print(f"Private access (/private/secret): {'allowed' if result else 'denied'}")`
			`assert not result, "Private path should be denied"`

			`# Test malformed`
			`result = await parser.can_fetch("http://localhost:8080/malformed/page", "bot")`
			`print(f"✓ Malformed robots.txt handled: {'allowed' if result else 'denied'}")`

			`# Test timeout`
			`start = time.time()`
			`result = await parser.can_fetch("http://localhost:8080/timeout/page", "bot")`
			`duration = time.time() - start`
			`print(f"✓ Timeout handled (took {duration:.2f}s): {'allowed' if result else 'denied'}")`
			`assert duration < 3, "Timeout not working"`

			`# Test empty`
			`result = await parser.can_fetch("http://localhost:8080/empty/page", "bot")`
			`print(f"✓ Empty robots.txt handled: {'allowed' if result else 'denied'}")`

			`# Test giant file`
			`start = time.time()`
			`result = await parser.can_fetch("http://localhost:8080/giant/page", "bot")`
			`duration = time.time() - start`
			`print(f"✓ Giant robots.txt handled (took {duration:.2f}s): {'allowed' if result else 'denied'}")`

			`finally:`
			`await runner.cleanup()`

			`# 5. Cache manipulation`
			`print("\n5. Testing cache manipulation...")`

			`# Clear expired`
			`parser.clear_expired()`
			`print("✓ Clear expired entries completed")`

			`# Clear all`
			`parser.clear_cache()`
			`print("✓ Clear all cache completed")`

			`# Test with custom TTL`
			`custom_parser = RobotsParser(cache_dir=temp_dir, cache_ttl=1) # 1 second TTL`
			`await custom_parser.can_fetch("https://www.example.com", "bot")`
			`print("✓ Custom TTL fetch completed")`
			`await asyncio.sleep(1.1)`
			`start = time.time()`
			`await custom_parser.can_fetch("https://www.example.com", "bot")`
			`print(f"✓ TTL expiry working (refetched after {time.time() - start:.2f}s)")`

			`finally:`
			`# Cleanup`
			`shutil.rmtree(temp_dir)`
			`print("\nTest cleanup completed")`

			`async def main():`
			`try:`
			`await test_robots_parser()`
			`except Exception as e:`
			`print(f"Test failed: {str(e)}")`
			`raise`

			`if __name__ == "__main__":`
			`asyncio.run(main())`