crawl4ai/docs/examples/docker_python_rest_api.py

import asyncio
import json
from typing import Optional
from urllib.parse import quote

async def get_token(session, email: str = "test@example.com") -> str:
    """Fetch a JWT token from the /token endpoint."""
    url = "http://localhost:8000/token"
    payload = {"email": email}
    print(f"\nFetching token from {url} with email: {email}")
    try:
        async with session.post(url, json=payload) as response:
            status = response.status
            data = await response.json()
            print(f"Token Response Status: {status}")
            print(f"Token Response: {json.dumps(data, indent=2)}")
            if status == 200:
                return data["access_token"]
            else:
                raise Exception(f"Failed to get token: {data.get('detail', 'Unknown error')}")
    except Exception as e:
        print(f"Error fetching token: {str(e)}")
        raise

async def test_endpoint(
    session,
    endpoint: str,
    url: str,
    token: str,
    params: Optional[dict] = None,
    expected_status: int = 200
) -> Optional[dict]:
    """Test an endpoint with token and print results."""
    params = params or {}
    param_str = "&".join(f"{k}={v}" for k, v in params.items())
    full_url = f"http://localhost:8000/{endpoint}/{quote(url)}"
    if param_str:
        full_url += f"?{param_str}"

    headers = {"Authorization": f"Bearer {token}"}
    print(f"\nTesting: {full_url}")

    try:
        async with session.get(full_url, headers=headers) as response:
            status = response.status
            try:
                data = await response.json()
            except:
                data = await response.text()

            print(f"Status: {status} (Expected: {expected_status})")
            if isinstance(data, dict):
                print(f"Response: {json.dumps(data, indent=2)}")
            else:
                print(f"Response: {data[:500]}...")  # First 500 chars
            assert status == expected_status, f"Expected {expected_status}, got {status}"
            return data
    except Exception as e:
        print(f"Error: {str(e)}")
        return None


async def test_stream_crawl(session, token: str):
    """Test the /crawl/stream endpoint with multiple URLs."""
    url = "http://localhost:8000/crawl/stream"
    payload = {
        "urls": [
            "https://example.com",
            "https://example.com/page1",  # Replicated example.com with variation
            "https://example.com/page2",  # Replicated example.com with variation
            "https://example.com/page3",  # Replicated example.com with variation
            # "https://www.python.org",
            # "https://news.ycombinator.com/news"
        ],
        "browser_config": {"headless": True, "viewport": {"width": 1200}},
        "crawler_config": {"stream": True, "cache_mode": "aggressive"}
    }
    headers = {"Authorization": f"Bearer {token}"}
    print(f"\nTesting Streaming Crawl: {url}")
    print(f"Payload: {json.dumps(payload, indent=2)}")

    try:
        async with session.post(url, json=payload, headers=headers) as response:
            status = response.status
            print(f"Status: {status} (Expected: 200)")
            assert status == 200, f"Expected 200, got {status}"

            # Read streaming response line-by-line (NDJSON)
            async for line in response.content:
                if line:
                    data = json.loads(line.decode('utf-8').strip())
                    print(f"Streamed Result: {json.dumps(data, indent=2)}")
    except Exception as e:
        print(f"Error in streaming crawl test: {str(e)}")

async def run_tests():
    import aiohttp
    print("Starting API Tests...")

    # Test URLs
    urls = [
        "example.com",
        "https://www.python.org",
        "https://news.ycombinator.com/news",
        "https://github.com/trending"
    ]

    async with aiohttp.ClientSession() as session:
        token = "test_token"
        # If jwt is enabled, authenticate first
        # Fetch token once and reuse it
        # token = await get_token(session)
        # if not token:
        #     print("Aborting tests due to token failure!")
        #     return

        print("\n=== Testing Crawl Endpoint ===")
        crawl_payload = {
            "urls": ["https://example.com"],
            "browser_config": {"headless": True},
            "crawler_config": {"stream": False}
        }
        async with session.post(
            "http://localhost:8000/crawl",
            json=crawl_payload,
            headers={"Authorization": f"Bearer {token}"}
        ) as response:
            status = response.status
            data = await response.json()
            print(f"\nCrawl Endpoint Status: {status}")
            print(f"Crawl Response: {json.dumps(data, indent=2)}")


        print("\n=== Testing Crawl Stream Endpoint ===")
        await test_stream_crawl(session, token)

        print("\n=== Testing Markdown Endpoint ===")
        for url in []: #urls:
            for filter_type in ["raw", "fit", "bm25", "llm"]:
                params = {"f": filter_type}
                if filter_type in ["bm25", "llm"]:
                    params["q"] = "extract main content"

                for cache in ["0", "1"]:
                    params["c"] = cache
                    await test_endpoint(session, "md", url, token, params)
                    await asyncio.sleep(1)  # Be nice to the server

        print("\n=== Testing LLM Endpoint ===")
        for url in urls:
            # Test basic extraction (direct response now)
            result = await test_endpoint(
                session,
                "llm",
                url,
                token,
                {"q": "Extract title and main content"}
            )

            # Test with schema (direct response)
            schema = {
                "type": "object",
                "properties": {
                    "title": {"type": "string"},
                    "content": {"type": "string"},
                    "links": {"type": "array", "items": {"type": "string"}}
                }
            }
            result = await test_endpoint(
                session,
                "llm",
                url,
                token,
                {
                    "q": "Extract content with links",
                    "s": json.dumps(schema),
                    "c": "1"  # Test with cache
                }
            )
            await asyncio.sleep(2)  # Be nice to the server

        print("\n=== Testing Error Cases ===")
        # Test invalid URL
        await test_endpoint(
            session,
            "md",
            "not_a_real_url",
            token,
            expected_status=500
        )

        # Test invalid filter type
        await test_endpoint(
            session,
            "md",
            "example.com",
            token,
            {"f": "invalid"},
            expected_status=422
        )

        # Test LLM without query (should fail per your server logic)
        await test_endpoint(
            session,
            "llm",
            "example.com",
            token,
            expected_status=400
        )

    print("\nAll tests completed!")

if __name__ == "__main__":
    asyncio.run(run_tests())