crawl4ai/tests/docker/test_docker.py

import requests
import time
import httpx
import asyncio
from typing import Dict, Any
from crawl4ai import (
    BrowserConfig, CrawlerRunConfig, DefaultMarkdownGenerator,
    PruningContentFilter, JsonCssExtractionStrategy, LLMContentFilter, CacheMode
)
from crawl4ai import LLMConfig
from crawl4ai.docker_client import Crawl4aiDockerClient

class Crawl4AiTester:
    def __init__(self, base_url: str = "http://localhost:11235"):
        self.base_url = base_url

    def submit_and_wait(
        self, request_data: Dict[str, Any], timeout: int = 300
    ) -> Dict[str, Any]:
        # Submit crawl job
        response = requests.post(f"{self.base_url}/crawl", json=request_data)
        task_id = response.json()["task_id"]
        print(f"Task ID: {task_id}")

        # Poll for result
        start_time = time.time()
        while True:
            if time.time() - start_time > timeout:
                raise TimeoutError(
                    f"Task {task_id} did not complete within {timeout} seconds"
                )

            result = requests.get(f"{self.base_url}/task/{task_id}")
            status = result.json()

            if status["status"] == "failed":
                print("Task failed:", status.get("error"))
                raise Exception(f"Task failed: {status.get('error')}")

            if status["status"] == "completed":
                return status

            time.sleep(2)

async def test_direct_api():
    """Test direct API endpoints without using the client SDK"""
    print("\n=== Testing Direct API Calls ===")
    
    # Test 1: Basic crawl with content filtering
    browser_config = BrowserConfig(
        headless=True,
        viewport_width=1200,
        viewport_height=800
    )
    
    crawler_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        markdown_generator=DefaultMarkdownGenerator(
            content_filter=PruningContentFilter(
                threshold=0.48,
                threshold_type="fixed",
                min_word_threshold=0
            ),
            options={"ignore_links": True}
        )
    )

    request_data = {
        "urls": ["https://example.com"],
        "browser_config": browser_config.dump(),
        "crawler_config": crawler_config.dump()
    }

    # Make direct API call
    async with httpx.AsyncClient() as client:
        response = await client.post(
            "http://localhost:8000/crawl",
            json=request_data,
            timeout=300
        )
        assert response.status_code == 200
        result = response.json()
        print("Basic crawl result:", result["success"])

    # Test 2: Structured extraction with JSON CSS
    schema = {
        "baseSelector": "article.post",
        "fields": [
            {"name": "title", "selector": "h1", "type": "text"},
            {"name": "content", "selector": ".content", "type": "html"}
        ]
    }

    crawler_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        extraction_strategy=JsonCssExtractionStrategy(schema=schema)
    )

    request_data["crawler_config"] = crawler_config.dump()

    async with httpx.AsyncClient() as client:
        response = await client.post(
            "http://localhost:8000/crawl",
            json=request_data
        )
        assert response.status_code == 200
        result = response.json()
        print("Structured extraction result:", result["success"])

    # Test 3: Get schema
    # async with httpx.AsyncClient() as client:
    #     response = await client.get("http://localhost:8000/schema")
    #     assert response.status_code == 200
    #     schemas = response.json()
    #     print("Retrieved schemas for:", list(schemas.keys()))

async def test_with_client():
    """Test using the Crawl4AI Docker client SDK"""
    print("\n=== Testing Client SDK ===")
    
    async with Crawl4aiDockerClient(verbose=True) as client:
        # Test 1: Basic crawl
        browser_config = BrowserConfig(headless=True)
        crawler_config = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS,
            markdown_generator=DefaultMarkdownGenerator(
                content_filter=PruningContentFilter(
                    threshold=0.48,
                    threshold_type="fixed"
                )
            )
        )

        result = await client.crawl(
            urls=["https://example.com"],
            browser_config=browser_config,
            crawler_config=crawler_config
        )
        print("Client SDK basic crawl:", result.success)

        # Test 2: LLM extraction with streaming
        crawler_config = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS,
            markdown_generator=DefaultMarkdownGenerator(
                content_filter=LLMContentFilter(
                    llm_config=LLMConfig(provider="openai/gpt-40"),
                    instruction="Extract key technical concepts"
                )
            ),
            stream=True
        )

        async for result in await client.crawl(
            urls=["https://example.com"],
            browser_config=browser_config,
            crawler_config=crawler_config
        ):
            print(f"Streaming result for: {result.url}")

        # # Test 3: Get schema
        # schemas = await client.get_schema()
        # print("Retrieved client schemas for:", list(schemas.keys()))

async def main():
    """Run all tests"""
    # Test direct API
    print("Testing direct API calls...")
    await test_direct_api()

    # Test client SDK
    print("\nTesting client SDK...")
    await test_with_client()

if __name__ == "__main__":
    asyncio.run(main())
feat(docker): add Docker service integration and config serialization Add Docker service integration with FastAPI server and client implementation. Implement serialization utilities for BrowserConfig and CrawlerRunConfig to support Docker service communication. Clean up imports and improve error handling. - Add Crawl4aiDockerClient class - Implement config serialization/deserialization - Add FastAPI server with streaming support - Add health check endpoint - Clean up imports and type hints 2025-01-31 18:00:16 +08:00			`import requests`
			`import time`
			`import httpx`
			`import asyncio`
			`from typing import Dict, Any`
			`from crawl4ai import (`
			`BrowserConfig, CrawlerRunConfig, DefaultMarkdownGenerator,`
			`PruningContentFilter, JsonCssExtractionStrategy, LLMContentFilter, CacheMode`
			`)`
feat(browser): add standalone CDP browser launch and lxml extraction strategy Add new features to enhance browser automation and HTML extraction: - Add CDP browser launch capability with customizable ports and profiles - Implement JsonLxmlExtractionStrategy for faster HTML parsing - Add CLI command 'crwl cdp' for launching standalone CDP browsers - Support connecting to external CDP browsers via URL - Optimize selector caching and context-sensitive queries BREAKING CHANGE: LLMConfig import path changed from crawl4ai.types to crawl4ai 2025-03-07 20:55:56 +08:00			`from crawl4ai import LLMConfig`
feat(docker): add Docker service integration and config serialization Add Docker service integration with FastAPI server and client implementation. Implement serialization utilities for BrowserConfig and CrawlerRunConfig to support Docker service communication. Clean up imports and improve error handling. - Add Crawl4aiDockerClient class - Implement config serialization/deserialization - Add FastAPI server with streaming support - Add health check endpoint - Clean up imports and type hints 2025-01-31 18:00:16 +08:00			`from crawl4ai.docker_client import Crawl4aiDockerClient`

			`class Crawl4AiTester:`
			`def __init__(self, base_url: str = "http://localhost:11235"):`
			`self.base_url = base_url`

			`def submit_and_wait(`
			`self, request_data: Dict[str, Any], timeout: int = 300`
			`) -> Dict[str, Any]:`
			`# Submit crawl job`
			`response = requests.post(f"{self.base_url}/crawl", json=request_data)`
			`task_id = response.json()["task_id"]`
			`print(f"Task ID: {task_id}")`

			`# Poll for result`
			`start_time = time.time()`
			`while True:`
			`if time.time() - start_time > timeout:`
			`raise TimeoutError(`
			`f"Task {task_id} did not complete within {timeout} seconds"`
			`)`

			`result = requests.get(f"{self.base_url}/task/{task_id}")`
			`status = result.json()`

			`if status["status"] == "failed":`
			`print("Task failed:", status.get("error"))`
			`raise Exception(f"Task failed: {status.get('error')}")`

			`if status["status"] == "completed":`
			`return status`

			`time.sleep(2)`

			`async def test_direct_api():`
			`"""Test direct API endpoints without using the client SDK"""`
			`print("\n=== Testing Direct API Calls ===")`

			`# Test 1: Basic crawl with content filtering`
			`browser_config = BrowserConfig(`
			`headless=True,`
			`viewport_width=1200,`
			`viewport_height=800`
			`)`

			`crawler_config = CrawlerRunConfig(`
			`cache_mode=CacheMode.BYPASS,`
			`markdown_generator=DefaultMarkdownGenerator(`
			`content_filter=PruningContentFilter(`
			`threshold=0.48,`
			`threshold_type="fixed",`
			`min_word_threshold=0`
			`),`
			`options={"ignore_links": True}`
			`)`
			`)`

			`request_data = {`
			`"urls": ["https://example.com"],`
			`"browser_config": browser_config.dump(),`
			`"crawler_config": crawler_config.dump()`
			`}`

			`# Make direct API call`
			`async with httpx.AsyncClient() as client:`
			`response = await client.post(`
			`"http://localhost:8000/crawl",`
			`json=request_data,`
			`timeout=300`
			`)`
			`assert response.status_code == 200`
			`result = response.json()`
			`print("Basic crawl result:", result["success"])`

			`# Test 2: Structured extraction with JSON CSS`
			`schema = {`
			`"baseSelector": "article.post",`
			`"fields": [`
			`{"name": "title", "selector": "h1", "type": "text"},`
			`{"name": "content", "selector": ".content", "type": "html"}`
			`]`
			`}`

			`crawler_config = CrawlerRunConfig(`
			`cache_mode=CacheMode.BYPASS,`
			`extraction_strategy=JsonCssExtractionStrategy(schema=schema)`
			`)`

			`request_data["crawler_config"] = crawler_config.dump()`

			`async with httpx.AsyncClient() as client:`
			`response = await client.post(`
			`"http://localhost:8000/crawl",`
			`json=request_data`
			`)`
			`assert response.status_code == 200`
			`result = response.json()`
			`print("Structured extraction result:", result["success"])`

			`# Test 3: Get schema`
			`# async with httpx.AsyncClient() as client:`
			`# response = await client.get("http://localhost:8000/schema")`
			`# assert response.status_code == 200`
			`# schemas = response.json()`
			`# print("Retrieved schemas for:", list(schemas.keys()))`

			`async def test_with_client():`
			`"""Test using the Crawl4AI Docker client SDK"""`
			`print("\n=== Testing Client SDK ===")`

			`async with Crawl4aiDockerClient(verbose=True) as client:`
			`# Test 1: Basic crawl`
			`browser_config = BrowserConfig(headless=True)`
			`crawler_config = CrawlerRunConfig(`
			`cache_mode=CacheMode.BYPASS,`
			`markdown_generator=DefaultMarkdownGenerator(`
			`content_filter=PruningContentFilter(`
			`threshold=0.48,`
			`threshold_type="fixed"`
			`)`
			`)`
			`)`

			`result = await client.crawl(`
			`urls=["https://example.com"],`
			`browser_config=browser_config,`
			`crawler_config=crawler_config`
			`)`
			`print("Client SDK basic crawl:", result.success)`

			`# Test 2: LLM extraction with streaming`
			`crawler_config = CrawlerRunConfig(`
			`cache_mode=CacheMode.BYPASS,`
			`markdown_generator=DefaultMarkdownGenerator(`
			`content_filter=LLMContentFilter(`
refactor(llm): rename LlmConfig to LLMConfig for consistency Rename LlmConfig to LLMConfig across the codebase to follow consistent naming conventions. Update all imports and usages to use the new name. Update documentation and examples to reflect the change. BREAKING CHANGE: LlmConfig has been renamed to LLMConfig. Users need to update their imports and usage. 2025-03-05 14:17:04 +08:00			`llm_config=LLMConfig(provider="openai/gpt-40"),`
feat(docker): add Docker service integration and config serialization Add Docker service integration with FastAPI server and client implementation. Implement serialization utilities for BrowserConfig and CrawlerRunConfig to support Docker service communication. Clean up imports and improve error handling. - Add Crawl4aiDockerClient class - Implement config serialization/deserialization - Add FastAPI server with streaming support - Add health check endpoint - Clean up imports and type hints 2025-01-31 18:00:16 +08:00			`instruction="Extract key technical concepts"`
			`)`
			`),`
			`stream=True`
			`)`

			`async for result in await client.crawl(`
			`urls=["https://example.com"],`
			`browser_config=browser_config,`
			`crawler_config=crawler_config`
			`):`
			`print(f"Streaming result for: {result.url}")`

			`# # Test 3: Get schema`
			`# schemas = await client.get_schema()`
			`# print("Retrieved client schemas for:", list(schemas.keys()))`

			`async def main():`
			`"""Run all tests"""`
			`# Test direct API`
			`print("Testing direct API calls...")`
feat(docker): enhance Docker deployment setup and configuration Add comprehensive Docker deployment configuration with: - New .dockerignore and .llm.env.example files - Enhanced Dockerfile with multi-stage build and optimizations - Detailed README with setup instructions and environment configurations - Improved requirements.txt with Gunicorn - Better error handling in async_configs.py BREAKING CHANGE: Docker deployment now requires .llm.env file for API keys 2025-02-01 19:33:27 +08:00			`await test_direct_api()`
feat(docker): add Docker service integration and config serialization Add Docker service integration with FastAPI server and client implementation. Implement serialization utilities for BrowserConfig and CrawlerRunConfig to support Docker service communication. Clean up imports and improve error handling. - Add Crawl4aiDockerClient class - Implement config serialization/deserialization - Add FastAPI server with streaming support - Add health check endpoint - Clean up imports and type hints 2025-01-31 18:00:16 +08:00
			`# Test client SDK`
			`print("\nTesting client SDK...")`
			`await test_with_client()`

			`if __name__ == "__main__":`
			`asyncio.run(main())`