feat(models): add dedicated tables field to CrawlResult

- Add tables field to CrawlResult model while maintaining backward compatibility - Update async_webcrawler.py to extract tables from media and pass to tables field - Update crypto_analysis_example.py to use the new tables field - Add /config/dump examples to demo_docker_api.py - Bump version to 0.6.1
2025-04-24 18:36:25 +08:00 · 2025-04-24 18:36:25 +08:00 · ccec40ed17
commit ccec40ed17
parent ad4dfb21e1
7 changed files with 287 additions and 26 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -5,6 +5,15 @@ All notable changes to Crawl4AI will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

+## [0.6.1] - 2025-04-24
+
+### Added
+- New dedicated `tables` field in `CrawlResult` model for better table extraction handling
+- Updated crypto_analysis_example.py to use the new tables field with backward compatibility
+
+### Changed
+- Improved playground UI in Docker deployment with better endpoint handling and UI feedback
+
 ## [0.6.0] ‑ 2025‑04‑22

 ### Added
--- a/crawl4ai/version.py
+++ b/crawl4ai/version.py
@ -1,3 +1,3 @@
 # crawl4ai/_version.py
-__version__ = "0.6.0"
+__version__ = "0.6.1"

--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@ -496,11 +496,13 @@ class AsyncWebCrawler:
            cleaned_html = sanitize_input_encode(
                result.get("cleaned_html", ""))
            media = result.get("media", {})
+            tables = media.pop("tables", []) if isinstance(media, dict) else []
            links = result.get("links", {})
            metadata = result.get("metadata", {})
        else:
            cleaned_html = sanitize_input_encode(result.cleaned_html)
            media = result.media.model_dump()
+            tables = media.pop("tables", [])
            links = result.links.model_dump()
            metadata = result.metadata

@ -627,6 +629,7 @@ class AsyncWebCrawler:
            cleaned_html=cleaned_html,
            markdown=markdown_result,
            media=media,
+            tables=tables,                       # NEW
            links=links,
            metadata=metadata,
            screenshot=screenshot_data,
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@ -1,4 +1,4 @@
-from pydantic import BaseModel, HttpUrl, PrivateAttr
+from pydantic import BaseModel, HttpUrl, PrivateAttr, Field
 from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
 from typing import AsyncGenerator
 from typing import Generic, TypeVar
@ -150,6 +150,7 @@ class CrawlResult(BaseModel):
    redirected_url: Optional[str] = None
    network_requests: Optional[List[Dict[str, Any]]] = None
    console_messages: Optional[List[Dict[str, Any]]] = None
+    tables: List[Dict] = Field(default_factory=list)  # NEW – [{headers,rows,caption,summary}]

    class Config:
        arbitrary_types_allowed = True
--- a/deploy/docker/static/playground/index.html
+++ b/deploy/docker/static/playground/index.html
@ -193,7 +193,48 @@
                <textarea id="urls" class="w-full bg-dark border border-border rounded p-2 h-32 text-sm mb-4"
                    spellcheck="false">https://example.com</textarea>

-                <details class="mb-4">
+                <!-- Specific options for /md endpoint -->
+                <details id="md-options" class="mb-4 hidden">
+                    <summary class="text-sm text-secondary cursor-pointer">/md Options</summary>
+                    <div class="mt-2 space-y-3 p-2 border border-border rounded">
+                        <div>
+                            <label for="md-filter" class="block text-xs text-secondary mb-1">Filter Type</label>
+                            <select id="md-filter" class="bg-dark border border-border rounded px-2 py-1 text-sm w-full">
+                                <option value="fit">fit - Adaptive content filtering</option>
+                                <option value="raw">raw - No filtering</option>
+                                <option value="bm25">bm25 - BM25 keyword relevance</option>
+                                <option value="llm">llm - LLM-based filtering</option>
+                            </select>
+                        </div>
+                        <div>
+                            <label for="md-query" class="block text-xs text-secondary mb-1">Query (for BM25/LLM filters)</label>
+                            <input id="md-query" type="text" placeholder="Enter search terms or instructions" 
+                                class="bg-dark border border-border rounded px-2 py-1 text-sm w-full">
+                        </div>
+                        <div>
+                            <label for="md-cache" class="block text-xs text-secondary mb-1">Cache Mode</label>
+                            <select id="md-cache" class="bg-dark border border-border rounded px-2 py-1 text-sm w-full">
+                                <option value="0">Write-Only (0)</option>
+                                <option value="1">Enabled (1)</option>
+                            </select>
+                        </div>
+                    </div>
+                </details>
+
+                <!-- Specific options for /llm endpoint -->
+                <details id="llm-options" class="mb-4 hidden">
+                    <summary class="text-sm text-secondary cursor-pointer">/llm Options</summary>
+                    <div class="mt-2 space-y-3 p-2 border border-border rounded">
+                        <div>
+                            <label for="llm-question" class="block text-xs text-secondary mb-1">Question</label>
+                            <input id="llm-question" type="text" value="What is this page about?" 
+                                class="bg-dark border border-border rounded px-2 py-1 text-sm w-full">
+                        </div>
+                    </div>
+                </details>
+
+                <!-- Advanced config for /crawl endpoints -->
+                <details id="adv-config" class="mb-4">
                    <summary class="text-sm text-secondary cursor-pointer">Advanced Config <span
                        class="text-xs text-primary">(Python → auto‑JSON)</span></summary>

@ -437,6 +478,33 @@
            cm.setValue(TEMPLATES[e.target.value]);
            document.getElementById('cfg-status').textContent = '';
        });
+        
+        // Handle endpoint selection change to show appropriate options
+        document.getElementById('endpoint').addEventListener('change', function(e) {
+            const endpoint = e.target.value;
+            const mdOptions = document.getElementById('md-options');
+            const llmOptions = document.getElementById('llm-options');
+            const advConfig = document.getElementById('adv-config');
+            
+            // Hide all option sections first
+            mdOptions.classList.add('hidden');
+            llmOptions.classList.add('hidden');
+            advConfig.classList.add('hidden');
+            
+            // Show the appropriate section based on endpoint
+            if (endpoint === 'md') {
+                mdOptions.classList.remove('hidden');
+                // Auto-open the /md options
+                mdOptions.setAttribute('open', '');
+            } else if (endpoint === 'llm') {
+                llmOptions.classList.remove('hidden');
+                // Auto-open the /llm options
+                llmOptions.setAttribute('open', '');
+            } else {
+                // For /crawl endpoints, show the advanced config
+                advConfig.classList.remove('hidden');
+            }
+        });

        async function pyConfigToJson() {
            const code = cm.getValue().trim();
@ -494,10 +562,18 @@
        }

        // Generate code snippets
-        function generateSnippets(api, payload) {
+        function generateSnippets(api, payload, method = 'POST') {
            // Python snippet
            const pyCodeEl = document.querySelector('#python-content code');
-            const pySnippet = `import httpx\n\nasync def crawl():\n    async with httpx.AsyncClient() as client:\n        response = await client.post(\n            "${window.location.origin}${api}",\n            json=${JSON.stringify(payload, null, 4).replace(/\n/g, '\n            ')}\n        )\n        return response.json()`;
+            let pySnippet;
+            
+            if (method === 'GET') {
+                // GET request (for /llm endpoint)
+                pySnippet = `import httpx\n\nasync def crawl():\n    async with httpx.AsyncClient() as client:\n        response = await client.get(\n            "${window.location.origin}${api}"\n        )\n        return response.json()`;
+            } else {
+                // POST request (for /crawl and /md endpoints)
+                pySnippet = `import httpx\n\nasync def crawl():\n    async with httpx.AsyncClient() as client:\n        response = await client.post(\n            "${window.location.origin}${api}",\n            json=${JSON.stringify(payload, null, 4).replace(/\n/g, '\n            ')}\n        )\n        return response.json()`;
+            }

            pyCodeEl.textContent = pySnippet;
            pyCodeEl.className = 'python hljs'; // Reset classes
@ -505,7 +581,15 @@

            // cURL snippet
            const curlCodeEl = document.querySelector('#curl-content code');
-            const curlSnippet = `curl -X POST ${window.location.origin}${api} \\\n  -H "Content-Type: application/json" \\\n  -d '${JSON.stringify(payload)}'`;
+            let curlSnippet;
+            
+            if (method === 'GET') {
+                // GET request (for /llm endpoint)
+                curlSnippet = `curl -X GET "${window.location.origin}${api}"`;
+            } else {
+                // POST request (for /crawl and /md endpoints)
+                curlSnippet = `curl -X POST ${window.location.origin}${api} \\\n  -H "Content-Type: application/json" \\\n  -d '${JSON.stringify(payload)}'`;
+            }

            curlCodeEl.textContent = curlSnippet;
            curlCodeEl.className = 'bash hljs'; // Reset classes
@ -536,20 +620,39 @@

            const endpointMap = {
                crawl: '/crawl',
-            };
-
-            /*const endpointMap = {
-                crawl: '/crawl',
-                crawl_stream: '/crawl/stream',
+                // crawl_stream: '/crawl/stream',
                md: '/md',
                llm: '/llm'
-            };*/
+            };

            const api = endpointMap[endpoint];
-            const payload = {
-                urls,
-                ...advConfig
-            };
+            let payload;
+            
+            // Create appropriate payload based on endpoint type
+            if (endpoint === 'md') {
+                // Get values from the /md specific inputs
+                const filterType = document.getElementById('md-filter').value;
+                const query = document.getElementById('md-query').value.trim();
+                const cache = document.getElementById('md-cache').value;
+                
+                // MD endpoint expects: { url, f, q, c }
+                payload = {
+                    url: urls[0], // Take first URL
+                    f: filterType, // Lowercase filter type as required by server
+                    q: query || null, // Use the query if provided, otherwise null
+                    c: cache
+                };
+            } else if (endpoint === 'llm') {
+                // LLM endpoint has a different URL pattern and uses query params
+                // This will be handled directly in the fetch below
+                payload = null;
+            } else {
+                // Default payload for /crawl and /crawl/stream
+                payload = {
+                    urls,
+                    ...advConfig
+                };
+            }

            updateStatus('processing');

@ -557,7 +660,18 @@
                const startTime = performance.now();
                let response, responseData;

-                if (endpoint === 'crawl_stream') {
+                if (endpoint === 'llm') {
+                    // Special handling for LLM endpoint which uses URL pattern: /llm/{encoded_url}?q={query}
+                    const url = urls[0];
+                    const encodedUrl = encodeURIComponent(url);
+                    // Get the question from the LLM-specific input
+                    const question = document.getElementById('llm-question').value.trim() || "What is this page about?";
+                    
+                    response = await fetch(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, {
+                        method: 'GET',
+                        headers: { 'Accept': 'application/json' }
+                    });
+                } else if (endpoint === 'crawl_stream') {
                    // Stream processing
                    response = await fetch(api, {
                        method: 'POST',
@ -597,7 +711,7 @@
                    document.querySelector('#response-content code').className = 'json hljs'; // Reset classes
                    forceHighlightElement(document.querySelector('#response-content code'));
                } else {
-                    // Regular request
+                    // Regular request (handles /crawl and /md)
                    response = await fetch(api, {
                        method: 'POST',
                        headers: { 'Content-Type': 'application/json' },
@ -625,7 +739,16 @@
                }

                forceHighlightElement(document.querySelector('#response-content code'));
-                generateSnippets(api, payload);
+                
+                // For generateSnippets, handle the LLM case specially
+                if (endpoint === 'llm') {
+                    const url = urls[0];
+                    const encodedUrl = encodeURIComponent(url);
+                    const question = document.getElementById('llm-question').value.trim() || "What is this page about?";
+                    generateSnippets(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, null, 'GET');
+                } else {
+                    generateSnippets(api, payload);
+                }
            } catch (error) {
                console.error('Error:', error);
                updateStatus('error');
@ -807,9 +930,24 @@
                });
            });
        }
+        
+        // Function to initialize UI based on selected endpoint
+        function initUI() {
+            // Trigger the endpoint change handler to set initial UI state
+            const endpointSelect = document.getElementById('endpoint');
+            const event = new Event('change');
+            endpointSelect.dispatchEvent(event);
+            
+            // Initialize copy buttons
+            initCopyButtons();
+        }

-        // Call this in your DOMContentLoaded or initialization
-        initCopyButtons();
+        // Initialize on page load
+        document.addEventListener('DOMContentLoaded', initUI);
+        // Also call it immediately in case the script runs after DOM is already loaded
+        if (document.readyState !== 'loading') {
+            initUI();
+        }

    </script>
 </body>
--- a/docs/examples/crypto_analysis_example.py
+++ b/docs/examples/crypto_analysis_example.py
@ -391,12 +391,14 @@ async def main():
        # Process results
        raw_df = pd.DataFrame()
        for result in results:
-            if result.success and result.media["tables"]:
+            # Use the new tables field, falling back to media["tables"] for backward compatibility
+            tables = result.tables if hasattr(result, "tables") and result.tables else result.media.get("tables", [])
+            if result.success and tables:
                # Extract primary market table
                # DataFrame
                raw_df = pd.DataFrame(
-                    result.media["tables"][0]["rows"],
-                    columns=result.media["tables"][0]["headers"],
+                    tables[0]["rows"],
+                    columns=tables[0]["headers"],
                )
                break

--- a/docs/examples/docker/demo_docker_api.py
+++ b/docs/examples/docker/demo_docker_api.py
@ -4,6 +4,8 @@ import json
 import os
 import time
 from typing import List, Dict, Any, AsyncGenerator, Optional
+import textwrap          # ← new: for pretty code literals
+import urllib.parse  # ← needed for URL-safe /llm calls
 from dotenv import load_dotenv
 from rich.console import Console
 from rich.syntax import Syntax
@ -969,13 +971,111 @@ async def demo_deep_with_ssl(client: httpx.AsyncClient):
            else:
                 console.print(f"  [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.")

+# 7. Markdown helper endpoint
+async def demo_markdown_endpoint(client: httpx.AsyncClient):
+    """
+    One-shot helper around /md.
+    Fetches PYTHON_URL with FIT filter and prints the first 500 chars of Markdown.
+    """
+    target_url = PYTHON_URL
+    payload = {"url": target_url, "f": "fit", "q": None, "c": "0"}
+
+    console.rule("[bold blue]Demo 7a: /md Endpoint[/]", style="blue")
+    print_payload(payload)
+
+    try:
+        t0 = time.time()
+        resp = await client.post("/md", json=payload)
+        dt = time.time() - t0
+        console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
+        resp.raise_for_status()
+        md = resp.json().get("markdown", "")
+        snippet = (md[:500] + "...") if len(md) > 500 else md
+        console.print(Panel(snippet, title="Markdown snippet", border_style="cyan", expand=False))
+    except Exception as e:
+        console.print(f"[bold red]Error hitting /md:[/] {e}")
+
+# 8. LLM QA helper endpoint
+async def demo_llm_endpoint(client: httpx.AsyncClient):
+    """
+    Quick QA round-trip with /llm.
+    Asks a trivial question against SIMPLE_URL just to show wiring.
+    """
+    page_url = SIMPLE_URL
+    question = "What is the title of this page?"
+
+    console.rule("[bold magenta]Demo 7b: /llm Endpoint[/]", style="magenta")
+    enc = urllib.parse.quote_plus(page_url, safe="")
+    console.print(f"GET /llm/{enc}?q={question}")
+
+    try:
+        t0 = time.time()
+        resp = await client.get(f"/llm/{enc}", params={"q": question})
+        dt = time.time() - t0
+        console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
+        resp.raise_for_status()
+        answer = resp.json().get("answer", "")
+        console.print(Panel(answer or "No answer returned", title="LLM answer", border_style="magenta", expand=False))
+    except Exception as e:
+        console.print(f"[bold red]Error hitting /llm:[/] {e}")
+
+
+# 9. /config/dump helpers --------------------------------------------------
+
+async def demo_config_dump_valid(client: httpx.AsyncClient):
+    """
+    Send a single top-level CrawlerRunConfig(...) expression and show the dump.
+    """
+    code_snippet = "CrawlerRunConfig(cache_mode='BYPASS', screenshot=True)"
+    payload = {"code": code_snippet}
+
+    console.rule("[bold blue]Demo 8a: /config/dump (valid)[/]", style="blue")
+    print_payload(payload)
+
+    try:
+        t0 = time.time()
+        resp = await client.post("/config/dump", json=payload)
+        dt = time.time() - t0
+        console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
+        resp.raise_for_status()
+        dump_json = resp.json()
+        console.print(Panel(Syntax(json.dumps(dump_json, indent=2), "json", theme="monokai"), title="Dump()", border_style="cyan"))
+    except Exception as e:
+        console.print(f"[bold red]Error in valid /config/dump call:[/] {e}")
+
+
+async def demo_config_dump_invalid(client: httpx.AsyncClient):
+    """
+    Purposely break the rule (nested call) to show the 400 parse error.
+    """
+    bad_code = textwrap.dedent("""
+        BrowserConfig(headless=True); CrawlerRunConfig()
+    """).strip()
+    payload = {"code": bad_code}
+
+    console.rule("[bold magenta]Demo 8b: /config/dump (invalid)[/]", style="magenta")
+    print_payload(payload)
+
+    try:
+        resp = await client.post("/config/dump", json=payload)
+        console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/]")
+        resp.raise_for_status()   # should throw -> except
+    except httpx.HTTPStatusError as e:
+        console.print("[cyan]Expected parse/validation failure captured:[/]")
+        try:
+            console.print(Panel(Syntax(json.dumps(e.response.json(), indent=2), "json", theme="fruity"), title="Error payload"))
+        except Exception:
+            console.print(e.response.text)
+    except Exception as e:
+        console.print(f"[bold red]Unexpected error during invalid test:[/] {e}")
+

 # --- Update Main Runner to include new demo ---
 async def main_demo():
    async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client:
        if not await check_server_health(client):
            return
-
+        
        # --- Run Demos ---
        await demo_basic_single_url(client)
        await demo_basic_multi_url(client)
@ -1001,7 +1101,15 @@ async def main_demo():
        await demo_deep_with_css_extraction(client)
        await demo_deep_with_llm_extraction(client) # Skips if no common LLM key env var
        await demo_deep_with_proxy(client) # Skips if no PROXIES env var
-        await demo_deep_with_ssl(client) # Added the new demo
+        await demo_deep_with_ssl(client)   # Added the new demo
+
+        # --- Helper endpoints ---
+        await demo_markdown_endpoint(client)
+        await demo_llm_endpoint(client)
+
+        # --- /config/dump sanity checks ---
+        await demo_config_dump_valid(client)
+        await demo_config_dump_invalid(client)

        console.rule("[bold green]Demo Complete[/]", style="green")