feat(models): add dedicated tables field to CrawlResult

- Add tables field to CrawlResult model while maintaining backward compatibility
- Update async_webcrawler.py to extract tables from media and pass to tables field
- Update crypto_analysis_example.py to use the new tables field
- Add /config/dump examples to demo_docker_api.py
- Bump version to 0.6.1
This commit is contained in:
UncleCode 2025-04-24 18:36:25 +08:00
parent ad4dfb21e1
commit ccec40ed17
7 changed files with 287 additions and 26 deletions

View File

@ -5,6 +5,15 @@ All notable changes to Crawl4AI will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [0.6.1] - 2025-04-24
### Added
- New dedicated `tables` field in `CrawlResult` model for better table extraction handling
- Updated crypto_analysis_example.py to use the new tables field with backward compatibility
### Changed
- Improved playground UI in Docker deployment with better endpoint handling and UI feedback
## [0.6.0] 20250422
### Added

View File

@ -1,3 +1,3 @@
# crawl4ai/_version.py
__version__ = "0.6.0"
__version__ = "0.6.1"

View File

@ -496,11 +496,13 @@ class AsyncWebCrawler:
cleaned_html = sanitize_input_encode(
result.get("cleaned_html", ""))
media = result.get("media", {})
tables = media.pop("tables", []) if isinstance(media, dict) else []
links = result.get("links", {})
metadata = result.get("metadata", {})
else:
cleaned_html = sanitize_input_encode(result.cleaned_html)
media = result.media.model_dump()
tables = media.pop("tables", [])
links = result.links.model_dump()
metadata = result.metadata
@ -627,6 +629,7 @@ class AsyncWebCrawler:
cleaned_html=cleaned_html,
markdown=markdown_result,
media=media,
tables=tables, # NEW
links=links,
metadata=metadata,
screenshot=screenshot_data,

View File

@ -1,4 +1,4 @@
from pydantic import BaseModel, HttpUrl, PrivateAttr
from pydantic import BaseModel, HttpUrl, PrivateAttr, Field
from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
from typing import AsyncGenerator
from typing import Generic, TypeVar
@ -150,6 +150,7 @@ class CrawlResult(BaseModel):
redirected_url: Optional[str] = None
network_requests: Optional[List[Dict[str, Any]]] = None
console_messages: Optional[List[Dict[str, Any]]] = None
tables: List[Dict] = Field(default_factory=list) # NEW [{headers,rows,caption,summary}]
class Config:
arbitrary_types_allowed = True

View File

@ -193,7 +193,48 @@
<textarea id="urls" class="w-full bg-dark border border-border rounded p-2 h-32 text-sm mb-4"
spellcheck="false">https://example.com</textarea>
<details class="mb-4">
<!-- Specific options for /md endpoint -->
<details id="md-options" class="mb-4 hidden">
<summary class="text-sm text-secondary cursor-pointer">/md Options</summary>
<div class="mt-2 space-y-3 p-2 border border-border rounded">
<div>
<label for="md-filter" class="block text-xs text-secondary mb-1">Filter Type</label>
<select id="md-filter" class="bg-dark border border-border rounded px-2 py-1 text-sm w-full">
<option value="fit">fit - Adaptive content filtering</option>
<option value="raw">raw - No filtering</option>
<option value="bm25">bm25 - BM25 keyword relevance</option>
<option value="llm">llm - LLM-based filtering</option>
</select>
</div>
<div>
<label for="md-query" class="block text-xs text-secondary mb-1">Query (for BM25/LLM filters)</label>
<input id="md-query" type="text" placeholder="Enter search terms or instructions"
class="bg-dark border border-border rounded px-2 py-1 text-sm w-full">
</div>
<div>
<label for="md-cache" class="block text-xs text-secondary mb-1">Cache Mode</label>
<select id="md-cache" class="bg-dark border border-border rounded px-2 py-1 text-sm w-full">
<option value="0">Write-Only (0)</option>
<option value="1">Enabled (1)</option>
</select>
</div>
</div>
</details>
<!-- Specific options for /llm endpoint -->
<details id="llm-options" class="mb-4 hidden">
<summary class="text-sm text-secondary cursor-pointer">/llm Options</summary>
<div class="mt-2 space-y-3 p-2 border border-border rounded">
<div>
<label for="llm-question" class="block text-xs text-secondary mb-1">Question</label>
<input id="llm-question" type="text" value="What is this page about?"
class="bg-dark border border-border rounded px-2 py-1 text-sm w-full">
</div>
</div>
</details>
<!-- Advanced config for /crawl endpoints -->
<details id="adv-config" class="mb-4">
<summary class="text-sm text-secondary cursor-pointer">Advanced Config <span
class="text-xs text-primary">(Python → autoJSON)</span></summary>
@ -437,6 +478,33 @@
cm.setValue(TEMPLATES[e.target.value]);
document.getElementById('cfg-status').textContent = '';
});
// Handle endpoint selection change to show appropriate options
document.getElementById('endpoint').addEventListener('change', function(e) {
const endpoint = e.target.value;
const mdOptions = document.getElementById('md-options');
const llmOptions = document.getElementById('llm-options');
const advConfig = document.getElementById('adv-config');
// Hide all option sections first
mdOptions.classList.add('hidden');
llmOptions.classList.add('hidden');
advConfig.classList.add('hidden');
// Show the appropriate section based on endpoint
if (endpoint === 'md') {
mdOptions.classList.remove('hidden');
// Auto-open the /md options
mdOptions.setAttribute('open', '');
} else if (endpoint === 'llm') {
llmOptions.classList.remove('hidden');
// Auto-open the /llm options
llmOptions.setAttribute('open', '');
} else {
// For /crawl endpoints, show the advanced config
advConfig.classList.remove('hidden');
}
});
async function pyConfigToJson() {
const code = cm.getValue().trim();
@ -494,10 +562,18 @@
}
// Generate code snippets
function generateSnippets(api, payload) {
function generateSnippets(api, payload, method = 'POST') {
// Python snippet
const pyCodeEl = document.querySelector('#python-content code');
const pySnippet = `import httpx\n\nasync def crawl():\n async with httpx.AsyncClient() as client:\n response = await client.post(\n "${window.location.origin}${api}",\n json=${JSON.stringify(payload, null, 4).replace(/\n/g, '\n ')}\n )\n return response.json()`;
let pySnippet;
if (method === 'GET') {
// GET request (for /llm endpoint)
pySnippet = `import httpx\n\nasync def crawl():\n async with httpx.AsyncClient() as client:\n response = await client.get(\n "${window.location.origin}${api}"\n )\n return response.json()`;
} else {
// POST request (for /crawl and /md endpoints)
pySnippet = `import httpx\n\nasync def crawl():\n async with httpx.AsyncClient() as client:\n response = await client.post(\n "${window.location.origin}${api}",\n json=${JSON.stringify(payload, null, 4).replace(/\n/g, '\n ')}\n )\n return response.json()`;
}
pyCodeEl.textContent = pySnippet;
pyCodeEl.className = 'python hljs'; // Reset classes
@ -505,7 +581,15 @@
// cURL snippet
const curlCodeEl = document.querySelector('#curl-content code');
const curlSnippet = `curl -X POST ${window.location.origin}${api} \\\n -H "Content-Type: application/json" \\\n -d '${JSON.stringify(payload)}'`;
let curlSnippet;
if (method === 'GET') {
// GET request (for /llm endpoint)
curlSnippet = `curl -X GET "${window.location.origin}${api}"`;
} else {
// POST request (for /crawl and /md endpoints)
curlSnippet = `curl -X POST ${window.location.origin}${api} \\\n -H "Content-Type: application/json" \\\n -d '${JSON.stringify(payload)}'`;
}
curlCodeEl.textContent = curlSnippet;
curlCodeEl.className = 'bash hljs'; // Reset classes
@ -536,20 +620,39 @@
const endpointMap = {
crawl: '/crawl',
};
/*const endpointMap = {
crawl: '/crawl',
crawl_stream: '/crawl/stream',
// crawl_stream: '/crawl/stream',
md: '/md',
llm: '/llm'
};*/
};
const api = endpointMap[endpoint];
const payload = {
urls,
...advConfig
};
let payload;
// Create appropriate payload based on endpoint type
if (endpoint === 'md') {
// Get values from the /md specific inputs
const filterType = document.getElementById('md-filter').value;
const query = document.getElementById('md-query').value.trim();
const cache = document.getElementById('md-cache').value;
// MD endpoint expects: { url, f, q, c }
payload = {
url: urls[0], // Take first URL
f: filterType, // Lowercase filter type as required by server
q: query || null, // Use the query if provided, otherwise null
c: cache
};
} else if (endpoint === 'llm') {
// LLM endpoint has a different URL pattern and uses query params
// This will be handled directly in the fetch below
payload = null;
} else {
// Default payload for /crawl and /crawl/stream
payload = {
urls,
...advConfig
};
}
updateStatus('processing');
@ -557,7 +660,18 @@
const startTime = performance.now();
let response, responseData;
if (endpoint === 'crawl_stream') {
if (endpoint === 'llm') {
// Special handling for LLM endpoint which uses URL pattern: /llm/{encoded_url}?q={query}
const url = urls[0];
const encodedUrl = encodeURIComponent(url);
// Get the question from the LLM-specific input
const question = document.getElementById('llm-question').value.trim() || "What is this page about?";
response = await fetch(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, {
method: 'GET',
headers: { 'Accept': 'application/json' }
});
} else if (endpoint === 'crawl_stream') {
// Stream processing
response = await fetch(api, {
method: 'POST',
@ -597,7 +711,7 @@
document.querySelector('#response-content code').className = 'json hljs'; // Reset classes
forceHighlightElement(document.querySelector('#response-content code'));
} else {
// Regular request
// Regular request (handles /crawl and /md)
response = await fetch(api, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
@ -625,7 +739,16 @@
}
forceHighlightElement(document.querySelector('#response-content code'));
generateSnippets(api, payload);
// For generateSnippets, handle the LLM case specially
if (endpoint === 'llm') {
const url = urls[0];
const encodedUrl = encodeURIComponent(url);
const question = document.getElementById('llm-question').value.trim() || "What is this page about?";
generateSnippets(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, null, 'GET');
} else {
generateSnippets(api, payload);
}
} catch (error) {
console.error('Error:', error);
updateStatus('error');
@ -807,9 +930,24 @@
});
});
}
// Function to initialize UI based on selected endpoint
function initUI() {
// Trigger the endpoint change handler to set initial UI state
const endpointSelect = document.getElementById('endpoint');
const event = new Event('change');
endpointSelect.dispatchEvent(event);
// Initialize copy buttons
initCopyButtons();
}
// Call this in your DOMContentLoaded or initialization
initCopyButtons();
// Initialize on page load
document.addEventListener('DOMContentLoaded', initUI);
// Also call it immediately in case the script runs after DOM is already loaded
if (document.readyState !== 'loading') {
initUI();
}
</script>
</body>

View File

@ -391,12 +391,14 @@ async def main():
# Process results
raw_df = pd.DataFrame()
for result in results:
if result.success and result.media["tables"]:
# Use the new tables field, falling back to media["tables"] for backward compatibility
tables = result.tables if hasattr(result, "tables") and result.tables else result.media.get("tables", [])
if result.success and tables:
# Extract primary market table
# DataFrame
raw_df = pd.DataFrame(
result.media["tables"][0]["rows"],
columns=result.media["tables"][0]["headers"],
tables[0]["rows"],
columns=tables[0]["headers"],
)
break

View File

@ -4,6 +4,8 @@ import json
import os
import time
from typing import List, Dict, Any, AsyncGenerator, Optional
import textwrap # ← new: for pretty code literals
import urllib.parse # ← needed for URL-safe /llm calls
from dotenv import load_dotenv
from rich.console import Console
from rich.syntax import Syntax
@ -969,13 +971,111 @@ async def demo_deep_with_ssl(client: httpx.AsyncClient):
else:
console.print(f" [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.")
# 7. Markdown helper endpoint
async def demo_markdown_endpoint(client: httpx.AsyncClient):
"""
One-shot helper around /md.
Fetches PYTHON_URL with FIT filter and prints the first 500 chars of Markdown.
"""
target_url = PYTHON_URL
payload = {"url": target_url, "f": "fit", "q": None, "c": "0"}
console.rule("[bold blue]Demo 7a: /md Endpoint[/]", style="blue")
print_payload(payload)
try:
t0 = time.time()
resp = await client.post("/md", json=payload)
dt = time.time() - t0
console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
resp.raise_for_status()
md = resp.json().get("markdown", "")
snippet = (md[:500] + "...") if len(md) > 500 else md
console.print(Panel(snippet, title="Markdown snippet", border_style="cyan", expand=False))
except Exception as e:
console.print(f"[bold red]Error hitting /md:[/] {e}")
# 8. LLM QA helper endpoint
async def demo_llm_endpoint(client: httpx.AsyncClient):
"""
Quick QA round-trip with /llm.
Asks a trivial question against SIMPLE_URL just to show wiring.
"""
page_url = SIMPLE_URL
question = "What is the title of this page?"
console.rule("[bold magenta]Demo 7b: /llm Endpoint[/]", style="magenta")
enc = urllib.parse.quote_plus(page_url, safe="")
console.print(f"GET /llm/{enc}?q={question}")
try:
t0 = time.time()
resp = await client.get(f"/llm/{enc}", params={"q": question})
dt = time.time() - t0
console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
resp.raise_for_status()
answer = resp.json().get("answer", "")
console.print(Panel(answer or "No answer returned", title="LLM answer", border_style="magenta", expand=False))
except Exception as e:
console.print(f"[bold red]Error hitting /llm:[/] {e}")
# 9. /config/dump helpers --------------------------------------------------
async def demo_config_dump_valid(client: httpx.AsyncClient):
"""
Send a single top-level CrawlerRunConfig(...) expression and show the dump.
"""
code_snippet = "CrawlerRunConfig(cache_mode='BYPASS', screenshot=True)"
payload = {"code": code_snippet}
console.rule("[bold blue]Demo 8a: /config/dump (valid)[/]", style="blue")
print_payload(payload)
try:
t0 = time.time()
resp = await client.post("/config/dump", json=payload)
dt = time.time() - t0
console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
resp.raise_for_status()
dump_json = resp.json()
console.print(Panel(Syntax(json.dumps(dump_json, indent=2), "json", theme="monokai"), title="Dump()", border_style="cyan"))
except Exception as e:
console.print(f"[bold red]Error in valid /config/dump call:[/] {e}")
async def demo_config_dump_invalid(client: httpx.AsyncClient):
"""
Purposely break the rule (nested call) to show the 400 parse error.
"""
bad_code = textwrap.dedent("""
BrowserConfig(headless=True); CrawlerRunConfig()
""").strip()
payload = {"code": bad_code}
console.rule("[bold magenta]Demo 8b: /config/dump (invalid)[/]", style="magenta")
print_payload(payload)
try:
resp = await client.post("/config/dump", json=payload)
console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/]")
resp.raise_for_status() # should throw -> except
except httpx.HTTPStatusError as e:
console.print("[cyan]Expected parse/validation failure captured:[/]")
try:
console.print(Panel(Syntax(json.dumps(e.response.json(), indent=2), "json", theme="fruity"), title="Error payload"))
except Exception:
console.print(e.response.text)
except Exception as e:
console.print(f"[bold red]Unexpected error during invalid test:[/] {e}")
# --- Update Main Runner to include new demo ---
async def main_demo():
async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client:
if not await check_server_health(client):
return
# --- Run Demos ---
await demo_basic_single_url(client)
await demo_basic_multi_url(client)
@ -1001,7 +1101,15 @@ async def main_demo():
await demo_deep_with_css_extraction(client)
await demo_deep_with_llm_extraction(client) # Skips if no common LLM key env var
await demo_deep_with_proxy(client) # Skips if no PROXIES env var
await demo_deep_with_ssl(client) # Added the new demo
await demo_deep_with_ssl(client) # Added the new demo
# --- Helper endpoints ---
await demo_markdown_endpoint(client)
await demo_llm_endpoint(client)
# --- /config/dump sanity checks ---
await demo_config_dump_valid(client)
await demo_config_dump_invalid(client)
console.rule("[bold green]Demo Complete[/]", style="green")