feat(models): add dedicated tables field to CrawlResult
- Add tables field to CrawlResult model while maintaining backward compatibility - Update async_webcrawler.py to extract tables from media and pass to tables field - Update crypto_analysis_example.py to use the new tables field - Add /config/dump examples to demo_docker_api.py - Bump version to 0.6.1
This commit is contained in:
parent
ad4dfb21e1
commit
ccec40ed17
@ -5,6 +5,15 @@ All notable changes to Crawl4AI will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [0.6.1] - 2025-04-24
|
||||
|
||||
### Added
|
||||
- New dedicated `tables` field in `CrawlResult` model for better table extraction handling
|
||||
- Updated crypto_analysis_example.py to use the new tables field with backward compatibility
|
||||
|
||||
### Changed
|
||||
- Improved playground UI in Docker deployment with better endpoint handling and UI feedback
|
||||
|
||||
## [0.6.0] ‑ 2025‑04‑22
|
||||
|
||||
### Added
|
||||
|
@ -1,3 +1,3 @@
|
||||
# crawl4ai/_version.py
|
||||
__version__ = "0.6.0"
|
||||
__version__ = "0.6.1"
|
||||
|
||||
|
@ -496,11 +496,13 @@ class AsyncWebCrawler:
|
||||
cleaned_html = sanitize_input_encode(
|
||||
result.get("cleaned_html", ""))
|
||||
media = result.get("media", {})
|
||||
tables = media.pop("tables", []) if isinstance(media, dict) else []
|
||||
links = result.get("links", {})
|
||||
metadata = result.get("metadata", {})
|
||||
else:
|
||||
cleaned_html = sanitize_input_encode(result.cleaned_html)
|
||||
media = result.media.model_dump()
|
||||
tables = media.pop("tables", [])
|
||||
links = result.links.model_dump()
|
||||
metadata = result.metadata
|
||||
|
||||
@ -627,6 +629,7 @@ class AsyncWebCrawler:
|
||||
cleaned_html=cleaned_html,
|
||||
markdown=markdown_result,
|
||||
media=media,
|
||||
tables=tables, # NEW
|
||||
links=links,
|
||||
metadata=metadata,
|
||||
screenshot=screenshot_data,
|
||||
|
@ -1,4 +1,4 @@
|
||||
from pydantic import BaseModel, HttpUrl, PrivateAttr
|
||||
from pydantic import BaseModel, HttpUrl, PrivateAttr, Field
|
||||
from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
|
||||
from typing import AsyncGenerator
|
||||
from typing import Generic, TypeVar
|
||||
@ -150,6 +150,7 @@ class CrawlResult(BaseModel):
|
||||
redirected_url: Optional[str] = None
|
||||
network_requests: Optional[List[Dict[str, Any]]] = None
|
||||
console_messages: Optional[List[Dict[str, Any]]] = None
|
||||
tables: List[Dict] = Field(default_factory=list) # NEW – [{headers,rows,caption,summary}]
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
|
@ -193,7 +193,48 @@
|
||||
<textarea id="urls" class="w-full bg-dark border border-border rounded p-2 h-32 text-sm mb-4"
|
||||
spellcheck="false">https://example.com</textarea>
|
||||
|
||||
<details class="mb-4">
|
||||
<!-- Specific options for /md endpoint -->
|
||||
<details id="md-options" class="mb-4 hidden">
|
||||
<summary class="text-sm text-secondary cursor-pointer">/md Options</summary>
|
||||
<div class="mt-2 space-y-3 p-2 border border-border rounded">
|
||||
<div>
|
||||
<label for="md-filter" class="block text-xs text-secondary mb-1">Filter Type</label>
|
||||
<select id="md-filter" class="bg-dark border border-border rounded px-2 py-1 text-sm w-full">
|
||||
<option value="fit">fit - Adaptive content filtering</option>
|
||||
<option value="raw">raw - No filtering</option>
|
||||
<option value="bm25">bm25 - BM25 keyword relevance</option>
|
||||
<option value="llm">llm - LLM-based filtering</option>
|
||||
</select>
|
||||
</div>
|
||||
<div>
|
||||
<label for="md-query" class="block text-xs text-secondary mb-1">Query (for BM25/LLM filters)</label>
|
||||
<input id="md-query" type="text" placeholder="Enter search terms or instructions"
|
||||
class="bg-dark border border-border rounded px-2 py-1 text-sm w-full">
|
||||
</div>
|
||||
<div>
|
||||
<label for="md-cache" class="block text-xs text-secondary mb-1">Cache Mode</label>
|
||||
<select id="md-cache" class="bg-dark border border-border rounded px-2 py-1 text-sm w-full">
|
||||
<option value="0">Write-Only (0)</option>
|
||||
<option value="1">Enabled (1)</option>
|
||||
</select>
|
||||
</div>
|
||||
</div>
|
||||
</details>
|
||||
|
||||
<!-- Specific options for /llm endpoint -->
|
||||
<details id="llm-options" class="mb-4 hidden">
|
||||
<summary class="text-sm text-secondary cursor-pointer">/llm Options</summary>
|
||||
<div class="mt-2 space-y-3 p-2 border border-border rounded">
|
||||
<div>
|
||||
<label for="llm-question" class="block text-xs text-secondary mb-1">Question</label>
|
||||
<input id="llm-question" type="text" value="What is this page about?"
|
||||
class="bg-dark border border-border rounded px-2 py-1 text-sm w-full">
|
||||
</div>
|
||||
</div>
|
||||
</details>
|
||||
|
||||
<!-- Advanced config for /crawl endpoints -->
|
||||
<details id="adv-config" class="mb-4">
|
||||
<summary class="text-sm text-secondary cursor-pointer">Advanced Config <span
|
||||
class="text-xs text-primary">(Python → auto‑JSON)</span></summary>
|
||||
|
||||
@ -437,6 +478,33 @@
|
||||
cm.setValue(TEMPLATES[e.target.value]);
|
||||
document.getElementById('cfg-status').textContent = '';
|
||||
});
|
||||
|
||||
// Handle endpoint selection change to show appropriate options
|
||||
document.getElementById('endpoint').addEventListener('change', function(e) {
|
||||
const endpoint = e.target.value;
|
||||
const mdOptions = document.getElementById('md-options');
|
||||
const llmOptions = document.getElementById('llm-options');
|
||||
const advConfig = document.getElementById('adv-config');
|
||||
|
||||
// Hide all option sections first
|
||||
mdOptions.classList.add('hidden');
|
||||
llmOptions.classList.add('hidden');
|
||||
advConfig.classList.add('hidden');
|
||||
|
||||
// Show the appropriate section based on endpoint
|
||||
if (endpoint === 'md') {
|
||||
mdOptions.classList.remove('hidden');
|
||||
// Auto-open the /md options
|
||||
mdOptions.setAttribute('open', '');
|
||||
} else if (endpoint === 'llm') {
|
||||
llmOptions.classList.remove('hidden');
|
||||
// Auto-open the /llm options
|
||||
llmOptions.setAttribute('open', '');
|
||||
} else {
|
||||
// For /crawl endpoints, show the advanced config
|
||||
advConfig.classList.remove('hidden');
|
||||
}
|
||||
});
|
||||
|
||||
async function pyConfigToJson() {
|
||||
const code = cm.getValue().trim();
|
||||
@ -494,10 +562,18 @@
|
||||
}
|
||||
|
||||
// Generate code snippets
|
||||
function generateSnippets(api, payload) {
|
||||
function generateSnippets(api, payload, method = 'POST') {
|
||||
// Python snippet
|
||||
const pyCodeEl = document.querySelector('#python-content code');
|
||||
const pySnippet = `import httpx\n\nasync def crawl():\n async with httpx.AsyncClient() as client:\n response = await client.post(\n "${window.location.origin}${api}",\n json=${JSON.stringify(payload, null, 4).replace(/\n/g, '\n ')}\n )\n return response.json()`;
|
||||
let pySnippet;
|
||||
|
||||
if (method === 'GET') {
|
||||
// GET request (for /llm endpoint)
|
||||
pySnippet = `import httpx\n\nasync def crawl():\n async with httpx.AsyncClient() as client:\n response = await client.get(\n "${window.location.origin}${api}"\n )\n return response.json()`;
|
||||
} else {
|
||||
// POST request (for /crawl and /md endpoints)
|
||||
pySnippet = `import httpx\n\nasync def crawl():\n async with httpx.AsyncClient() as client:\n response = await client.post(\n "${window.location.origin}${api}",\n json=${JSON.stringify(payload, null, 4).replace(/\n/g, '\n ')}\n )\n return response.json()`;
|
||||
}
|
||||
|
||||
pyCodeEl.textContent = pySnippet;
|
||||
pyCodeEl.className = 'python hljs'; // Reset classes
|
||||
@ -505,7 +581,15 @@
|
||||
|
||||
// cURL snippet
|
||||
const curlCodeEl = document.querySelector('#curl-content code');
|
||||
const curlSnippet = `curl -X POST ${window.location.origin}${api} \\\n -H "Content-Type: application/json" \\\n -d '${JSON.stringify(payload)}'`;
|
||||
let curlSnippet;
|
||||
|
||||
if (method === 'GET') {
|
||||
// GET request (for /llm endpoint)
|
||||
curlSnippet = `curl -X GET "${window.location.origin}${api}"`;
|
||||
} else {
|
||||
// POST request (for /crawl and /md endpoints)
|
||||
curlSnippet = `curl -X POST ${window.location.origin}${api} \\\n -H "Content-Type: application/json" \\\n -d '${JSON.stringify(payload)}'`;
|
||||
}
|
||||
|
||||
curlCodeEl.textContent = curlSnippet;
|
||||
curlCodeEl.className = 'bash hljs'; // Reset classes
|
||||
@ -536,20 +620,39 @@
|
||||
|
||||
const endpointMap = {
|
||||
crawl: '/crawl',
|
||||
};
|
||||
|
||||
/*const endpointMap = {
|
||||
crawl: '/crawl',
|
||||
crawl_stream: '/crawl/stream',
|
||||
// crawl_stream: '/crawl/stream',
|
||||
md: '/md',
|
||||
llm: '/llm'
|
||||
};*/
|
||||
};
|
||||
|
||||
const api = endpointMap[endpoint];
|
||||
const payload = {
|
||||
urls,
|
||||
...advConfig
|
||||
};
|
||||
let payload;
|
||||
|
||||
// Create appropriate payload based on endpoint type
|
||||
if (endpoint === 'md') {
|
||||
// Get values from the /md specific inputs
|
||||
const filterType = document.getElementById('md-filter').value;
|
||||
const query = document.getElementById('md-query').value.trim();
|
||||
const cache = document.getElementById('md-cache').value;
|
||||
|
||||
// MD endpoint expects: { url, f, q, c }
|
||||
payload = {
|
||||
url: urls[0], // Take first URL
|
||||
f: filterType, // Lowercase filter type as required by server
|
||||
q: query || null, // Use the query if provided, otherwise null
|
||||
c: cache
|
||||
};
|
||||
} else if (endpoint === 'llm') {
|
||||
// LLM endpoint has a different URL pattern and uses query params
|
||||
// This will be handled directly in the fetch below
|
||||
payload = null;
|
||||
} else {
|
||||
// Default payload for /crawl and /crawl/stream
|
||||
payload = {
|
||||
urls,
|
||||
...advConfig
|
||||
};
|
||||
}
|
||||
|
||||
updateStatus('processing');
|
||||
|
||||
@ -557,7 +660,18 @@
|
||||
const startTime = performance.now();
|
||||
let response, responseData;
|
||||
|
||||
if (endpoint === 'crawl_stream') {
|
||||
if (endpoint === 'llm') {
|
||||
// Special handling for LLM endpoint which uses URL pattern: /llm/{encoded_url}?q={query}
|
||||
const url = urls[0];
|
||||
const encodedUrl = encodeURIComponent(url);
|
||||
// Get the question from the LLM-specific input
|
||||
const question = document.getElementById('llm-question').value.trim() || "What is this page about?";
|
||||
|
||||
response = await fetch(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, {
|
||||
method: 'GET',
|
||||
headers: { 'Accept': 'application/json' }
|
||||
});
|
||||
} else if (endpoint === 'crawl_stream') {
|
||||
// Stream processing
|
||||
response = await fetch(api, {
|
||||
method: 'POST',
|
||||
@ -597,7 +711,7 @@
|
||||
document.querySelector('#response-content code').className = 'json hljs'; // Reset classes
|
||||
forceHighlightElement(document.querySelector('#response-content code'));
|
||||
} else {
|
||||
// Regular request
|
||||
// Regular request (handles /crawl and /md)
|
||||
response = await fetch(api, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
@ -625,7 +739,16 @@
|
||||
}
|
||||
|
||||
forceHighlightElement(document.querySelector('#response-content code'));
|
||||
generateSnippets(api, payload);
|
||||
|
||||
// For generateSnippets, handle the LLM case specially
|
||||
if (endpoint === 'llm') {
|
||||
const url = urls[0];
|
||||
const encodedUrl = encodeURIComponent(url);
|
||||
const question = document.getElementById('llm-question').value.trim() || "What is this page about?";
|
||||
generateSnippets(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, null, 'GET');
|
||||
} else {
|
||||
generateSnippets(api, payload);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error:', error);
|
||||
updateStatus('error');
|
||||
@ -807,9 +930,24 @@
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// Function to initialize UI based on selected endpoint
|
||||
function initUI() {
|
||||
// Trigger the endpoint change handler to set initial UI state
|
||||
const endpointSelect = document.getElementById('endpoint');
|
||||
const event = new Event('change');
|
||||
endpointSelect.dispatchEvent(event);
|
||||
|
||||
// Initialize copy buttons
|
||||
initCopyButtons();
|
||||
}
|
||||
|
||||
// Call this in your DOMContentLoaded or initialization
|
||||
initCopyButtons();
|
||||
// Initialize on page load
|
||||
document.addEventListener('DOMContentLoaded', initUI);
|
||||
// Also call it immediately in case the script runs after DOM is already loaded
|
||||
if (document.readyState !== 'loading') {
|
||||
initUI();
|
||||
}
|
||||
|
||||
</script>
|
||||
</body>
|
||||
|
@ -391,12 +391,14 @@ async def main():
|
||||
# Process results
|
||||
raw_df = pd.DataFrame()
|
||||
for result in results:
|
||||
if result.success and result.media["tables"]:
|
||||
# Use the new tables field, falling back to media["tables"] for backward compatibility
|
||||
tables = result.tables if hasattr(result, "tables") and result.tables else result.media.get("tables", [])
|
||||
if result.success and tables:
|
||||
# Extract primary market table
|
||||
# DataFrame
|
||||
raw_df = pd.DataFrame(
|
||||
result.media["tables"][0]["rows"],
|
||||
columns=result.media["tables"][0]["headers"],
|
||||
tables[0]["rows"],
|
||||
columns=tables[0]["headers"],
|
||||
)
|
||||
break
|
||||
|
||||
|
@ -4,6 +4,8 @@ import json
|
||||
import os
|
||||
import time
|
||||
from typing import List, Dict, Any, AsyncGenerator, Optional
|
||||
import textwrap # ← new: for pretty code literals
|
||||
import urllib.parse # ← needed for URL-safe /llm calls
|
||||
from dotenv import load_dotenv
|
||||
from rich.console import Console
|
||||
from rich.syntax import Syntax
|
||||
@ -969,13 +971,111 @@ async def demo_deep_with_ssl(client: httpx.AsyncClient):
|
||||
else:
|
||||
console.print(f" [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.")
|
||||
|
||||
# 7. Markdown helper endpoint
|
||||
async def demo_markdown_endpoint(client: httpx.AsyncClient):
|
||||
"""
|
||||
One-shot helper around /md.
|
||||
Fetches PYTHON_URL with FIT filter and prints the first 500 chars of Markdown.
|
||||
"""
|
||||
target_url = PYTHON_URL
|
||||
payload = {"url": target_url, "f": "fit", "q": None, "c": "0"}
|
||||
|
||||
console.rule("[bold blue]Demo 7a: /md Endpoint[/]", style="blue")
|
||||
print_payload(payload)
|
||||
|
||||
try:
|
||||
t0 = time.time()
|
||||
resp = await client.post("/md", json=payload)
|
||||
dt = time.time() - t0
|
||||
console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
|
||||
resp.raise_for_status()
|
||||
md = resp.json().get("markdown", "")
|
||||
snippet = (md[:500] + "...") if len(md) > 500 else md
|
||||
console.print(Panel(snippet, title="Markdown snippet", border_style="cyan", expand=False))
|
||||
except Exception as e:
|
||||
console.print(f"[bold red]Error hitting /md:[/] {e}")
|
||||
|
||||
# 8. LLM QA helper endpoint
|
||||
async def demo_llm_endpoint(client: httpx.AsyncClient):
|
||||
"""
|
||||
Quick QA round-trip with /llm.
|
||||
Asks a trivial question against SIMPLE_URL just to show wiring.
|
||||
"""
|
||||
page_url = SIMPLE_URL
|
||||
question = "What is the title of this page?"
|
||||
|
||||
console.rule("[bold magenta]Demo 7b: /llm Endpoint[/]", style="magenta")
|
||||
enc = urllib.parse.quote_plus(page_url, safe="")
|
||||
console.print(f"GET /llm/{enc}?q={question}")
|
||||
|
||||
try:
|
||||
t0 = time.time()
|
||||
resp = await client.get(f"/llm/{enc}", params={"q": question})
|
||||
dt = time.time() - t0
|
||||
console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
|
||||
resp.raise_for_status()
|
||||
answer = resp.json().get("answer", "")
|
||||
console.print(Panel(answer or "No answer returned", title="LLM answer", border_style="magenta", expand=False))
|
||||
except Exception as e:
|
||||
console.print(f"[bold red]Error hitting /llm:[/] {e}")
|
||||
|
||||
|
||||
# 9. /config/dump helpers --------------------------------------------------
|
||||
|
||||
async def demo_config_dump_valid(client: httpx.AsyncClient):
|
||||
"""
|
||||
Send a single top-level CrawlerRunConfig(...) expression and show the dump.
|
||||
"""
|
||||
code_snippet = "CrawlerRunConfig(cache_mode='BYPASS', screenshot=True)"
|
||||
payload = {"code": code_snippet}
|
||||
|
||||
console.rule("[bold blue]Demo 8a: /config/dump (valid)[/]", style="blue")
|
||||
print_payload(payload)
|
||||
|
||||
try:
|
||||
t0 = time.time()
|
||||
resp = await client.post("/config/dump", json=payload)
|
||||
dt = time.time() - t0
|
||||
console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
|
||||
resp.raise_for_status()
|
||||
dump_json = resp.json()
|
||||
console.print(Panel(Syntax(json.dumps(dump_json, indent=2), "json", theme="monokai"), title="Dump()", border_style="cyan"))
|
||||
except Exception as e:
|
||||
console.print(f"[bold red]Error in valid /config/dump call:[/] {e}")
|
||||
|
||||
|
||||
async def demo_config_dump_invalid(client: httpx.AsyncClient):
|
||||
"""
|
||||
Purposely break the rule (nested call) to show the 400 parse error.
|
||||
"""
|
||||
bad_code = textwrap.dedent("""
|
||||
BrowserConfig(headless=True); CrawlerRunConfig()
|
||||
""").strip()
|
||||
payload = {"code": bad_code}
|
||||
|
||||
console.rule("[bold magenta]Demo 8b: /config/dump (invalid)[/]", style="magenta")
|
||||
print_payload(payload)
|
||||
|
||||
try:
|
||||
resp = await client.post("/config/dump", json=payload)
|
||||
console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/]")
|
||||
resp.raise_for_status() # should throw -> except
|
||||
except httpx.HTTPStatusError as e:
|
||||
console.print("[cyan]Expected parse/validation failure captured:[/]")
|
||||
try:
|
||||
console.print(Panel(Syntax(json.dumps(e.response.json(), indent=2), "json", theme="fruity"), title="Error payload"))
|
||||
except Exception:
|
||||
console.print(e.response.text)
|
||||
except Exception as e:
|
||||
console.print(f"[bold red]Unexpected error during invalid test:[/] {e}")
|
||||
|
||||
|
||||
# --- Update Main Runner to include new demo ---
|
||||
async def main_demo():
|
||||
async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client:
|
||||
if not await check_server_health(client):
|
||||
return
|
||||
|
||||
|
||||
# --- Run Demos ---
|
||||
await demo_basic_single_url(client)
|
||||
await demo_basic_multi_url(client)
|
||||
@ -1001,7 +1101,15 @@ async def main_demo():
|
||||
await demo_deep_with_css_extraction(client)
|
||||
await demo_deep_with_llm_extraction(client) # Skips if no common LLM key env var
|
||||
await demo_deep_with_proxy(client) # Skips if no PROXIES env var
|
||||
await demo_deep_with_ssl(client) # Added the new demo
|
||||
await demo_deep_with_ssl(client) # Added the new demo
|
||||
|
||||
# --- Helper endpoints ---
|
||||
await demo_markdown_endpoint(client)
|
||||
await demo_llm_endpoint(client)
|
||||
|
||||
# --- /config/dump sanity checks ---
|
||||
await demo_config_dump_valid(client)
|
||||
await demo_config_dump_invalid(client)
|
||||
|
||||
console.rule("[bold green]Demo Complete[/]", style="green")
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user