
- Add ignore_default_value option to to_serializable_dict - Add viewport dict support in BrowserConfig - Replace FastFilterChain with FilterChain - Add deprecation warnings for unwanted properties - Clean up unused imports - Rename example files for consistency - Add comprehensive Docker configuration tutorial BREAKING CHANGE: FastFilterChain has been replaced with FilterChain
214 lines
7.2 KiB
Python
214 lines
7.2 KiB
Python
import asyncio
|
|
import json
|
|
from typing import Optional
|
|
from urllib.parse import quote
|
|
|
|
async def get_token(session, email: str = "test@example.com") -> str:
|
|
"""Fetch a JWT token from the /token endpoint."""
|
|
url = "http://localhost:8000/token"
|
|
payload = {"email": email}
|
|
print(f"\nFetching token from {url} with email: {email}")
|
|
try:
|
|
async with session.post(url, json=payload) as response:
|
|
status = response.status
|
|
data = await response.json()
|
|
print(f"Token Response Status: {status}")
|
|
print(f"Token Response: {json.dumps(data, indent=2)}")
|
|
if status == 200:
|
|
return data["access_token"]
|
|
else:
|
|
raise Exception(f"Failed to get token: {data.get('detail', 'Unknown error')}")
|
|
except Exception as e:
|
|
print(f"Error fetching token: {str(e)}")
|
|
raise
|
|
|
|
async def test_endpoint(
|
|
session,
|
|
endpoint: str,
|
|
url: str,
|
|
token: str,
|
|
params: Optional[dict] = None,
|
|
expected_status: int = 200
|
|
) -> Optional[dict]:
|
|
"""Test an endpoint with token and print results."""
|
|
params = params or {}
|
|
param_str = "&".join(f"{k}={v}" for k, v in params.items())
|
|
full_url = f"http://localhost:8000/{endpoint}/{quote(url)}"
|
|
if param_str:
|
|
full_url += f"?{param_str}"
|
|
|
|
headers = {"Authorization": f"Bearer {token}"}
|
|
print(f"\nTesting: {full_url}")
|
|
|
|
try:
|
|
async with session.get(full_url, headers=headers) as response:
|
|
status = response.status
|
|
try:
|
|
data = await response.json()
|
|
except:
|
|
data = await response.text()
|
|
|
|
print(f"Status: {status} (Expected: {expected_status})")
|
|
if isinstance(data, dict):
|
|
print(f"Response: {json.dumps(data, indent=2)}")
|
|
else:
|
|
print(f"Response: {data[:500]}...") # First 500 chars
|
|
assert status == expected_status, f"Expected {expected_status}, got {status}"
|
|
return data
|
|
except Exception as e:
|
|
print(f"Error: {str(e)}")
|
|
return None
|
|
|
|
|
|
async def test_stream_crawl(session, token: str):
|
|
"""Test the /crawl/stream endpoint with multiple URLs."""
|
|
url = "http://localhost:8000/crawl/stream"
|
|
payload = {
|
|
"urls": [
|
|
"https://example.com",
|
|
"https://example.com/page1", # Replicated example.com with variation
|
|
"https://example.com/page2", # Replicated example.com with variation
|
|
"https://example.com/page3", # Replicated example.com with variation
|
|
# "https://www.python.org",
|
|
# "https://news.ycombinator.com/news"
|
|
],
|
|
"browser_config": {"headless": True, "viewport": {"width": 1200}},
|
|
"crawler_config": {"stream": True, "cache_mode": "aggressive"}
|
|
}
|
|
headers = {"Authorization": f"Bearer {token}"}
|
|
print(f"\nTesting Streaming Crawl: {url}")
|
|
print(f"Payload: {json.dumps(payload, indent=2)}")
|
|
|
|
try:
|
|
async with session.post(url, json=payload, headers=headers) as response:
|
|
status = response.status
|
|
print(f"Status: {status} (Expected: 200)")
|
|
assert status == 200, f"Expected 200, got {status}"
|
|
|
|
# Read streaming response line-by-line (NDJSON)
|
|
async for line in response.content:
|
|
if line:
|
|
data = json.loads(line.decode('utf-8').strip())
|
|
print(f"Streamed Result: {json.dumps(data, indent=2)}")
|
|
except Exception as e:
|
|
print(f"Error in streaming crawl test: {str(e)}")
|
|
|
|
async def run_tests():
|
|
import aiohttp
|
|
print("Starting API Tests...")
|
|
|
|
# Test URLs
|
|
urls = [
|
|
"example.com",
|
|
"https://www.python.org",
|
|
"https://news.ycombinator.com/news",
|
|
"https://github.com/trending"
|
|
]
|
|
|
|
async with aiohttp.ClientSession() as session:
|
|
token = "test_token"
|
|
# If jwt is enabled, authenticate first
|
|
# Fetch token once and reuse it
|
|
# token = await get_token(session)
|
|
# if not token:
|
|
# print("Aborting tests due to token failure!")
|
|
# return
|
|
|
|
print("\n=== Testing Crawl Endpoint ===")
|
|
crawl_payload = {
|
|
"urls": ["https://example.com"],
|
|
"browser_config": {"headless": True},
|
|
"crawler_config": {"stream": False}
|
|
}
|
|
async with session.post(
|
|
"http://localhost:8000/crawl",
|
|
json=crawl_payload,
|
|
headers={"Authorization": f"Bearer {token}"}
|
|
) as response:
|
|
status = response.status
|
|
data = await response.json()
|
|
print(f"\nCrawl Endpoint Status: {status}")
|
|
print(f"Crawl Response: {json.dumps(data, indent=2)}")
|
|
|
|
|
|
print("\n=== Testing Crawl Stream Endpoint ===")
|
|
await test_stream_crawl(session, token)
|
|
|
|
print("\n=== Testing Markdown Endpoint ===")
|
|
for url in []: #urls:
|
|
for filter_type in ["raw", "fit", "bm25", "llm"]:
|
|
params = {"f": filter_type}
|
|
if filter_type in ["bm25", "llm"]:
|
|
params["q"] = "extract main content"
|
|
|
|
for cache in ["0", "1"]:
|
|
params["c"] = cache
|
|
await test_endpoint(session, "md", url, token, params)
|
|
await asyncio.sleep(1) # Be nice to the server
|
|
|
|
print("\n=== Testing LLM Endpoint ===")
|
|
for url in urls:
|
|
# Test basic extraction (direct response now)
|
|
result = await test_endpoint(
|
|
session,
|
|
"llm",
|
|
url,
|
|
token,
|
|
{"q": "Extract title and main content"}
|
|
)
|
|
|
|
# Test with schema (direct response)
|
|
schema = {
|
|
"type": "object",
|
|
"properties": {
|
|
"title": {"type": "string"},
|
|
"content": {"type": "string"},
|
|
"links": {"type": "array", "items": {"type": "string"}}
|
|
}
|
|
}
|
|
result = await test_endpoint(
|
|
session,
|
|
"llm",
|
|
url,
|
|
token,
|
|
{
|
|
"q": "Extract content with links",
|
|
"s": json.dumps(schema),
|
|
"c": "1" # Test with cache
|
|
}
|
|
)
|
|
await asyncio.sleep(2) # Be nice to the server
|
|
|
|
print("\n=== Testing Error Cases ===")
|
|
# Test invalid URL
|
|
await test_endpoint(
|
|
session,
|
|
"md",
|
|
"not_a_real_url",
|
|
token,
|
|
expected_status=500
|
|
)
|
|
|
|
# Test invalid filter type
|
|
await test_endpoint(
|
|
session,
|
|
"md",
|
|
"example.com",
|
|
token,
|
|
{"f": "invalid"},
|
|
expected_status=422
|
|
)
|
|
|
|
# Test LLM without query (should fail per your server logic)
|
|
await test_endpoint(
|
|
session,
|
|
"llm",
|
|
"example.com",
|
|
token,
|
|
expected_status=400
|
|
)
|
|
|
|
print("\nAll tests completed!")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(run_tests()) |