mirror of
https://github.com/unclecode/crawl4ai.git
synced 2025-09-29 04:38:57 +00:00

- Another thing this commit introduces is the concept of the Relevance Content Filter. This is an improvement over Fit Markdown. This class of strategies aims to extract the main content from a given page - the part that really matters and is useful to be processed. One strategy has been created using the BM25 algorithm, which finds chunks of text from the web page relevant to its title, descriptions, and keywords, or supports a given user query and matches them. The result is then returned to the main engine to be converted to Markdown. Plans include adding approaches using language models as well. - The cache database was updated to hold information about response headers and downloaded files.
229 lines
9.0 KiB
Python
229 lines
9.0 KiB
Python
import os
|
|
import sys
|
|
import asyncio
|
|
import shutil
|
|
from typing import List
|
|
import tempfile
|
|
import time
|
|
|
|
# Add the parent directory to the Python path
|
|
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
sys.path.append(parent_dir)
|
|
|
|
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
|
|
|
class TestDownloads:
|
|
def __init__(self):
|
|
self.temp_dir = tempfile.mkdtemp(prefix="crawl4ai_test_")
|
|
self.download_dir = os.path.join(self.temp_dir, "downloads")
|
|
os.makedirs(self.download_dir, exist_ok=True)
|
|
self.results: List[str] = []
|
|
|
|
def cleanup(self):
|
|
shutil.rmtree(self.temp_dir)
|
|
|
|
def log_result(self, test_name: str, success: bool, message: str = ""):
|
|
result = f"{'✅' if success else '❌'} {test_name}: {message}"
|
|
self.results.append(result)
|
|
print(result)
|
|
|
|
async def test_basic_download(self):
|
|
"""Test basic file download functionality"""
|
|
try:
|
|
async with AsyncWebCrawler(
|
|
accept_downloads=True,
|
|
downloads_path=self.download_dir,
|
|
verbose=True
|
|
) as crawler:
|
|
# Python.org downloads page typically has stable download links
|
|
result = await crawler.arun(
|
|
url="https://www.python.org/downloads/",
|
|
js_code="""
|
|
// Click first download link
|
|
const downloadLink = document.querySelector('a[href$=".exe"]');
|
|
if (downloadLink) downloadLink.click();
|
|
"""
|
|
)
|
|
|
|
success = result.downloaded_files is not None and len(result.downloaded_files) > 0
|
|
self.log_result(
|
|
"Basic Download",
|
|
success,
|
|
f"Downloaded {len(result.downloaded_files or [])} files" if success else "No files downloaded"
|
|
)
|
|
except Exception as e:
|
|
self.log_result("Basic Download", False, str(e))
|
|
|
|
async def test_persistent_context_download(self):
|
|
"""Test downloads with persistent context"""
|
|
try:
|
|
user_data_dir = os.path.join(self.temp_dir, "user_data")
|
|
os.makedirs(user_data_dir, exist_ok=True)
|
|
|
|
async with AsyncWebCrawler(
|
|
accept_downloads=True,
|
|
downloads_path=self.download_dir,
|
|
use_persistent_context=True,
|
|
user_data_dir=user_data_dir,
|
|
verbose=True
|
|
) as crawler:
|
|
result = await crawler.arun(
|
|
url="https://www.python.org/downloads/",
|
|
js_code="""
|
|
const downloadLink = document.querySelector('a[href$=".exe"]');
|
|
if (downloadLink) downloadLink.click();
|
|
"""
|
|
)
|
|
|
|
success = result.downloaded_files is not None and len(result.downloaded_files) > 0
|
|
self.log_result(
|
|
"Persistent Context Download",
|
|
success,
|
|
f"Downloaded {len(result.downloaded_files or [])} files" if success else "No files downloaded"
|
|
)
|
|
except Exception as e:
|
|
self.log_result("Persistent Context Download", False, str(e))
|
|
|
|
async def test_multiple_downloads(self):
|
|
"""Test multiple simultaneous downloads"""
|
|
try:
|
|
async with AsyncWebCrawler(
|
|
accept_downloads=True,
|
|
downloads_path=self.download_dir,
|
|
verbose=True
|
|
) as crawler:
|
|
result = await crawler.arun(
|
|
url="https://www.python.org/downloads/",
|
|
js_code="""
|
|
// Click multiple download links
|
|
const downloadLinks = document.querySelectorAll('a[href$=".exe"]');
|
|
downloadLinks.forEach(link => link.click());
|
|
"""
|
|
)
|
|
|
|
success = result.downloaded_files is not None and len(result.downloaded_files) > 1
|
|
self.log_result(
|
|
"Multiple Downloads",
|
|
success,
|
|
f"Downloaded {len(result.downloaded_files or [])} files" if success else "Not enough files downloaded"
|
|
)
|
|
except Exception as e:
|
|
self.log_result("Multiple Downloads", False, str(e))
|
|
|
|
async def test_different_browsers(self):
|
|
"""Test downloads across different browser types"""
|
|
browsers = ["chromium", "firefox", "webkit"]
|
|
|
|
for browser_type in browsers:
|
|
try:
|
|
async with AsyncWebCrawler(
|
|
accept_downloads=True,
|
|
downloads_path=self.download_dir,
|
|
browser_type=browser_type,
|
|
verbose=True
|
|
) as crawler:
|
|
result = await crawler.arun(
|
|
url="https://www.python.org/downloads/",
|
|
js_code="""
|
|
const downloadLink = document.querySelector('a[href$=".exe"]');
|
|
if (downloadLink) downloadLink.click();
|
|
"""
|
|
)
|
|
|
|
success = result.downloaded_files is not None and len(result.downloaded_files) > 0
|
|
self.log_result(
|
|
f"{browser_type.title()} Download",
|
|
success,
|
|
f"Downloaded {len(result.downloaded_files or [])} files" if success else "No files downloaded"
|
|
)
|
|
except Exception as e:
|
|
self.log_result(f"{browser_type.title()} Download", False, str(e))
|
|
|
|
async def test_edge_cases(self):
|
|
"""Test various edge cases"""
|
|
|
|
# Test 1: Downloads without specifying download path
|
|
try:
|
|
async with AsyncWebCrawler(
|
|
accept_downloads=True,
|
|
verbose=True
|
|
) as crawler:
|
|
result = await crawler.arun(
|
|
url="https://www.python.org/downloads/",
|
|
js_code="document.querySelector('a[href$=\".exe\"]').click()"
|
|
)
|
|
self.log_result(
|
|
"Default Download Path",
|
|
True,
|
|
f"Downloaded to default path: {result.downloaded_files[0] if result.downloaded_files else 'None'}"
|
|
)
|
|
except Exception as e:
|
|
self.log_result("Default Download Path", False, str(e))
|
|
|
|
# Test 2: Downloads with invalid path
|
|
try:
|
|
async with AsyncWebCrawler(
|
|
accept_downloads=True,
|
|
downloads_path="/invalid/path/that/doesnt/exist",
|
|
verbose=True
|
|
) as crawler:
|
|
result = await crawler.arun(
|
|
url="https://www.python.org/downloads/",
|
|
js_code="document.querySelector('a[href$=\".exe\"]').click()"
|
|
)
|
|
self.log_result("Invalid Download Path", False, "Should have raised an error")
|
|
except Exception as e:
|
|
self.log_result("Invalid Download Path", True, "Correctly handled invalid path")
|
|
|
|
# Test 3: Download with accept_downloads=False
|
|
try:
|
|
async with AsyncWebCrawler(
|
|
accept_downloads=False,
|
|
verbose=True
|
|
) as crawler:
|
|
result = await crawler.arun(
|
|
url="https://www.python.org/downloads/",
|
|
js_code="document.querySelector('a[href$=\".exe\"]').click()"
|
|
)
|
|
success = result.downloaded_files is None
|
|
self.log_result(
|
|
"Disabled Downloads",
|
|
success,
|
|
"Correctly ignored downloads" if success else "Unexpectedly downloaded files"
|
|
)
|
|
except Exception as e:
|
|
self.log_result("Disabled Downloads", False, str(e))
|
|
|
|
async def run_all_tests(self):
|
|
"""Run all test cases"""
|
|
print("\n🧪 Running Download Tests...\n")
|
|
|
|
test_methods = [
|
|
self.test_basic_download,
|
|
self.test_persistent_context_download,
|
|
self.test_multiple_downloads,
|
|
self.test_different_browsers,
|
|
self.test_edge_cases
|
|
]
|
|
|
|
for test in test_methods:
|
|
print(f"\n📝 Running {test.__doc__}...")
|
|
await test()
|
|
await asyncio.sleep(2) # Brief pause between tests
|
|
|
|
print("\n📊 Test Results Summary:")
|
|
for result in self.results:
|
|
print(result)
|
|
|
|
successes = len([r for r in self.results if '✅' in r])
|
|
total = len(self.results)
|
|
print(f"\nTotal: {successes}/{total} tests passed")
|
|
|
|
self.cleanup()
|
|
|
|
async def main():
|
|
tester = TestDownloads()
|
|
await tester.run_all_tests()
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main()) |