277 lines
9.9 KiB
Python
277 lines
9.9 KiB
Python
import os, sys
|
|
|
|
# append the parent directory to the sys.path
|
|
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
sys.path.append(parent_dir)
|
|
parent_parent_dir = os.path.dirname(parent_dir)
|
|
sys.path.append(parent_parent_dir)
|
|
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
|
__data__ = os.path.join(__location__, "__data")
|
|
import asyncio
|
|
from pathlib import Path
|
|
import aiohttp
|
|
import json
|
|
from crawl4ai import AsyncWebCrawler, CacheMode
|
|
from crawl4ai.content_filter_strategy import BM25ContentFilter
|
|
|
|
|
|
# 1. File Download Processing Example
|
|
async def download_example():
|
|
"""Example of downloading files from Python.org"""
|
|
# downloads_path = os.path.join(os.getcwd(), "downloads")
|
|
downloads_path = os.path.join(Path.home(), ".crawl4ai", "downloads")
|
|
os.makedirs(downloads_path, exist_ok=True)
|
|
|
|
print(f"Downloads will be saved to: {downloads_path}")
|
|
|
|
async with AsyncWebCrawler(
|
|
accept_downloads=True, downloads_path=downloads_path, verbose=True
|
|
) as crawler:
|
|
result = await crawler.arun(
|
|
url="https://www.python.org/downloads/",
|
|
js_code="""
|
|
// Find and click the first Windows installer link
|
|
const downloadLink = document.querySelector('a[href$=".exe"]');
|
|
if (downloadLink) {
|
|
console.log('Found download link:', downloadLink.href);
|
|
downloadLink.click();
|
|
} else {
|
|
console.log('No .exe download link found');
|
|
}
|
|
""",
|
|
delay_before_return_html=1, # Wait 5 seconds to ensure download starts
|
|
cache_mode=CacheMode.BYPASS,
|
|
)
|
|
|
|
if result.downloaded_files:
|
|
print("\nDownload successful!")
|
|
print("Downloaded files:")
|
|
for file_path in result.downloaded_files:
|
|
print(f"- {file_path}")
|
|
print(f" File size: {os.path.getsize(file_path) / (1024*1024):.2f} MB")
|
|
else:
|
|
print("\nNo files were downloaded")
|
|
|
|
|
|
# 2. Local File and Raw HTML Processing Example
|
|
async def local_and_raw_html_example():
|
|
"""Example of processing local files and raw HTML"""
|
|
# Create a sample HTML file
|
|
sample_file = os.path.join(__data__, "sample.html")
|
|
with open(sample_file, "w") as f:
|
|
f.write(
|
|
"""
|
|
<html><body>
|
|
<h1>Test Content</h1>
|
|
<p>This is a test paragraph.</p>
|
|
</body></html>
|
|
"""
|
|
)
|
|
|
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
|
# Process local file
|
|
local_result = await crawler.arun(url=f"file://{os.path.abspath(sample_file)}")
|
|
|
|
# Process raw HTML
|
|
raw_html = """
|
|
<html><body>
|
|
<h1>Raw HTML Test</h1>
|
|
<p>This is a test of raw HTML processing.</p>
|
|
</body></html>
|
|
"""
|
|
raw_result = await crawler.arun(url=f"raw:{raw_html}")
|
|
|
|
# Clean up
|
|
os.remove(sample_file)
|
|
|
|
print("Local file content:", local_result.markdown)
|
|
print("\nRaw HTML content:", raw_result.markdown)
|
|
|
|
|
|
# 3. Enhanced Markdown Generation Example
|
|
async def markdown_generation_example():
|
|
"""Example of enhanced markdown generation with citations and LLM-friendly features"""
|
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
|
# Create a content filter (optional)
|
|
content_filter = BM25ContentFilter(
|
|
# user_query="History and cultivation",
|
|
bm25_threshold=1.0
|
|
)
|
|
|
|
result = await crawler.arun(
|
|
url="https://en.wikipedia.org/wiki/Apple",
|
|
css_selector="main div#bodyContent",
|
|
content_filter=content_filter,
|
|
cache_mode=CacheMode.BYPASS,
|
|
)
|
|
|
|
from crawl4ai.content_filter_strategy import BM25ContentFilter
|
|
|
|
result = await crawler.arun(
|
|
url="https://en.wikipedia.org/wiki/Apple",
|
|
css_selector="main div#bodyContent",
|
|
content_filter=BM25ContentFilter(),
|
|
)
|
|
print(result.markdown_v2.fit_markdown)
|
|
|
|
print("\nMarkdown Generation Results:")
|
|
print(f"1. Original markdown length: {len(result.markdown)}")
|
|
print("2. New markdown versions (markdown_v2):")
|
|
print(f" - Raw markdown length: {len(result.markdown_v2.raw_markdown)}")
|
|
print(
|
|
f" - Citations markdown length: {len(result.markdown_v2.markdown_with_citations)}"
|
|
)
|
|
print(
|
|
f" - References section length: {len(result.markdown_v2.references_markdown)}"
|
|
)
|
|
if result.markdown_v2.fit_markdown:
|
|
print(
|
|
f" - Filtered markdown length: {len(result.markdown_v2.fit_markdown)}"
|
|
)
|
|
|
|
# Save examples to files
|
|
output_dir = os.path.join(__data__, "markdown_examples")
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
# Save different versions
|
|
with open(os.path.join(output_dir, "1_raw_markdown.md"), "w") as f:
|
|
f.write(result.markdown_v2.raw_markdown)
|
|
|
|
with open(os.path.join(output_dir, "2_citations_markdown.md"), "w") as f:
|
|
f.write(result.markdown_v2.markdown_with_citations)
|
|
|
|
with open(os.path.join(output_dir, "3_references.md"), "w") as f:
|
|
f.write(result.markdown_v2.references_markdown)
|
|
|
|
if result.markdown_v2.fit_markdown:
|
|
with open(os.path.join(output_dir, "4_filtered_markdown.md"), "w") as f:
|
|
f.write(result.markdown_v2.fit_markdown)
|
|
|
|
print(f"\nMarkdown examples saved to: {output_dir}")
|
|
|
|
# Show a sample of citations and references
|
|
print("\nSample of markdown with citations:")
|
|
print(result.markdown_v2.markdown_with_citations[:500] + "...\n")
|
|
print("Sample of references:")
|
|
print(
|
|
"\n".join(result.markdown_v2.references_markdown.split("\n")[:10]) + "..."
|
|
)
|
|
|
|
|
|
# 4. Browser Management Example
|
|
async def browser_management_example():
|
|
"""Example of using enhanced browser management features"""
|
|
# Use the specified user directory path
|
|
user_data_dir = os.path.join(Path.home(), ".crawl4ai", "browser_profile")
|
|
os.makedirs(user_data_dir, exist_ok=True)
|
|
|
|
print(f"Browser profile will be saved to: {user_data_dir}")
|
|
|
|
async with AsyncWebCrawler(
|
|
use_managed_browser=True,
|
|
user_data_dir=user_data_dir,
|
|
headless=False,
|
|
verbose=True,
|
|
) as crawler:
|
|
result = await crawler.arun(
|
|
url="https://crawl4ai.com",
|
|
# session_id="persistent_session_1",
|
|
cache_mode=CacheMode.BYPASS,
|
|
)
|
|
# Use GitHub as an example - it's a good test for browser management
|
|
# because it requires proper browser handling
|
|
result = await crawler.arun(
|
|
url="https://github.com/trending",
|
|
# session_id="persistent_session_1",
|
|
cache_mode=CacheMode.BYPASS,
|
|
)
|
|
|
|
print("\nBrowser session result:", result.success)
|
|
if result.success:
|
|
print("Page title:", result.metadata.get("title", "No title found"))
|
|
|
|
|
|
# 5. API Usage Example
|
|
async def api_example():
|
|
"""Example of using the new API endpoints"""
|
|
api_token = os.getenv("CRAWL4AI_API_TOKEN") or "test_api_code"
|
|
headers = {"Authorization": f"Bearer {api_token}"}
|
|
async with aiohttp.ClientSession() as session:
|
|
# Submit crawl job
|
|
crawl_request = {
|
|
"urls": ["https://news.ycombinator.com"], # Hacker News as an example
|
|
"extraction_config": {
|
|
"type": "json_css",
|
|
"params": {
|
|
"schema": {
|
|
"name": "Hacker News Articles",
|
|
"baseSelector": ".athing",
|
|
"fields": [
|
|
{"name": "title", "selector": ".title a", "type": "text"},
|
|
{"name": "score", "selector": ".score", "type": "text"},
|
|
{
|
|
"name": "url",
|
|
"selector": ".title a",
|
|
"type": "attribute",
|
|
"attribute": "href",
|
|
},
|
|
],
|
|
}
|
|
},
|
|
},
|
|
"crawler_params": {
|
|
"headless": True,
|
|
# "use_managed_browser": True
|
|
},
|
|
"cache_mode": "bypass",
|
|
# "screenshot": True,
|
|
# "magic": True
|
|
}
|
|
|
|
async with session.post(
|
|
"http://localhost:11235/crawl", json=crawl_request, headers=headers
|
|
) as response:
|
|
task_data = await response.json()
|
|
task_id = task_data["task_id"]
|
|
|
|
# Check task status
|
|
while True:
|
|
async with session.get(
|
|
f"http://localhost:11235/task/{task_id}", headers=headers
|
|
) as status_response:
|
|
result = await status_response.json()
|
|
print(f"Task status: {result['status']}")
|
|
|
|
if result["status"] == "completed":
|
|
print("Task completed!")
|
|
print("Results:")
|
|
news = json.loads(result["results"][0]["extracted_content"])
|
|
print(json.dumps(news[:4], indent=2))
|
|
break
|
|
else:
|
|
await asyncio.sleep(1)
|
|
|
|
|
|
# Main execution
|
|
async def main():
|
|
# print("Running Crawl4AI feature examples...")
|
|
|
|
# print("\n1. Running Download Example:")
|
|
# await download_example()
|
|
|
|
# print("\n2. Running Markdown Generation Example:")
|
|
# await markdown_generation_example()
|
|
|
|
# # print("\n3. Running Local and Raw HTML Example:")
|
|
# await local_and_raw_html_example()
|
|
|
|
# # print("\n4. Running Browser Management Example:")
|
|
await browser_management_example()
|
|
|
|
# print("\n5. Running API Example:")
|
|
await api_example()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|