
- Fix crawler text mode for improved performance; cover missing `srcset` and `data_srcset` attributes in image tags. - Introduced Managed Browsers for enhanced crawling experience. - Updated documentation for clearer navigation on configuration. - Changed 'text_only' to 'text_mode' in configuration and methods. - Improved performance and relevance in content filtering strategies.
158 lines
5.4 KiB
Markdown
158 lines
5.4 KiB
Markdown
# Prefix-Based Input Handling in Crawl4AI
|
|
|
|
This guide will walk you through using the Crawl4AI library to crawl web pages, local HTML files, and raw HTML strings. We'll demonstrate these capabilities using a Wikipedia page as an example.
|
|
|
|
## Crawling a Web URL
|
|
|
|
To crawl a live web page, provide the URL starting with `http://` or `https://`, using a `CrawlerRunConfig` object:
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler
|
|
from crawl4ai.async_configs import CrawlerRunConfig
|
|
|
|
async def crawl_web():
|
|
config = CrawlerRunConfig(bypass_cache=True)
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun(url="https://en.wikipedia.org/wiki/apple", config=config)
|
|
if result.success:
|
|
print("Markdown Content:")
|
|
print(result.markdown)
|
|
else:
|
|
print(f"Failed to crawl: {result.error_message}")
|
|
|
|
asyncio.run(crawl_web())
|
|
```
|
|
|
|
## Crawling a Local HTML File
|
|
|
|
To crawl a local HTML file, prefix the file path with `file://`.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler
|
|
from crawl4ai.async_configs import CrawlerRunConfig
|
|
|
|
async def crawl_local_file():
|
|
local_file_path = "/path/to/apple.html" # Replace with your file path
|
|
file_url = f"file://{local_file_path}"
|
|
config = CrawlerRunConfig(bypass_cache=True)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun(url=file_url, config=config)
|
|
if result.success:
|
|
print("Markdown Content from Local File:")
|
|
print(result.markdown)
|
|
else:
|
|
print(f"Failed to crawl local file: {result.error_message}")
|
|
|
|
asyncio.run(crawl_local_file())
|
|
```
|
|
|
|
## Crawling Raw HTML Content
|
|
|
|
To crawl raw HTML content, prefix the HTML string with `raw:`.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler
|
|
from crawl4ai.async_configs import CrawlerRunConfig
|
|
|
|
async def crawl_raw_html():
|
|
raw_html = "<html><body><h1>Hello, World!</h1></body></html>"
|
|
raw_html_url = f"raw:{raw_html}"
|
|
config = CrawlerRunConfig(bypass_cache=True)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun(url=raw_html_url, config=config)
|
|
if result.success:
|
|
print("Markdown Content from Raw HTML:")
|
|
print(result.markdown)
|
|
else:
|
|
print(f"Failed to crawl raw HTML: {result.error_message}")
|
|
|
|
asyncio.run(crawl_raw_html())
|
|
```
|
|
|
|
---
|
|
|
|
# Complete Example
|
|
|
|
Below is a comprehensive script that:
|
|
|
|
1. Crawls the Wikipedia page for "Apple."
|
|
2. Saves the HTML content to a local file (`apple.html`).
|
|
3. Crawls the local HTML file and verifies the markdown length matches the original crawl.
|
|
4. Crawls the raw HTML content from the saved file and verifies consistency.
|
|
|
|
```python
|
|
import os
|
|
import sys
|
|
import asyncio
|
|
from pathlib import Path
|
|
from crawl4ai import AsyncWebCrawler
|
|
from crawl4ai.async_configs import CrawlerRunConfig
|
|
|
|
async def main():
|
|
wikipedia_url = "https://en.wikipedia.org/wiki/apple"
|
|
script_dir = Path(__file__).parent
|
|
html_file_path = script_dir / "apple.html"
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
# Step 1: Crawl the Web URL
|
|
print("\n=== Step 1: Crawling the Wikipedia URL ===")
|
|
web_config = CrawlerRunConfig(bypass_cache=True)
|
|
result = await crawler.arun(url=wikipedia_url, config=web_config)
|
|
|
|
if not result.success:
|
|
print(f"Failed to crawl {wikipedia_url}: {result.error_message}")
|
|
return
|
|
|
|
with open(html_file_path, 'w', encoding='utf-8') as f:
|
|
f.write(result.html)
|
|
web_crawl_length = len(result.markdown)
|
|
print(f"Length of markdown from web crawl: {web_crawl_length}\n")
|
|
|
|
# Step 2: Crawl from the Local HTML File
|
|
print("=== Step 2: Crawling from the Local HTML File ===")
|
|
file_url = f"file://{html_file_path.resolve()}"
|
|
file_config = CrawlerRunConfig(bypass_cache=True)
|
|
local_result = await crawler.arun(url=file_url, config=file_config)
|
|
|
|
if not local_result.success:
|
|
print(f"Failed to crawl local file {file_url}: {local_result.error_message}")
|
|
return
|
|
|
|
local_crawl_length = len(local_result.markdown)
|
|
assert web_crawl_length == local_crawl_length, "Markdown length mismatch"
|
|
print("✅ Markdown length matches between web and local file crawl.\n")
|
|
|
|
# Step 3: Crawl Using Raw HTML Content
|
|
print("=== Step 3: Crawling Using Raw HTML Content ===")
|
|
with open(html_file_path, 'r', encoding='utf-8') as f:
|
|
raw_html_content = f.read()
|
|
raw_html_url = f"raw:{raw_html_content}"
|
|
raw_config = CrawlerRunConfig(bypass_cache=True)
|
|
raw_result = await crawler.arun(url=raw_html_url, config=raw_config)
|
|
|
|
if not raw_result.success:
|
|
print(f"Failed to crawl raw HTML content: {raw_result.error_message}")
|
|
return
|
|
|
|
raw_crawl_length = len(raw_result.markdown)
|
|
assert web_crawl_length == raw_crawl_length, "Markdown length mismatch"
|
|
print("✅ Markdown length matches between web and raw HTML crawl.\n")
|
|
|
|
print("All tests passed successfully!")
|
|
if html_file_path.exists():
|
|
os.remove(html_file_path)
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|
|
```
|
|
|
|
---
|
|
|
|
# Conclusion
|
|
|
|
With the unified `url` parameter and prefix-based handling in **Crawl4AI**, you can seamlessly handle web URLs, local HTML files, and raw HTML content. Use `CrawlerRunConfig` for flexible and consistent configuration in all scenarios. |