crawl4ai/docs/examples/quickstart_async.py

676 lines
25 KiB
Python
Raw Normal View History

import os, sys
2025-01-13 19:19:58 +08:00
from crawl4ai.async_configs import LlmConfig
# append parent directory to system path
2025-01-13 19:19:58 +08:00
sys.path.append(
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
)
os.environ["FIRECRAWL_API_KEY"] = "fc-84b370ccfad44beabc686b38f1769692"
2024-09-04 15:33:24 +08:00
import asyncio
# import nest_asyncio
# nest_asyncio.apply()
2024-09-04 15:33:24 +08:00
import time
import json
import os
import re
from typing import Dict, List
2024-09-04 15:33:24 +08:00
from bs4 import BeautifulSoup
from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
2025-01-13 19:19:58 +08:00
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.extraction_strategy import (
JsonCssExtractionStrategy,
LLMExtractionStrategy,
)
2024-09-04 15:33:24 +08:00
2024-10-13 14:37:45 +08:00
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
2024-09-04 15:33:24 +08:00
print("Crawl4AI: Advanced Web Crawling and Data Extraction")
print("GitHub Repository: https://github.com/unclecode/crawl4ai")
print("Twitter: @unclecode")
print("Website: https://crawl4ai.com")
2024-09-04 15:33:24 +08:00
async def simple_crawl():
print("\n--- Basic Usage ---")
async with AsyncWebCrawler(verbose=True) as crawler:
2025-01-13 19:19:58 +08:00
result = await crawler.arun(
url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS
)
2024-09-04 15:33:24 +08:00
print(result.markdown[:500]) # Print first 500 characters
2025-01-13 19:19:58 +08:00
2024-10-13 14:37:45 +08:00
async def simple_example_with_running_js_code():
2024-09-04 15:33:24 +08:00
print("\n--- Executing JavaScript and Using CSS Selectors ---")
# New code to handle the wait_for parameter
wait_for = """() => {
return Array.from(document.querySelectorAll('article.tease-card')).length > 10;
}"""
# wait_for can be also just a css selector
# wait_for = "article.tease-card:nth-child(10)"
2024-09-04 15:33:24 +08:00
async with AsyncWebCrawler(verbose=True) as crawler:
js_code = [
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
]
2024-09-04 15:33:24 +08:00
result = await crawler.arun(
url="https://www.nbcnews.com/business",
js_code=js_code,
# wait_for=wait_for,
cache_mode=CacheMode.BYPASS,
2024-09-04 15:33:24 +08:00
)
print(result.markdown[:500]) # Print first 500 characters
2024-09-04 15:33:24 +08:00
2025-01-13 19:19:58 +08:00
2024-10-13 14:37:45 +08:00
async def simple_example_with_css_selector():
print("\n--- Using CSS Selectors ---")
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business",
css_selector=".wide-tease-item__description",
cache_mode=CacheMode.BYPASS,
2024-10-13 14:37:45 +08:00
)
print(result.markdown[:500]) # Print first 500 characters
2025-01-13 19:19:58 +08:00
2024-09-04 15:33:24 +08:00
async def use_proxy():
print("\n--- Using a Proxy ---")
print(
"Note: Replace 'http://your-proxy-url:port' with a working proxy to run this example."
)
2024-09-04 15:33:24 +08:00
# Uncomment and modify the following lines to use a proxy
2025-01-13 19:19:58 +08:00
async with AsyncWebCrawler(
verbose=True, proxy="http://your-proxy-url:port"
) as crawler:
result = await crawler.arun(
2025-01-13 19:19:58 +08:00
url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS
)
if result.success:
print(result.markdown[:500]) # Print first 500 characters
2024-09-04 15:33:24 +08:00
2025-01-13 19:19:58 +08:00
async def capture_and_save_screenshot(url: str, output_path: str):
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
2025-01-13 19:19:58 +08:00
url=url, screenshot=True, cache_mode=CacheMode.BYPASS
)
2025-01-13 19:19:58 +08:00
if result.success and result.screenshot:
import base64
2025-01-13 19:19:58 +08:00
# Decode the base64 screenshot data
screenshot_data = base64.b64decode(result.screenshot)
2025-01-13 19:19:58 +08:00
# Save the screenshot as a JPEG file
2025-01-13 19:19:58 +08:00
with open(output_path, "wb") as f:
f.write(screenshot_data)
2025-01-13 19:19:58 +08:00
print(f"Screenshot saved successfully to {output_path}")
else:
print("Failed to capture screenshot")
2025-01-13 19:19:58 +08:00
2024-09-04 15:33:24 +08:00
class OpenAIModelFee(BaseModel):
model_name: str = Field(..., description="Name of the OpenAI model.")
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
output_fee: str = Field(
..., description="Fee for output token for the OpenAI model."
)
2024-09-04 15:33:24 +08:00
2025-01-13 19:19:58 +08:00
async def extract_structured_data_using_llm(
provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
):
print(f"\n--- Extracting Structured Data with {provider} ---")
2025-01-13 19:19:58 +08:00
if api_token is None and provider != "ollama":
print(f"API token is required for {provider}. Skipping this example.")
2024-09-04 15:33:24 +08:00
return
# extra_args = {}
2025-01-13 19:19:58 +08:00
extra_args = {
"temperature": 0,
"top_p": 0.9,
"max_tokens": 2000,
# any other supported parameters for litellm
}
if extra_headers:
extra_args["extra_headers"] = extra_headers
2024-09-04 15:33:24 +08:00
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url="https://openai.com/api/pricing/",
2024-09-04 15:33:24 +08:00
word_count_threshold=1,
extraction_strategy=LLMExtractionStrategy(
llmConfig=LlmConfig(provider=provider,api_token=api_token),
feat: Enhance AsyncPlaywrightCrawlerStrategy with text-only and light modes, dynamic viewport adjustment, and session management ### New Features: - **Text-Only Mode**: Added support for text-only crawling by disabling images, JavaScript, GPU, and other non-essential features. - **Light Mode**: Optimized browser settings to reduce resource usage and improve efficiency during crawling. - **Dynamic Viewport Adjustment**: Automatically adjusts viewport dimensions based on content size, ensuring accurate rendering and scaling. - **Full Page Scanning**: Introduced a feature to scroll and capture dynamic content for pages with infinite scroll or lazy-loading elements. - **Session Management**: Added `create_session` method for creating and managing browser sessions with unique IDs. ### Improvements: - Unified viewport handling across contexts by dynamically setting dimensions using `self.viewport_width` and `self.viewport_height`. - Enhanced logging and error handling for viewport adjustments, page scanning, and content evaluation. - Reduced resource usage with additional browser flags for both `light_mode` and `text_only` configurations. - Improved handling of cookies, headers, and proxies in session creation. ### Refactoring: - Removed hardcoded viewport dimensions and replaced them with dynamic configurations. - Cleaned up unused and commented-out code for better readability and maintainability. - Introduced defaults for frequently used parameters like `delay_before_return_html`. ### Fixes: - Resolved potential inconsistencies in viewport handling. - Improved robustness of content loading and dynamic adjustments to avoid failures and timeouts. ### Docs Update: - Updated schema usage in `quickstart_async.py` example: - Changed `OpenAIModelFee.schema()` to `OpenAIModelFee.model_json_schema()` for compatibility. - Enhanced LLM extraction instruction documentation. This commit introduces significant enhancements to improve efficiency, flexibility, and reliability of the crawler strategy.
2024-12-08 20:04:44 +08:00
schema=OpenAIModelFee.model_json_schema(),
2024-09-04 15:33:24 +08:00
extraction_type="schema",
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
Do not miss any models in the entire content. One extracted model JSON format should look like this:
{"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""",
2025-01-13 19:19:58 +08:00
extra_args=extra_args,
),
cache_mode=CacheMode.BYPASS,
2024-09-04 15:33:24 +08:00
)
print(result.extracted_content)
2025-01-13 19:19:58 +08:00
async def extract_structured_data_using_css_extractor():
print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
schema = {
2025-01-13 19:19:58 +08:00
"name": "KidoCode Courses",
"baseSelector": "section.charge-methodology .w-tab-content > div",
"fields": [
{
"name": "section_title",
"selector": "h3.heading-50",
"type": "text",
},
{
"name": "section_description",
"selector": ".charge-content",
"type": "text",
},
{
"name": "course_name",
"selector": ".text-block-93",
"type": "text",
},
{
"name": "course_description",
"selector": ".course-content-text",
"type": "text",
},
{
"name": "course_icon",
"selector": ".image-92",
"type": "attribute",
"attribute": "src",
},
],
}
2025-01-13 19:19:58 +08:00
async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
# Create the JavaScript that handles clicking multiple times
js_click_tabs = """
(async () => {
const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
for(let tab of tabs) {
// scroll to the tab
tab.scrollIntoView();
tab.click();
// Wait for content to load and animations to complete
await new Promise(r => setTimeout(r, 500));
}
})();
2025-01-13 19:19:58 +08:00
"""
result = await crawler.arun(
url="https://www.kidocode.com/degrees/technology",
extraction_strategy=JsonCssExtractionStrategy(schema, verbose=True),
js_code=[js_click_tabs],
2025-01-13 19:19:58 +08:00
cache_mode=CacheMode.BYPASS,
)
companies = json.loads(result.extracted_content)
print(f"Successfully extracted {len(companies)} companies")
print(json.dumps(companies[0], indent=2))
2025-01-13 19:19:58 +08:00
# Advanced Session-Based Crawling with Dynamic Content 🔄
async def crawl_dynamic_content_pages_method_1():
2024-09-04 15:33:24 +08:00
print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
first_commit = ""
2024-09-04 15:33:24 +08:00
async def on_execution_started(page):
nonlocal first_commit
2024-09-04 15:33:24 +08:00
try:
while True:
await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4")
commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4")
commit = await commit.evaluate("(element) => element.textContent")
commit = re.sub(r"\s+", "", commit)
2024-09-04 15:33:24 +08:00
if commit and commit != first_commit:
first_commit = commit
break
await asyncio.sleep(0.5)
except Exception as e:
print(f"Warning: New content didn't appear after JavaScript execution: {e}")
async with AsyncWebCrawler(verbose=True) as crawler:
crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
2024-09-04 15:33:24 +08:00
url = "https://github.com/microsoft/TypeScript/commits/main"
session_id = "typescript_commits_session"
all_commits = []
js_next_page = """
(() => {
const button = document.querySelector('a[data-testid="pagination-next-button"]');
if (button) button.click();
})();
2024-09-04 15:33:24 +08:00
"""
for page in range(3): # Crawl 3 pages
result = await crawler.arun(
url=url,
session_id=session_id,
css_selector="li.Box-sc-g0xbh4-0",
js=js_next_page if page > 0 else None,
cache_mode=CacheMode.BYPASS,
js_only=page > 0,
headless=False,
2024-09-04 15:33:24 +08:00
)
assert result.success, f"Failed to crawl page {page + 1}"
soup = BeautifulSoup(result.cleaned_html, "html.parser")
2024-09-04 15:33:24 +08:00
commits = soup.select("li")
all_commits.extend(commits)
print(f"Page {page + 1}: Found {len(commits)} commits")
await crawler.crawler_strategy.kill_session(session_id)
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
2025-01-13 19:19:58 +08:00
async def crawl_dynamic_content_pages_method_2():
print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
2024-09-04 15:33:24 +08:00
async with AsyncWebCrawler(verbose=True) as crawler:
url = "https://github.com/microsoft/TypeScript/commits/main"
session_id = "typescript_commits_session"
all_commits = []
last_commit = ""
js_next_page_and_wait = """
(async () => {
const getCurrentCommit = () => {
const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
return commits.length > 0 ? commits[0].textContent.trim() : null;
};
const initialCommit = getCurrentCommit();
const button = document.querySelector('a[data-testid="pagination-next-button"]');
if (button) button.click();
// Poll for changes
while (true) {
await new Promise(resolve => setTimeout(resolve, 100)); // Wait 100ms
const newCommit = getCurrentCommit();
if (newCommit && newCommit !== initialCommit) {
break;
}
}
})();
"""
schema = {
"name": "Commit Extractor",
"baseSelector": "li.Box-sc-g0xbh4-0",
"fields": [
{
"name": "title",
"selector": "h4.markdown-title",
"type": "text",
"transform": "strip",
},
],
}
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
for page in range(3): # Crawl 3 pages
result = await crawler.arun(
url=url,
session_id=session_id,
css_selector="li.Box-sc-g0xbh4-0",
extraction_strategy=extraction_strategy,
js_code=js_next_page_and_wait if page > 0 else None,
js_only=page > 0,
cache_mode=CacheMode.BYPASS,
headless=False,
)
assert result.success, f"Failed to crawl page {page + 1}"
commits = json.loads(result.extracted_content)
all_commits.extend(commits)
print(f"Page {page + 1}: Found {len(commits)} commits")
await crawler.crawler_strategy.kill_session(session_id)
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
2025-01-13 19:19:58 +08:00
async def crawl_dynamic_content_pages_method_3():
2025-01-13 19:19:58 +08:00
print(
"\n--- Advanced Multi-Page Crawling with JavaScript Execution using `wait_for` ---"
)
2024-09-04 15:33:24 +08:00
async with AsyncWebCrawler(verbose=True) as crawler:
url = "https://github.com/microsoft/TypeScript/commits/main"
session_id = "typescript_commits_session"
all_commits = []
2024-09-04 15:33:24 +08:00
js_next_page = """
const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
if (commits.length > 0) {
window.firstCommit = commits[0].textContent.trim();
}
const button = document.querySelector('a[data-testid="pagination-next-button"]');
if (button) button.click();
"""
2024-09-04 15:33:24 +08:00
wait_for = """() => {
const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
if (commits.length === 0) return false;
const firstCommit = commits[0].textContent.trim();
return firstCommit !== window.firstCommit;
}"""
2025-01-13 19:19:58 +08:00
schema = {
"name": "Commit Extractor",
"baseSelector": "li.Box-sc-g0xbh4-0",
"fields": [
{
"name": "title",
"selector": "h4.markdown-title",
"type": "text",
"transform": "strip",
},
],
}
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
for page in range(3): # Crawl 3 pages
result = await crawler.arun(
url=url,
session_id=session_id,
css_selector="li.Box-sc-g0xbh4-0",
extraction_strategy=extraction_strategy,
js_code=js_next_page if page > 0 else None,
wait_for=wait_for if page > 0 else None,
js_only=page > 0,
cache_mode=CacheMode.BYPASS,
headless=False,
)
assert result.success, f"Failed to crawl page {page + 1}"
commits = json.loads(result.extracted_content)
all_commits.extend(commits)
print(f"Page {page + 1}: Found {len(commits)} commits")
await crawler.crawler_strategy.kill_session(session_id)
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
2024-09-04 15:33:24 +08:00
2025-01-13 19:19:58 +08:00
async def crawl_custom_browser_type():
# Use Firefox
start = time.time()
2025-01-13 19:19:58 +08:00
async with AsyncWebCrawler(
browser_type="firefox", verbose=True, headless=True
) as crawler:
result = await crawler.arun(
url="https://www.example.com", cache_mode=CacheMode.BYPASS
)
print(result.markdown[:500])
print("Time taken: ", time.time() - start)
# Use WebKit
start = time.time()
2025-01-13 19:19:58 +08:00
async with AsyncWebCrawler(
browser_type="webkit", verbose=True, headless=True
) as crawler:
result = await crawler.arun(
url="https://www.example.com", cache_mode=CacheMode.BYPASS
)
print(result.markdown[:500])
print("Time taken: ", time.time() - start)
# Use Chromium (default)
start = time.time()
2025-01-13 19:19:58 +08:00
async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
result = await crawler.arun(
url="https://www.example.com", cache_mode=CacheMode.BYPASS
)
print(result.markdown[:500])
print("Time taken: ", time.time() - start)
2025-01-13 19:19:58 +08:00
async def crawl_with_user_simultion():
async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
url = "YOUR-URL-HERE"
result = await crawler.arun(
2025-01-13 19:19:58 +08:00
url=url,
cache_mode=CacheMode.BYPASS,
2025-01-13 19:19:58 +08:00
magic=True, # Automatically detects and removes overlays, popups, and other elements that block content
# simulate_user = True,# Causes a series of random mouse movements and clicks to simulate user interaction
# override_navigator = True # Overrides the navigator object to make it look like a real user
)
2025-01-13 19:19:58 +08:00
print(result.markdown)
2024-09-04 15:33:24 +08:00
async def speed_comparison():
# print("\n--- Speed Comparison ---")
# print("Firecrawl (simulated):")
# print("Time taken: 7.02 seconds")
# print("Content length: 42074 characters")
# print("Images found: 49")
# print()
# Simulated Firecrawl performance
from firecrawl import FirecrawlApp
2025-01-13 19:19:58 +08:00
app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
start = time.time()
scrape_status = app.scrape_url(
2025-01-13 19:19:58 +08:00
"https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]}
)
end = time.time()
print("Firecrawl:")
print(f"Time taken: {end - start:.2f} seconds")
print(f"Content length: {len(scrape_status['markdown'])} characters")
print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}")
2025-01-13 19:19:58 +08:00
print()
2024-09-04 15:33:24 +08:00
async with AsyncWebCrawler() as crawler:
# Crawl4AI simple crawl
start = time.time()
result = await crawler.arun(
url="https://www.nbcnews.com/business",
word_count_threshold=0,
cache_mode=CacheMode.BYPASS,
verbose=False,
2024-09-04 15:33:24 +08:00
)
end = time.time()
print("Crawl4AI (simple crawl):")
print(f"Time taken: {end - start:.2f} seconds")
print(f"Content length: {len(result.markdown)} characters")
print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
print()
# Crawl4AI with advanced content filtering
start = time.time()
result = await crawler.arun(
url="https://www.nbcnews.com/business",
word_count_threshold=0,
markdown_generator=DefaultMarkdownGenerator(
2025-01-13 19:19:58 +08:00
content_filter=PruningContentFilter(
threshold=0.48, threshold_type="fixed", min_word_threshold=0
)
# content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
),
cache_mode=CacheMode.BYPASS,
verbose=False,
)
end = time.time()
print("Crawl4AI (Markdown Plus):")
print(f"Time taken: {end - start:.2f} seconds")
Release prep (#749) * fix: Update export of URLPatternFilter * chore: Add dependancy for cchardet in requirements * docs: Update example for deep crawl in release note for v0.5 * Docs: update the example for memory dispatcher * docs: updated example for crawl strategies * Refactor: Removed wrapping in if __name__==main block since this is a markdown file. * chore: removed cchardet from dependancy list, since unclecode is planning to remove it * docs: updated the example for proxy rotation to a working example * feat: Introduced ProxyConfig param * Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1 * chore: update and test new dependancies * feat:Make PyPDF2 a conditional dependancy * updated tutorial and release note for v0.5 * docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename * refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult * fix: Bug in serialisation of markdown in acache_url * Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown * fix: remove deprecated markdown_v2 from docker * Refactor: remove deprecated fit_markdown and fit_html from result * refactor: fix cache retrieval for markdown as a string * chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
2025-02-28 17:23:35 +05:30
print(f"Content length: {len(result.markdown.raw_markdown)} characters")
print(f"Fit Markdown: {len(result.markdown.fit_markdown)} characters")
print(f"Images found: {result.markdown.raw_markdown.count('cldnry.s-nbcnews.com')}")
print()
2024-09-04 15:33:24 +08:00
# Crawl4AI with JavaScript execution
start = time.time()
result = await crawler.arun(
url="https://www.nbcnews.com/business",
js_code=[
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
],
2024-09-04 15:33:24 +08:00
word_count_threshold=0,
cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator(
2025-01-13 19:19:58 +08:00
content_filter=PruningContentFilter(
threshold=0.48, threshold_type="fixed", min_word_threshold=0
)
# content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
),
verbose=False,
2024-09-04 15:33:24 +08:00
)
end = time.time()
print("Crawl4AI (with JavaScript execution):")
print(f"Time taken: {end - start:.2f} seconds")
Release prep (#749) * fix: Update export of URLPatternFilter * chore: Add dependancy for cchardet in requirements * docs: Update example for deep crawl in release note for v0.5 * Docs: update the example for memory dispatcher * docs: updated example for crawl strategies * Refactor: Removed wrapping in if __name__==main block since this is a markdown file. * chore: removed cchardet from dependancy list, since unclecode is planning to remove it * docs: updated the example for proxy rotation to a working example * feat: Introduced ProxyConfig param * Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1 * chore: update and test new dependancies * feat:Make PyPDF2 a conditional dependancy * updated tutorial and release note for v0.5 * docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename * refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult * fix: Bug in serialisation of markdown in acache_url * Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown * fix: remove deprecated markdown_v2 from docker * Refactor: remove deprecated fit_markdown and fit_html from result * refactor: fix cache retrieval for markdown as a string * chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
2025-02-28 17:23:35 +05:30
print(f"Content length: {len(result.markdown.raw_markdown)} characters")
print(f"Fit Markdown: {len(result.markdown.fit_markdown)} characters")
print(f"Images found: {result.markdown.raw_markdown.count('cldnry.s-nbcnews.com')}")
2024-09-04 15:33:24 +08:00
print("\nNote on Speed Comparison:")
print("The speed test conducted here may not reflect optimal conditions.")
print("When we call Firecrawl's API, we're seeing its best performance,")
print("while Crawl4AI's performance is limited by the local network speed.")
print("For a more accurate comparison, it's recommended to run these tests")
print("on servers with a stable and fast internet connection.")
print("Despite these limitations, Crawl4AI still demonstrates faster performance.")
print("If you run these tests in an environment with better network conditions,")
print("you may observe an even more significant speed advantage for Crawl4AI.")
2025-01-13 19:19:58 +08:00
async def generate_knowledge_graph():
class Entity(BaseModel):
name: str
description: str
2025-01-13 19:19:58 +08:00
class Relationship(BaseModel):
entity1: Entity
entity2: Entity
description: str
relation_type: str
class KnowledgeGraph(BaseModel):
entities: List[Entity]
relationships: List[Relationship]
extraction_strategy = LLMExtractionStrategy(
llmConfig=LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")), # In case of Ollama just pass "no-token"
2025-01-13 19:19:58 +08:00
schema=KnowledgeGraph.model_json_schema(),
extraction_type="schema",
instruction="""Extract entities and relationships from the given text.""",
)
async with AsyncWebCrawler() as crawler:
url = "https://paulgraham.com/love.html"
result = await crawler.arun(
url=url,
cache_mode=CacheMode.BYPASS,
extraction_strategy=extraction_strategy,
# magic=True
)
# print(result.extracted_content)
with open(os.path.join(__location__, "kb.json"), "w") as f:
f.write(result.extracted_content)
2025-01-13 19:19:58 +08:00
2024-10-27 19:24:46 +08:00
async def fit_markdown_remove_overlay():
async with AsyncWebCrawler(
2025-01-13 19:19:58 +08:00
headless=True, # Set to False to see what is happening
verbose=True,
user_agent_mode="random",
user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
) as crawler:
2024-10-27 19:24:46 +08:00
result = await crawler.arun(
2025-01-13 19:19:58 +08:00
url="https://www.kidocode.com/degrees/technology",
cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter(
threshold=0.48, threshold_type="fixed", min_word_threshold=0
),
2025-01-13 19:19:58 +08:00
options={"ignore_links": True},
),
# markdown_generator=DefaultMarkdownGenerator(
# content_filter=BM25ContentFilter(user_query="", bm25_threshold=1.0),
# options={
# "ignore_links": True
# }
# ),
2024-10-27 19:24:46 +08:00
)
2025-01-13 19:19:58 +08:00
if result.success:
Release prep (#749) * fix: Update export of URLPatternFilter * chore: Add dependancy for cchardet in requirements * docs: Update example for deep crawl in release note for v0.5 * Docs: update the example for memory dispatcher * docs: updated example for crawl strategies * Refactor: Removed wrapping in if __name__==main block since this is a markdown file. * chore: removed cchardet from dependancy list, since unclecode is planning to remove it * docs: updated the example for proxy rotation to a working example * feat: Introduced ProxyConfig param * Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1 * chore: update and test new dependancies * feat:Make PyPDF2 a conditional dependancy * updated tutorial and release note for v0.5 * docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename * refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult * fix: Bug in serialisation of markdown in acache_url * Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown * fix: remove deprecated markdown_v2 from docker * Refactor: remove deprecated fit_markdown and fit_html from result * refactor: fix cache retrieval for markdown as a string * chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
2025-02-28 17:23:35 +05:30
print(len(result.markdown.raw_markdown))
print(len(result.markdown.markdown_with_citations))
print(len(result.markdown.fit_markdown))
2025-01-13 19:19:58 +08:00
# Save clean html
with open(os.path.join(__location__, "output/cleaned_html.html"), "w") as f:
f.write(result.cleaned_html)
2025-01-13 19:19:58 +08:00
with open(
os.path.join(__location__, "output/output_raw_markdown.md"), "w"
) as f:
Release prep (#749) * fix: Update export of URLPatternFilter * chore: Add dependancy for cchardet in requirements * docs: Update example for deep crawl in release note for v0.5 * Docs: update the example for memory dispatcher * docs: updated example for crawl strategies * Refactor: Removed wrapping in if __name__==main block since this is a markdown file. * chore: removed cchardet from dependancy list, since unclecode is planning to remove it * docs: updated the example for proxy rotation to a working example * feat: Introduced ProxyConfig param * Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1 * chore: update and test new dependancies * feat:Make PyPDF2 a conditional dependancy * updated tutorial and release note for v0.5 * docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename * refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult * fix: Bug in serialisation of markdown in acache_url * Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown * fix: remove deprecated markdown_v2 from docker * Refactor: remove deprecated fit_markdown and fit_html from result * refactor: fix cache retrieval for markdown as a string * chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
2025-02-28 17:23:35 +05:30
f.write(result.markdown.raw_markdown)
2025-01-13 19:19:58 +08:00
with open(
os.path.join(__location__, "output/output_markdown_with_citations.md"),
"w",
) as f:
Release prep (#749) * fix: Update export of URLPatternFilter * chore: Add dependancy for cchardet in requirements * docs: Update example for deep crawl in release note for v0.5 * Docs: update the example for memory dispatcher * docs: updated example for crawl strategies * Refactor: Removed wrapping in if __name__==main block since this is a markdown file. * chore: removed cchardet from dependancy list, since unclecode is planning to remove it * docs: updated the example for proxy rotation to a working example * feat: Introduced ProxyConfig param * Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1 * chore: update and test new dependancies * feat:Make PyPDF2 a conditional dependancy * updated tutorial and release note for v0.5 * docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename * refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult * fix: Bug in serialisation of markdown in acache_url * Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown * fix: remove deprecated markdown_v2 from docker * Refactor: remove deprecated fit_markdown and fit_html from result * refactor: fix cache retrieval for markdown as a string * chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
2025-02-28 17:23:35 +05:30
f.write(result.markdown.markdown_with_citations)
2025-01-13 19:19:58 +08:00
with open(
os.path.join(__location__, "output/output_fit_markdown.md"), "w"
) as f:
Release prep (#749) * fix: Update export of URLPatternFilter * chore: Add dependancy for cchardet in requirements * docs: Update example for deep crawl in release note for v0.5 * Docs: update the example for memory dispatcher * docs: updated example for crawl strategies * Refactor: Removed wrapping in if __name__==main block since this is a markdown file. * chore: removed cchardet from dependancy list, since unclecode is planning to remove it * docs: updated the example for proxy rotation to a working example * feat: Introduced ProxyConfig param * Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1 * chore: update and test new dependancies * feat:Make PyPDF2 a conditional dependancy * updated tutorial and release note for v0.5 * docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename * refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult * fix: Bug in serialisation of markdown in acache_url * Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown * fix: remove deprecated markdown_v2 from docker * Refactor: remove deprecated fit_markdown and fit_html from result * refactor: fix cache retrieval for markdown as a string * chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
2025-02-28 17:23:35 +05:30
f.write(result.markdown.fit_markdown)
2025-01-13 19:19:58 +08:00
2024-10-27 19:24:46 +08:00
print("Done")
2024-09-04 15:33:24 +08:00
async def main():
# await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
2025-01-13 19:19:58 +08:00
# await simple_crawl()
# await simple_example_with_running_js_code()
# await simple_example_with_css_selector()
# # await use_proxy()
# await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
# await extract_structured_data_using_css_extractor()
# LLM extraction examples
# await extract_structured_data_using_llm()
# await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
2025-01-13 19:19:58 +08:00
# await extract_structured_data_using_llm("ollama/llama3.2")
# You always can pass custom headers to the extraction strategy
# custom_headers = {
# "Authorization": "Bearer your-custom-token",
# "X-Custom-Header": "Some-Value"
# }
# await extract_structured_data_using_llm(extra_headers=custom_headers)
2025-01-13 19:19:58 +08:00
# await crawl_dynamic_content_pages_method_1()
# await crawl_dynamic_content_pages_method_2()
await crawl_dynamic_content_pages_method_3()
2025-01-13 19:19:58 +08:00
# await crawl_custom_browser_type()
2025-01-13 19:19:58 +08:00
# await speed_comparison()
2024-09-04 15:33:24 +08:00
2024-09-04 15:33:24 +08:00
if __name__ == "__main__":
asyncio.run(main())