2024-12-12 19:35:09 +08:00
|
|
|
import os, sys
|
2024-12-29 20:05:18 +08:00
|
|
|
|
2025-03-07 20:55:56 +08:00
|
|
|
from crawl4ai import LLMConfig
|
2025-02-21 13:11:37 +05:30
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
sys.path.append(
|
|
|
|
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
)
|
2024-12-12 19:35:09 +08:00
|
|
|
|
|
|
|
import asyncio
|
|
|
|
import time
|
|
|
|
import json
|
|
|
|
import re
|
2025-01-13 19:19:58 +08:00
|
|
|
from typing import Dict
|
2024-12-12 19:35:09 +08:00
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
|
|
|
|
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
2025-01-13 19:19:58 +08:00
|
|
|
from crawl4ai.content_filter_strategy import PruningContentFilter
|
2024-12-29 20:05:18 +08:00
|
|
|
from crawl4ai.extraction_strategy import (
|
|
|
|
JsonCssExtractionStrategy,
|
|
|
|
LLMExtractionStrategy,
|
|
|
|
)
|
2024-12-12 19:35:09 +08:00
|
|
|
|
|
|
|
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
|
|
|
|
|
|
|
print("Crawl4AI: Advanced Web Crawling and Data Extraction")
|
|
|
|
print("GitHub Repository: https://github.com/unclecode/crawl4ai")
|
|
|
|
print("Twitter: @unclecode")
|
|
|
|
print("Website: https://crawl4ai.com")
|
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
|
2024-12-12 19:35:09 +08:00
|
|
|
# Basic Example - Simple Crawl
|
|
|
|
async def simple_crawl():
|
|
|
|
print("\n--- Basic Usage ---")
|
|
|
|
browser_config = BrowserConfig(headless=True)
|
2024-12-29 20:05:18 +08:00
|
|
|
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
|
|
|
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
|
|
result = await crawler.arun(
|
|
|
|
url="https://www.nbcnews.com/business", config=crawler_config
|
|
|
|
)
|
|
|
|
print(result.markdown[:500])
|
|
|
|
|
|
|
|
|
|
|
|
async def clean_content():
|
2024-12-12 19:35:09 +08:00
|
|
|
crawler_config = CrawlerRunConfig(
|
2024-12-29 20:05:18 +08:00
|
|
|
cache_mode=CacheMode.BYPASS,
|
|
|
|
excluded_tags=["nav", "footer", "aside"],
|
|
|
|
remove_overlay_elements=True,
|
|
|
|
markdown_generator=DefaultMarkdownGenerator(
|
|
|
|
content_filter=PruningContentFilter(
|
|
|
|
threshold=0.48, threshold_type="fixed", min_word_threshold=0
|
|
|
|
),
|
|
|
|
options={"ignore_links": True},
|
|
|
|
),
|
2024-12-12 19:35:09 +08:00
|
|
|
)
|
2024-12-29 20:05:18 +08:00
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
|
|
result = await crawler.arun(
|
|
|
|
url="https://en.wikipedia.org/wiki/Apple",
|
|
|
|
config=crawler_config,
|
|
|
|
)
|
2025-02-28 17:23:35 +05:30
|
|
|
full_markdown_length = len(result.markdown.raw_markdown)
|
|
|
|
fit_markdown_length = len(result.markdown.fit_markdown)
|
2024-12-29 20:05:18 +08:00
|
|
|
print(f"Full Markdown Length: {full_markdown_length}")
|
|
|
|
print(f"Fit Markdown Length: {fit_markdown_length}")
|
|
|
|
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
async def link_analysis():
|
|
|
|
crawler_config = CrawlerRunConfig(
|
|
|
|
cache_mode=CacheMode.ENABLED,
|
|
|
|
exclude_external_links=True,
|
|
|
|
exclude_social_media_links=True,
|
|
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
2024-12-12 19:35:09 +08:00
|
|
|
result = await crawler.arun(
|
|
|
|
url="https://www.nbcnews.com/business",
|
2024-12-29 20:05:18 +08:00
|
|
|
config=crawler_config,
|
2024-12-12 19:35:09 +08:00
|
|
|
)
|
2024-12-29 20:05:18 +08:00
|
|
|
print(f"Found {len(result.links['internal'])} internal links")
|
|
|
|
print(f"Found {len(result.links['external'])} external links")
|
|
|
|
|
2025-01-13 19:19:58 +08:00
|
|
|
for link in result.links["internal"][:5]:
|
2024-12-29 20:05:18 +08:00
|
|
|
print(f"Href: {link['href']}\nText: {link['text']}\n")
|
2024-12-12 19:35:09 +08:00
|
|
|
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2024-12-12 19:35:09 +08:00
|
|
|
# JavaScript Execution Example
|
|
|
|
async def simple_example_with_running_js_code():
|
|
|
|
print("\n--- Executing JavaScript and Using CSS Selectors ---")
|
2024-12-29 20:05:18 +08:00
|
|
|
|
|
|
|
browser_config = BrowserConfig(headless=True, java_script_enabled=True)
|
|
|
|
|
2024-12-12 19:35:09 +08:00
|
|
|
crawler_config = CrawlerRunConfig(
|
|
|
|
cache_mode=CacheMode.BYPASS,
|
2024-12-29 20:05:18 +08:00
|
|
|
js_code="const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();",
|
2024-12-12 19:35:09 +08:00
|
|
|
# wait_for="() => { return Array.from(document.querySelectorAll('article.tease-card')).length > 10; }"
|
|
|
|
)
|
2024-12-29 20:05:18 +08:00
|
|
|
|
2024-12-12 19:35:09 +08:00
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
|
|
result = await crawler.arun(
|
2024-12-29 20:05:18 +08:00
|
|
|
url="https://www.nbcnews.com/business", config=crawler_config
|
2024-12-12 19:35:09 +08:00
|
|
|
)
|
|
|
|
print(result.markdown[:500])
|
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
|
2024-12-12 19:35:09 +08:00
|
|
|
# CSS Selector Example
|
|
|
|
async def simple_example_with_css_selector():
|
|
|
|
print("\n--- Using CSS Selectors ---")
|
|
|
|
browser_config = BrowserConfig(headless=True)
|
|
|
|
crawler_config = CrawlerRunConfig(
|
2024-12-29 20:05:18 +08:00
|
|
|
cache_mode=CacheMode.BYPASS, css_selector=".wide-tease-item__description"
|
2024-12-12 19:35:09 +08:00
|
|
|
)
|
2024-12-29 20:05:18 +08:00
|
|
|
|
2024-12-12 19:35:09 +08:00
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
2024-12-29 20:05:18 +08:00
|
|
|
result = await crawler.arun(
|
|
|
|
url="https://www.nbcnews.com/business", config=crawler_config
|
|
|
|
)
|
|
|
|
print(result.markdown[:500])
|
|
|
|
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
async def media_handling():
|
2025-01-13 19:19:58 +08:00
|
|
|
crawler_config = CrawlerRunConfig(
|
|
|
|
cache_mode=CacheMode.BYPASS, exclude_external_images=True, screenshot=True
|
|
|
|
)
|
2024-12-29 20:05:18 +08:00
|
|
|
async with AsyncWebCrawler() as crawler:
|
2024-12-12 19:35:09 +08:00
|
|
|
result = await crawler.arun(
|
2025-01-13 19:19:58 +08:00
|
|
|
url="https://www.nbcnews.com/business", config=crawler_config
|
2024-12-12 19:35:09 +08:00
|
|
|
)
|
2025-01-13 19:19:58 +08:00
|
|
|
for img in result.media["images"][:5]:
|
2024-12-29 20:05:18 +08:00
|
|
|
print(f"Image URL: {img['src']}, Alt: {img['alt']}, Score: {img['score']}")
|
|
|
|
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
async def custom_hook_workflow(verbose=True):
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
|
|
# Set a 'before_goto' hook to run custom code just before navigation
|
2025-01-13 19:19:58 +08:00
|
|
|
crawler.crawler_strategy.set_hook(
|
|
|
|
"before_goto",
|
|
|
|
lambda page, context: print("[Hook] Preparing to navigate..."),
|
|
|
|
)
|
2024-12-29 20:05:18 +08:00
|
|
|
|
|
|
|
# Perform the crawl operation
|
2025-01-13 19:19:58 +08:00
|
|
|
result = await crawler.arun(url="https://crawl4ai.com")
|
2025-02-28 17:23:35 +05:30
|
|
|
print(result.markdown.raw_markdown[:500].replace("\n", " -- "))
|
2024-12-29 20:05:18 +08:00
|
|
|
|
2024-12-12 19:35:09 +08:00
|
|
|
|
|
|
|
# Proxy Example
|
|
|
|
async def use_proxy():
|
|
|
|
print("\n--- Using a Proxy ---")
|
|
|
|
browser_config = BrowserConfig(
|
|
|
|
headless=True,
|
2024-12-29 20:05:18 +08:00
|
|
|
proxy_config={
|
|
|
|
"server": "http://proxy.example.com:8080",
|
|
|
|
"username": "username",
|
|
|
|
"password": "password",
|
|
|
|
},
|
2024-12-12 19:35:09 +08:00
|
|
|
)
|
2024-12-29 20:05:18 +08:00
|
|
|
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
|
|
|
|
2024-12-12 19:35:09 +08:00
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
|
|
result = await crawler.arun(
|
2024-12-29 20:05:18 +08:00
|
|
|
url="https://www.nbcnews.com/business", config=crawler_config
|
2024-12-12 19:35:09 +08:00
|
|
|
)
|
|
|
|
if result.success:
|
|
|
|
print(result.markdown[:500])
|
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
|
2024-12-12 19:35:09 +08:00
|
|
|
# Screenshot Example
|
|
|
|
async def capture_and_save_screenshot(url: str, output_path: str):
|
|
|
|
browser_config = BrowserConfig(headless=True)
|
2024-12-29 20:05:18 +08:00
|
|
|
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, screenshot=True)
|
|
|
|
|
2024-12-12 19:35:09 +08:00
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
2024-12-29 20:05:18 +08:00
|
|
|
result = await crawler.arun(url=url, config=crawler_config)
|
|
|
|
|
2024-12-12 19:35:09 +08:00
|
|
|
if result.success and result.screenshot:
|
|
|
|
import base64
|
2024-12-29 20:05:18 +08:00
|
|
|
|
2024-12-12 19:35:09 +08:00
|
|
|
screenshot_data = base64.b64decode(result.screenshot)
|
2024-12-29 20:05:18 +08:00
|
|
|
with open(output_path, "wb") as f:
|
2024-12-12 19:35:09 +08:00
|
|
|
f.write(screenshot_data)
|
|
|
|
print(f"Screenshot saved successfully to {output_path}")
|
|
|
|
else:
|
|
|
|
print("Failed to capture screenshot")
|
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
|
2024-12-12 19:35:09 +08:00
|
|
|
# LLM Extraction Example
|
|
|
|
class OpenAIModelFee(BaseModel):
|
|
|
|
model_name: str = Field(..., description="Name of the OpenAI model.")
|
|
|
|
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
|
2024-12-29 20:05:18 +08:00
|
|
|
output_fee: str = Field(
|
|
|
|
..., description="Fee for output token for the OpenAI model."
|
|
|
|
)
|
|
|
|
|
2024-12-12 19:35:09 +08:00
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
async def extract_structured_data_using_llm(
|
|
|
|
provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
|
|
|
|
):
|
2024-12-12 19:35:09 +08:00
|
|
|
print(f"\n--- Extracting Structured Data with {provider} ---")
|
2024-12-29 20:05:18 +08:00
|
|
|
|
2024-12-12 19:35:09 +08:00
|
|
|
if api_token is None and provider != "ollama":
|
|
|
|
print(f"API token is required for {provider}. Skipping this example.")
|
|
|
|
return
|
|
|
|
|
|
|
|
browser_config = BrowserConfig(headless=True)
|
2024-12-29 20:05:18 +08:00
|
|
|
|
|
|
|
extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000}
|
2024-12-12 19:35:09 +08:00
|
|
|
if extra_headers:
|
|
|
|
extra_args["extra_headers"] = extra_headers
|
|
|
|
|
|
|
|
crawler_config = CrawlerRunConfig(
|
|
|
|
cache_mode=CacheMode.BYPASS,
|
|
|
|
word_count_threshold=1,
|
2024-12-29 20:05:18 +08:00
|
|
|
page_timeout=80000,
|
2024-12-12 19:35:09 +08:00
|
|
|
extraction_strategy=LLMExtractionStrategy(
|
2025-03-05 14:17:04 +08:00
|
|
|
llm_config=LLMConfig(provider=provider,api_token=api_token),
|
2024-12-12 19:35:09 +08:00
|
|
|
schema=OpenAIModelFee.model_json_schema(),
|
|
|
|
extraction_type="schema",
|
|
|
|
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
|
|
|
Do not miss any models in the entire content.""",
|
2024-12-29 20:05:18 +08:00
|
|
|
extra_args=extra_args,
|
|
|
|
),
|
2024-12-12 19:35:09 +08:00
|
|
|
)
|
2024-12-29 20:05:18 +08:00
|
|
|
|
2024-12-12 19:35:09 +08:00
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
|
|
result = await crawler.arun(
|
2024-12-29 20:05:18 +08:00
|
|
|
url="https://openai.com/api/pricing/", config=crawler_config
|
2024-12-12 19:35:09 +08:00
|
|
|
)
|
|
|
|
print(result.extracted_content)
|
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
|
2024-12-12 19:35:09 +08:00
|
|
|
# CSS Extraction Example
|
|
|
|
async def extract_structured_data_using_css_extractor():
|
|
|
|
print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
|
|
|
|
schema = {
|
|
|
|
"name": "KidoCode Courses",
|
2025-01-19 17:12:03 +08:00
|
|
|
"baseSelector": "section.charge-methodology .framework-collection-item.w-dyn-item",
|
2024-12-12 19:35:09 +08:00
|
|
|
"fields": [
|
|
|
|
{
|
|
|
|
"name": "section_title",
|
|
|
|
"selector": "h3.heading-50",
|
|
|
|
"type": "text",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"name": "section_description",
|
|
|
|
"selector": ".charge-content",
|
|
|
|
"type": "text",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"name": "course_name",
|
|
|
|
"selector": ".text-block-93",
|
|
|
|
"type": "text",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"name": "course_description",
|
|
|
|
"selector": ".course-content-text",
|
|
|
|
"type": "text",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"name": "course_icon",
|
|
|
|
"selector": ".image-92",
|
|
|
|
"type": "attribute",
|
2024-12-29 20:05:18 +08:00
|
|
|
"attribute": "src",
|
|
|
|
},
|
|
|
|
],
|
2024-12-12 19:35:09 +08:00
|
|
|
}
|
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
browser_config = BrowserConfig(headless=True, java_script_enabled=True)
|
|
|
|
|
2024-12-12 19:35:09 +08:00
|
|
|
js_click_tabs = """
|
|
|
|
(async () => {
|
|
|
|
const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
|
|
|
|
for(let tab of tabs) {
|
|
|
|
tab.scrollIntoView();
|
|
|
|
tab.click();
|
|
|
|
await new Promise(r => setTimeout(r, 500));
|
|
|
|
}
|
|
|
|
})();
|
|
|
|
"""
|
2024-12-29 20:05:18 +08:00
|
|
|
|
2024-12-12 19:35:09 +08:00
|
|
|
crawler_config = CrawlerRunConfig(
|
|
|
|
cache_mode=CacheMode.BYPASS,
|
|
|
|
extraction_strategy=JsonCssExtractionStrategy(schema),
|
2024-12-29 20:05:18 +08:00
|
|
|
js_code=[js_click_tabs],
|
2025-01-19 17:12:03 +08:00
|
|
|
delay_before_return_html=1
|
2024-12-12 19:35:09 +08:00
|
|
|
)
|
2024-12-29 20:05:18 +08:00
|
|
|
|
2024-12-12 19:35:09 +08:00
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
|
|
result = await crawler.arun(
|
2024-12-29 20:05:18 +08:00
|
|
|
url="https://www.kidocode.com/degrees/technology", config=crawler_config
|
2024-12-12 19:35:09 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
companies = json.loads(result.extracted_content)
|
|
|
|
print(f"Successfully extracted {len(companies)} companies")
|
|
|
|
print(json.dumps(companies[0], indent=2))
|
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
|
2024-12-12 19:35:09 +08:00
|
|
|
# Dynamic Content Examples - Method 1
|
|
|
|
async def crawl_dynamic_content_pages_method_1():
|
|
|
|
print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
|
|
|
|
first_commit = ""
|
|
|
|
|
|
|
|
async def on_execution_started(page, **kwargs):
|
|
|
|
nonlocal first_commit
|
|
|
|
try:
|
|
|
|
while True:
|
|
|
|
await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4")
|
|
|
|
commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4")
|
|
|
|
commit = await commit.evaluate("(element) => element.textContent")
|
|
|
|
commit = re.sub(r"\s+", "", commit)
|
|
|
|
if commit and commit != first_commit:
|
|
|
|
first_commit = commit
|
|
|
|
break
|
|
|
|
await asyncio.sleep(0.5)
|
|
|
|
except Exception as e:
|
|
|
|
print(f"Warning: New content didn't appear after JavaScript execution: {e}")
|
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
browser_config = BrowserConfig(headless=False, java_script_enabled=True)
|
2024-12-12 19:35:09 +08:00
|
|
|
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
|
|
crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
|
|
|
|
|
|
|
|
url = "https://github.com/microsoft/TypeScript/commits/main"
|
|
|
|
session_id = "typescript_commits_session"
|
|
|
|
all_commits = []
|
|
|
|
|
|
|
|
js_next_page = """
|
|
|
|
const button = document.querySelector('a[data-testid="pagination-next-button"]');
|
|
|
|
if (button) button.click();
|
|
|
|
"""
|
|
|
|
|
|
|
|
for page in range(3):
|
|
|
|
crawler_config = CrawlerRunConfig(
|
|
|
|
cache_mode=CacheMode.BYPASS,
|
|
|
|
css_selector="li.Box-sc-g0xbh4-0",
|
|
|
|
js_code=js_next_page if page > 0 else None,
|
|
|
|
js_only=page > 0,
|
2024-12-29 20:05:18 +08:00
|
|
|
session_id=session_id,
|
2024-12-12 19:35:09 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
result = await crawler.arun(url=url, config=crawler_config)
|
|
|
|
assert result.success, f"Failed to crawl page {page + 1}"
|
|
|
|
|
|
|
|
soup = BeautifulSoup(result.cleaned_html, "html.parser")
|
|
|
|
commits = soup.select("li")
|
|
|
|
all_commits.extend(commits)
|
|
|
|
|
|
|
|
print(f"Page {page + 1}: Found {len(commits)} commits")
|
|
|
|
|
|
|
|
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
|
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
|
2024-12-12 19:35:09 +08:00
|
|
|
# Dynamic Content Examples - Method 2
|
|
|
|
async def crawl_dynamic_content_pages_method_2():
|
|
|
|
print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
|
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
browser_config = BrowserConfig(headless=False, java_script_enabled=True)
|
2024-12-12 19:35:09 +08:00
|
|
|
|
|
|
|
js_next_page_and_wait = """
|
|
|
|
(async () => {
|
|
|
|
const getCurrentCommit = () => {
|
|
|
|
const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
|
|
|
|
return commits.length > 0 ? commits[0].textContent.trim() : null;
|
|
|
|
};
|
|
|
|
|
|
|
|
const initialCommit = getCurrentCommit();
|
|
|
|
const button = document.querySelector('a[data-testid="pagination-next-button"]');
|
|
|
|
if (button) button.click();
|
|
|
|
|
|
|
|
while (true) {
|
|
|
|
await new Promise(resolve => setTimeout(resolve, 100));
|
|
|
|
const newCommit = getCurrentCommit();
|
|
|
|
if (newCommit && newCommit !== initialCommit) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
})();
|
|
|
|
"""
|
|
|
|
|
|
|
|
schema = {
|
|
|
|
"name": "Commit Extractor",
|
|
|
|
"baseSelector": "li.Box-sc-g0xbh4-0",
|
|
|
|
"fields": [
|
|
|
|
{
|
|
|
|
"name": "title",
|
|
|
|
"selector": "h4.markdown-title",
|
|
|
|
"type": "text",
|
|
|
|
"transform": "strip",
|
|
|
|
},
|
|
|
|
],
|
|
|
|
}
|
|
|
|
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
|
|
url = "https://github.com/microsoft/TypeScript/commits/main"
|
|
|
|
session_id = "typescript_commits_session"
|
|
|
|
all_commits = []
|
|
|
|
|
|
|
|
extraction_strategy = JsonCssExtractionStrategy(schema)
|
|
|
|
|
|
|
|
for page in range(3):
|
|
|
|
crawler_config = CrawlerRunConfig(
|
|
|
|
cache_mode=CacheMode.BYPASS,
|
|
|
|
css_selector="li.Box-sc-g0xbh4-0",
|
|
|
|
extraction_strategy=extraction_strategy,
|
|
|
|
js_code=js_next_page_and_wait if page > 0 else None,
|
|
|
|
js_only=page > 0,
|
2024-12-29 20:05:18 +08:00
|
|
|
session_id=session_id,
|
2024-12-12 19:35:09 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
result = await crawler.arun(url=url, config=crawler_config)
|
|
|
|
assert result.success, f"Failed to crawl page {page + 1}"
|
|
|
|
|
|
|
|
commits = json.loads(result.extracted_content)
|
|
|
|
all_commits.extend(commits)
|
|
|
|
print(f"Page {page + 1}: Found {len(commits)} commits")
|
|
|
|
|
|
|
|
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
|
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
|
|
|
|
async def cosine_similarity_extraction():
|
2025-03-04 18:23:55 +08:00
|
|
|
from crawl4ai.extraction_strategy import CosineStrategy
|
2024-12-29 20:05:18 +08:00
|
|
|
crawl_config = CrawlerRunConfig(
|
|
|
|
cache_mode=CacheMode.BYPASS,
|
|
|
|
extraction_strategy=CosineStrategy(
|
|
|
|
word_count_threshold=10,
|
2025-01-13 19:19:58 +08:00
|
|
|
max_dist=0.2, # Maximum distance between two words
|
|
|
|
linkage_method="ward", # Linkage method for hierarchical clustering (ward, complete, average, single)
|
|
|
|
top_k=3, # Number of top keywords to extract
|
|
|
|
sim_threshold=0.3, # Similarity threshold for clustering
|
|
|
|
semantic_filter="McDonald's economic impact, American consumer trends", # Keywords to filter the content semantically using embeddings
|
|
|
|
verbose=True,
|
|
|
|
),
|
2024-12-29 20:05:18 +08:00
|
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
|
|
result = await crawler.arun(
|
|
|
|
url="https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156",
|
2025-01-13 19:19:58 +08:00
|
|
|
config=crawl_config,
|
2024-12-29 20:05:18 +08:00
|
|
|
)
|
|
|
|
print(json.loads(result.extracted_content)[:5])
|
|
|
|
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2024-12-12 19:35:09 +08:00
|
|
|
# Browser Comparison
|
|
|
|
async def crawl_custom_browser_type():
|
|
|
|
print("\n--- Browser Comparison ---")
|
2024-12-29 20:05:18 +08:00
|
|
|
|
2024-12-12 19:35:09 +08:00
|
|
|
# Firefox
|
2024-12-29 20:05:18 +08:00
|
|
|
browser_config_firefox = BrowserConfig(browser_type="firefox", headless=True)
|
2024-12-12 19:35:09 +08:00
|
|
|
start = time.time()
|
|
|
|
async with AsyncWebCrawler(config=browser_config_firefox) as crawler:
|
|
|
|
result = await crawler.arun(
|
|
|
|
url="https://www.example.com",
|
2024-12-29 20:05:18 +08:00
|
|
|
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
|
2024-12-12 19:35:09 +08:00
|
|
|
)
|
|
|
|
print("Firefox:", time.time() - start)
|
|
|
|
print(result.markdown[:500])
|
|
|
|
|
|
|
|
# WebKit
|
2024-12-29 20:05:18 +08:00
|
|
|
browser_config_webkit = BrowserConfig(browser_type="webkit", headless=True)
|
2024-12-12 19:35:09 +08:00
|
|
|
start = time.time()
|
|
|
|
async with AsyncWebCrawler(config=browser_config_webkit) as crawler:
|
|
|
|
result = await crawler.arun(
|
|
|
|
url="https://www.example.com",
|
2024-12-29 20:05:18 +08:00
|
|
|
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
|
2024-12-12 19:35:09 +08:00
|
|
|
)
|
|
|
|
print("WebKit:", time.time() - start)
|
|
|
|
print(result.markdown[:500])
|
|
|
|
|
|
|
|
# Chromium (default)
|
2024-12-29 20:05:18 +08:00
|
|
|
browser_config_chromium = BrowserConfig(browser_type="chromium", headless=True)
|
2024-12-12 19:35:09 +08:00
|
|
|
start = time.time()
|
|
|
|
async with AsyncWebCrawler(config=browser_config_chromium) as crawler:
|
|
|
|
result = await crawler.arun(
|
|
|
|
url="https://www.example.com",
|
2024-12-29 20:05:18 +08:00
|
|
|
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
|
2024-12-12 19:35:09 +08:00
|
|
|
)
|
|
|
|
print("Chromium:", time.time() - start)
|
|
|
|
print(result.markdown[:500])
|
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
|
2024-12-12 19:35:09 +08:00
|
|
|
# Anti-Bot and User Simulation
|
|
|
|
async def crawl_with_user_simulation():
|
|
|
|
browser_config = BrowserConfig(
|
|
|
|
headless=True,
|
|
|
|
user_agent_mode="random",
|
2024-12-29 20:05:18 +08:00
|
|
|
user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
|
2024-12-12 19:35:09 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
crawler_config = CrawlerRunConfig(
|
|
|
|
cache_mode=CacheMode.BYPASS,
|
|
|
|
magic=True,
|
|
|
|
simulate_user=True,
|
2024-12-29 20:05:18 +08:00
|
|
|
override_navigator=True,
|
2024-12-12 19:35:09 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
2024-12-29 20:05:18 +08:00
|
|
|
result = await crawler.arun(url="YOUR-URL-HERE", config=crawler_config)
|
|
|
|
print(result.markdown)
|
|
|
|
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
async def ssl_certification():
|
|
|
|
# Configure crawler to fetch SSL certificate
|
|
|
|
config = CrawlerRunConfig(
|
|
|
|
fetch_ssl_certificate=True,
|
2025-01-13 19:19:58 +08:00
|
|
|
cache_mode=CacheMode.BYPASS, # Bypass cache to always get fresh certificates
|
2024-12-29 20:05:18 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
2025-01-13 19:19:58 +08:00
|
|
|
result = await crawler.arun(url="https://example.com", config=config)
|
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
if result.success and result.ssl_certificate:
|
|
|
|
cert = result.ssl_certificate
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2025-03-04 18:23:55 +08:00
|
|
|
tmp_dir = os.path.join(__location__, "tmp")
|
|
|
|
os.makedirs(tmp_dir, exist_ok=True)
|
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
# 1. Access certificate properties directly
|
|
|
|
print("\nCertificate Information:")
|
|
|
|
print(f"Issuer: {cert.issuer.get('CN', '')}")
|
|
|
|
print(f"Valid until: {cert.valid_until}")
|
|
|
|
print(f"Fingerprint: {cert.fingerprint}")
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
# 2. Export certificate in different formats
|
|
|
|
cert.to_json(os.path.join(tmp_dir, "certificate.json")) # For analysis
|
|
|
|
print("\nCertificate exported to:")
|
|
|
|
print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}")
|
2025-01-13 19:19:58 +08:00
|
|
|
|
|
|
|
pem_data = cert.to_pem(
|
|
|
|
os.path.join(tmp_dir, "certificate.pem")
|
|
|
|
) # For web servers
|
2024-12-29 20:05:18 +08:00
|
|
|
print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}")
|
2025-01-13 19:19:58 +08:00
|
|
|
|
|
|
|
der_data = cert.to_der(
|
|
|
|
os.path.join(tmp_dir, "certificate.der")
|
|
|
|
) # For Java apps
|
2024-12-29 20:05:18 +08:00
|
|
|
print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
|
2024-12-12 19:35:09 +08:00
|
|
|
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2024-12-12 19:35:09 +08:00
|
|
|
# Main execution
|
|
|
|
async def main():
|
|
|
|
# Basic examples
|
2025-01-19 17:12:03 +08:00
|
|
|
await simple_crawl()
|
|
|
|
await simple_example_with_running_js_code()
|
|
|
|
await simple_example_with_css_selector()
|
2024-12-29 20:05:18 +08:00
|
|
|
|
2024-12-12 19:35:09 +08:00
|
|
|
# Advanced examples
|
2025-01-19 17:12:03 +08:00
|
|
|
await extract_structured_data_using_css_extractor()
|
2024-12-29 20:05:18 +08:00
|
|
|
await extract_structured_data_using_llm(
|
|
|
|
"openai/gpt-4o", os.getenv("OPENAI_API_KEY")
|
|
|
|
)
|
2025-01-19 17:12:03 +08:00
|
|
|
await crawl_dynamic_content_pages_method_1()
|
|
|
|
await crawl_dynamic_content_pages_method_2()
|
2024-12-29 20:05:18 +08:00
|
|
|
|
2024-12-12 19:35:09 +08:00
|
|
|
# Browser comparisons
|
2025-01-19 17:12:03 +08:00
|
|
|
await crawl_custom_browser_type()
|
2024-12-12 19:35:09 +08:00
|
|
|
|
|
|
|
# Screenshot example
|
2025-01-19 17:12:03 +08:00
|
|
|
await capture_and_save_screenshot(
|
|
|
|
"https://www.example.com",
|
|
|
|
os.path.join(__location__, "tmp/example_screenshot.jpg")
|
|
|
|
)
|
2024-12-12 19:35:09 +08:00
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
|
2024-12-12 19:35:09 +08:00
|
|
|
if __name__ == "__main__":
|
2024-12-29 20:05:18 +08:00
|
|
|
asyncio.run(main())
|