crawl4ai/docs/examples/quickstart_sync.py

406 lines
14 KiB
Python
Raw Normal View History

import os
import time
from crawl4ai.types import LLMConfig
from crawl4ai.web_crawler import WebCrawler
from crawl4ai.chunking_strategy import *
from crawl4ai.extraction_strategy import *
from crawl4ai.crawler_strategy import *
from rich import print
from rich.console import Console
from functools import lru_cache
console = Console()
2025-01-13 19:19:58 +08:00
@lru_cache()
def create_crawler():
crawler = WebCrawler(verbose=True)
crawler.warmup()
return crawler
2025-01-13 19:19:58 +08:00
def print_result(result):
# Print each key in one line and just the first 10 characters of each one's value and three dots
2025-01-13 19:19:58 +08:00
console.print("\t[bold]Result:[/bold]")
for key, value in result.model_dump().items():
if isinstance(value, str) and value:
console.print(f"\t{key}: [green]{value[:20]}...[/green]")
if result.extracted_content:
items = json.loads(result.extracted_content)
print(f"\t[bold]{len(items)} blocks is extracted![/bold]")
def cprint(message, press_any_key=False):
console.print(message)
if press_any_key:
console.print("Press any key to continue...", style="")
input()
2025-01-13 19:19:58 +08:00
def basic_usage(crawler):
2025-01-13 19:19:58 +08:00
cprint(
"🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]"
)
result = crawler.run(url="https://www.nbcnews.com/business", only_text=True)
2024-06-19 00:48:38 +08:00
cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
print_result(result)
2025-01-13 19:19:58 +08:00
2024-06-19 00:48:38 +08:00
def basic_usage_some_params(crawler):
2025-01-13 19:19:58 +08:00
cprint(
"🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]"
)
result = crawler.run(
url="https://www.nbcnews.com/business", word_count_threshold=1, only_text=True
)
cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
print_result(result)
2025-01-13 19:19:58 +08:00
def screenshot_usage(crawler):
cprint("\n📸 [bold cyan]Let's take a screenshot of the page![/bold cyan]")
result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
cprint("[LOG] 📦 [bold yellow]Screenshot result:[/bold yellow]")
# Save the screenshot to a file
with open("screenshot.png", "wb") as f:
f.write(base64.b64decode(result.screenshot))
cprint("Screenshot saved to 'screenshot.png'!")
print_result(result)
2025-01-13 19:19:58 +08:00
def understanding_parameters(crawler):
2025-01-13 19:19:58 +08:00
cprint(
"\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]"
)
cprint(
"By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action."
)
# First crawl (reads from cache)
cprint("1⃣ First crawl (caches the result):", True)
start_time = time.time()
result = crawler.run(url="https://www.nbcnews.com/business")
end_time = time.time()
2025-01-13 19:19:58 +08:00
cprint(
f"[LOG] 📦 [bold yellow]First crawl took {end_time - start_time} seconds and result (from cache):[/bold yellow]"
)
print_result(result)
# Force to crawl again
cprint("2⃣ Second crawl (Force to crawl again):", True)
start_time = time.time()
result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
end_time = time.time()
2025-01-13 19:19:58 +08:00
cprint(
f"[LOG] 📦 [bold yellow]Second crawl took {end_time - start_time} seconds and result (forced to crawl):[/bold yellow]"
)
print_result(result)
2025-01-13 19:19:58 +08:00
def add_chunking_strategy(crawler):
# Adding a chunking strategy: RegexChunking
2025-01-13 19:19:58 +08:00
cprint(
"\n🧩 [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]",
True,
)
cprint(
"RegexChunking is a simple chunking strategy that splits the text based on a given regex pattern. Let's see it in action!"
)
result = crawler.run(
url="https://www.nbcnews.com/business",
2025-01-13 19:19:58 +08:00
chunking_strategy=RegexChunking(patterns=["\n\n"]),
)
cprint("[LOG] 📦 [bold yellow]RegexChunking result:[/bold yellow]")
print_result(result)
# Adding another chunking strategy: NlpSentenceChunking
2025-01-13 19:19:58 +08:00
cprint(
"\n🔍 [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]",
True,
)
cprint(
"NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!"
)
result = crawler.run(
2025-01-13 19:19:58 +08:00
url="https://www.nbcnews.com/business", chunking_strategy=NlpSentenceChunking()
)
cprint("[LOG] 📦 [bold yellow]NlpSentenceChunking result:[/bold yellow]")
print_result(result)
2025-01-13 19:19:58 +08:00
def add_extraction_strategy(crawler):
# Adding an extraction strategy: CosineStrategy
2025-01-13 19:19:58 +08:00
cprint(
"\n🧠 [bold cyan]Let's get smarter with an extraction strategy: CosineStrategy![/bold cyan]",
True,
)
cprint(
"CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let's see it in action!"
)
result = crawler.run(
url="https://www.nbcnews.com/business",
2025-01-13 19:19:58 +08:00
extraction_strategy=CosineStrategy(
word_count_threshold=10,
max_dist=0.2,
linkage_method="ward",
top_k=3,
sim_threshold=0.3,
verbose=True,
),
)
cprint("[LOG] 📦 [bold yellow]CosineStrategy result:[/bold yellow]")
print_result(result)
2025-01-13 19:19:58 +08:00
# Using semantic_filter with CosineStrategy
2025-01-13 19:19:58 +08:00
cprint(
"You can pass other parameters like 'semantic_filter' to the CosineStrategy to extract semantically similar blocks of text. Let's see it in action!"
)
result = crawler.run(
url="https://www.nbcnews.com/business",
extraction_strategy=CosineStrategy(
semantic_filter="inflation rent prices",
2025-01-13 19:19:58 +08:00
),
)
cprint(
"[LOG] 📦 [bold yellow]CosineStrategy result with semantic filter:[/bold yellow]"
)
print_result(result)
2025-01-13 19:19:58 +08:00
def add_llm_extraction_strategy(crawler):
# Adding an LLM extraction strategy without instructions
2025-01-13 19:19:58 +08:00
cprint(
"\n🤖 [bold cyan]Time to bring in the big guns: LLMExtractionStrategy without instructions![/bold cyan]",
True,
)
cprint(
"LLMExtractionStrategy uses a large language model to extract relevant information from the web page. Let's see it in action!"
)
result = crawler.run(
url="https://www.nbcnews.com/business",
2025-01-13 19:19:58 +08:00
extraction_strategy=LLMExtractionStrategy(
llm_config = LLMConfig(provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY"))
2025-01-13 19:19:58 +08:00
),
)
cprint(
"[LOG] 📦 [bold yellow]LLMExtractionStrategy (no instructions) result:[/bold yellow]"
)
print_result(result)
2025-01-13 19:19:58 +08:00
# Adding an LLM extraction strategy with instructions
2025-01-13 19:19:58 +08:00
cprint(
"\n📜 [bold cyan]Let's make it even more interesting: LLMExtractionStrategy with instructions![/bold cyan]",
True,
)
cprint(
"Let's say we are only interested in financial news. Let's see how LLMExtractionStrategy performs with instructions!"
)
result = crawler.run(
url="https://www.nbcnews.com/business",
extraction_strategy=LLMExtractionStrategy(
llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
2025-01-13 19:19:58 +08:00
instruction="I am interested in only financial news",
),
)
cprint(
"[LOG] 📦 [bold yellow]LLMExtractionStrategy (with instructions) result:[/bold yellow]"
)
print_result(result)
2025-01-13 19:19:58 +08:00
result = crawler.run(
url="https://www.nbcnews.com/business",
extraction_strategy=LLMExtractionStrategy(
llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
2025-01-13 19:19:58 +08:00
instruction="Extract only content related to technology",
),
)
cprint(
"[LOG] 📦 [bold yellow]LLMExtractionStrategy (with technology instruction) result:[/bold yellow]"
)
print_result(result)
2025-01-13 19:19:58 +08:00
def targeted_extraction(crawler):
# Using a CSS selector to extract only H2 tags
2025-01-13 19:19:58 +08:00
cprint(
"\n🎯 [bold cyan]Targeted extraction: Let's use a CSS selector to extract only H2 tags![/bold cyan]",
True,
)
2025-01-13 19:19:58 +08:00
result = crawler.run(url="https://www.nbcnews.com/business", css_selector="h2")
cprint("[LOG] 📦 [bold yellow]CSS Selector (H2 tags) result:[/bold yellow]")
print_result(result)
2025-01-13 19:19:58 +08:00
def interactive_extraction(crawler):
# Passing JavaScript code to interact with the page
2025-01-13 19:19:58 +08:00
cprint(
"\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]",
True,
)
cprint(
"In this example we try to click the 'Load More' button on the page using JavaScript code."
)
js_code = """
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
loadMoreButton && loadMoreButton.click();
"""
# crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
# crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
2025-01-13 19:19:58 +08:00
result = crawler.run(url="https://www.nbcnews.com/business", js=js_code)
cprint(
"[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]"
)
print_result(result)
2025-01-13 19:19:58 +08:00
def multiple_scrip(crawler):
# Passing JavaScript code to interact with the page
2025-01-13 19:19:58 +08:00
cprint(
"\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]",
True,
)
cprint(
"In this example we try to click the 'Load More' button on the page using JavaScript code."
)
js_code = [
"""
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
loadMoreButton && loadMoreButton.click();
2025-01-13 19:19:58 +08:00
"""
] * 2
# crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
# crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
2025-01-13 19:19:58 +08:00
result = crawler.run(url="https://www.nbcnews.com/business", js=js_code)
cprint(
"[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]"
)
print_result(result)
2025-01-13 19:19:58 +08:00
def using_crawler_hooks(crawler):
# Example usage of the hooks for authentication and setting a cookie
def on_driver_created(driver):
print("[HOOK] on_driver_created")
# Example customization: maximize the window
driver.maximize_window()
2025-01-13 19:19:58 +08:00
# Example customization: logging in to a hypothetical website
2025-01-13 19:19:58 +08:00
driver.get("https://example.com/login")
from selenium.webdriver.support.ui import WebDriverWait
2024-06-19 00:37:53 +08:00
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
2025-01-13 19:19:58 +08:00
WebDriverWait(driver, 10).until(
2025-01-13 19:19:58 +08:00
EC.presence_of_element_located((By.NAME, "username"))
)
2025-01-13 19:19:58 +08:00
driver.find_element(By.NAME, "username").send_keys("testuser")
driver.find_element(By.NAME, "password").send_keys("password123")
driver.find_element(By.NAME, "login").click()
WebDriverWait(driver, 10).until(
2025-01-13 19:19:58 +08:00
EC.presence_of_element_located((By.ID, "welcome"))
)
# Add a custom cookie
2025-01-13 19:19:58 +08:00
driver.add_cookie({"name": "test_cookie", "value": "cookie_value"})
return driver
def before_get_url(driver):
print("[HOOK] before_get_url")
# Example customization: add a custom header
# Enable Network domain for sending headers
2025-01-13 19:19:58 +08:00
driver.execute_cdp_cmd("Network.enable", {})
# Add a custom header
2025-01-13 19:19:58 +08:00
driver.execute_cdp_cmd(
"Network.setExtraHTTPHeaders", {"headers": {"X-Test-Header": "test"}}
)
return driver
2025-01-13 19:19:58 +08:00
def after_get_url(driver):
print("[HOOK] after_get_url")
# Example customization: log the URL
print(driver.current_url)
return driver
def before_return_html(driver, html):
print("[HOOK] before_return_html")
# Example customization: log the HTML
print(len(html))
return driver
2025-01-13 19:19:58 +08:00
cprint(
"\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]",
True,
)
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
2025-01-13 19:19:58 +08:00
crawler_strategy.set_hook("on_driver_created", on_driver_created)
crawler_strategy.set_hook("before_get_url", before_get_url)
crawler_strategy.set_hook("after_get_url", after_get_url)
crawler_strategy.set_hook("before_return_html", before_return_html)
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
2025-01-13 19:19:58 +08:00
crawler.warmup()
result = crawler.run(url="https://example.com")
2025-01-13 19:19:58 +08:00
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
2025-01-13 19:19:58 +08:00
print_result(result=result)
def using_crawler_hooks_dleay_example(crawler):
def delay(driver):
print("Delaying for 5 seconds...")
time.sleep(5)
print("Resuming...")
2025-01-13 19:19:58 +08:00
def create_crawler():
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
2025-01-13 19:19:58 +08:00
crawler_strategy.set_hook("after_get_url", delay)
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
crawler.warmup()
return crawler
2025-01-13 19:19:58 +08:00
cprint(
"\n🔗 [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]"
)
crawler = create_crawler()
2025-01-13 19:19:58 +08:00
result = crawler.run(url="https://google.com", bypass_cache=True)
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
print_result(result)
2025-01-13 19:19:58 +08:00
def main():
2025-01-13 19:19:58 +08:00
cprint(
"🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]"
)
cprint(
"⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]"
)
cprint(
"If this is the first time you're running Crawl4ai, this might take a few seconds to load required model files."
)
crawler = create_crawler()
2024-05-17 15:08:03 +08:00
2024-06-19 00:48:38 +08:00
crawler.always_by_pass_cache = True
basic_usage(crawler)
2024-06-19 00:48:38 +08:00
# basic_usage_some_params(crawler)
understanding_parameters(crawler)
2025-01-13 19:19:58 +08:00
crawler.always_by_pass_cache = True
screenshot_usage(crawler)
add_chunking_strategy(crawler)
add_extraction_strategy(crawler)
add_llm_extraction_strategy(crawler)
targeted_extraction(crawler)
interactive_extraction(crawler)
multiple_scrip(crawler)
2025-01-13 19:19:58 +08:00
cprint(
"\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]"
)
if __name__ == "__main__":
main()