
* feature: Add LlmConfig to easily configure and pass LLM configs to different strategies * pulled in next branch and resolved conflicts * feat: Add gemini and deepseek providers. Make ignore_cache in llm content filter to true by default to avoid confusions * Refactor: Update LlmConfig in LLMExtractionStrategy class and deprecate old params * updated tests, docs and readme
406 lines
14 KiB
Python
406 lines
14 KiB
Python
import os
|
||
import time
|
||
from crawl4ai.async_configs import LlmConfig
|
||
from crawl4ai.web_crawler import WebCrawler
|
||
from crawl4ai.chunking_strategy import *
|
||
from crawl4ai.extraction_strategy import *
|
||
from crawl4ai.crawler_strategy import *
|
||
from rich import print
|
||
from rich.console import Console
|
||
from functools import lru_cache
|
||
|
||
console = Console()
|
||
|
||
|
||
@lru_cache()
|
||
def create_crawler():
|
||
crawler = WebCrawler(verbose=True)
|
||
crawler.warmup()
|
||
return crawler
|
||
|
||
|
||
def print_result(result):
|
||
# Print each key in one line and just the first 10 characters of each one's value and three dots
|
||
console.print("\t[bold]Result:[/bold]")
|
||
for key, value in result.model_dump().items():
|
||
if isinstance(value, str) and value:
|
||
console.print(f"\t{key}: [green]{value[:20]}...[/green]")
|
||
if result.extracted_content:
|
||
items = json.loads(result.extracted_content)
|
||
print(f"\t[bold]{len(items)} blocks is extracted![/bold]")
|
||
|
||
|
||
def cprint(message, press_any_key=False):
|
||
console.print(message)
|
||
if press_any_key:
|
||
console.print("Press any key to continue...", style="")
|
||
input()
|
||
|
||
|
||
def basic_usage(crawler):
|
||
cprint(
|
||
"🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]"
|
||
)
|
||
result = crawler.run(url="https://www.nbcnews.com/business", only_text=True)
|
||
cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
|
||
print_result(result)
|
||
|
||
|
||
def basic_usage_some_params(crawler):
|
||
cprint(
|
||
"🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]"
|
||
)
|
||
result = crawler.run(
|
||
url="https://www.nbcnews.com/business", word_count_threshold=1, only_text=True
|
||
)
|
||
cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
|
||
print_result(result)
|
||
|
||
|
||
def screenshot_usage(crawler):
|
||
cprint("\n📸 [bold cyan]Let's take a screenshot of the page![/bold cyan]")
|
||
result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
|
||
cprint("[LOG] 📦 [bold yellow]Screenshot result:[/bold yellow]")
|
||
# Save the screenshot to a file
|
||
with open("screenshot.png", "wb") as f:
|
||
f.write(base64.b64decode(result.screenshot))
|
||
cprint("Screenshot saved to 'screenshot.png'!")
|
||
print_result(result)
|
||
|
||
|
||
def understanding_parameters(crawler):
|
||
cprint(
|
||
"\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]"
|
||
)
|
||
cprint(
|
||
"By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action."
|
||
)
|
||
|
||
# First crawl (reads from cache)
|
||
cprint("1️⃣ First crawl (caches the result):", True)
|
||
start_time = time.time()
|
||
result = crawler.run(url="https://www.nbcnews.com/business")
|
||
end_time = time.time()
|
||
cprint(
|
||
f"[LOG] 📦 [bold yellow]First crawl took {end_time - start_time} seconds and result (from cache):[/bold yellow]"
|
||
)
|
||
print_result(result)
|
||
|
||
# Force to crawl again
|
||
cprint("2️⃣ Second crawl (Force to crawl again):", True)
|
||
start_time = time.time()
|
||
result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
|
||
end_time = time.time()
|
||
cprint(
|
||
f"[LOG] 📦 [bold yellow]Second crawl took {end_time - start_time} seconds and result (forced to crawl):[/bold yellow]"
|
||
)
|
||
print_result(result)
|
||
|
||
|
||
def add_chunking_strategy(crawler):
|
||
# Adding a chunking strategy: RegexChunking
|
||
cprint(
|
||
"\n🧩 [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]",
|
||
True,
|
||
)
|
||
cprint(
|
||
"RegexChunking is a simple chunking strategy that splits the text based on a given regex pattern. Let's see it in action!"
|
||
)
|
||
result = crawler.run(
|
||
url="https://www.nbcnews.com/business",
|
||
chunking_strategy=RegexChunking(patterns=["\n\n"]),
|
||
)
|
||
cprint("[LOG] 📦 [bold yellow]RegexChunking result:[/bold yellow]")
|
||
print_result(result)
|
||
|
||
# Adding another chunking strategy: NlpSentenceChunking
|
||
cprint(
|
||
"\n🔍 [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]",
|
||
True,
|
||
)
|
||
cprint(
|
||
"NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!"
|
||
)
|
||
result = crawler.run(
|
||
url="https://www.nbcnews.com/business", chunking_strategy=NlpSentenceChunking()
|
||
)
|
||
cprint("[LOG] 📦 [bold yellow]NlpSentenceChunking result:[/bold yellow]")
|
||
print_result(result)
|
||
|
||
|
||
def add_extraction_strategy(crawler):
|
||
# Adding an extraction strategy: CosineStrategy
|
||
cprint(
|
||
"\n🧠 [bold cyan]Let's get smarter with an extraction strategy: CosineStrategy![/bold cyan]",
|
||
True,
|
||
)
|
||
cprint(
|
||
"CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let's see it in action!"
|
||
)
|
||
result = crawler.run(
|
||
url="https://www.nbcnews.com/business",
|
||
extraction_strategy=CosineStrategy(
|
||
word_count_threshold=10,
|
||
max_dist=0.2,
|
||
linkage_method="ward",
|
||
top_k=3,
|
||
sim_threshold=0.3,
|
||
verbose=True,
|
||
),
|
||
)
|
||
cprint("[LOG] 📦 [bold yellow]CosineStrategy result:[/bold yellow]")
|
||
print_result(result)
|
||
|
||
# Using semantic_filter with CosineStrategy
|
||
cprint(
|
||
"You can pass other parameters like 'semantic_filter' to the CosineStrategy to extract semantically similar blocks of text. Let's see it in action!"
|
||
)
|
||
result = crawler.run(
|
||
url="https://www.nbcnews.com/business",
|
||
extraction_strategy=CosineStrategy(
|
||
semantic_filter="inflation rent prices",
|
||
),
|
||
)
|
||
cprint(
|
||
"[LOG] 📦 [bold yellow]CosineStrategy result with semantic filter:[/bold yellow]"
|
||
)
|
||
print_result(result)
|
||
|
||
|
||
def add_llm_extraction_strategy(crawler):
|
||
# Adding an LLM extraction strategy without instructions
|
||
cprint(
|
||
"\n🤖 [bold cyan]Time to bring in the big guns: LLMExtractionStrategy without instructions![/bold cyan]",
|
||
True,
|
||
)
|
||
cprint(
|
||
"LLMExtractionStrategy uses a large language model to extract relevant information from the web page. Let's see it in action!"
|
||
)
|
||
result = crawler.run(
|
||
url="https://www.nbcnews.com/business",
|
||
extraction_strategy=LLMExtractionStrategy(
|
||
llmConfig = LlmConfig(provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY"))
|
||
),
|
||
)
|
||
cprint(
|
||
"[LOG] 📦 [bold yellow]LLMExtractionStrategy (no instructions) result:[/bold yellow]"
|
||
)
|
||
print_result(result)
|
||
|
||
# Adding an LLM extraction strategy with instructions
|
||
cprint(
|
||
"\n📜 [bold cyan]Let's make it even more interesting: LLMExtractionStrategy with instructions![/bold cyan]",
|
||
True,
|
||
)
|
||
cprint(
|
||
"Let's say we are only interested in financial news. Let's see how LLMExtractionStrategy performs with instructions!"
|
||
)
|
||
result = crawler.run(
|
||
url="https://www.nbcnews.com/business",
|
||
extraction_strategy=LLMExtractionStrategy(
|
||
llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
|
||
instruction="I am interested in only financial news",
|
||
),
|
||
)
|
||
cprint(
|
||
"[LOG] 📦 [bold yellow]LLMExtractionStrategy (with instructions) result:[/bold yellow]"
|
||
)
|
||
print_result(result)
|
||
|
||
result = crawler.run(
|
||
url="https://www.nbcnews.com/business",
|
||
extraction_strategy=LLMExtractionStrategy(
|
||
llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
|
||
instruction="Extract only content related to technology",
|
||
),
|
||
)
|
||
cprint(
|
||
"[LOG] 📦 [bold yellow]LLMExtractionStrategy (with technology instruction) result:[/bold yellow]"
|
||
)
|
||
print_result(result)
|
||
|
||
|
||
def targeted_extraction(crawler):
|
||
# Using a CSS selector to extract only H2 tags
|
||
cprint(
|
||
"\n🎯 [bold cyan]Targeted extraction: Let's use a CSS selector to extract only H2 tags![/bold cyan]",
|
||
True,
|
||
)
|
||
result = crawler.run(url="https://www.nbcnews.com/business", css_selector="h2")
|
||
cprint("[LOG] 📦 [bold yellow]CSS Selector (H2 tags) result:[/bold yellow]")
|
||
print_result(result)
|
||
|
||
|
||
def interactive_extraction(crawler):
|
||
# Passing JavaScript code to interact with the page
|
||
cprint(
|
||
"\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]",
|
||
True,
|
||
)
|
||
cprint(
|
||
"In this example we try to click the 'Load More' button on the page using JavaScript code."
|
||
)
|
||
js_code = """
|
||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||
loadMoreButton && loadMoreButton.click();
|
||
"""
|
||
# crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
||
# crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
|
||
result = crawler.run(url="https://www.nbcnews.com/business", js=js_code)
|
||
cprint(
|
||
"[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]"
|
||
)
|
||
print_result(result)
|
||
|
||
|
||
def multiple_scrip(crawler):
|
||
# Passing JavaScript code to interact with the page
|
||
cprint(
|
||
"\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]",
|
||
True,
|
||
)
|
||
cprint(
|
||
"In this example we try to click the 'Load More' button on the page using JavaScript code."
|
||
)
|
||
js_code = [
|
||
"""
|
||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||
loadMoreButton && loadMoreButton.click();
|
||
"""
|
||
] * 2
|
||
# crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
||
# crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
|
||
result = crawler.run(url="https://www.nbcnews.com/business", js=js_code)
|
||
cprint(
|
||
"[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]"
|
||
)
|
||
print_result(result)
|
||
|
||
|
||
def using_crawler_hooks(crawler):
|
||
# Example usage of the hooks for authentication and setting a cookie
|
||
def on_driver_created(driver):
|
||
print("[HOOK] on_driver_created")
|
||
# Example customization: maximize the window
|
||
driver.maximize_window()
|
||
|
||
# Example customization: logging in to a hypothetical website
|
||
driver.get("https://example.com/login")
|
||
|
||
from selenium.webdriver.support.ui import WebDriverWait
|
||
from selenium.webdriver.common.by import By
|
||
from selenium.webdriver.support import expected_conditions as EC
|
||
|
||
WebDriverWait(driver, 10).until(
|
||
EC.presence_of_element_located((By.NAME, "username"))
|
||
)
|
||
driver.find_element(By.NAME, "username").send_keys("testuser")
|
||
driver.find_element(By.NAME, "password").send_keys("password123")
|
||
driver.find_element(By.NAME, "login").click()
|
||
WebDriverWait(driver, 10).until(
|
||
EC.presence_of_element_located((By.ID, "welcome"))
|
||
)
|
||
# Add a custom cookie
|
||
driver.add_cookie({"name": "test_cookie", "value": "cookie_value"})
|
||
return driver
|
||
|
||
def before_get_url(driver):
|
||
print("[HOOK] before_get_url")
|
||
# Example customization: add a custom header
|
||
# Enable Network domain for sending headers
|
||
driver.execute_cdp_cmd("Network.enable", {})
|
||
# Add a custom header
|
||
driver.execute_cdp_cmd(
|
||
"Network.setExtraHTTPHeaders", {"headers": {"X-Test-Header": "test"}}
|
||
)
|
||
return driver
|
||
|
||
def after_get_url(driver):
|
||
print("[HOOK] after_get_url")
|
||
# Example customization: log the URL
|
||
print(driver.current_url)
|
||
return driver
|
||
|
||
def before_return_html(driver, html):
|
||
print("[HOOK] before_return_html")
|
||
# Example customization: log the HTML
|
||
print(len(html))
|
||
return driver
|
||
|
||
cprint(
|
||
"\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]",
|
||
True,
|
||
)
|
||
|
||
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
|
||
crawler_strategy.set_hook("on_driver_created", on_driver_created)
|
||
crawler_strategy.set_hook("before_get_url", before_get_url)
|
||
crawler_strategy.set_hook("after_get_url", after_get_url)
|
||
crawler_strategy.set_hook("before_return_html", before_return_html)
|
||
|
||
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
|
||
crawler.warmup()
|
||
result = crawler.run(url="https://example.com")
|
||
|
||
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
|
||
print_result(result=result)
|
||
|
||
|
||
def using_crawler_hooks_dleay_example(crawler):
|
||
def delay(driver):
|
||
print("Delaying for 5 seconds...")
|
||
time.sleep(5)
|
||
print("Resuming...")
|
||
|
||
def create_crawler():
|
||
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
|
||
crawler_strategy.set_hook("after_get_url", delay)
|
||
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
|
||
crawler.warmup()
|
||
return crawler
|
||
|
||
cprint(
|
||
"\n🔗 [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]"
|
||
)
|
||
crawler = create_crawler()
|
||
result = crawler.run(url="https://google.com", bypass_cache=True)
|
||
|
||
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
|
||
print_result(result)
|
||
|
||
|
||
def main():
|
||
cprint(
|
||
"🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]"
|
||
)
|
||
cprint(
|
||
"⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]"
|
||
)
|
||
cprint(
|
||
"If this is the first time you're running Crawl4ai, this might take a few seconds to load required model files."
|
||
)
|
||
|
||
crawler = create_crawler()
|
||
|
||
crawler.always_by_pass_cache = True
|
||
basic_usage(crawler)
|
||
# basic_usage_some_params(crawler)
|
||
understanding_parameters(crawler)
|
||
|
||
crawler.always_by_pass_cache = True
|
||
screenshot_usage(crawler)
|
||
add_chunking_strategy(crawler)
|
||
add_extraction_strategy(crawler)
|
||
add_llm_extraction_strategy(crawler)
|
||
targeted_extraction(crawler)
|
||
interactive_extraction(crawler)
|
||
multiple_scrip(crawler)
|
||
|
||
cprint(
|
||
"\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]"
|
||
)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|