2024-05-16 19:50:20 +08:00
import os
import time
2025-03-05 14:17:04 +08:00
from crawl4ai . types import LLMConfig
2024-05-14 21:27:41 +08:00
from crawl4ai . web_crawler import WebCrawler
from crawl4ai . chunking_strategy import *
from crawl4ai . extraction_strategy import *
2024-05-16 17:31:44 +08:00
from crawl4ai . crawler_strategy import *
from rich import print
from rich . console import Console
2024-05-16 19:50:20 +08:00
from functools import lru_cache
2024-05-14 21:27:41 +08:00
2024-05-16 17:31:44 +08:00
console = Console ( )
2025-01-13 19:19:58 +08:00
2024-05-16 19:50:20 +08:00
@lru_cache ( )
def create_crawler ( ) :
2024-05-17 21:56:13 +08:00
crawler = WebCrawler ( verbose = True )
2024-05-16 19:50:20 +08:00
crawler . warmup ( )
return crawler
2025-01-13 19:19:58 +08:00
2024-05-16 17:31:44 +08:00
def print_result ( result ) :
# Print each key in one line and just the first 10 characters of each one's value and three dots
2025-01-13 19:19:58 +08:00
console . print ( " \t [bold]Result:[/bold] " )
2024-05-16 17:31:44 +08:00
for key , value in result . model_dump ( ) . items ( ) :
2024-05-16 19:50:20 +08:00
if isinstance ( value , str ) and value :
2024-05-16 17:31:44 +08:00
console . print ( f " \t { key } : [green] { value [ : 20 ] } ...[/green] " )
2024-05-16 22:49:12 +08:00
if result . extracted_content :
items = json . loads ( result . extracted_content )
print ( f " \t [bold] { len ( items ) } blocks is extracted![/bold] " )
2024-05-16 17:31:44 +08:00
def cprint ( message , press_any_key = False ) :
console . print ( message )
if press_any_key :
console . print ( " Press any key to continue... " , style = " " )
input ( )
2024-05-14 21:27:41 +08:00
2025-01-13 19:19:58 +08:00
2024-05-16 19:50:20 +08:00
def basic_usage ( crawler ) :
2025-01-13 19:19:58 +08:00
cprint (
" 🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan] "
)
result = crawler . run ( url = " https://www.nbcnews.com/business " , only_text = True )
2024-06-19 00:48:38 +08:00
cprint ( " [LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow] " )
print_result ( result )
2025-01-13 19:19:58 +08:00
2024-06-19 00:48:38 +08:00
def basic_usage_some_params ( crawler ) :
2025-01-13 19:19:58 +08:00
cprint (
" 🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan] "
)
result = crawler . run (
url = " https://www.nbcnews.com/business " , word_count_threshold = 1 , only_text = True
)
2024-05-16 17:31:44 +08:00
cprint ( " [LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow] " )
print_result ( result )
2025-01-13 19:19:58 +08:00
2024-06-07 15:33:15 +08:00
def screenshot_usage ( crawler ) :
cprint ( " \n 📸 [bold cyan]Let ' s take a screenshot of the page![/bold cyan] " )
result = crawler . run ( url = " https://www.nbcnews.com/business " , screenshot = True )
cprint ( " [LOG] 📦 [bold yellow]Screenshot result:[/bold yellow] " )
# Save the screenshot to a file
with open ( " screenshot.png " , " wb " ) as f :
f . write ( base64 . b64decode ( result . screenshot ) )
cprint ( " Screenshot saved to ' screenshot.png ' ! " )
print_result ( result )
2025-01-13 19:19:58 +08:00
2024-05-16 19:50:20 +08:00
def understanding_parameters ( crawler ) :
2025-01-13 19:19:58 +08:00
cprint (
" \n 🧠 [bold cyan]Understanding ' bypass_cache ' and ' include_raw_html ' parameters:[/bold cyan] "
)
cprint (
" By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let ' s see this in action. "
)
2024-05-16 19:50:20 +08:00
# First crawl (reads from cache)
2024-05-16 17:31:44 +08:00
cprint ( " 1️ ⃣ First crawl (caches the result): " , True )
start_time = time . time ( )
result = crawler . run ( url = " https://www.nbcnews.com/business " )
end_time = time . time ( )
2025-01-13 19:19:58 +08:00
cprint (
f " [LOG] 📦 [bold yellow]First crawl took { end_time - start_time } seconds and result (from cache):[/bold yellow] "
)
2024-05-16 17:31:44 +08:00
print_result ( result )
# Force to crawl again
cprint ( " 2️ ⃣ Second crawl (Force to crawl again): " , True )
start_time = time . time ( )
result = crawler . run ( url = " https://www.nbcnews.com/business " , bypass_cache = True )
end_time = time . time ( )
2025-01-13 19:19:58 +08:00
cprint (
f " [LOG] 📦 [bold yellow]Second crawl took { end_time - start_time } seconds and result (forced to crawl):[/bold yellow] "
)
2024-05-16 17:31:44 +08:00
print_result ( result )
2025-01-13 19:19:58 +08:00
2024-05-16 19:50:20 +08:00
def add_chunking_strategy ( crawler ) :
2024-05-16 17:31:44 +08:00
# Adding a chunking strategy: RegexChunking
2025-01-13 19:19:58 +08:00
cprint (
" \n 🧩 [bold cyan]Let ' s add a chunking strategy: RegexChunking![/bold cyan] " ,
True ,
)
cprint (
" RegexChunking is a simple chunking strategy that splits the text based on a given regex pattern. Let ' s see it in action! "
)
2024-05-16 17:31:44 +08:00
result = crawler . run (
url = " https://www.nbcnews.com/business " ,
2025-01-13 19:19:58 +08:00
chunking_strategy = RegexChunking ( patterns = [ " \n \n " ] ) ,
2024-05-16 17:31:44 +08:00
)
cprint ( " [LOG] 📦 [bold yellow]RegexChunking result:[/bold yellow] " )
print_result ( result )
# Adding another chunking strategy: NlpSentenceChunking
2025-01-13 19:19:58 +08:00
cprint (
" \n 🔍 [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan] " ,
True ,
)
cprint (
" NlpSentenceChunking uses NLP techniques to split the text into sentences. Let ' s see how it performs! "
)
2024-05-16 17:31:44 +08:00
result = crawler . run (
2025-01-13 19:19:58 +08:00
url = " https://www.nbcnews.com/business " , chunking_strategy = NlpSentenceChunking ( )
2024-05-16 17:31:44 +08:00
)
cprint ( " [LOG] 📦 [bold yellow]NlpSentenceChunking result:[/bold yellow] " )
print_result ( result )
2025-01-13 19:19:58 +08:00
2024-05-16 19:50:20 +08:00
def add_extraction_strategy ( crawler ) :
2024-05-16 17:31:44 +08:00
# Adding an extraction strategy: CosineStrategy
2025-01-13 19:19:58 +08:00
cprint (
" \n 🧠 [bold cyan]Let ' s get smarter with an extraction strategy: CosineStrategy![/bold cyan] " ,
True ,
)
cprint (
" CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let ' s see it in action! "
)
2024-05-16 17:31:44 +08:00
result = crawler . run (
url = " https://www.nbcnews.com/business " ,
2025-01-13 19:19:58 +08:00
extraction_strategy = CosineStrategy (
word_count_threshold = 10 ,
max_dist = 0.2 ,
linkage_method = " ward " ,
top_k = 3 ,
sim_threshold = 0.3 ,
verbose = True ,
) ,
2024-05-16 17:31:44 +08:00
)
cprint ( " [LOG] 📦 [bold yellow]CosineStrategy result:[/bold yellow] " )
print_result ( result )
2025-01-13 19:19:58 +08:00
2024-05-16 19:50:20 +08:00
# Using semantic_filter with CosineStrategy
2025-01-13 19:19:58 +08:00
cprint (
" You can pass other parameters like ' semantic_filter ' to the CosineStrategy to extract semantically similar blocks of text. Let ' s see it in action! "
)
2024-05-16 17:31:44 +08:00
result = crawler . run (
url = " https://www.nbcnews.com/business " ,
extraction_strategy = CosineStrategy (
semantic_filter = " inflation rent prices " ,
2025-01-13 19:19:58 +08:00
) ,
)
cprint (
" [LOG] 📦 [bold yellow]CosineStrategy result with semantic filter:[/bold yellow] "
2024-05-16 17:31:44 +08:00
)
2024-05-16 19:50:20 +08:00
print_result ( result )
2024-05-16 17:31:44 +08:00
2025-01-13 19:19:58 +08:00
2024-05-16 19:50:20 +08:00
def add_llm_extraction_strategy ( crawler ) :
2024-05-16 17:31:44 +08:00
# Adding an LLM extraction strategy without instructions
2025-01-13 19:19:58 +08:00
cprint (
" \n 🤖 [bold cyan]Time to bring in the big guns: LLMExtractionStrategy without instructions![/bold cyan] " ,
True ,
)
cprint (
" LLMExtractionStrategy uses a large language model to extract relevant information from the web page. Let ' s see it in action! "
)
2024-05-16 17:31:44 +08:00
result = crawler . run (
url = " https://www.nbcnews.com/business " ,
2025-01-13 19:19:58 +08:00
extraction_strategy = LLMExtractionStrategy (
2025-03-05 14:17:04 +08:00
llm_config = LLMConfig ( provider = " openai/gpt-4o " , api_token = os . getenv ( " OPENAI_API_KEY " ) )
2025-01-13 19:19:58 +08:00
) ,
)
cprint (
" [LOG] 📦 [bold yellow]LLMExtractionStrategy (no instructions) result:[/bold yellow] "
2024-05-16 17:31:44 +08:00
)
print_result ( result )
2025-01-13 19:19:58 +08:00
2024-05-16 17:31:44 +08:00
# Adding an LLM extraction strategy with instructions
2025-01-13 19:19:58 +08:00
cprint (
" \n 📜 [bold cyan]Let ' s make it even more interesting: LLMExtractionStrategy with instructions![/bold cyan] " ,
True ,
)
cprint (
" Let ' s say we are only interested in financial news. Let ' s see how LLMExtractionStrategy performs with instructions! "
)
2024-05-16 17:31:44 +08:00
result = crawler . run (
url = " https://www.nbcnews.com/business " ,
extraction_strategy = LLMExtractionStrategy (
2025-03-05 14:17:04 +08:00
llm_config = LLMConfig ( provider = " openai/gpt-4o " , api_token = os . getenv ( " OPENAI_API_KEY " ) ) ,
2025-01-13 19:19:58 +08:00
instruction = " I am interested in only financial news " ,
) ,
)
cprint (
" [LOG] 📦 [bold yellow]LLMExtractionStrategy (with instructions) result:[/bold yellow] "
2024-05-16 17:31:44 +08:00
)
print_result ( result )
2025-01-13 19:19:58 +08:00
2024-05-16 17:31:44 +08:00
result = crawler . run (
2024-05-17 18:11:13 +08:00
url = " https://www.nbcnews.com/business " ,
2024-05-16 17:31:44 +08:00
extraction_strategy = LLMExtractionStrategy (
2025-03-05 14:17:04 +08:00
llm_config = LLMConfig ( provider = " openai/gpt-4o " , api_token = os . getenv ( " OPENAI_API_KEY " ) ) ,
2025-01-13 19:19:58 +08:00
instruction = " Extract only content related to technology " ,
) ,
)
cprint (
" [LOG] 📦 [bold yellow]LLMExtractionStrategy (with technology instruction) result:[/bold yellow] "
2024-05-16 19:50:20 +08:00
)
print_result ( result )
2024-05-16 17:31:44 +08:00
2025-01-13 19:19:58 +08:00
2024-05-16 19:50:20 +08:00
def targeted_extraction ( crawler ) :
2024-05-16 17:31:44 +08:00
# Using a CSS selector to extract only H2 tags
2025-01-13 19:19:58 +08:00
cprint (
" \n 🎯 [bold cyan]Targeted extraction: Let ' s use a CSS selector to extract only H2 tags![/bold cyan] " ,
True ,
2024-05-16 17:31:44 +08:00
)
2025-01-13 19:19:58 +08:00
result = crawler . run ( url = " https://www.nbcnews.com/business " , css_selector = " h2 " )
2024-05-16 17:31:44 +08:00
cprint ( " [LOG] 📦 [bold yellow]CSS Selector (H2 tags) result:[/bold yellow] " )
print_result ( result )
2025-01-13 19:19:58 +08:00
2024-05-16 19:50:20 +08:00
def interactive_extraction ( crawler ) :
2024-05-16 17:31:44 +08:00
# Passing JavaScript code to interact with the page
2025-01-13 19:19:58 +08:00
cprint (
" \n 🖱️ [bold cyan]Let ' s get interactive: Passing JavaScript code to click ' Load More ' button![/bold cyan] " ,
True ,
)
cprint (
" In this example we try to click the ' Load More ' button on the page using JavaScript code. "
)
2024-05-16 17:31:44 +08:00
js_code = """
const loadMoreButton = Array . from ( document . querySelectorAll ( ' button ' ) ) . find ( button = > button . textContent . includes ( ' Load More ' ) ) ;
loadMoreButton & & loadMoreButton . click ( ) ;
"""
2024-06-07 20:43:50 +08:00
# crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
# crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
2025-01-13 19:19:58 +08:00
result = crawler . run ( url = " https://www.nbcnews.com/business " , js = js_code )
cprint (
" [LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow] "
2024-05-16 17:31:44 +08:00
)
print_result ( result )
2025-01-13 19:19:58 +08:00
2024-06-02 15:40:18 +08:00
def multiple_scrip ( crawler ) :
# Passing JavaScript code to interact with the page
2025-01-13 19:19:58 +08:00
cprint (
" \n 🖱️ [bold cyan]Let ' s get interactive: Passing JavaScript code to click ' Load More ' button![/bold cyan] " ,
True ,
)
cprint (
" In this example we try to click the ' Load More ' button on the page using JavaScript code. "
)
js_code = [
"""
2024-06-02 15:40:18 +08:00
const loadMoreButton = Array . from ( document . querySelectorAll ( ' button ' ) ) . find ( button = > button . textContent . includes ( ' Load More ' ) ) ;
loadMoreButton & & loadMoreButton . click ( ) ;
2025-01-13 19:19:58 +08:00
"""
] * 2
2024-06-07 20:43:50 +08:00
# crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
# crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
2025-01-13 19:19:58 +08:00
result = crawler . run ( url = " https://www.nbcnews.com/business " , js = js_code )
cprint (
" [LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow] "
2024-06-02 15:40:18 +08:00
)
print_result ( result )
2025-01-13 19:19:58 +08:00
2024-06-17 15:37:18 +08:00
def using_crawler_hooks ( crawler ) :
# Example usage of the hooks for authentication and setting a cookie
def on_driver_created ( driver ) :
print ( " [HOOK] on_driver_created " )
# Example customization: maximize the window
driver . maximize_window ( )
2025-01-13 19:19:58 +08:00
2024-06-17 15:37:18 +08:00
# Example customization: logging in to a hypothetical website
2025-01-13 19:19:58 +08:00
driver . get ( " https://example.com/login " )
2024-06-17 15:50:03 +08:00
from selenium . webdriver . support . ui import WebDriverWait
2024-06-19 00:37:53 +08:00
from selenium . webdriver . common . by import By
from selenium . webdriver . support import expected_conditions as EC
2025-01-13 19:19:58 +08:00
2024-06-17 15:37:18 +08:00
WebDriverWait ( driver , 10 ) . until (
2025-01-13 19:19:58 +08:00
EC . presence_of_element_located ( ( By . NAME , " username " ) )
2024-06-17 15:37:18 +08:00
)
2025-01-13 19:19:58 +08:00
driver . find_element ( By . NAME , " username " ) . send_keys ( " testuser " )
driver . find_element ( By . NAME , " password " ) . send_keys ( " password123 " )
driver . find_element ( By . NAME , " login " ) . click ( )
2024-06-17 15:37:18 +08:00
WebDriverWait ( driver , 10 ) . until (
2025-01-13 19:19:58 +08:00
EC . presence_of_element_located ( ( By . ID , " welcome " ) )
2024-06-17 15:37:18 +08:00
)
# Add a custom cookie
2025-01-13 19:19:58 +08:00
driver . add_cookie ( { " name " : " test_cookie " , " value " : " cookie_value " } )
return driver
2024-06-17 15:37:18 +08:00
2024-06-17 15:50:03 +08:00
def before_get_url ( driver ) :
print ( " [HOOK] before_get_url " )
# Example customization: add a custom header
# Enable Network domain for sending headers
2025-01-13 19:19:58 +08:00
driver . execute_cdp_cmd ( " Network.enable " , { } )
2024-06-17 15:50:03 +08:00
# Add a custom header
2025-01-13 19:19:58 +08:00
driver . execute_cdp_cmd (
" Network.setExtraHTTPHeaders " , { " headers " : { " X-Test-Header " : " test " } }
)
2024-06-17 15:50:03 +08:00
return driver
2025-01-13 19:19:58 +08:00
2024-06-17 15:37:18 +08:00
def after_get_url ( driver ) :
print ( " [HOOK] after_get_url " )
# Example customization: log the URL
print ( driver . current_url )
return driver
def before_return_html ( driver , html ) :
print ( " [HOOK] before_return_html " )
2024-06-17 15:50:03 +08:00
# Example customization: log the HTML
print ( len ( html ) )
return driver
2025-01-13 19:19:58 +08:00
cprint (
" \n 🔗 [bold cyan]Using Crawler Hooks: Let ' s see how we can customize the crawler using hooks![/bold cyan] " ,
True ,
)
2024-07-08 16:33:25 +08:00
crawler_strategy = LocalSeleniumCrawlerStrategy ( verbose = True )
2025-01-13 19:19:58 +08:00
crawler_strategy . set_hook ( " on_driver_created " , on_driver_created )
crawler_strategy . set_hook ( " before_get_url " , before_get_url )
crawler_strategy . set_hook ( " after_get_url " , after_get_url )
crawler_strategy . set_hook ( " before_return_html " , before_return_html )
2024-07-08 16:33:25 +08:00
crawler = WebCrawler ( verbose = True , crawler_strategy = crawler_strategy )
2025-01-13 19:19:58 +08:00
crawler . warmup ( )
2024-06-17 15:37:18 +08:00
result = crawler . run ( url = " https://example.com " )
2025-01-13 19:19:58 +08:00
2024-06-17 15:37:18 +08:00
cprint ( " [LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow] " )
2025-01-13 19:19:58 +08:00
print_result ( result = result )
2024-07-08 16:33:25 +08:00
def using_crawler_hooks_dleay_example ( crawler ) :
def delay ( driver ) :
print ( " Delaying for 5 seconds... " )
time . sleep ( 5 )
print ( " Resuming... " )
2025-01-13 19:19:58 +08:00
2024-07-08 16:33:25 +08:00
def create_crawler ( ) :
crawler_strategy = LocalSeleniumCrawlerStrategy ( verbose = True )
2025-01-13 19:19:58 +08:00
crawler_strategy . set_hook ( " after_get_url " , delay )
2024-07-08 16:33:25 +08:00
crawler = WebCrawler ( verbose = True , crawler_strategy = crawler_strategy )
crawler . warmup ( )
return crawler
2025-01-13 19:19:58 +08:00
cprint (
" \n 🔗 [bold cyan]Using Crawler Hooks: Let ' s add a delay after fetching the url to make sure entire page is fetched.[/bold cyan] "
)
2024-07-08 16:33:25 +08:00
crawler = create_crawler ( )
2025-01-13 19:19:58 +08:00
result = crawler . run ( url = " https://google.com " , bypass_cache = True )
2024-07-08 16:33:25 +08:00
cprint ( " [LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow] " )
print_result ( result )
2025-01-13 19:19:58 +08:00
2024-06-17 15:37:18 +08:00
2024-05-16 19:50:20 +08:00
def main ( ) :
2025-01-13 19:19:58 +08:00
cprint (
" 🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let ' s dive into some web crawling fun! 🌐[/bold green] "
)
cprint (
" ⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan] "
)
cprint (
" If this is the first time you ' re running Crawl4ai, this might take a few seconds to load required model files. "
)
2024-05-16 19:50:20 +08:00
crawler = create_crawler ( )
2024-05-17 15:08:03 +08:00
2024-06-19 00:48:38 +08:00
crawler . always_by_pass_cache = True
2024-05-18 14:13:06 +08:00
basic_usage ( crawler )
2024-06-19 00:48:38 +08:00
# basic_usage_some_params(crawler)
2024-05-16 19:50:20 +08:00
understanding_parameters ( crawler )
2025-01-13 19:19:58 +08:00
2024-05-17 16:53:03 +08:00
crawler . always_by_pass_cache = True
2024-06-07 15:33:15 +08:00
screenshot_usage ( crawler )
2024-05-16 19:50:20 +08:00
add_chunking_strategy ( crawler )
add_extraction_strategy ( crawler )
add_llm_extraction_strategy ( crawler )
targeted_extraction ( crawler )
interactive_extraction ( crawler )
2024-06-02 15:40:18 +08:00
multiple_scrip ( crawler )
2024-05-16 19:50:20 +08:00
2025-01-13 19:19:58 +08:00
cprint (
" \n 🎉 [bold green]Congratulations! You ' ve made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green] "
)
2024-05-16 17:31:44 +08:00
if __name__ == " __main__ " :
main ( )