2024-05-16 19:50:20 +08:00
import os
import time
2024-05-14 21:27:41 +08:00
from crawl4ai . web_crawler import WebCrawler
from crawl4ai . chunking_strategy import *
from crawl4ai . extraction_strategy import *
2024-05-16 17:31:44 +08:00
from crawl4ai . crawler_strategy import *
from rich import print
from rich . console import Console
2024-05-16 19:50:20 +08:00
from functools import lru_cache
2024-05-14 21:27:41 +08:00
2024-05-16 17:31:44 +08:00
console = Console ( )
2024-05-16 19:50:20 +08:00
@lru_cache ( )
def create_crawler ( ) :
2024-05-17 21:56:13 +08:00
crawler = WebCrawler ( verbose = True )
2024-05-16 19:50:20 +08:00
crawler . warmup ( )
return crawler
2024-05-16 17:31:44 +08:00
def print_result ( result ) :
# Print each key in one line and just the first 10 characters of each one's value and three dots
console . print ( f " \t [bold]Result:[/bold] " )
for key , value in result . model_dump ( ) . items ( ) :
2024-05-16 19:50:20 +08:00
if isinstance ( value , str ) and value :
2024-05-16 17:31:44 +08:00
console . print ( f " \t { key } : [green] { value [ : 20 ] } ...[/green] " )
2024-05-16 22:49:12 +08:00
if result . extracted_content :
items = json . loads ( result . extracted_content )
print ( f " \t [bold] { len ( items ) } blocks is extracted![/bold] " )
2024-05-16 17:31:44 +08:00
def cprint ( message , press_any_key = False ) :
console . print ( message )
if press_any_key :
console . print ( " Press any key to continue... " , style = " " )
input ( )
2024-05-14 21:27:41 +08:00
2024-05-16 19:50:20 +08:00
def basic_usage ( crawler ) :
2024-05-16 17:31:44 +08:00
cprint ( " 🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan] " )
result = crawler . run ( url = " https://www.nbcnews.com/business " )
cprint ( " [LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow] " )
print_result ( result )
2024-05-16 19:50:20 +08:00
def understanding_parameters ( crawler ) :
2024-05-16 17:31:44 +08:00
cprint ( " \n 🧠 [bold cyan]Understanding ' bypass_cache ' and ' include_raw_html ' parameters:[/bold cyan] " )
2024-05-16 19:50:20 +08:00
cprint ( " By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let ' s see this in action. " )
2024-05-16 17:31:44 +08:00
2024-05-16 19:50:20 +08:00
# First crawl (reads from cache)
2024-05-16 17:31:44 +08:00
cprint ( " 1️ ⃣ First crawl (caches the result): " , True )
start_time = time . time ( )
result = crawler . run ( url = " https://www.nbcnews.com/business " )
end_time = time . time ( )
cprint ( f " [LOG] 📦 [bold yellow]First crawl took { end_time - start_time } seconds and result (from cache):[/bold yellow] " )
print_result ( result )
# Force to crawl again
cprint ( " 2️ ⃣ Second crawl (Force to crawl again): " , True )
start_time = time . time ( )
result = crawler . run ( url = " https://www.nbcnews.com/business " , bypass_cache = True )
end_time = time . time ( )
cprint ( f " [LOG] 📦 [bold yellow]Second crawl took { end_time - start_time } seconds and result (forced to crawl):[/bold yellow] " )
print_result ( result )
2024-05-16 19:50:20 +08:00
def add_chunking_strategy ( crawler ) :
2024-05-16 17:31:44 +08:00
# Adding a chunking strategy: RegexChunking
cprint ( " \n 🧩 [bold cyan]Let ' s add a chunking strategy: RegexChunking![/bold cyan] " , True )
cprint ( " RegexChunking is a simple chunking strategy that splits the text based on a given regex pattern. Let ' s see it in action! " )
result = crawler . run (
url = " https://www.nbcnews.com/business " ,
chunking_strategy = RegexChunking ( patterns = [ " \n \n " ] )
)
cprint ( " [LOG] 📦 [bold yellow]RegexChunking result:[/bold yellow] " )
print_result ( result )
# Adding another chunking strategy: NlpSentenceChunking
cprint ( " \n 🔍 [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan] " , True )
cprint ( " NlpSentenceChunking uses NLP techniques to split the text into sentences. Let ' s see how it performs! " )
result = crawler . run (
url = " https://www.nbcnews.com/business " ,
chunking_strategy = NlpSentenceChunking ( )
)
cprint ( " [LOG] 📦 [bold yellow]NlpSentenceChunking result:[/bold yellow] " )
print_result ( result )
2024-05-16 19:50:20 +08:00
def add_extraction_strategy ( crawler ) :
2024-05-16 17:31:44 +08:00
# Adding an extraction strategy: CosineStrategy
cprint ( " \n 🧠 [bold cyan]Let ' s get smarter with an extraction strategy: CosineStrategy![/bold cyan] " , True )
cprint ( " CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let ' s see it in action! " )
result = crawler . run (
url = " https://www.nbcnews.com/business " ,
2024-05-19 22:30:10 +08:00
extraction_strategy = CosineStrategy ( word_count_threshold = 10 , max_dist = 0.2 , linkage_method = " ward " , top_k = 3 , sim_threshold = 0.3 , verbose = True )
2024-05-16 17:31:44 +08:00
)
cprint ( " [LOG] 📦 [bold yellow]CosineStrategy result:[/bold yellow] " )
print_result ( result )
2024-05-14 21:27:41 +08:00
2024-05-16 19:50:20 +08:00
# Using semantic_filter with CosineStrategy
2024-05-16 17:31:44 +08:00
cprint ( " You can pass other parameters like ' semantic_filter ' to the CosineStrategy to extract semantically similar blocks of text. Let ' s see it in action! " )
result = crawler . run (
url = " https://www.nbcnews.com/business " ,
extraction_strategy = CosineStrategy (
semantic_filter = " inflation rent prices " ,
)
)
cprint ( " [LOG] 📦 [bold yellow]CosineStrategy result with semantic filter:[/bold yellow] " )
2024-05-16 19:50:20 +08:00
print_result ( result )
2024-05-16 17:31:44 +08:00
2024-05-16 19:50:20 +08:00
def add_llm_extraction_strategy ( crawler ) :
2024-05-16 17:31:44 +08:00
# Adding an LLM extraction strategy without instructions
cprint ( " \n 🤖 [bold cyan]Time to bring in the big guns: LLMExtractionStrategy without instructions![/bold cyan] " , True )
cprint ( " LLMExtractionStrategy uses a large language model to extract relevant information from the web page. Let ' s see it in action! " )
result = crawler . run (
url = " https://www.nbcnews.com/business " ,
extraction_strategy = LLMExtractionStrategy ( provider = " openai/gpt-4o " , api_token = os . getenv ( ' OPENAI_API_KEY ' ) )
)
cprint ( " [LOG] 📦 [bold yellow]LLMExtractionStrategy (no instructions) result:[/bold yellow] " )
print_result ( result )
# Adding an LLM extraction strategy with instructions
cprint ( " \n 📜 [bold cyan]Let ' s make it even more interesting: LLMExtractionStrategy with instructions![/bold cyan] " , True )
cprint ( " Let ' s say we are only interested in financial news. Let ' s see how LLMExtractionStrategy performs with instructions! " )
result = crawler . run (
url = " https://www.nbcnews.com/business " ,
extraction_strategy = LLMExtractionStrategy (
provider = " openai/gpt-4o " ,
api_token = os . getenv ( ' OPENAI_API_KEY ' ) ,
instruction = " I am interested in only financial news "
)
)
cprint ( " [LOG] 📦 [bold yellow]LLMExtractionStrategy (with instructions) result:[/bold yellow] " )
print_result ( result )
result = crawler . run (
2024-05-17 18:11:13 +08:00
url = " https://www.nbcnews.com/business " ,
2024-05-16 17:31:44 +08:00
extraction_strategy = LLMExtractionStrategy (
provider = " openai/gpt-4o " ,
api_token = os . getenv ( ' OPENAI_API_KEY ' ) ,
instruction = " Extract only content related to technology "
)
2024-05-16 19:50:20 +08:00
)
cprint ( " [LOG] 📦 [bold yellow]LLMExtractionStrategy (with technology instruction) result:[/bold yellow] " )
print_result ( result )
2024-05-16 17:31:44 +08:00
2024-05-16 19:50:20 +08:00
def targeted_extraction ( crawler ) :
2024-05-16 17:31:44 +08:00
# Using a CSS selector to extract only H2 tags
cprint ( " \n 🎯 [bold cyan]Targeted extraction: Let ' s use a CSS selector to extract only H2 tags![/bold cyan] " , True )
result = crawler . run (
url = " https://www.nbcnews.com/business " ,
css_selector = " h2 "
)
cprint ( " [LOG] 📦 [bold yellow]CSS Selector (H2 tags) result:[/bold yellow] " )
print_result ( result )
2024-05-16 19:50:20 +08:00
def interactive_extraction ( crawler ) :
2024-05-16 17:31:44 +08:00
# Passing JavaScript code to interact with the page
cprint ( " \n 🖱️ [bold cyan]Let ' s get interactive: Passing JavaScript code to click ' Load More ' button![/bold cyan] " , True )
cprint ( " In this example we try to click the ' Load More ' button on the page using JavaScript code. " )
js_code = """
const loadMoreButton = Array . from ( document . querySelectorAll ( ' button ' ) ) . find ( button = > button . textContent . includes ( ' Load More ' ) ) ;
loadMoreButton & & loadMoreButton . click ( ) ;
"""
crawler_strategy = LocalSeleniumCrawlerStrategy ( js_code = js_code )
crawler = WebCrawler ( crawler_strategy = crawler_strategy , always_by_pass_cache = True )
result = crawler . run (
url = " https://www.nbcnews.com/business " ,
)
cprint ( " [LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow] " )
print_result ( result )
2024-05-16 19:50:20 +08:00
def main ( ) :
cprint ( " 🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let ' s dive into some web crawling fun! 🌐[/bold green] " )
cprint ( " ⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan] " )
cprint ( " If this is the first time you ' re running Crawl4ai, this might take a few seconds to load required model files. " )
crawler = create_crawler ( )
2024-05-17 15:08:03 +08:00
2024-05-18 14:13:06 +08:00
basic_usage ( crawler )
2024-05-16 19:50:20 +08:00
understanding_parameters ( crawler )
2024-05-17 16:53:03 +08:00
crawler . always_by_pass_cache = True
2024-05-16 19:50:20 +08:00
add_chunking_strategy ( crawler )
add_extraction_strategy ( crawler )
add_llm_extraction_strategy ( crawler )
targeted_extraction ( crawler )
interactive_extraction ( crawler )
2024-05-16 17:31:44 +08:00
cprint ( " \n 🎉 [bold green]Congratulations! You ' ve made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green] " )
if __name__ == " __main__ " :
main ( )