import os import time from crawl4ai.async_configs import LlmConfig from crawl4ai.web_crawler import WebCrawler from crawl4ai.chunking_strategy import * from crawl4ai.extraction_strategy import * from crawl4ai.crawler_strategy import * from rich import print from rich.console import Console from functools import lru_cache console = Console() @lru_cache() def create_crawler(): crawler = WebCrawler(verbose=True) crawler.warmup() return crawler def print_result(result): # Print each key in one line and just the first 10 characters of each one's value and three dots console.print("\t[bold]Result:[/bold]") for key, value in result.model_dump().items(): if isinstance(value, str) and value: console.print(f"\t{key}: [green]{value[:20]}...[/green]") if result.extracted_content: items = json.loads(result.extracted_content) print(f"\t[bold]{len(items)} blocks is extracted![/bold]") def cprint(message, press_any_key=False): console.print(message) if press_any_key: console.print("Press any key to continue...", style="") input() def basic_usage(crawler): cprint( "πŸ› οΈ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]" ) result = crawler.run(url="https://www.nbcnews.com/business", only_text=True) cprint("[LOG] πŸ“¦ [bold yellow]Basic crawl result:[/bold yellow]") print_result(result) def basic_usage_some_params(crawler): cprint( "πŸ› οΈ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]" ) result = crawler.run( url="https://www.nbcnews.com/business", word_count_threshold=1, only_text=True ) cprint("[LOG] πŸ“¦ [bold yellow]Basic crawl result:[/bold yellow]") print_result(result) def screenshot_usage(crawler): cprint("\nπŸ“Έ [bold cyan]Let's take a screenshot of the page![/bold cyan]") result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True) cprint("[LOG] πŸ“¦ [bold yellow]Screenshot result:[/bold yellow]") # Save the screenshot to a file with open("screenshot.png", "wb") as f: f.write(base64.b64decode(result.screenshot)) cprint("Screenshot saved to 'screenshot.png'!") print_result(result) def understanding_parameters(crawler): cprint( "\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]" ) cprint( "By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action." ) # First crawl (reads from cache) cprint("1️⃣ First crawl (caches the result):", True) start_time = time.time() result = crawler.run(url="https://www.nbcnews.com/business") end_time = time.time() cprint( f"[LOG] πŸ“¦ [bold yellow]First crawl took {end_time - start_time} seconds and result (from cache):[/bold yellow]" ) print_result(result) # Force to crawl again cprint("2️⃣ Second crawl (Force to crawl again):", True) start_time = time.time() result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True) end_time = time.time() cprint( f"[LOG] πŸ“¦ [bold yellow]Second crawl took {end_time - start_time} seconds and result (forced to crawl):[/bold yellow]" ) print_result(result) def add_chunking_strategy(crawler): # Adding a chunking strategy: RegexChunking cprint( "\n🧩 [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]", True, ) cprint( "RegexChunking is a simple chunking strategy that splits the text based on a given regex pattern. Let's see it in action!" ) result = crawler.run( url="https://www.nbcnews.com/business", chunking_strategy=RegexChunking(patterns=["\n\n"]), ) cprint("[LOG] πŸ“¦ [bold yellow]RegexChunking result:[/bold yellow]") print_result(result) # Adding another chunking strategy: NlpSentenceChunking cprint( "\nπŸ” [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]", True, ) cprint( "NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!" ) result = crawler.run( url="https://www.nbcnews.com/business", chunking_strategy=NlpSentenceChunking() ) cprint("[LOG] πŸ“¦ [bold yellow]NlpSentenceChunking result:[/bold yellow]") print_result(result) def add_extraction_strategy(crawler): # Adding an extraction strategy: CosineStrategy cprint( "\n🧠 [bold cyan]Let's get smarter with an extraction strategy: CosineStrategy![/bold cyan]", True, ) cprint( "CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let's see it in action!" ) result = crawler.run( url="https://www.nbcnews.com/business", extraction_strategy=CosineStrategy( word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold=0.3, verbose=True, ), ) cprint("[LOG] πŸ“¦ [bold yellow]CosineStrategy result:[/bold yellow]") print_result(result) # Using semantic_filter with CosineStrategy cprint( "You can pass other parameters like 'semantic_filter' to the CosineStrategy to extract semantically similar blocks of text. Let's see it in action!" ) result = crawler.run( url="https://www.nbcnews.com/business", extraction_strategy=CosineStrategy( semantic_filter="inflation rent prices", ), ) cprint( "[LOG] πŸ“¦ [bold yellow]CosineStrategy result with semantic filter:[/bold yellow]" ) print_result(result) def add_llm_extraction_strategy(crawler): # Adding an LLM extraction strategy without instructions cprint( "\nπŸ€– [bold cyan]Time to bring in the big guns: LLMExtractionStrategy without instructions![/bold cyan]", True, ) cprint( "LLMExtractionStrategy uses a large language model to extract relevant information from the web page. Let's see it in action!" ) result = crawler.run( url="https://www.nbcnews.com/business", extraction_strategy=LLMExtractionStrategy( llmConfig = LlmConfig(provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY")) ), ) cprint( "[LOG] πŸ“¦ [bold yellow]LLMExtractionStrategy (no instructions) result:[/bold yellow]" ) print_result(result) # Adding an LLM extraction strategy with instructions cprint( "\nπŸ“œ [bold cyan]Let's make it even more interesting: LLMExtractionStrategy with instructions![/bold cyan]", True, ) cprint( "Let's say we are only interested in financial news. Let's see how LLMExtractionStrategy performs with instructions!" ) result = crawler.run( url="https://www.nbcnews.com/business", extraction_strategy=LLMExtractionStrategy( llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")), instruction="I am interested in only financial news", ), ) cprint( "[LOG] πŸ“¦ [bold yellow]LLMExtractionStrategy (with instructions) result:[/bold yellow]" ) print_result(result) result = crawler.run( url="https://www.nbcnews.com/business", extraction_strategy=LLMExtractionStrategy( llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")), instruction="Extract only content related to technology", ), ) cprint( "[LOG] πŸ“¦ [bold yellow]LLMExtractionStrategy (with technology instruction) result:[/bold yellow]" ) print_result(result) def targeted_extraction(crawler): # Using a CSS selector to extract only H2 tags cprint( "\n🎯 [bold cyan]Targeted extraction: Let's use a CSS selector to extract only H2 tags![/bold cyan]", True, ) result = crawler.run(url="https://www.nbcnews.com/business", css_selector="h2") cprint("[LOG] πŸ“¦ [bold yellow]CSS Selector (H2 tags) result:[/bold yellow]") print_result(result) def interactive_extraction(crawler): # Passing JavaScript code to interact with the page cprint( "\nπŸ–±οΈ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True, ) cprint( "In this example we try to click the 'Load More' button on the page using JavaScript code." ) js_code = """ const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click(); """ # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code) # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True) result = crawler.run(url="https://www.nbcnews.com/business", js=js_code) cprint( "[LOG] πŸ“¦ [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]" ) print_result(result) def multiple_scrip(crawler): # Passing JavaScript code to interact with the page cprint( "\nπŸ–±οΈ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True, ) cprint( "In this example we try to click the 'Load More' button on the page using JavaScript code." ) js_code = [ """ const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click(); """ ] * 2 # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code) # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True) result = crawler.run(url="https://www.nbcnews.com/business", js=js_code) cprint( "[LOG] πŸ“¦ [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]" ) print_result(result) def using_crawler_hooks(crawler): # Example usage of the hooks for authentication and setting a cookie def on_driver_created(driver): print("[HOOK] on_driver_created") # Example customization: maximize the window driver.maximize_window() # Example customization: logging in to a hypothetical website driver.get("https://example.com/login") from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.NAME, "username")) ) driver.find_element(By.NAME, "username").send_keys("testuser") driver.find_element(By.NAME, "password").send_keys("password123") driver.find_element(By.NAME, "login").click() WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.ID, "welcome")) ) # Add a custom cookie driver.add_cookie({"name": "test_cookie", "value": "cookie_value"}) return driver def before_get_url(driver): print("[HOOK] before_get_url") # Example customization: add a custom header # Enable Network domain for sending headers driver.execute_cdp_cmd("Network.enable", {}) # Add a custom header driver.execute_cdp_cmd( "Network.setExtraHTTPHeaders", {"headers": {"X-Test-Header": "test"}} ) return driver def after_get_url(driver): print("[HOOK] after_get_url") # Example customization: log the URL print(driver.current_url) return driver def before_return_html(driver, html): print("[HOOK] before_return_html") # Example customization: log the HTML print(len(html)) return driver cprint( "\nπŸ”— [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True, ) crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True) crawler_strategy.set_hook("on_driver_created", on_driver_created) crawler_strategy.set_hook("before_get_url", before_get_url) crawler_strategy.set_hook("after_get_url", after_get_url) crawler_strategy.set_hook("before_return_html", before_return_html) crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy) crawler.warmup() result = crawler.run(url="https://example.com") cprint("[LOG] πŸ“¦ [bold yellow]Crawler Hooks result:[/bold yellow]") print_result(result=result) def using_crawler_hooks_dleay_example(crawler): def delay(driver): print("Delaying for 5 seconds...") time.sleep(5) print("Resuming...") def create_crawler(): crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True) crawler_strategy.set_hook("after_get_url", delay) crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy) crawler.warmup() return crawler cprint( "\nπŸ”— [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]" ) crawler = create_crawler() result = crawler.run(url="https://google.com", bypass_cache=True) cprint("[LOG] πŸ“¦ [bold yellow]Crawler Hooks result:[/bold yellow]") print_result(result) def main(): cprint( "🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]" ) cprint( "⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]" ) cprint( "If this is the first time you're running Crawl4ai, this might take a few seconds to load required model files." ) crawler = create_crawler() crawler.always_by_pass_cache = True basic_usage(crawler) # basic_usage_some_params(crawler) understanding_parameters(crawler) crawler.always_by_pass_cache = True screenshot_usage(crawler) add_chunking_strategy(crawler) add_extraction_strategy(crawler) add_llm_extraction_strategy(crawler) targeted_extraction(crawler) interactive_extraction(crawler) multiple_scrip(crawler) cprint( "\nπŸŽ‰ [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! πŸ•ΈοΈ[/bold green]" ) if __name__ == "__main__": main()