# Explanation of bypass_cache and include_raw_html
cprint("\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]")
cprint("By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action. Becuase we already crawled this URL, the result will be fetched from the cache. Let's try it out!")
# Reads from cache
cprint("1️⃣ First crawl (caches the result):",True)
cprint("[LOG] 📦 [bold yellow]Craw result (without raw HTML content):[/bold yellow]")
print_result(result)
cprint("\n📄 The 'include_raw_html' parameter, when set to True, includes the raw HTML content in the response. By default is set to True. Let's move on to exploring different chunking strategies now!")
cprint("For the rest of this guide, I set crawler.always_by_pass_cache to True to force the crawler to bypass the cache. This is to ensure that we get fresh results for each run.",True)
crawler.always_by_pass_cache=True
# Adding a chunking strategy: RegexChunking
cprint("\n🧩 [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]",True)
cprint("RegexChunking is a simple chunking strategy that splits the text based on a given regex pattern. Let's see it in action!")
cprint("You can pass other parameters like 'semantic_filter' to the CosineStrategy to extract semantically similar blocks of text. Let's see it in action!")
result=crawler.run(
url="https://www.nbcnews.com/business",
extraction_strategy=CosineStrategy(
semantic_filter="inflation rent prices",
)
)
cprint("[LOG] 📦 [bold yellow]CosineStrategy result with semantic filter:[/bold yellow]")
print_result(result)
# Adding an LLM extraction strategy without instructions
cprint("\n🤖 [bold cyan]Time to bring in the big guns: LLMExtractionStrategy without instructions![/bold cyan]",True)
cprint("LLMExtractionStrategy uses a large language model to extract relevant information from the web page. Let's see it in action!")
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
print_result(result)
cprint("\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]")
# extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3), # Default is CosineStrategy
extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY'),instruction="I am intrested in only financial news"),