crawl4ai/docs/quickstart.py

import os
from crawl4ai.web_crawler import WebCrawler
from crawl4ai.chunking_strategy import *
from crawl4ai.extraction_strategy import *


def main():
    crawler = WebCrawler()
    crawler.warmup()
    
    # Single page crawl
    result = crawler.run(
        url="https://www.nbcnews.com/business",
        word_count_threshold=5,  # Minimum word count for a HTML tag to be considered as a worthy block
        chunking_strategy=RegexChunking(patterns=["\n\n"]),  # Default is RegexChunking
        extraction_strategy=CosineStrategy(
            word_count_threshold=20, max_dist=0.2, linkage_method="ward", top_k=3
        ),  # Default is CosineStrategy
        # extraction_strategy= LLMExtractionStrategy(provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY')),
        bypass_cache=True,
        extract_blocks=True,  # Whether to extract semantical blocks of text from the HTML
        css_selector="",  # Eg: "div.article-body" or all H2 tags liek "h2"
        verbose=True,
        include_raw_html=True,  # Whether to include the raw HTML content in the response
    )
    

    print("[LOG] 📦 Crawl result:")
    print(result.model_dump())


if __name__ == "__main__":
    main()
- Test all methods - Update index.hml - Update Readme - Resolve some bugs 2024-05-14 21:27:41 +08:00			`import os`
			`from crawl4ai.web_crawler import WebCrawler`
			`from crawl4ai.chunking_strategy import *`
			`from crawl4ai.extraction_strategy import *`


			`def main():`
			`crawler = WebCrawler()`
			`crawler.warmup()`

			`# Single page crawl`
			`result = crawler.run(`
			`url="https://www.nbcnews.com/business",`
			`word_count_threshold=5, # Minimum word count for a HTML tag to be considered as a worthy block`
			`chunking_strategy=RegexChunking(patterns=["\n\n"]), # Default is RegexChunking`
			`extraction_strategy=CosineStrategy(`
			`word_count_threshold=20, max_dist=0.2, linkage_method="ward", top_k=3`
			`), # Default is CosineStrategy`
			`# extraction_strategy= LLMExtractionStrategy(provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY')),`
			`bypass_cache=True,`
			`extract_blocks=True, # Whether to extract semantical blocks of text from the HTML`
			`css_selector="", # Eg: "div.article-body" or all H2 tags liek "h2"`
			`verbose=True,`
			`include_raw_html=True, # Whether to include the raw HTML content in the response`
			`)`


			`print("[LOG] 📦 Crawl result:")`
			`print(result.model_dump())`


			`if __name__ == "__main__":`
			`main()`