crawl4ai/docs/examples/quickstart_sync.py

import os
import time
from crawl4ai.types import LLMConfig
from crawl4ai.web_crawler import WebCrawler
from crawl4ai.chunking_strategy import *
from crawl4ai.extraction_strategy import *
from crawl4ai.crawler_strategy import *
from rich import print
from rich.console import Console
from functools import lru_cache

console = Console()


@lru_cache()
def create_crawler():
    crawler = WebCrawler(verbose=True)
    crawler.warmup()
    return crawler


def print_result(result):
    # Print each key in one line and just the first 10 characters of each one's value and three dots
    console.print("\t[bold]Result:[/bold]")
    for key, value in result.model_dump().items():
        if isinstance(value, str) and value:
            console.print(f"\t{key}: [green]{value[:20]}...[/green]")
    if result.extracted_content:
        items = json.loads(result.extracted_content)
        print(f"\t[bold]{len(items)} blocks is extracted![/bold]")


def cprint(message, press_any_key=False):
    console.print(message)
    if press_any_key:
        console.print("Press any key to continue...", style="")
        input()


def basic_usage(crawler):
    cprint(
        "🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]"
    )
    result = crawler.run(url="https://www.nbcnews.com/business", only_text=True)
    cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
    print_result(result)


def basic_usage_some_params(crawler):
    cprint(
        "🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]"
    )
    result = crawler.run(
        url="https://www.nbcnews.com/business", word_count_threshold=1, only_text=True
    )
    cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
    print_result(result)


def screenshot_usage(crawler):
    cprint("\n📸 [bold cyan]Let's take a screenshot of the page![/bold cyan]")
    result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
    cprint("[LOG] 📦 [bold yellow]Screenshot result:[/bold yellow]")
    # Save the screenshot to a file
    with open("screenshot.png", "wb") as f:
        f.write(base64.b64decode(result.screenshot))
    cprint("Screenshot saved to 'screenshot.png'!")
    print_result(result)


def understanding_parameters(crawler):
    cprint(
        "\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]"
    )
    cprint(
        "By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action."
    )

    # First crawl (reads from cache)
    cprint("1️⃣ First crawl (caches the result):", True)
    start_time = time.time()
    result = crawler.run(url="https://www.nbcnews.com/business")
    end_time = time.time()
    cprint(
        f"[LOG] 📦 [bold yellow]First crawl took {end_time - start_time} seconds and result (from cache):[/bold yellow]"
    )
    print_result(result)

    # Force to crawl again
    cprint("2️⃣ Second crawl (Force to crawl again):", True)
    start_time = time.time()
    result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
    end_time = time.time()
    cprint(
        f"[LOG] 📦 [bold yellow]Second crawl took {end_time - start_time} seconds and result (forced to crawl):[/bold yellow]"
    )
    print_result(result)


def add_chunking_strategy(crawler):
    # Adding a chunking strategy: RegexChunking
    cprint(
        "\n🧩 [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]",
        True,
    )
    cprint(
        "RegexChunking is a simple chunking strategy that splits the text based on a given regex pattern. Let's see it in action!"
    )
    result = crawler.run(
        url="https://www.nbcnews.com/business",
        chunking_strategy=RegexChunking(patterns=["\n\n"]),
    )
    cprint("[LOG] 📦 [bold yellow]RegexChunking result:[/bold yellow]")
    print_result(result)

    # Adding another chunking strategy: NlpSentenceChunking
    cprint(
        "\n🔍 [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]",
        True,
    )
    cprint(
        "NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!"
    )
    result = crawler.run(
        url="https://www.nbcnews.com/business", chunking_strategy=NlpSentenceChunking()
    )
    cprint("[LOG] 📦 [bold yellow]NlpSentenceChunking result:[/bold yellow]")
    print_result(result)


def add_extraction_strategy(crawler):
    # Adding an extraction strategy: CosineStrategy
    cprint(
        "\n🧠 [bold cyan]Let's get smarter with an extraction strategy: CosineStrategy![/bold cyan]",
        True,
    )
    cprint(
        "CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let's see it in action!"
    )
    result = crawler.run(
        url="https://www.nbcnews.com/business",
        extraction_strategy=CosineStrategy(
            word_count_threshold=10,
            max_dist=0.2,
            linkage_method="ward",
            top_k=3,
            sim_threshold=0.3,
            verbose=True,
        ),
    )
    cprint("[LOG] 📦 [bold yellow]CosineStrategy result:[/bold yellow]")
    print_result(result)

    # Using semantic_filter with CosineStrategy
    cprint(
        "You can pass other parameters like 'semantic_filter' to the CosineStrategy to extract semantically similar blocks of text. Let's see it in action!"
    )
    result = crawler.run(
        url="https://www.nbcnews.com/business",
        extraction_strategy=CosineStrategy(
            semantic_filter="inflation rent prices",
        ),
    )
    cprint(
        "[LOG] 📦 [bold yellow]CosineStrategy result with semantic filter:[/bold yellow]"
    )
    print_result(result)


def add_llm_extraction_strategy(crawler):
    # Adding an LLM extraction strategy without instructions
    cprint(
        "\n🤖 [bold cyan]Time to bring in the big guns: LLMExtractionStrategy without instructions![/bold cyan]",
        True,
    )
    cprint(
        "LLMExtractionStrategy uses a large language model to extract relevant information from the web page. Let's see it in action!"
    )
    result = crawler.run(
        url="https://www.nbcnews.com/business",
        extraction_strategy=LLMExtractionStrategy(
            llm_config =  LLMConfig(provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY"))
        ),
    )
    cprint(
        "[LOG] 📦 [bold yellow]LLMExtractionStrategy (no instructions) result:[/bold yellow]"
    )
    print_result(result)

    # Adding an LLM extraction strategy with instructions
    cprint(
        "\n📜 [bold cyan]Let's make it even more interesting: LLMExtractionStrategy with instructions![/bold cyan]",
        True,
    )
    cprint(
        "Let's say we are only interested in financial news. Let's see how LLMExtractionStrategy performs with instructions!"
    )
    result = crawler.run(
        url="https://www.nbcnews.com/business",
        extraction_strategy=LLMExtractionStrategy(
            llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
            instruction="I am interested in only financial news",
        ),
    )
    cprint(
        "[LOG] 📦 [bold yellow]LLMExtractionStrategy (with instructions) result:[/bold yellow]"
    )
    print_result(result)

    result = crawler.run(
        url="https://www.nbcnews.com/business",
        extraction_strategy=LLMExtractionStrategy(
            llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
            instruction="Extract only content related to technology",
        ),
    )
    cprint(
        "[LOG] 📦 [bold yellow]LLMExtractionStrategy (with technology instruction) result:[/bold yellow]"
    )
    print_result(result)


def targeted_extraction(crawler):
    # Using a CSS selector to extract only H2 tags
    cprint(
        "\n🎯 [bold cyan]Targeted extraction: Let's use a CSS selector to extract only H2 tags![/bold cyan]",
        True,
    )
    result = crawler.run(url="https://www.nbcnews.com/business", css_selector="h2")
    cprint("[LOG] 📦 [bold yellow]CSS Selector (H2 tags) result:[/bold yellow]")
    print_result(result)


def interactive_extraction(crawler):
    # Passing JavaScript code to interact with the page
    cprint(
        "\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]",
        True,
    )
    cprint(
        "In this example we try to click the 'Load More' button on the page using JavaScript code."
    )
    js_code = """
    const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
    loadMoreButton && loadMoreButton.click();
    """
    # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
    # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
    result = crawler.run(url="https://www.nbcnews.com/business", js=js_code)
    cprint(
        "[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]"
    )
    print_result(result)


def multiple_scrip(crawler):
    # Passing JavaScript code to interact with the page
    cprint(
        "\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]",
        True,
    )
    cprint(
        "In this example we try to click the 'Load More' button on the page using JavaScript code."
    )
    js_code = [
        """
    const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
    loadMoreButton && loadMoreButton.click();
    """
    ] * 2
    # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
    # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
    result = crawler.run(url="https://www.nbcnews.com/business", js=js_code)
    cprint(
        "[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]"
    )
    print_result(result)


def using_crawler_hooks(crawler):
    # Example usage of the hooks for authentication and setting a cookie
    def on_driver_created(driver):
        print("[HOOK] on_driver_created")
        # Example customization: maximize the window
        driver.maximize_window()

        # Example customization: logging in to a hypothetical website
        driver.get("https://example.com/login")

        from selenium.webdriver.support.ui import WebDriverWait
        from selenium.webdriver.common.by import By
        from selenium.webdriver.support import expected_conditions as EC

        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.NAME, "username"))
        )
        driver.find_element(By.NAME, "username").send_keys("testuser")
        driver.find_element(By.NAME, "password").send_keys("password123")
        driver.find_element(By.NAME, "login").click()
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "welcome"))
        )
        # Add a custom cookie
        driver.add_cookie({"name": "test_cookie", "value": "cookie_value"})
        return driver

    def before_get_url(driver):
        print("[HOOK] before_get_url")
        # Example customization: add a custom header
        # Enable Network domain for sending headers
        driver.execute_cdp_cmd("Network.enable", {})
        # Add a custom header
        driver.execute_cdp_cmd(
            "Network.setExtraHTTPHeaders", {"headers": {"X-Test-Header": "test"}}
        )
        return driver

    def after_get_url(driver):
        print("[HOOK] after_get_url")
        # Example customization: log the URL
        print(driver.current_url)
        return driver

    def before_return_html(driver, html):
        print("[HOOK] before_return_html")
        # Example customization: log the HTML
        print(len(html))
        return driver

    cprint(
        "\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]",
        True,
    )

    crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
    crawler_strategy.set_hook("on_driver_created", on_driver_created)
    crawler_strategy.set_hook("before_get_url", before_get_url)
    crawler_strategy.set_hook("after_get_url", after_get_url)
    crawler_strategy.set_hook("before_return_html", before_return_html)

    crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
    crawler.warmup()
    result = crawler.run(url="https://example.com")

    cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
    print_result(result=result)


def using_crawler_hooks_dleay_example(crawler):
    def delay(driver):
        print("Delaying for 5 seconds...")
        time.sleep(5)
        print("Resuming...")

    def create_crawler():
        crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
        crawler_strategy.set_hook("after_get_url", delay)
        crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
        crawler.warmup()
        return crawler

    cprint(
        "\n🔗 [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]"
    )
    crawler = create_crawler()
    result = crawler.run(url="https://google.com", bypass_cache=True)

    cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
    print_result(result)


def main():
    cprint(
        "🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]"
    )
    cprint(
        "⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]"
    )
    cprint(
        "If this is the first time you're running Crawl4ai, this might take a few seconds to load required model files."
    )

    crawler = create_crawler()

    crawler.always_by_pass_cache = True
    basic_usage(crawler)
    # basic_usage_some_params(crawler)
    understanding_parameters(crawler)

    crawler.always_by_pass_cache = True
    screenshot_usage(crawler)
    add_chunking_strategy(crawler)
    add_extraction_strategy(crawler)
    add_llm_extraction_strategy(crawler)
    targeted_extraction(crawler)
    interactive_extraction(crawler)
    multiple_scrip(crawler)

    cprint(
        "\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]"
    )


if __name__ == "__main__":
    main()
-												Update:
- Fix Spacy model issue
- Update Readme and requirements.txt

											
										
										
											2024-05-16 19:50:20 +08:00
+								import os
 								import time
-												refactor(llm): rename LlmConfig to LLMConfig for consistency

Rename LlmConfig to LLMConfig across the codebase to follow consistent naming conventions.
Update all imports and usages to use the new name.
Update documentation and examples to reflect the change.

BREAKING CHANGE: LlmConfig has been renamed to LLMConfig. Users need to update their imports and usage.

											
										
										
											2025-03-05 14:17:04 +08:00
+								from crawl4ai.types import LLMConfig
-												- Test all methods
- Update index.hml
- Update Readme
- Resolve some bugs

											
										
										
											2024-05-14 21:27:41 +08:00
+								from crawl4ai.web_crawler import WebCrawler
 								from crawl4ai.chunking_strategy import *
 								from crawl4ai.extraction_strategy import *
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								from crawl4ai.crawler_strategy import *
 								from rich import print
 								from rich.console import Console
-												Update:
- Fix Spacy model issue
- Update Readme and requirements.txt

											
										
										
											2024-05-16 19:50:20 +08:00
+								from functools import lru_cache
-												- Test all methods
- Update index.hml
- Update Readme
- Resolve some bugs

											
										
										
											2024-05-14 21:27:41 +08:00
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								console = Console()
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
-												Update:
- Fix Spacy model issue
- Update Readme and requirements.txt

											
										
										
											2024-05-16 19:50:20 +08:00
+								@lru_cache()
 								def create_crawler():
-												chore: Add support for GPU, MPS, and CPU

											
										
										
											2024-05-17 21:56:13 +08:00
+								    crawler = WebCrawler(verbose=True)
-												Update:
- Fix Spacy model issue
- Update Readme and requirements.txt

											
										
										
											2024-05-16 19:50:20 +08:00
+								    crawler.warmup()
 								    return crawler
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								def print_result(result):
 								    # Print each key in one line and just the first 10 characters of each one's value and three dots
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								    console.print("\t[bold]Result:[/bold]")
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								    for key, value in result.model_dump().items():
-												Update:
- Fix Spacy model issue
- Update Readme and requirements.txt

											
										
										
											2024-05-16 19:50:20 +08:00
+								        if isinstance(value, str) and value:
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								            console.print(f"\t{key}: [green]{value[:20]}...[/green]")
-												Update quickstart.py: Add counting items
											
										
										
											2024-05-16 22:49:12 +08:00
+								    if result.extracted_content:
 								        items = json.loads(result.extracted_content)
 								        print(f"\t[bold]{len(items)} blocks is extracted![/bold]")
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
 								def cprint(message, press_any_key=False):
 								    console.print(message)
 								    if press_any_key:
 								        console.print("Press any key to continue...", style="")
 								        input()
-												- Test all methods
- Update index.hml
- Update Readme
- Resolve some bugs

											
										
										
											2024-05-14 21:27:41 +08:00
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
-												Update:
- Fix Spacy model issue
- Update Readme and requirements.txt

											
										
										
											2024-05-16 19:50:20 +08:00
+								def basic_usage(crawler):
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								    cprint(
 								        "🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]"
 								    )
 								    result = crawler.run(url="https://www.nbcnews.com/business", only_text=True)
-												Merge branch 'format-inline-tags'

											
										
										
											2024-06-19 00:48:38 +08:00
+								    cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
 								    print_result(result)
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
-												Merge branch 'format-inline-tags'

											
										
										
											2024-06-19 00:48:38 +08:00
+								def basic_usage_some_params(crawler):
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								    cprint(
 								        "🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]"
 								    )
 								    result = crawler.run(
 								        url="https://www.nbcnews.com/business", word_count_threshold=1, only_text=True
 								    )
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								    cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
 								    print_result(result)
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
-												feat: Add screenshot functionality to crawl_urls

											
										
										
											2024-06-07 15:33:15 +08:00
+								def screenshot_usage(crawler):
 								    cprint("\n📸 [bold cyan]Let's take a screenshot of the page![/bold cyan]")
 								    result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
 								    cprint("[LOG] 📦 [bold yellow]Screenshot result:[/bold yellow]")
 								    # Save the screenshot to a file
 								    with open("screenshot.png", "wb") as f:
 								        f.write(base64.b64decode(result.screenshot))
 								    cprint("Screenshot saved to 'screenshot.png'!")
 								    print_result(result)
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
-												Update:
- Fix Spacy model issue
- Update Readme and requirements.txt

											
										
										
											2024-05-16 19:50:20 +08:00
+								def understanding_parameters(crawler):
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								    cprint(
 								        "\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]"
 								    )
 								    cprint(
 								        "By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action."
 								    )
-												Update:
- Fix Spacy model issue
- Update Readme and requirements.txt

											
										
										
											2024-05-16 19:50:20 +08:00
+								    # First crawl (reads from cache)
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								    cprint("1️⃣ First crawl (caches the result):", True)
 								    start_time = time.time()
 								    result = crawler.run(url="https://www.nbcnews.com/business")
 								    end_time = time.time()
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								    cprint(
 								        f"[LOG] 📦 [bold yellow]First crawl took {end_time - start_time} seconds and result (from cache):[/bold yellow]"
 								    )
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								    print_result(result)
 								    # Force to crawl again
 								    cprint("2️⃣ Second crawl (Force to crawl again):", True)
 								    start_time = time.time()
 								    result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
 								    end_time = time.time()
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								    cprint(
 								        f"[LOG] 📦 [bold yellow]Second crawl took {end_time - start_time} seconds and result (forced to crawl):[/bold yellow]"
 								    )
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								    print_result(result)
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
-												Update:
- Fix Spacy model issue
- Update Readme and requirements.txt

											
										
										
											2024-05-16 19:50:20 +08:00
+								def add_chunking_strategy(crawler):
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								    # Adding a chunking strategy: RegexChunking
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								    cprint(
 								        "\n🧩 [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]",
 								        True,
 								    )
 								    cprint(
 								        "RegexChunking is a simple chunking strategy that splits the text based on a given regex pattern. Let's see it in action!"
 								    )
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								    result = crawler.run(
 								        url="https://www.nbcnews.com/business",
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								        chunking_strategy=RegexChunking(patterns=["\n\n"]),
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								    )
 								    cprint("[LOG] 📦 [bold yellow]RegexChunking result:[/bold yellow]")
 								    print_result(result)
 								    # Adding another chunking strategy: NlpSentenceChunking
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								    cprint(
 								        "\n🔍 [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]",
 								        True,
 								    )
 								    cprint(
 								        "NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!"
 								    )
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								    result = crawler.run(
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								        url="https://www.nbcnews.com/business", chunking_strategy=NlpSentenceChunking()
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								    )
 								    cprint("[LOG] 📦 [bold yellow]NlpSentenceChunking result:[/bold yellow]")
 								    print_result(result)
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
-												Update:
- Fix Spacy model issue
- Update Readme and requirements.txt

											
										
										
											2024-05-16 19:50:20 +08:00
+								def add_extraction_strategy(crawler):
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								    # Adding an extraction strategy: CosineStrategy
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								    cprint(
 								        "\n🧠 [bold cyan]Let's get smarter with an extraction strategy: CosineStrategy![/bold cyan]",
 								        True,
 								    )
 								    cprint(
 								        "CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let's see it in action!"
 								    )
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								    result = crawler.run(
 								        url="https://www.nbcnews.com/business",
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								        extraction_strategy=CosineStrategy(
 								            word_count_threshold=10,
 								            max_dist=0.2,
 								            linkage_method="ward",
 								            top_k=3,
 								            sim_threshold=0.3,
 								            verbose=True,
 								        ),
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								    )
 								    cprint("[LOG] 📦 [bold yellow]CosineStrategy result:[/bold yellow]")
 								    print_result(result)
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
-												Update:
- Fix Spacy model issue
- Update Readme and requirements.txt

											
										
										
											2024-05-16 19:50:20 +08:00
+								    # Using semantic_filter with CosineStrategy
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								    cprint(
 								        "You can pass other parameters like 'semantic_filter' to the CosineStrategy to extract semantically similar blocks of text. Let's see it in action!"
 								    )
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								    result = crawler.run(
 								        url="https://www.nbcnews.com/business",
 								        extraction_strategy=CosineStrategy(
 								            semantic_filter="inflation rent prices",
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								        ),
 								    )
 								    cprint(
 								        "[LOG] 📦 [bold yellow]CosineStrategy result with semantic filter:[/bold yellow]"
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								    )
-												Update:
- Fix Spacy model issue
- Update Readme and requirements.txt

											
										
										
											2024-05-16 19:50:20 +08:00
+								    print_result(result)
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
-												Update:
- Fix Spacy model issue
- Update Readme and requirements.txt

											
										
										
											2024-05-16 19:50:20 +08:00
+								def add_llm_extraction_strategy(crawler):
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								    # Adding an LLM extraction strategy without instructions
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								    cprint(
 								        "\n🤖 [bold cyan]Time to bring in the big guns: LLMExtractionStrategy without instructions![/bold cyan]",
 								        True,
 								    )
 								    cprint(
 								        "LLMExtractionStrategy uses a large language model to extract relevant information from the web page. Let's see it in action!"
 								    )
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								    result = crawler.run(
 								        url="https://www.nbcnews.com/business",
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								        extraction_strategy=LLMExtractionStrategy(
-												refactor(llm): rename LlmConfig to LLMConfig for consistency

Rename LlmConfig to LLMConfig across the codebase to follow consistent naming conventions.
Update all imports and usages to use the new name.
Update documentation and examples to reflect the change.

BREAKING CHANGE: LlmConfig has been renamed to LLMConfig. Users need to update their imports and usage.

											
										
										
											2025-03-05 14:17:04 +08:00
+								            llm_config =  LLMConfig(provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY"))
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								        ),
 								    )
 								    cprint(
 								        "[LOG] 📦 [bold yellow]LLMExtractionStrategy (no instructions) result:[/bold yellow]"
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								    )
 								    print_result(result)
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								    # Adding an LLM extraction strategy with instructions
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								    cprint(
 								        "\n📜 [bold cyan]Let's make it even more interesting: LLMExtractionStrategy with instructions![/bold cyan]",
 								        True,
 								    )
 								    cprint(
 								        "Let's say we are only interested in financial news. Let's see how LLMExtractionStrategy performs with instructions!"
 								    )
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								    result = crawler.run(
 								        url="https://www.nbcnews.com/business",
 								        extraction_strategy=LLMExtractionStrategy(
-												refactor(llm): rename LlmConfig to LLMConfig for consistency

Rename LlmConfig to LLMConfig across the codebase to follow consistent naming conventions.
Update all imports and usages to use the new name.
Update documentation and examples to reflect the change.

BREAKING CHANGE: LlmConfig has been renamed to LLMConfig. Users need to update their imports and usage.

											
										
										
											2025-03-05 14:17:04 +08:00
+								            llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								            instruction="I am interested in only financial news",
 								        ),
 								    )
 								    cprint(
 								        "[LOG] 📦 [bold yellow]LLMExtractionStrategy (with instructions) result:[/bold yellow]"
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								    )
 								    print_result(result)
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								    result = crawler.run(
-												chore: Update web crawler URLs to use NBC News business section

											
										
										
											2024-05-17 18:11:13 +08:00
+								        url="https://www.nbcnews.com/business",
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								        extraction_strategy=LLMExtractionStrategy(
-												refactor(llm): rename LlmConfig to LLMConfig for consistency

Rename LlmConfig to LLMConfig across the codebase to follow consistent naming conventions.
Update all imports and usages to use the new name.
Update documentation and examples to reflect the change.

BREAKING CHANGE: LlmConfig has been renamed to LLMConfig. Users need to update their imports and usage.

											
										
										
											2025-03-05 14:17:04 +08:00
+								            llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								            instruction="Extract only content related to technology",
 								        ),
 								    )
 								    cprint(
 								        "[LOG] 📦 [bold yellow]LLMExtractionStrategy (with technology instruction) result:[/bold yellow]"
-												Update:
- Fix Spacy model issue
- Update Readme and requirements.txt

											
										
										
											2024-05-16 19:50:20 +08:00
+								    )
 								    print_result(result)
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
-												Update:
- Fix Spacy model issue
- Update Readme and requirements.txt

											
										
										
											2024-05-16 19:50:20 +08:00
+								def targeted_extraction(crawler):
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								    # Using a CSS selector to extract only H2 tags
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								    cprint(
 								        "\n🎯 [bold cyan]Targeted extraction: Let's use a CSS selector to extract only H2 tags![/bold cyan]",
 								        True,
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								    )
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								    result = crawler.run(url="https://www.nbcnews.com/business", css_selector="h2")
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								    cprint("[LOG] 📦 [bold yellow]CSS Selector (H2 tags) result:[/bold yellow]")
 								    print_result(result)
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
-												Update:
- Fix Spacy model issue
- Update Readme and requirements.txt

											
										
										
											2024-05-16 19:50:20 +08:00
+								def interactive_extraction(crawler):
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								    # Passing JavaScript code to interact with the page
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								    cprint(
 								        "\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]",
 								        True,
 								    )
 								    cprint(
 								        "In this example we try to click the 'Load More' button on the page using JavaScript code."
 								    )
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								    js_code = """
 								    const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
 								    loadMoreButton && loadMoreButton.click();
 								    """
-												Add recipe images, update README, and REST api example

											
										
										
											2024-06-07 20:43:50 +08:00
+								    # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
 								    # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								    result = crawler.run(url="https://www.nbcnews.com/business", js=js_code)
 								    cprint(
 								        "[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]"
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
+								    )
 								    print_result(result)
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
-												Update for v0.2.2
- Support multiple JS scripts
- Fixed some of bugs
- Resolved a few issue relevant to Colab installation

											
										
										
											2024-06-02 15:40:18 +08:00
+								def multiple_scrip(crawler):
 								    # Passing JavaScript code to interact with the page
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								    cprint(
 								        "\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]",
 								        True,
 								    )
 								    cprint(
 								        "In this example we try to click the 'Load More' button on the page using JavaScript code."
 								    )
 								    js_code = [
 								        """
-												Update for v0.2.2
- Support multiple JS scripts
- Fixed some of bugs
- Resolved a few issue relevant to Colab installation

											
										
										
											2024-06-02 15:40:18 +08:00
+								    const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
 								    loadMoreButton && loadMoreButton.click();
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								    """
 								    ] * 2
-												Add recipe images, update README, and REST api example

											
										
										
											2024-06-07 20:43:50 +08:00
+								    # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
 								    # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								    result = crawler.run(url="https://www.nbcnews.com/business", js=js_code)
 								    cprint(
 								        "[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]"
-												Update for v0.2.2
- Support multiple JS scripts
- Fixed some of bugs
- Resolved a few issue relevant to Colab installation

											
										
										
											2024-06-02 15:40:18 +08:00
+								    )
 								    print_result(result)
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
-												chore: Add hooks for customizing the LocalSeleniumCrawlerStrategy

											
										
										
											2024-06-17 15:37:18 +08:00
+								def using_crawler_hooks(crawler):
 								    # Example usage of the hooks for authentication and setting a cookie
 								    def on_driver_created(driver):
 								        print("[HOOK] on_driver_created")
 								        # Example customization: maximize the window
 								        driver.maximize_window()
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
-												chore: Add hooks for customizing the LocalSeleniumCrawlerStrategy

											
										
										
											2024-06-17 15:37:18 +08:00
+								        # Example customization: logging in to a hypothetical website
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								        driver.get("https://example.com/login")
-												chore: Add custom headers to LocalSeleniumCrawlerStrategy

											
										
										
											2024-06-17 15:50:03 +08:00
+								        from selenium.webdriver.support.ui import WebDriverWait
-												Update quickstart.py

											
										
										
											2024-06-19 00:37:53 +08:00
+								        from selenium.webdriver.common.by import By
 								        from selenium.webdriver.support import expected_conditions as EC
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
-												chore: Add hooks for customizing the LocalSeleniumCrawlerStrategy

											
										
										
											2024-06-17 15:37:18 +08:00
+								        WebDriverWait(driver, 10).until(
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								            EC.presence_of_element_located((By.NAME, "username"))
-												chore: Add hooks for customizing the LocalSeleniumCrawlerStrategy

											
										
										
											2024-06-17 15:37:18 +08:00
+								        )
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								        driver.find_element(By.NAME, "username").send_keys("testuser")
 								        driver.find_element(By.NAME, "password").send_keys("password123")
 								        driver.find_element(By.NAME, "login").click()
-												chore: Add hooks for customizing the LocalSeleniumCrawlerStrategy

											
										
										
											2024-06-17 15:37:18 +08:00
+								        WebDriverWait(driver, 10).until(
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								            EC.presence_of_element_located((By.ID, "welcome"))
-												chore: Add hooks for customizing the LocalSeleniumCrawlerStrategy

											
										
										
											2024-06-17 15:37:18 +08:00
+								        )
 								        # Add a custom cookie
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								        driver.add_cookie({"name": "test_cookie", "value": "cookie_value"})
 								        return driver
-												chore: Add hooks for customizing the LocalSeleniumCrawlerStrategy

											
										
										
											2024-06-17 15:37:18 +08:00
-												chore: Add custom headers to LocalSeleniumCrawlerStrategy

											
										
										
											2024-06-17 15:50:03 +08:00
+								    def before_get_url(driver):
 								        print("[HOOK] before_get_url")
 								        # Example customization: add a custom header
 								        # Enable Network domain for sending headers
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								        driver.execute_cdp_cmd("Network.enable", {})
-												chore: Add custom headers to LocalSeleniumCrawlerStrategy

											
										
										
											2024-06-17 15:50:03 +08:00
+								        # Add a custom header
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								        driver.execute_cdp_cmd(
 								            "Network.setExtraHTTPHeaders", {"headers": {"X-Test-Header": "test"}}
 								        )
-												chore: Add custom headers to LocalSeleniumCrawlerStrategy

											
										
										
											2024-06-17 15:50:03 +08:00
+								        return driver
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
-												chore: Add hooks for customizing the LocalSeleniumCrawlerStrategy

											
										
										
											2024-06-17 15:37:18 +08:00
+								    def after_get_url(driver):
 								        print("[HOOK] after_get_url")
 								        # Example customization: log the URL
 								        print(driver.current_url)
 								        return driver
 								    def before_return_html(driver, html):
 								        print("[HOOK] before_return_html")
-												chore: Add custom headers to LocalSeleniumCrawlerStrategy

											
										
										
											2024-06-17 15:50:03 +08:00
+								        # Example customization: log the HTML
 								        print(len(html))
 								        return driver
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
 								    cprint(
 								        "\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]",
 								        True,
 								    )
-												## [v0.2.74] - 2024-07-08
A slew of exciting updates to improve the crawler's stability and robustness! 🎉

- 💻 **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding.
- 🛡️ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy.
- 🧹 **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy.
- 🚮 **Database cleanup**: Removed existing database file and initialized a new one.

											
										
										
											2024-07-08 16:33:25 +08:00
+								    crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								    crawler_strategy.set_hook("on_driver_created", on_driver_created)
 								    crawler_strategy.set_hook("before_get_url", before_get_url)
 								    crawler_strategy.set_hook("after_get_url", after_get_url)
 								    crawler_strategy.set_hook("before_return_html", before_return_html)
-												## [v0.2.74] - 2024-07-08
A slew of exciting updates to improve the crawler's stability and robustness! 🎉

- 💻 **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding.
- 🛡️ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy.
- 🧹 **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy.
- 🚮 **Database cleanup**: Removed existing database file and initialized a new one.

											
										
										
											2024-07-08 16:33:25 +08:00
+								    crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								    crawler.warmup()
-												chore: Add hooks for customizing the LocalSeleniumCrawlerStrategy

											
										
										
											2024-06-17 15:37:18 +08:00
+								    result = crawler.run(url="https://example.com")
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
-												chore: Add hooks for customizing the LocalSeleniumCrawlerStrategy

											
										
										
											2024-06-17 15:37:18 +08:00
+								    cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								    print_result(result=result)
-												## [v0.2.74] - 2024-07-08
A slew of exciting updates to improve the crawler's stability and robustness! 🎉

- 💻 **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding.
- 🛡️ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy.
- 🧹 **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy.
- 🚮 **Database cleanup**: Removed existing database file and initialized a new one.

											
										
										
											2024-07-08 16:33:25 +08:00
+								def using_crawler_hooks_dleay_example(crawler):
 								    def delay(driver):
 								        print("Delaying for 5 seconds...")
 								        time.sleep(5)
 								        print("Resuming...")
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
-												## [v0.2.74] - 2024-07-08
A slew of exciting updates to improve the crawler's stability and robustness! 🎉

- 💻 **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding.
- 🛡️ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy.
- 🧹 **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy.
- 🚮 **Database cleanup**: Removed existing database file and initialized a new one.

											
										
										
											2024-07-08 16:33:25 +08:00
+								    def create_crawler():
 								        crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								        crawler_strategy.set_hook("after_get_url", delay)
-												## [v0.2.74] - 2024-07-08
A slew of exciting updates to improve the crawler's stability and robustness! 🎉

- 💻 **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding.
- 🛡️ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy.
- 🧹 **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy.
- 🚮 **Database cleanup**: Removed existing database file and initialized a new one.

											
										
										
											2024-07-08 16:33:25 +08:00
+								        crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
 								        crawler.warmup()
 								        return crawler
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								    cprint(
 								        "\n🔗 [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]"
 								    )
-												## [v0.2.74] - 2024-07-08
A slew of exciting updates to improve the crawler's stability and robustness! 🎉

- 💻 **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding.
- 🛡️ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy.
- 🧹 **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy.
- 🚮 **Database cleanup**: Removed existing database file and initialized a new one.

											
										
										
											2024-07-08 16:33:25 +08:00
+								    crawler = create_crawler()
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								    result = crawler.run(url="https://google.com", bypass_cache=True)
-												## [v0.2.74] - 2024-07-08
A slew of exciting updates to improve the crawler's stability and robustness! 🎉

- 💻 **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding.
- 🛡️ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy.
- 🧹 **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy.
- 🚮 **Database cleanup**: Removed existing database file and initialized a new one.

											
										
										
											2024-07-08 16:33:25 +08:00
+								    cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
 								    print_result(result)
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
-												chore: Add hooks for customizing the LocalSeleniumCrawlerStrategy

											
										
										
											2024-06-17 15:37:18 +08:00
-												Update:
- Fix Spacy model issue
- Update Readme and requirements.txt

											
										
										
											2024-05-16 19:50:20 +08:00
+								def main():
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								    cprint(
 								        "🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]"
 								    )
 								    cprint(
 								        "⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]"
 								    )
 								    cprint(
 								        "If this is the first time you're running Crawl4ai, this might take a few seconds to load required model files."
 								    )
-												Update:
- Fix Spacy model issue
- Update Readme and requirements.txt

											
										
										
											2024-05-16 19:50:20 +08:00
 								    crawler = create_crawler()
-												Remove dependency on Spacy model.

											
										
										
											2024-05-17 15:08:03 +08:00
-												Merge branch 'format-inline-tags'

											
										
										
											2024-06-19 00:48:38 +08:00
+								    crawler.always_by_pass_cache = True
-												chore: Update Selenium options in crawler_strategy.py and add verbose logging in CosineStrategy

											
										
										
											2024-05-18 14:13:06 +08:00
+								    basic_usage(crawler)
-												Merge branch 'format-inline-tags'

											
										
										
											2024-06-19 00:48:38 +08:00
+								    # basic_usage_some_params(crawler)
-												Update:
- Fix Spacy model issue
- Update Readme and requirements.txt

											
										
										
											2024-05-16 19:50:20 +08:00
+								    understanding_parameters(crawler)
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
-												chore: Update pip installation command and requirements, add new dependencies

											
										
										
											2024-05-17 16:53:03 +08:00
+								    crawler.always_by_pass_cache = True
-												feat: Add screenshot functionality to crawl_urls

											
										
										
											2024-06-07 15:33:15 +08:00
+								    screenshot_usage(crawler)
-												Update:
- Fix Spacy model issue
- Update Readme and requirements.txt

											
										
										
											2024-05-16 19:50:20 +08:00
+								    add_chunking_strategy(crawler)
 								    add_extraction_strategy(crawler)
 								    add_llm_extraction_strategy(crawler)
 								    targeted_extraction(crawler)
 								    interactive_extraction(crawler)
-												Update for v0.2.2
- Support multiple JS scripts
- Fixed some of bugs
- Resolved a few issue relevant to Colab installation

											
										
										
											2024-06-02 15:40:18 +08:00
+								    multiple_scrip(crawler)
-												Update:
- Fix Spacy model issue
- Update Readme and requirements.txt

											
										
										
											2024-05-16 19:50:20 +08:00
-												Apply Ruff Corrections

											
										
										
											2025-01-13 19:19:58 +08:00
+								    cprint(
 								        "\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]"
 								    )
-												Update:
- Debug
- Refactor code for new version

											
										
										
											2024-05-16 17:31:44 +08:00
 								if __name__ == "__main__":
 								    main()