crawl4ai/tests/async/test_chunking_and_extraction_strategies.py

import os
import sys
import pytest
import json

# Add the parent directory to the Python path
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)

from crawl4ai import LLMConfig
from crawl4ai.async_webcrawler import AsyncWebCrawler
from crawl4ai.chunking_strategy import RegexChunking
from crawl4ai.extraction_strategy import LLMExtractionStrategy


@pytest.mark.asyncio
async def test_regex_chunking():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://www.nbcnews.com/business"
        chunking_strategy = RegexChunking(patterns=["\n\n"])
        result = await crawler.arun(
            url=url, chunking_strategy=chunking_strategy, bypass_cache=True
        )
        assert result.success
        assert result.extracted_content
        chunks = json.loads(result.extracted_content)
        assert len(chunks) > 1  # Ensure multiple chunks were created


# @pytest.mark.asyncio
# async def test_cosine_strategy():
#     async with AsyncWebCrawler(verbose=True) as crawler:
#         url = "https://www.nbcnews.com/business"
#         extraction_strategy = CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold=0.3)
#         result = await crawler.arun(
#             url=url,
#             extraction_strategy=extraction_strategy,
#             bypass_cache=True
#         )
#         assert result.success
#         assert result.extracted_content
#         extracted_data = json.loads(result.extracted_content)
#         assert len(extracted_data) > 0
#         assert all('tags' in item for item in extracted_data)


@pytest.mark.asyncio
async def test_llm_extraction_strategy():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://www.nbcnews.com/business"
        extraction_strategy = LLMExtractionStrategy(
            llm_config=LLMConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),
            instruction="Extract only content related to technology",
        )
        result = await crawler.arun(
            url=url, extraction_strategy=extraction_strategy, bypass_cache=True
        )
        assert result.success
        assert result.extracted_content
        extracted_data = json.loads(result.extracted_content)
        assert len(extracted_data) > 0
        assert all("content" in item for item in extracted_data)


# @pytest.mark.asyncio
# async def test_combined_chunking_and_extraction():
#     async with AsyncWebCrawler(verbose=True) as crawler:
#         url = "https://www.nbcnews.com/business"
#         chunking_strategy = RegexChunking(patterns=["\n\n"])
#         extraction_strategy = CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold=0.3)
#         result = await crawler.arun(
#             url=url,
#             chunking_strategy=chunking_strategy,
#             extraction_strategy=extraction_strategy,
#             bypass_cache=True
#         )
#         assert result.success
#         assert result.extracted_content
#         extracted_data = json.loads(result.extracted_content)
#         assert len(extracted_data) > 0
#         assert all('tags' in item for item in extracted_data)
#         assert all('content' in item for item in extracted_data)

# Entry point for debugging
if __name__ == "__main__":
    pytest.main([__file__, "-v"])
Add Async Version, JsonCss Extrator 2024-09-03 01:27:00 +08:00			`import os`
			`import sys`
			`import pytest`
			`import json`

			`# Add the parent directory to the Python path`
			`parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))`
			`sys.path.append(parent_dir)`

feat(browser): add standalone CDP browser launch and lxml extraction strategy Add new features to enhance browser automation and HTML extraction: - Add CDP browser launch capability with customizable ports and profiles - Implement JsonLxmlExtractionStrategy for faster HTML parsing - Add CLI command 'crwl cdp' for launching standalone CDP browsers - Support connecting to external CDP browsers via URL - Optimize selector caching and context-sensitive queries BREAKING CHANGE: LLMConfig import path changed from crawl4ai.types to crawl4ai 2025-03-07 20:55:56 +08:00			`from crawl4ai import LLMConfig`
Add Async Version, JsonCss Extrator 2024-09-03 01:27:00 +08:00			`from crawl4ai.async_webcrawler import AsyncWebCrawler`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00			`from crawl4ai.chunking_strategy import RegexChunking`
			`from crawl4ai.extraction_strategy import LLMExtractionStrategy`

Add Async Version, JsonCss Extrator 2024-09-03 01:27:00 +08:00
			`@pytest.mark.asyncio`
			`async def test_regex_chunking():`
			`async with AsyncWebCrawler(verbose=True) as crawler:`
			`url = "https://www.nbcnews.com/business"`
			`chunking_strategy = RegexChunking(patterns=["\n\n"])`
			`result = await crawler.arun(`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00			`url=url, chunking_strategy=chunking_strategy, bypass_cache=True`
Add Async Version, JsonCss Extrator 2024-09-03 01:27:00 +08:00			`)`
			`assert result.success`
			`assert result.extracted_content`
			`chunks = json.loads(result.extracted_content)`
			`assert len(chunks) > 1 # Ensure multiple chunks were created`

Apply Ruff Corrections 2025-01-13 19:19:58 +08:00
Update .gitignore to ignore temporary and test directories 2024-09-26 15:09:49 +08:00			`# @pytest.mark.asyncio`
			`# async def test_cosine_strategy():`
			`# async with AsyncWebCrawler(verbose=True) as crawler:`
			`# url = "https://www.nbcnews.com/business"`
			`# extraction_strategy = CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold=0.3)`
			`# result = await crawler.arun(`
			`# url=url,`
			`# extraction_strategy=extraction_strategy,`
			`# bypass_cache=True`
			`# )`
			`# assert result.success`
			`# assert result.extracted_content`
			`# extracted_data = json.loads(result.extracted_content)`
			`# assert len(extracted_data) > 0`
			`# assert all('tags' in item for item in extracted_data)`
Add Async Version, JsonCss Extrator 2024-09-03 01:27:00 +08:00
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00
Add Async Version, JsonCss Extrator 2024-09-03 01:27:00 +08:00			`@pytest.mark.asyncio`
			`async def test_llm_extraction_strategy():`
			`async with AsyncWebCrawler(verbose=True) as crawler:`
			`url = "https://www.nbcnews.com/business"`
			`extraction_strategy = LLMExtractionStrategy(`
refactor(llm): rename LlmConfig to LLMConfig for consistency Rename LlmConfig to LLMConfig across the codebase to follow consistent naming conventions. Update all imports and usages to use the new name. Update documentation and examples to reflect the change. BREAKING CHANGE: LlmConfig has been renamed to LLMConfig. Users need to update their imports and usage. 2025-03-05 14:17:04 +08:00			`llm_config=LLMConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00			`instruction="Extract only content related to technology",`
Add Async Version, JsonCss Extrator 2024-09-03 01:27:00 +08:00			`)`
			`result = await crawler.arun(`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00			`url=url, extraction_strategy=extraction_strategy, bypass_cache=True`
Add Async Version, JsonCss Extrator 2024-09-03 01:27:00 +08:00			`)`
			`assert result.success`
			`assert result.extracted_content`
			`extracted_data = json.loads(result.extracted_content)`
			`assert len(extracted_data) > 0`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00			`assert all("content" in item for item in extracted_data)`

Add Async Version, JsonCss Extrator 2024-09-03 01:27:00 +08:00
Update .gitignore to ignore temporary and test directories 2024-09-26 15:09:49 +08:00			`# @pytest.mark.asyncio`
			`# async def test_combined_chunking_and_extraction():`
			`# async with AsyncWebCrawler(verbose=True) as crawler:`
			`# url = "https://www.nbcnews.com/business"`
			`# chunking_strategy = RegexChunking(patterns=["\n\n"])`
			`# extraction_strategy = CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold=0.3)`
			`# result = await crawler.arun(`
			`# url=url,`
			`# chunking_strategy=chunking_strategy,`
			`# extraction_strategy=extraction_strategy,`
			`# bypass_cache=True`
			`# )`
			`# assert result.success`
			`# assert result.extracted_content`
			`# extracted_data = json.loads(result.extracted_content)`
			`# assert len(extracted_data) > 0`
			`# assert all('tags' in item for item in extracted_data)`
			`# assert all('content' in item for item in extracted_data)`
Add Async Version, JsonCss Extrator 2024-09-03 01:27:00 +08:00
			`# Entry point for debugging`
			`if __name__ == "__main__":`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00			`pytest.main([__file__, "-v"])`