feat(schema): improve HTML preprocessing for schema generation

Add new preprocess_html_for_schema utility function to better handle HTML cleaning for schema generation. This replaces the previous optimize_html function in the GoogleSearchCrawler and includes smarter attribute handling and pattern detection. Other changes: - Update default provider to gpt-4o - Add DEFAULT_PROVIDER_API_KEY constant - Make LLMConfig creation more flexible with create_llm_config helper - Add new dependencies: zstandard and msgpack This change improves schema generation reliability while reducing noise in the processed HTML.
2025-03-12 22:40:46 +08:00 · 2025-03-12 22:40:46 +08:00 · dc36997a08
commit dc36997a08
parent 1630fbdafe
8 changed files with 134 additions and 12 deletions
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@ -1,6 +1,7 @@
 import os
 from .config import (
    DEFAULT_PROVIDER,
+    DEFAULT_PROVIDER_API_KEY,
    MIN_WORD_THRESHOLD,
    IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
    PROVIDER_MODELS,
@ -1080,7 +1081,7 @@ class LLMConfig:
            self.api_token = os.getenv(api_token[4:])
        else:
            self.api_token = PROVIDER_MODELS.get(provider, "no-token") or os.getenv(
-                "OPENAI_API_KEY"
+                DEFAULT_PROVIDER_API_KEY
            )
        self.base_url = base_url

--- a/crawl4ai/config.py
+++ b/crawl4ai/config.py
@ -4,7 +4,8 @@ from dotenv import load_dotenv
 load_dotenv()  # Load environment variables from .env file

 # Default provider, ONLY used when the extraction strategy is LLMExtractionStrategy
-DEFAULT_PROVIDER = "openai/gpt-4o-mini"
+DEFAULT_PROVIDER = "openai/gpt-4o"
+DEFAULT_PROVIDER_API_KEY = "OPENAI_API_KEY"
 MODEL_REPO_BRANCH = "new-release-0.0.2"
 # Provider-model dictionary, ONLY used when the extraction strategy is LLMExtractionStrategy
 PROVIDER_MODELS = {
--- a/crawl4ai/crawlers/google_search/crawler.py
+++ b/crawl4ai/crawlers/google_search/crawler.py
@ -1,6 +1,6 @@
 from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig, CacheMode
 from crawl4ai.hub import BaseCrawler
-from crawl4ai.utils import optimize_html, get_home_folder
+from crawl4ai.utils import optimize_html, get_home_folder, preprocess_html_for_schema
 from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
 from pathlib import Path
 import json
@ -68,7 +68,8 @@ class GoogleSearchCrawler(BaseCrawler):
        home_dir = get_home_folder() if not schema_cache_path else schema_cache_path
        os.makedirs(f"{home_dir}/schema", exist_ok=True)

-        cleaned_html = optimize_html(html, threshold=100)
+        # cleaned_html = optimize_html(html, threshold=100)
+        cleaned_html = preprocess_html_for_schema(html) 

        organic_schema = None
        if os.path.exists(f"{home_dir}/schema/organic_schema.json"):
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@ -34,7 +34,7 @@ from .model_loader import (
    calculate_batch_size
 )

-from .types import LLMConfig
+from .types import LLMConfig, create_llm_config

 from functools import partial
 import numpy as np
@ -757,8 +757,6 @@ class LLMExtractionStrategy(ExtractionStrategy):
 #######################################################
 # New extraction strategies for JSON-based extraction #
 #######################################################
-
-
 class JsonElementExtractionStrategy(ExtractionStrategy):
    """
    Abstract base class for extracting structured JSON from HTML content.
@ -1049,7 +1047,7 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
        schema_type: str = "CSS", # or XPATH
        query: str = None,
        target_json_example: str = None,
-        llm_config: 'LLMConfig' = None,
+        llm_config: 'LLMConfig' = create_llm_config(),
        provider: str = None,
        api_token: str = None,
        **kwargs
@ -1140,7 +1138,6 @@ In this scenario, use your best judgment to generate the schema. Try to maximize
        except Exception as e:
            raise Exception(f"Failed to generate schema: {str(e)}")

-
 class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
    """
    Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors.
--- a/crawl4ai/types.py
+++ b/crawl4ai/types.py
@ -178,4 +178,10 @@ if TYPE_CHECKING:
        BestFirstCrawlingStrategy as BestFirstCrawlingStrategyType,
        DFSDeepCrawlStrategy as DFSDeepCrawlStrategyType,
        DeepCrawlDecorator as DeepCrawlDecoratorType,
-    )
+    )
+
+
+
+def create_llm_config(*args, **kwargs) -> 'LLMConfigType':
+    from .async_configs import LLMConfig
+    return LLMConfig(*args, **kwargs)
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@ -26,7 +26,7 @@ import cProfile
 import pstats
 from functools import wraps
 import asyncio
-
+from lxml import etree, html as lhtml
 import sqlite3
 import hashlib

@ -2617,3 +2617,116 @@ class HeadPeekr:
    def get_title(head_content: str):
        title_match = re.search(r'<title>(.*?)</title>', head_content, re.IGNORECASE | re.DOTALL)
        return title_match.group(1) if title_match else None
+
+def preprocess_html_for_schema(html_content, text_threshold=100, attr_value_threshold=200, max_size=100000):
+    """
+    Preprocess HTML to reduce size while preserving structure for schema generation.
+    
+    Args:
+        html_content (str): Raw HTML content
+        text_threshold (int): Maximum length for text nodes before truncation
+        attr_value_threshold (int): Maximum length for attribute values before truncation
+        max_size (int): Target maximum size for output HTML
+        
+    Returns:
+        str: Preprocessed HTML content
+    """
+    try:
+        # Parse HTML with error recovery
+        parser = etree.HTMLParser(remove_comments=True, remove_blank_text=True)
+        tree = lhtml.fromstring(html_content, parser=parser)
+        
+        # 1. Remove HEAD section (keep only BODY)
+        head_elements = tree.xpath('//head')
+        for head in head_elements:
+            if head.getparent() is not None:
+                head.getparent().remove(head)
+        
+        # 2. Define tags to remove completely
+        tags_to_remove = [
+            'script', 'style', 'noscript', 'iframe', 'canvas', 'svg',
+            'video', 'audio', 'source', 'track', 'map', 'area'
+        ]
+        
+        # Remove unwanted elements
+        for tag in tags_to_remove:
+            elements = tree.xpath(f'//{tag}')
+            for element in elements:
+                if element.getparent() is not None:
+                    element.getparent().remove(element)
+        
+        # 3. Process remaining elements to clean attributes and truncate text
+        for element in tree.iter():
+            # Skip if we're at the root level
+            if element.getparent() is None:
+                continue
+                
+            # Clean non-essential attributes but preserve structural ones
+            # attribs_to_keep = {'id', 'class', 'name', 'href', 'src', 'type', 'value', 'data-'}
+
+            # This is more aggressive than the previous version
+            attribs_to_keep = {'id', 'class', 'name', 'type', 'value'}
+
+            # attributes_hates_truncate = ['id', 'class', "data-"]
+
+            # This means, I don't care, if an attribute is too long, truncate it, go and find a better css selector to build a schema
+            attributes_hates_truncate = []
+            
+            # Process each attribute
+            for attrib in list(element.attrib.keys()):
+                # Keep if it's essential or starts with data-
+                if not (attrib in attribs_to_keep or attrib.startswith('data-')):
+                    element.attrib.pop(attrib)
+                # Truncate long attribute values except for selectors
+                elif attrib not in attributes_hates_truncate and len(element.attrib[attrib]) > attr_value_threshold:
+                    element.attrib[attrib] = element.attrib[attrib][:attr_value_threshold] + '...'
+            
+            # Truncate text content if it's too long
+            if element.text and len(element.text.strip()) > text_threshold:
+                element.text = element.text.strip()[:text_threshold] + '...'
+                
+            # Also truncate tail text if present
+            if element.tail and len(element.tail.strip()) > text_threshold:
+                element.tail = element.tail.strip()[:text_threshold] + '...'
+        
+        # 4. Find repeated patterns and keep only a few examples
+        # This is a simplistic approach - more sophisticated pattern detection could be implemented
+        pattern_elements = {}
+        for element in tree.xpath('//*[contains(@class, "")]'):
+            parent = element.getparent()
+            if parent is None:
+                continue
+                
+            # Create a signature based on tag and classes
+            classes = element.get('class', '')
+            if not classes:
+                continue
+            signature = f"{element.tag}.{classes}"
+            
+            if signature in pattern_elements:
+                pattern_elements[signature].append(element)
+            else:
+                pattern_elements[signature] = [element]
+        
+        # Keep only 3 examples of each repeating pattern
+        for signature, elements in pattern_elements.items():
+            if len(elements) > 3:
+                # Keep the first 2 and last elements
+                for element in elements[2:-1]:
+                    if element.getparent() is not None:
+                        element.getparent().remove(element)
+        
+        # 5. Convert back to string
+        result = etree.tostring(tree, encoding='unicode', method='html')
+        
+        # If still over the size limit, apply more aggressive truncation
+        if len(result) > max_size:
+            return result[:max_size] + "..."
+            
+        return result
+    
+    except Exception as e:
+        # Fallback for parsing errors
+        return html_content[:max_size] if len(html_content) > max_size else html_content
+    
+
--- a/pyproject.toml
+++ b/pyproject.toml
@ -42,7 +42,9 @@ dependencies = [
    "pyperclip>=1.8.2",
    "faust-cchardet>=2.1.19",
    "aiohttp>=3.11.11",
-    "humanize>=4.10.0"
+    "humanize>=4.10.0",
+    "zstandard>=0.23.0",
+    "msgpack>=1.1.0"
 ]
 classifiers = [
    "Development Status :: 4 - Beta",
--- a/tests/20241401/test_schema_builder.py
+++ b/tests/20241401/test_schema_builder.py
@ -10,6 +10,7 @@ import asyncio
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
 from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
 from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
+from crawl4ai.utils import preprocess_html_for_schema, JsonXPathExtractionStrategy
 import json

 # Test HTML - A complex job board with companies, departments, and positions