fix(extraction): JsonCss selector and crawler improvements

- Fix JsonCssExtractionStrategy._get_elements to return all matching elements instead of just one - Add robust error handling to page_need_scroll with default fallback - Improve JSON extraction strategies documentation - Refactor content scraping strategy - Update version to 0.4.247
2025-12-29 11:27:52 +00:00 · 2025-01-05 19:26:46 +08:00 · 2025-01-05 19:26:46 +08:00 · 72fbdac467
commit 72fbdac467
parent 0857c7b448
6 changed files with 56 additions and 102 deletions
--- a/crawl4ai/version.py
+++ b/crawl4ai/version.py
@ -1,2 +1,2 @@
 # crawl4ai/_version.py
-__version__ = "0.4.246"
+__version__ = "0.4.247"
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@ -2163,7 +2163,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            }
        """)
    
-    async def page_need_scroll(self, page: Page):
+    async def page_need_scroll(self, page: Page) -> bool:
        """
        Determine whether the page need to scroll
        
@ -2171,12 +2171,21 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            page: Playwright page object
            
        Returns:
-            page should scroll or not
+            bool: True if page needs scrolling
        """
-        return await page.evaluate("""
+        try:
+            need_scroll = await page.evaluate("""
            () => {
                const scrollHeight = document.documentElement.scrollHeight;
                const viewportHeight = window.innerHeight;
                return scrollHeight > viewportHeight;
            }
-        """)
+            """)
+            return need_scroll
+        except Exception as e:
+            self.logger.warning(
+                message="Failed to check scroll need: {error}. Defaulting to True for safety.",
+                tag="SCROLL",
+                params={"error": str(e)}
+            )
+            return True  # Default to scrolling if check fails
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@ -122,92 +122,6 @@ class WebScrapingStrategy(ContentScrapingStrategy):
        """
        return await asyncio.to_thread(self._scrap, url, html, **kwargs)

-    def _generate_markdown_content(self, cleaned_html: str,html: str,url: str, success: bool, **kwargs) -> Dict[str, Any]:
-        """
-        Generate markdown content from cleaned HTML.
-
-        Args:
-            cleaned_html (str): The cleaned HTML content.
-            html (str): The original HTML content.
-            url (str): The URL of the page.
-            success (bool): Whether the content was successfully cleaned.
-            **kwargs: Additional keyword arguments.
-
-        Returns:
-            Dict[str, Any]: A dictionary containing the generated markdown content.
-        """
-        markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerator())
-        
-        if markdown_generator:
-            try:
-                if kwargs.get('fit_markdown', False) and not markdown_generator.content_filter:
-                        markdown_generator.content_filter = BM25ContentFilter(
-                            user_query=kwargs.get('fit_markdown_user_query', None),
-                            bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0)
-                        )
-                
-                markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
-                    cleaned_html=cleaned_html,
-                    base_url=url,
-                    html2text_options=kwargs.get('html2text', {})
-                )
-                
-                return {
-                    'markdown': markdown_result.raw_markdown,  
-                    'fit_markdown': markdown_result.fit_markdown,
-                    'fit_html': markdown_result.fit_html, 
-                    'markdown_v2': markdown_result
-                }
-            except Exception as e:
-                self._log('error',
-                    message="Error using new markdown generation strategy: {error}",
-                    tag="SCRAPE",
-                    params={"error": str(e)}
-                )
-                markdown_generator = None
-                return {
-                    'markdown': f"Error using new markdown generation strategy: {str(e)}",
-                    'fit_markdown': "Set flag 'fit_markdown' to True to get cleaned HTML content.",
-                    'fit_html': "Set flag 'fit_markdown' to True to get cleaned HTML content.",
-                    'markdown_v2': None                    
-                }
-
-        # Legacy method
-        """
-        # h = CustomHTML2Text()
-        # h.update_params(**kwargs.get('html2text', {}))            
-        # markdown = h.handle(cleaned_html)
-        # markdown = markdown.replace('    ```', '```')
-        
-        # fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content."
-        # fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content."
-        
-        # if kwargs.get('content_filter', None) or kwargs.get('fit_markdown', False):
-        #     content_filter = kwargs.get('content_filter', None)
-        #     if not content_filter:
-        #         content_filter = BM25ContentFilter(
-        #             user_query=kwargs.get('fit_markdown_user_query', None),
-        #             bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0)
-        #         )
-        #     fit_html = content_filter.filter_content(html)
-        #     fit_html = '\n'.join('<div>{}</div>'.format(s) for s in fit_html)
-        #     fit_markdown = h.handle(fit_html)
-
-        # markdown_v2 = MarkdownGenerationResult(
-        #     raw_markdown=markdown,
-        #     markdown_with_citations=markdown,
-        #     references_markdown=markdown,
-        #     fit_markdown=fit_markdown
-        # )
-        
-        # return {
-        #     'markdown': markdown,
-        #     'fit_markdown': fit_markdown,
-        #     'fit_html': fit_html,
-        #     'markdown_v2' : markdown_v2
-        # }
-        """
-
    def flatten_nested_elements(self, node):
        """
        Flatten nested elements in a HTML tree.
@ -798,13 +712,6 @@ class WebScrapingStrategy(ContentScrapingStrategy):

        cleaned_html = str_body.replace('\n\n', '\n').replace('  ', ' ')

-        # markdown_content = self._generate_markdown_content(
-        #     cleaned_html=cleaned_html,
-        #     html=html,
-        #     url=url,
-        #     success=success,
-        #     **kwargs
-        # )
        
        return {
            # **markdown_content,
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@ -974,8 +974,7 @@ class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
        return parsed_html.select(selector)

    def _get_elements(self, element, selector: str):
-        selected = element.select_one(selector)
-        return [selected] if selected else []
+        return element.select(selector)

    def _get_element_text(self, element) -> str:
        return element.get_text(strip=True)
@ -1050,3 +1049,4 @@ class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
    def _get_element_attribute(self, element, attribute: str):
        return element.get(attribute)
 
+
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@ -21,6 +21,8 @@ import textwrap
 import cProfile
 import pstats
 from functools import wraps
+import asyncio
+

 class InvalidCSSSelectorError(Exception):
    pass
@ -1579,6 +1581,25 @@ def ensure_content_dirs(base_path: str) -> Dict[str, str]:
        
    return content_paths

+def configure_windows_event_loop():
+    """
+    Configure the Windows event loop to use ProactorEventLoop.
+    This resolves the NotImplementedError that occurs on Windows when using asyncio subprocesses.
+    
+    This function should only be called on Windows systems and before any async operations.
+    On non-Windows systems, this function does nothing.
+    
+    Example:
+        ```python
+        from crawl4ai.async_configs import configure_windows_event_loop
+        
+        # Call this before any async operations if you're on Windows
+        configure_windows_event_loop()
+        ```
+    """
+    if platform.system() == 'Windows':
+        asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
+
 def get_error_context(exc_info, context_lines: int = 5):
    """
    Extract error context with more reliable line number tracking.
--- a/docs/md_v3/tutorials/async-webcrawler-basics.md
+++ b/docs/md_v3/tutorials/async-webcrawler-basics.md
@ -148,7 +148,24 @@ Below are a few `BrowserConfig` and `CrawlerRunConfig` parameters you might twea

 ---

-## 5. Putting It All Together
+## 5. Windows-Specific Configuration
+
+When using AsyncWebCrawler on Windows, you might encounter a `NotImplementedError` related to `asyncio.create_subprocess_exec`. This is a known Windows-specific issue that occurs because Windows' default event loop doesn't support subprocess operations.
+
+To resolve this, Crawl4AI provides a utility function to configure Windows to use the ProactorEventLoop. Call this function before running any async operations:
+
+```python
+from crawl4ai.utils import configure_windows_event_loop
+
+# Call this before any async operations if you're on Windows
+configure_windows_event_loop()
+
+# Your AsyncWebCrawler code here
+```
+
+---
+
+## 6. Putting It All Together

 Here’s a slightly more in-depth example that shows off a few key config parameters at once:

@ -193,7 +210,7 @@ if __name__ == "__main__":

 ---

-## 6. Next Steps
+## 7. Next Steps

 - **Smart Crawling Techniques**: Learn to handle iframes, advanced caching, and selective extraction in the [next tutorial](./smart-crawling.md).
 - **Hooks & Custom Code**: See how to inject custom logic before and after navigation in a dedicated [Hooks Tutorial](./hooks-custom.md).