mirror of
https://github.com/unclecode/crawl4ai.git
synced 2025-12-29 11:27:52 +00:00
fix(extraction): JsonCss selector and crawler improvements
- Fix JsonCssExtractionStrategy._get_elements to return all matching elements instead of just one - Add robust error handling to page_need_scroll with default fallback - Improve JSON extraction strategies documentation - Refactor content scraping strategy - Update version to 0.4.247
This commit is contained in:
parent
0857c7b448
commit
72fbdac467
@ -1,2 +1,2 @@
|
||||
# crawl4ai/_version.py
|
||||
__version__ = "0.4.246"
|
||||
__version__ = "0.4.247"
|
||||
|
||||
@ -2163,7 +2163,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
}
|
||||
""")
|
||||
|
||||
async def page_need_scroll(self, page: Page):
|
||||
async def page_need_scroll(self, page: Page) -> bool:
|
||||
"""
|
||||
Determine whether the page need to scroll
|
||||
|
||||
@ -2171,12 +2171,21 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
page: Playwright page object
|
||||
|
||||
Returns:
|
||||
page should scroll or not
|
||||
bool: True if page needs scrolling
|
||||
"""
|
||||
return await page.evaluate("""
|
||||
try:
|
||||
need_scroll = await page.evaluate("""
|
||||
() => {
|
||||
const scrollHeight = document.documentElement.scrollHeight;
|
||||
const viewportHeight = window.innerHeight;
|
||||
return scrollHeight > viewportHeight;
|
||||
}
|
||||
""")
|
||||
""")
|
||||
return need_scroll
|
||||
except Exception as e:
|
||||
self.logger.warning(
|
||||
message="Failed to check scroll need: {error}. Defaulting to True for safety.",
|
||||
tag="SCROLL",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
return True # Default to scrolling if check fails
|
||||
@ -122,92 +122,6 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
"""
|
||||
return await asyncio.to_thread(self._scrap, url, html, **kwargs)
|
||||
|
||||
def _generate_markdown_content(self, cleaned_html: str,html: str,url: str, success: bool, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Generate markdown content from cleaned HTML.
|
||||
|
||||
Args:
|
||||
cleaned_html (str): The cleaned HTML content.
|
||||
html (str): The original HTML content.
|
||||
url (str): The URL of the page.
|
||||
success (bool): Whether the content was successfully cleaned.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing the generated markdown content.
|
||||
"""
|
||||
markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerator())
|
||||
|
||||
if markdown_generator:
|
||||
try:
|
||||
if kwargs.get('fit_markdown', False) and not markdown_generator.content_filter:
|
||||
markdown_generator.content_filter = BM25ContentFilter(
|
||||
user_query=kwargs.get('fit_markdown_user_query', None),
|
||||
bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0)
|
||||
)
|
||||
|
||||
markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
|
||||
cleaned_html=cleaned_html,
|
||||
base_url=url,
|
||||
html2text_options=kwargs.get('html2text', {})
|
||||
)
|
||||
|
||||
return {
|
||||
'markdown': markdown_result.raw_markdown,
|
||||
'fit_markdown': markdown_result.fit_markdown,
|
||||
'fit_html': markdown_result.fit_html,
|
||||
'markdown_v2': markdown_result
|
||||
}
|
||||
except Exception as e:
|
||||
self._log('error',
|
||||
message="Error using new markdown generation strategy: {error}",
|
||||
tag="SCRAPE",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
markdown_generator = None
|
||||
return {
|
||||
'markdown': f"Error using new markdown generation strategy: {str(e)}",
|
||||
'fit_markdown': "Set flag 'fit_markdown' to True to get cleaned HTML content.",
|
||||
'fit_html': "Set flag 'fit_markdown' to True to get cleaned HTML content.",
|
||||
'markdown_v2': None
|
||||
}
|
||||
|
||||
# Legacy method
|
||||
"""
|
||||
# h = CustomHTML2Text()
|
||||
# h.update_params(**kwargs.get('html2text', {}))
|
||||
# markdown = h.handle(cleaned_html)
|
||||
# markdown = markdown.replace(' ```', '```')
|
||||
|
||||
# fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content."
|
||||
# fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content."
|
||||
|
||||
# if kwargs.get('content_filter', None) or kwargs.get('fit_markdown', False):
|
||||
# content_filter = kwargs.get('content_filter', None)
|
||||
# if not content_filter:
|
||||
# content_filter = BM25ContentFilter(
|
||||
# user_query=kwargs.get('fit_markdown_user_query', None),
|
||||
# bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0)
|
||||
# )
|
||||
# fit_html = content_filter.filter_content(html)
|
||||
# fit_html = '\n'.join('<div>{}</div>'.format(s) for s in fit_html)
|
||||
# fit_markdown = h.handle(fit_html)
|
||||
|
||||
# markdown_v2 = MarkdownGenerationResult(
|
||||
# raw_markdown=markdown,
|
||||
# markdown_with_citations=markdown,
|
||||
# references_markdown=markdown,
|
||||
# fit_markdown=fit_markdown
|
||||
# )
|
||||
|
||||
# return {
|
||||
# 'markdown': markdown,
|
||||
# 'fit_markdown': fit_markdown,
|
||||
# 'fit_html': fit_html,
|
||||
# 'markdown_v2' : markdown_v2
|
||||
# }
|
||||
"""
|
||||
|
||||
def flatten_nested_elements(self, node):
|
||||
"""
|
||||
Flatten nested elements in a HTML tree.
|
||||
@ -798,13 +712,6 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
|
||||
cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ')
|
||||
|
||||
# markdown_content = self._generate_markdown_content(
|
||||
# cleaned_html=cleaned_html,
|
||||
# html=html,
|
||||
# url=url,
|
||||
# success=success,
|
||||
# **kwargs
|
||||
# )
|
||||
|
||||
return {
|
||||
# **markdown_content,
|
||||
|
||||
@ -974,8 +974,7 @@ class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
|
||||
return parsed_html.select(selector)
|
||||
|
||||
def _get_elements(self, element, selector: str):
|
||||
selected = element.select_one(selector)
|
||||
return [selected] if selected else []
|
||||
return element.select(selector)
|
||||
|
||||
def _get_element_text(self, element) -> str:
|
||||
return element.get_text(strip=True)
|
||||
@ -1050,3 +1049,4 @@ class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
|
||||
def _get_element_attribute(self, element, attribute: str):
|
||||
return element.get(attribute)
|
||||
|
||||
|
||||
|
||||
@ -21,6 +21,8 @@ import textwrap
|
||||
import cProfile
|
||||
import pstats
|
||||
from functools import wraps
|
||||
import asyncio
|
||||
|
||||
|
||||
class InvalidCSSSelectorError(Exception):
|
||||
pass
|
||||
@ -1579,6 +1581,25 @@ def ensure_content_dirs(base_path: str) -> Dict[str, str]:
|
||||
|
||||
return content_paths
|
||||
|
||||
def configure_windows_event_loop():
|
||||
"""
|
||||
Configure the Windows event loop to use ProactorEventLoop.
|
||||
This resolves the NotImplementedError that occurs on Windows when using asyncio subprocesses.
|
||||
|
||||
This function should only be called on Windows systems and before any async operations.
|
||||
On non-Windows systems, this function does nothing.
|
||||
|
||||
Example:
|
||||
```python
|
||||
from crawl4ai.async_configs import configure_windows_event_loop
|
||||
|
||||
# Call this before any async operations if you're on Windows
|
||||
configure_windows_event_loop()
|
||||
```
|
||||
"""
|
||||
if platform.system() == 'Windows':
|
||||
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
|
||||
|
||||
def get_error_context(exc_info, context_lines: int = 5):
|
||||
"""
|
||||
Extract error context with more reliable line number tracking.
|
||||
|
||||
@ -148,7 +148,24 @@ Below are a few `BrowserConfig` and `CrawlerRunConfig` parameters you might twea
|
||||
|
||||
---
|
||||
|
||||
## 5. Putting It All Together
|
||||
## 5. Windows-Specific Configuration
|
||||
|
||||
When using AsyncWebCrawler on Windows, you might encounter a `NotImplementedError` related to `asyncio.create_subprocess_exec`. This is a known Windows-specific issue that occurs because Windows' default event loop doesn't support subprocess operations.
|
||||
|
||||
To resolve this, Crawl4AI provides a utility function to configure Windows to use the ProactorEventLoop. Call this function before running any async operations:
|
||||
|
||||
```python
|
||||
from crawl4ai.utils import configure_windows_event_loop
|
||||
|
||||
# Call this before any async operations if you're on Windows
|
||||
configure_windows_event_loop()
|
||||
|
||||
# Your AsyncWebCrawler code here
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 6. Putting It All Together
|
||||
|
||||
Here’s a slightly more in-depth example that shows off a few key config parameters at once:
|
||||
|
||||
@ -193,7 +210,7 @@ if __name__ == "__main__":
|
||||
|
||||
---
|
||||
|
||||
## 6. Next Steps
|
||||
## 7. Next Steps
|
||||
|
||||
- **Smart Crawling Techniques**: Learn to handle iframes, advanced caching, and selective extraction in the [next tutorial](./smart-crawling.md).
|
||||
- **Hooks & Custom Code**: See how to inject custom logic before and after navigation in a dedicated [Hooks Tutorial](./hooks-custom.md).
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user