diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index ee26a70..38b432b 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.4.2" +__version__ = "0.4.21" diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 41574fe..aa0b849 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -7,6 +7,7 @@ from .config import ( from .user_agent_generator import UserAgentGenerator from .extraction_strategy import ExtractionStrategy from .chunking_strategy import ChunkingStrategy +from .markdown_generation_strategy import MarkdownGenerationStrategy class BrowserConfig: """ @@ -269,6 +270,7 @@ class CrawlerRunConfig: word_count_threshold: int = MIN_WORD_THRESHOLD , extraction_strategy : ExtractionStrategy=None, # Will default to NoExtractionStrategy if None chunking_strategy : ChunkingStrategy= None, # Will default to RegexChunking if None + markdown_generator : MarkdownGenerationStrategy = None, content_filter=None, cache_mode=None, session_id: str = None, @@ -309,6 +311,7 @@ class CrawlerRunConfig: self.word_count_threshold = word_count_threshold self.extraction_strategy = extraction_strategy self.chunking_strategy = chunking_strategy + self.markdown_generator = markdown_generator self.content_filter = content_filter self.cache_mode = cache_mode self.session_id = session_id @@ -364,6 +367,7 @@ class CrawlerRunConfig: word_count_threshold=kwargs.get("word_count_threshold", 200), extraction_strategy=kwargs.get("extraction_strategy"), chunking_strategy=kwargs.get("chunking_strategy"), + markdown_generator=kwargs.get("markdown_generator"), content_filter=kwargs.get("content_filter"), cache_mode=kwargs.get("cache_mode"), session_id=kwargs.get("session_id"), diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 8515a38..9b96815 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -7,7 +7,8 @@ from pathlib import Path from typing import Optional, List, Union import json import asyncio -from contextlib import nullcontext, asynccontextmanager +# from contextlib import nullcontext, asynccontextmanager +from contextlib import asynccontextmanager from .models import CrawlResult, MarkdownGenerationResult from .async_database import async_db_manager from .chunking_strategy import * @@ -15,6 +16,7 @@ from .content_filter_strategy import * from .extraction_strategy import * from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode +from .markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy from .content_scraping_strategy import WebScrapingStrategy from .async_logger import AsyncLogger from .async_configs import BrowserConfig, CrawlerRunConfig @@ -132,17 +134,12 @@ class AsyncWebCrawler: async def __aexit__(self, exc_type, exc_val, exc_tb): await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb) - - @asynccontextmanager - async def nullcontext(self): - yield async def awarmup(self): """Initialize the crawler with warm-up sequence.""" self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT") self.ready = True - @asynccontextmanager async def nullcontext(self): """异步空上下文管理器""" @@ -323,7 +320,8 @@ class AsyncWebCrawler: config=config, # Pass the config object instead of individual parameters screenshot=screenshot_data, pdf_data=pdf_data, - verbose=config.verbose + verbose=config.verbose, + **kwargs ) # Set response data @@ -424,7 +422,8 @@ class AsyncWebCrawler: css_selector=config.css_selector, only_text=config.only_text, image_description_min_word_threshold=config.image_description_min_word_threshold, - content_filter=config.content_filter + content_filter=config.content_filter, + **kwargs ) if result is None: @@ -435,16 +434,29 @@ class AsyncWebCrawler: except Exception as e: raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}") + + # Extract results - markdown_v2 = result.get("markdown_v2", None) cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) - markdown = sanitize_input_encode(result.get("markdown", "")) fit_markdown = sanitize_input_encode(result.get("fit_markdown", "")) fit_html = sanitize_input_encode(result.get("fit_html", "")) media = result.get("media", []) links = result.get("links", []) metadata = result.get("metadata", {}) + # Markdown Generation + markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator() + if not config.content_filter and not markdown_generator.content_filter: + markdown_generator.content_filter = PruningContentFilter() + + markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown( + cleaned_html=cleaned_html, + base_url=url, + # html2text_options=kwargs.get('html2text', {}) + ) + markdown_v2 = markdown_result + markdown = sanitize_input_encode(markdown_result.raw_markdown) + # Log processing completion self.logger.info( message="Processed {url:.50}... | Time: {timing}ms", diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index f58e1ea..4ba9a60 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -602,16 +602,16 @@ class WebScrapingStrategy(ContentScrapingStrategy): cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ') - markdown_content = self._generate_markdown_content( - cleaned_html=cleaned_html, - html=html, - url=url, - success=success, - **kwargs - ) + # markdown_content = self._generate_markdown_content( + # cleaned_html=cleaned_html, + # html=html, + # url=url, + # success=success, + # **kwargs + # ) return { - **markdown_content, + # **markdown_content, 'cleaned_html': cleaned_html, 'success': success, 'media': media, diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 1c76bf1..bd4c425 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -239,8 +239,10 @@ async def crawl_dynamic_content_pages_method_1(): all_commits = [] js_next_page = """ - const button = document.querySelector('a[data-testid="pagination-next-button"]'); - if (button) button.click(); + (() => { + const button = document.querySelector('a[data-testid="pagination-next-button"]'); + if (button) button.click(); + })(); """ for page in range(3): # Crawl 3 pages @@ -604,14 +606,14 @@ async def fit_markdown_remove_overlay(): async def main(): - await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) + # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) - await simple_crawl() - await simple_example_with_running_js_code() - await simple_example_with_css_selector() - # await use_proxy() - await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) - await extract_structured_data_using_css_extractor() + # await simple_crawl() + # await simple_example_with_running_js_code() + # await simple_example_with_css_selector() + # # await use_proxy() + # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) + # await extract_structured_data_using_css_extractor() # LLM extraction examples # await extract_structured_data_using_llm()