Feature: Add Markdown generation to CrawlerRunConfig

- Added markdown generator parameter to CrawlerRunConfig in `async_configs.py`.
  - Implemented logic for Markdown generation in content scraping in `async_webcrawler.py`.
  - Updated version number to 0.4.21 in `__version__.py`.
This commit is contained in:
UncleCode 2024-12-13 21:51:38 +08:00
parent 7af1d32ef6
commit 7524aa7b5e
5 changed files with 46 additions and 28 deletions

View File

@ -1,2 +1,2 @@
# crawl4ai/_version.py # crawl4ai/_version.py
__version__ = "0.4.2" __version__ = "0.4.21"

View File

@ -7,6 +7,7 @@ from .config import (
from .user_agent_generator import UserAgentGenerator from .user_agent_generator import UserAgentGenerator
from .extraction_strategy import ExtractionStrategy from .extraction_strategy import ExtractionStrategy
from .chunking_strategy import ChunkingStrategy from .chunking_strategy import ChunkingStrategy
from .markdown_generation_strategy import MarkdownGenerationStrategy
class BrowserConfig: class BrowserConfig:
""" """
@ -269,6 +270,7 @@ class CrawlerRunConfig:
word_count_threshold: int = MIN_WORD_THRESHOLD , word_count_threshold: int = MIN_WORD_THRESHOLD ,
extraction_strategy : ExtractionStrategy=None, # Will default to NoExtractionStrategy if None extraction_strategy : ExtractionStrategy=None, # Will default to NoExtractionStrategy if None
chunking_strategy : ChunkingStrategy= None, # Will default to RegexChunking if None chunking_strategy : ChunkingStrategy= None, # Will default to RegexChunking if None
markdown_generator : MarkdownGenerationStrategy = None,
content_filter=None, content_filter=None,
cache_mode=None, cache_mode=None,
session_id: str = None, session_id: str = None,
@ -309,6 +311,7 @@ class CrawlerRunConfig:
self.word_count_threshold = word_count_threshold self.word_count_threshold = word_count_threshold
self.extraction_strategy = extraction_strategy self.extraction_strategy = extraction_strategy
self.chunking_strategy = chunking_strategy self.chunking_strategy = chunking_strategy
self.markdown_generator = markdown_generator
self.content_filter = content_filter self.content_filter = content_filter
self.cache_mode = cache_mode self.cache_mode = cache_mode
self.session_id = session_id self.session_id = session_id
@ -364,6 +367,7 @@ class CrawlerRunConfig:
word_count_threshold=kwargs.get("word_count_threshold", 200), word_count_threshold=kwargs.get("word_count_threshold", 200),
extraction_strategy=kwargs.get("extraction_strategy"), extraction_strategy=kwargs.get("extraction_strategy"),
chunking_strategy=kwargs.get("chunking_strategy"), chunking_strategy=kwargs.get("chunking_strategy"),
markdown_generator=kwargs.get("markdown_generator"),
content_filter=kwargs.get("content_filter"), content_filter=kwargs.get("content_filter"),
cache_mode=kwargs.get("cache_mode"), cache_mode=kwargs.get("cache_mode"),
session_id=kwargs.get("session_id"), session_id=kwargs.get("session_id"),

View File

@ -7,7 +7,8 @@ from pathlib import Path
from typing import Optional, List, Union from typing import Optional, List, Union
import json import json
import asyncio import asyncio
from contextlib import nullcontext, asynccontextmanager # from contextlib import nullcontext, asynccontextmanager
from contextlib import asynccontextmanager
from .models import CrawlResult, MarkdownGenerationResult from .models import CrawlResult, MarkdownGenerationResult
from .async_database import async_db_manager from .async_database import async_db_manager
from .chunking_strategy import * from .chunking_strategy import *
@ -15,6 +16,7 @@ from .content_filter_strategy import *
from .extraction_strategy import * from .extraction_strategy import *
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
from .markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy
from .content_scraping_strategy import WebScrapingStrategy from .content_scraping_strategy import WebScrapingStrategy
from .async_logger import AsyncLogger from .async_logger import AsyncLogger
from .async_configs import BrowserConfig, CrawlerRunConfig from .async_configs import BrowserConfig, CrawlerRunConfig
@ -132,17 +134,12 @@ class AsyncWebCrawler:
async def __aexit__(self, exc_type, exc_val, exc_tb): async def __aexit__(self, exc_type, exc_val, exc_tb):
await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb) await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb)
@asynccontextmanager
async def nullcontext(self):
yield
async def awarmup(self): async def awarmup(self):
"""Initialize the crawler with warm-up sequence.""" """Initialize the crawler with warm-up sequence."""
self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT") self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
self.ready = True self.ready = True
@asynccontextmanager @asynccontextmanager
async def nullcontext(self): async def nullcontext(self):
"""异步空上下文管理器""" """异步空上下文管理器"""
@ -323,7 +320,8 @@ class AsyncWebCrawler:
config=config, # Pass the config object instead of individual parameters config=config, # Pass the config object instead of individual parameters
screenshot=screenshot_data, screenshot=screenshot_data,
pdf_data=pdf_data, pdf_data=pdf_data,
verbose=config.verbose verbose=config.verbose,
**kwargs
) )
# Set response data # Set response data
@ -424,7 +422,8 @@ class AsyncWebCrawler:
css_selector=config.css_selector, css_selector=config.css_selector,
only_text=config.only_text, only_text=config.only_text,
image_description_min_word_threshold=config.image_description_min_word_threshold, image_description_min_word_threshold=config.image_description_min_word_threshold,
content_filter=config.content_filter content_filter=config.content_filter,
**kwargs
) )
if result is None: if result is None:
@ -435,16 +434,29 @@ class AsyncWebCrawler:
except Exception as e: except Exception as e:
raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}") raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")
# Extract results # Extract results
markdown_v2 = result.get("markdown_v2", None)
cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
markdown = sanitize_input_encode(result.get("markdown", ""))
fit_markdown = sanitize_input_encode(result.get("fit_markdown", "")) fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
fit_html = sanitize_input_encode(result.get("fit_html", "")) fit_html = sanitize_input_encode(result.get("fit_html", ""))
media = result.get("media", []) media = result.get("media", [])
links = result.get("links", []) links = result.get("links", [])
metadata = result.get("metadata", {}) metadata = result.get("metadata", {})
# Markdown Generation
markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator()
if not config.content_filter and not markdown_generator.content_filter:
markdown_generator.content_filter = PruningContentFilter()
markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
cleaned_html=cleaned_html,
base_url=url,
# html2text_options=kwargs.get('html2text', {})
)
markdown_v2 = markdown_result
markdown = sanitize_input_encode(markdown_result.raw_markdown)
# Log processing completion # Log processing completion
self.logger.info( self.logger.info(
message="Processed {url:.50}... | Time: {timing}ms", message="Processed {url:.50}... | Time: {timing}ms",

View File

@ -602,16 +602,16 @@ class WebScrapingStrategy(ContentScrapingStrategy):
cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ') cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ')
markdown_content = self._generate_markdown_content( # markdown_content = self._generate_markdown_content(
cleaned_html=cleaned_html, # cleaned_html=cleaned_html,
html=html, # html=html,
url=url, # url=url,
success=success, # success=success,
**kwargs # **kwargs
) # )
return { return {
**markdown_content, # **markdown_content,
'cleaned_html': cleaned_html, 'cleaned_html': cleaned_html,
'success': success, 'success': success,
'media': media, 'media': media,

View File

@ -239,8 +239,10 @@ async def crawl_dynamic_content_pages_method_1():
all_commits = [] all_commits = []
js_next_page = """ js_next_page = """
const button = document.querySelector('a[data-testid="pagination-next-button"]'); (() => {
if (button) button.click(); const button = document.querySelector('a[data-testid="pagination-next-button"]');
if (button) button.click();
})();
""" """
for page in range(3): # Crawl 3 pages for page in range(3): # Crawl 3 pages
@ -604,14 +606,14 @@ async def fit_markdown_remove_overlay():
async def main(): async def main():
await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
await simple_crawl() # await simple_crawl()
await simple_example_with_running_js_code() # await simple_example_with_running_js_code()
await simple_example_with_css_selector() # await simple_example_with_css_selector()
# await use_proxy() # # await use_proxy()
await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
await extract_structured_data_using_css_extractor() # await extract_structured_data_using_css_extractor()
# LLM extraction examples # LLM extraction examples
# await extract_structured_data_using_llm() # await extract_structured_data_using_llm()