Feature: Add Markdown generation to CrawlerRunConfig
- Added markdown generator parameter to CrawlerRunConfig in `async_configs.py`. - Implemented logic for Markdown generation in content scraping in `async_webcrawler.py`. - Updated version number to 0.4.21 in `__version__.py`.
This commit is contained in:
parent
7af1d32ef6
commit
7524aa7b5e
@ -1,2 +1,2 @@
|
|||||||
# crawl4ai/_version.py
|
# crawl4ai/_version.py
|
||||||
__version__ = "0.4.2"
|
__version__ = "0.4.21"
|
||||||
|
@ -7,6 +7,7 @@ from .config import (
|
|||||||
from .user_agent_generator import UserAgentGenerator
|
from .user_agent_generator import UserAgentGenerator
|
||||||
from .extraction_strategy import ExtractionStrategy
|
from .extraction_strategy import ExtractionStrategy
|
||||||
from .chunking_strategy import ChunkingStrategy
|
from .chunking_strategy import ChunkingStrategy
|
||||||
|
from .markdown_generation_strategy import MarkdownGenerationStrategy
|
||||||
|
|
||||||
class BrowserConfig:
|
class BrowserConfig:
|
||||||
"""
|
"""
|
||||||
@ -269,6 +270,7 @@ class CrawlerRunConfig:
|
|||||||
word_count_threshold: int = MIN_WORD_THRESHOLD ,
|
word_count_threshold: int = MIN_WORD_THRESHOLD ,
|
||||||
extraction_strategy : ExtractionStrategy=None, # Will default to NoExtractionStrategy if None
|
extraction_strategy : ExtractionStrategy=None, # Will default to NoExtractionStrategy if None
|
||||||
chunking_strategy : ChunkingStrategy= None, # Will default to RegexChunking if None
|
chunking_strategy : ChunkingStrategy= None, # Will default to RegexChunking if None
|
||||||
|
markdown_generator : MarkdownGenerationStrategy = None,
|
||||||
content_filter=None,
|
content_filter=None,
|
||||||
cache_mode=None,
|
cache_mode=None,
|
||||||
session_id: str = None,
|
session_id: str = None,
|
||||||
@ -309,6 +311,7 @@ class CrawlerRunConfig:
|
|||||||
self.word_count_threshold = word_count_threshold
|
self.word_count_threshold = word_count_threshold
|
||||||
self.extraction_strategy = extraction_strategy
|
self.extraction_strategy = extraction_strategy
|
||||||
self.chunking_strategy = chunking_strategy
|
self.chunking_strategy = chunking_strategy
|
||||||
|
self.markdown_generator = markdown_generator
|
||||||
self.content_filter = content_filter
|
self.content_filter = content_filter
|
||||||
self.cache_mode = cache_mode
|
self.cache_mode = cache_mode
|
||||||
self.session_id = session_id
|
self.session_id = session_id
|
||||||
@ -364,6 +367,7 @@ class CrawlerRunConfig:
|
|||||||
word_count_threshold=kwargs.get("word_count_threshold", 200),
|
word_count_threshold=kwargs.get("word_count_threshold", 200),
|
||||||
extraction_strategy=kwargs.get("extraction_strategy"),
|
extraction_strategy=kwargs.get("extraction_strategy"),
|
||||||
chunking_strategy=kwargs.get("chunking_strategy"),
|
chunking_strategy=kwargs.get("chunking_strategy"),
|
||||||
|
markdown_generator=kwargs.get("markdown_generator"),
|
||||||
content_filter=kwargs.get("content_filter"),
|
content_filter=kwargs.get("content_filter"),
|
||||||
cache_mode=kwargs.get("cache_mode"),
|
cache_mode=kwargs.get("cache_mode"),
|
||||||
session_id=kwargs.get("session_id"),
|
session_id=kwargs.get("session_id"),
|
||||||
|
@ -7,7 +7,8 @@ from pathlib import Path
|
|||||||
from typing import Optional, List, Union
|
from typing import Optional, List, Union
|
||||||
import json
|
import json
|
||||||
import asyncio
|
import asyncio
|
||||||
from contextlib import nullcontext, asynccontextmanager
|
# from contextlib import nullcontext, asynccontextmanager
|
||||||
|
from contextlib import asynccontextmanager
|
||||||
from .models import CrawlResult, MarkdownGenerationResult
|
from .models import CrawlResult, MarkdownGenerationResult
|
||||||
from .async_database import async_db_manager
|
from .async_database import async_db_manager
|
||||||
from .chunking_strategy import *
|
from .chunking_strategy import *
|
||||||
@ -15,6 +16,7 @@ from .content_filter_strategy import *
|
|||||||
from .extraction_strategy import *
|
from .extraction_strategy import *
|
||||||
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
|
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
|
||||||
from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
|
from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
|
||||||
|
from .markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy
|
||||||
from .content_scraping_strategy import WebScrapingStrategy
|
from .content_scraping_strategy import WebScrapingStrategy
|
||||||
from .async_logger import AsyncLogger
|
from .async_logger import AsyncLogger
|
||||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
@ -132,17 +134,12 @@ class AsyncWebCrawler:
|
|||||||
|
|
||||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||||
await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb)
|
await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb)
|
||||||
|
|
||||||
@asynccontextmanager
|
|
||||||
async def nullcontext(self):
|
|
||||||
yield
|
|
||||||
|
|
||||||
async def awarmup(self):
|
async def awarmup(self):
|
||||||
"""Initialize the crawler with warm-up sequence."""
|
"""Initialize the crawler with warm-up sequence."""
|
||||||
self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
|
self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
|
||||||
self.ready = True
|
self.ready = True
|
||||||
|
|
||||||
|
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def nullcontext(self):
|
async def nullcontext(self):
|
||||||
"""异步空上下文管理器"""
|
"""异步空上下文管理器"""
|
||||||
@ -323,7 +320,8 @@ class AsyncWebCrawler:
|
|||||||
config=config, # Pass the config object instead of individual parameters
|
config=config, # Pass the config object instead of individual parameters
|
||||||
screenshot=screenshot_data,
|
screenshot=screenshot_data,
|
||||||
pdf_data=pdf_data,
|
pdf_data=pdf_data,
|
||||||
verbose=config.verbose
|
verbose=config.verbose,
|
||||||
|
**kwargs
|
||||||
)
|
)
|
||||||
|
|
||||||
# Set response data
|
# Set response data
|
||||||
@ -424,7 +422,8 @@ class AsyncWebCrawler:
|
|||||||
css_selector=config.css_selector,
|
css_selector=config.css_selector,
|
||||||
only_text=config.only_text,
|
only_text=config.only_text,
|
||||||
image_description_min_word_threshold=config.image_description_min_word_threshold,
|
image_description_min_word_threshold=config.image_description_min_word_threshold,
|
||||||
content_filter=config.content_filter
|
content_filter=config.content_filter,
|
||||||
|
**kwargs
|
||||||
)
|
)
|
||||||
|
|
||||||
if result is None:
|
if result is None:
|
||||||
@ -435,16 +434,29 @@ class AsyncWebCrawler:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")
|
raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Extract results
|
# Extract results
|
||||||
markdown_v2 = result.get("markdown_v2", None)
|
|
||||||
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
|
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
|
||||||
markdown = sanitize_input_encode(result.get("markdown", ""))
|
|
||||||
fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
|
fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
|
||||||
fit_html = sanitize_input_encode(result.get("fit_html", ""))
|
fit_html = sanitize_input_encode(result.get("fit_html", ""))
|
||||||
media = result.get("media", [])
|
media = result.get("media", [])
|
||||||
links = result.get("links", [])
|
links = result.get("links", [])
|
||||||
metadata = result.get("metadata", {})
|
metadata = result.get("metadata", {})
|
||||||
|
|
||||||
|
# Markdown Generation
|
||||||
|
markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator()
|
||||||
|
if not config.content_filter and not markdown_generator.content_filter:
|
||||||
|
markdown_generator.content_filter = PruningContentFilter()
|
||||||
|
|
||||||
|
markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
|
||||||
|
cleaned_html=cleaned_html,
|
||||||
|
base_url=url,
|
||||||
|
# html2text_options=kwargs.get('html2text', {})
|
||||||
|
)
|
||||||
|
markdown_v2 = markdown_result
|
||||||
|
markdown = sanitize_input_encode(markdown_result.raw_markdown)
|
||||||
|
|
||||||
# Log processing completion
|
# Log processing completion
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
message="Processed {url:.50}... | Time: {timing}ms",
|
message="Processed {url:.50}... | Time: {timing}ms",
|
||||||
|
@ -602,16 +602,16 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
|
|
||||||
cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ')
|
cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ')
|
||||||
|
|
||||||
markdown_content = self._generate_markdown_content(
|
# markdown_content = self._generate_markdown_content(
|
||||||
cleaned_html=cleaned_html,
|
# cleaned_html=cleaned_html,
|
||||||
html=html,
|
# html=html,
|
||||||
url=url,
|
# url=url,
|
||||||
success=success,
|
# success=success,
|
||||||
**kwargs
|
# **kwargs
|
||||||
)
|
# )
|
||||||
|
|
||||||
return {
|
return {
|
||||||
**markdown_content,
|
# **markdown_content,
|
||||||
'cleaned_html': cleaned_html,
|
'cleaned_html': cleaned_html,
|
||||||
'success': success,
|
'success': success,
|
||||||
'media': media,
|
'media': media,
|
||||||
|
@ -239,8 +239,10 @@ async def crawl_dynamic_content_pages_method_1():
|
|||||||
all_commits = []
|
all_commits = []
|
||||||
|
|
||||||
js_next_page = """
|
js_next_page = """
|
||||||
const button = document.querySelector('a[data-testid="pagination-next-button"]');
|
(() => {
|
||||||
if (button) button.click();
|
const button = document.querySelector('a[data-testid="pagination-next-button"]');
|
||||||
|
if (button) button.click();
|
||||||
|
})();
|
||||||
"""
|
"""
|
||||||
|
|
||||||
for page in range(3): # Crawl 3 pages
|
for page in range(3): # Crawl 3 pages
|
||||||
@ -604,14 +606,14 @@ async def fit_markdown_remove_overlay():
|
|||||||
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
|
# await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
|
||||||
|
|
||||||
await simple_crawl()
|
# await simple_crawl()
|
||||||
await simple_example_with_running_js_code()
|
# await simple_example_with_running_js_code()
|
||||||
await simple_example_with_css_selector()
|
# await simple_example_with_css_selector()
|
||||||
# await use_proxy()
|
# # await use_proxy()
|
||||||
await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
|
# await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
|
||||||
await extract_structured_data_using_css_extractor()
|
# await extract_structured_data_using_css_extractor()
|
||||||
|
|
||||||
# LLM extraction examples
|
# LLM extraction examples
|
||||||
# await extract_structured_data_using_llm()
|
# await extract_structured_data_using_llm()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user