feat(logger): add abstract logger base class and file logger implementation

Add AsyncLoggerBase abstract class to standardize logger interface and introduce AsyncFileLogger for file-only logging. Remove deprecated always_bypass_cache parameter and clean up AsyncWebCrawler initialization.

BREAKING CHANGE: Removed deprecated 'always_by_pass_cache' parameter. Use BrowserConfig cache settings instead.
This commit is contained in:
UncleCode 2025-02-23 21:23:41 +08:00
parent 46d2f12851
commit c6d48080a4
7 changed files with 198 additions and 63 deletions

View File

@ -8,6 +8,10 @@ from .content_scraping_strategy import (
WebScrapingStrategy,
LXMLWebScrapingStrategy,
)
from .async_logger import (
AsyncLoggerBase,
AsyncLogger,
)
from .proxy_strategy import (
ProxyRotationStrategy,
RoundRobinProxyStrategy,
@ -59,6 +63,8 @@ from .deep_crawling import (
)
__all__ = [
"AsyncLoggerBase",
"AsyncLogger",
"AsyncWebCrawler",
"DeepCrawlStrategy",
"BFSDeepCrawlStrategy",

View File

@ -1,3 +1,2 @@
# crawl4ai/_version.py
# __version__ = "0.4.3b3"
__version__ = "0.4.300"
__version__ = "0.5.0"

View File

@ -1,3 +1,4 @@
from abc import ABC, abstractmethod
from enum import Enum
from typing import Optional, Dict, Any
from colorama import Fore, Style, init
@ -13,7 +14,37 @@ class LogLevel(Enum):
ERROR = 5
class AsyncLogger:
class AsyncLoggerBase(ABC):
@abstractmethod
def debug(self, message: str, tag: str = "DEBUG", **kwargs):
pass
@abstractmethod
def info(self, message: str, tag: str = "INFO", **kwargs):
pass
@abstractmethod
def success(self, message: str, tag: str = "SUCCESS", **kwargs):
pass
@abstractmethod
def warning(self, message: str, tag: str = "WARNING", **kwargs):
pass
@abstractmethod
def error(self, message: str, tag: str = "ERROR", **kwargs):
pass
@abstractmethod
def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50):
pass
@abstractmethod
def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50):
pass
class AsyncLogger(AsyncLoggerBase):
"""
Asynchronous logger with support for colored console output and file logging.
Supports templated messages with colored components.
@ -225,3 +256,55 @@ class AsyncLogger:
tag=tag,
params={"url": url, "url_length": url_length, "error": error},
)
class AsyncFileLogger(AsyncLoggerBase):
"""
File-only asynchronous logger that writes logs to a specified file.
"""
def __init__(self, log_file: str):
"""
Initialize the file logger.
Args:
log_file: File path for logging
"""
self.log_file = log_file
os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True)
def _write_to_file(self, level: str, message: str, tag: str):
"""Write a message to the log file."""
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
with open(self.log_file, "a", encoding="utf-8") as f:
f.write(f"[{timestamp}] [{level}] [{tag}] {message}\n")
def debug(self, message: str, tag: str = "DEBUG", **kwargs):
"""Log a debug message to file."""
self._write_to_file("DEBUG", message, tag)
def info(self, message: str, tag: str = "INFO", **kwargs):
"""Log an info message to file."""
self._write_to_file("INFO", message, tag)
def success(self, message: str, tag: str = "SUCCESS", **kwargs):
"""Log a success message to file."""
self._write_to_file("SUCCESS", message, tag)
def warning(self, message: str, tag: str = "WARNING", **kwargs):
"""Log a warning message to file."""
self._write_to_file("WARNING", message, tag)
def error(self, message: str, tag: str = "ERROR", **kwargs):
"""Log an error message to file."""
self._write_to_file("ERROR", message, tag)
def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50):
"""Log URL fetch status to file."""
status = "SUCCESS" if success else "FAILED"
message = f"{url[:url_length]}... | Status: {status} | Time: {timing:.2f}s"
self._write_to_file("URL_STATUS", message, tag)
def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50):
"""Log error status to file."""
message = f"{url[:url_length]}... | Error: {error}"
self._write_to_file("ERROR", message, tag)

View File

@ -2,7 +2,6 @@ from .__version__ import __version__ as crawl4ai_version
import os
import sys
import time
import warnings
from colorama import Fore
from pathlib import Path
from typing import Optional, List
@ -30,7 +29,7 @@ from .markdown_generation_strategy import (
MarkdownGenerationStrategy,
)
from .deep_crawling import DeepCrawlDecorator
from .async_logger import AsyncLogger
from .async_logger import AsyncLogger, AsyncLoggerBase
from .async_configs import BrowserConfig, CrawlerRunConfig
from .async_dispatcher import * # noqa: F403
from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
@ -80,22 +79,21 @@ class AsyncWebCrawler:
await crawler.close()
```
Attributes:
Attributes:
browser_config (BrowserConfig): Configuration object for browser settings.
crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages.
logger (AsyncLogger): Logger instance for recording events and errors.
always_bypass_cache (bool): Whether to always bypass cache.
crawl4ai_folder (str): Directory for storing cache.
base_directory (str): Base directory for storing cache.
ready (bool): Whether the crawler is ready for use.
Methods:
start(): Start the crawler explicitly without using context manager.
close(): Close the crawler explicitly without using context manager.
arun(): Run the crawler for a single source: URL (web, local file, or raw HTML).
awarmup(): Perform warmup sequence.
arun_many(): Run the crawler for multiple sources.
aprocess_html(): Process HTML content.
Methods:
start(): Start the crawler explicitly without using context manager.
close(): Close the crawler explicitly without using context manager.
arun(): Run the crawler for a single source: URL (web, local file, or raw HTML).
awarmup(): Perform warmup sequence.
arun_many(): Run the crawler for multiple sources.
aprocess_html(): Process HTML content.
Typical Usage:
async with AsyncWebCrawler() as crawler:
@ -116,50 +114,30 @@ class AsyncWebCrawler:
def __init__(
self,
crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
config: Optional[BrowserConfig] = None,
always_bypass_cache: bool = False,
always_by_pass_cache: Optional[bool] = None, # Deprecated parameter
crawler_strategy: AsyncCrawlerStrategy = None,
config: BrowserConfig = None,
base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
thread_safe: bool = False,
logger: AsyncLoggerBase = None,
**kwargs,
):
"""
Initialize the AsyncWebCrawler.
Args:
crawler_strategy: Strategy for crawling web pages. If None, will create AsyncPlaywrightCrawlerStrategy
config: Configuration object for browser settings. If None, will be created from kwargs
always_bypass_cache: Whether to always bypass cache (new parameter)
always_by_pass_cache: Deprecated, use always_bypass_cache instead
crawler_strategy: Strategy for crawling web pages. Default AsyncPlaywrightCrawlerStrategy
config: Configuration object for browser settings. Default BrowserConfig()
base_directory: Base directory for storing cache
thread_safe: Whether to use thread-safe operations
**kwargs: Additional arguments for backwards compatibility
"""
# Handle browser configuration
browser_config = config
if browser_config is not None:
if any(
k in kwargs
for k in [
"browser_type",
"headless",
"viewport_width",
"viewport_height",
]
):
self.logger.warning(
message="Both browser_config and legacy browser parameters provided. browser_config will take precedence.",
tag="WARNING",
)
else:
# Create browser config from kwargs for backwards compatibility
browser_config = BrowserConfig.from_kwargs(kwargs)
browser_config = config or BrowserConfig()
self.browser_config = browser_config
# Initialize logger first since other components may need it
self.logger = AsyncLogger(
self.logger = logger or AsyncLogger(
log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"),
verbose=self.browser_config.verbose,
tag_width=10,
@ -173,24 +151,6 @@ class AsyncWebCrawler:
**params, # Pass remaining kwargs for backwards compatibility
)
# If craweler strategy doesnt have logger, use crawler logger
if not self.crawler_strategy.logger:
self.crawler_strategy.logger = self.logger
# Handle deprecated cache parameter
if always_by_pass_cache is not None:
if kwargs.get("warning", True):
warnings.warn(
"'always_by_pass_cache' is deprecated and will be removed in version 0.5.0. "
"Use 'always_bypass_cache' instead. "
"Pass warning=False to suppress this warning.",
DeprecationWarning,
stacklevel=2,
)
self.always_bypass_cache = always_by_pass_cache
else:
self.always_bypass_cache = always_bypass_cache
# Thread safety setup
self._lock = asyncio.Lock() if thread_safe else None
@ -356,7 +316,7 @@ class AsyncWebCrawler:
# Create cache context
cache_context = CacheContext(
url, config.cache_mode, self.always_bypass_cache
url, config.cache_mode, False
)
# Initialize processing variables

View File

@ -1,6 +1,6 @@
# crawl4ai/hub.py
from abc import ABC, abstractmethod
from typing import Dict, Type
from typing import Dict, Type, Union
import logging
import importlib
from pathlib import Path
@ -63,7 +63,7 @@ class CrawlerHub:
cls._crawlers[name] = obj
@classmethod
def get(cls, name: str) -> Type[BaseCrawler] | None:
def get(cls, name: str) -> Union[Type[BaseCrawler], None]:
if not cls._crawlers:
cls._discover_crawlers()
return cls._crawlers.get(name)

View File

@ -1,5 +1,12 @@
import asyncio
from crawl4ai import *
from crawl4ai import (
AsyncWebCrawler,
BrowserConfig,
CrawlerRunConfig,
CacheMode,
DefaultMarkdownGenerator,
PruningContentFilter,
)
async def main():

View File

@ -0,0 +1,80 @@
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, AsyncLoggerBase
import os
from datetime import datetime
class AsyncFileLogger(AsyncLoggerBase):
"""
File-only asynchronous logger that writes logs to a specified file.
"""
def __init__(self, log_file: str):
"""
Initialize the file logger.
Args:
log_file: File path for logging
"""
self.log_file = log_file
os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True)
def _write_to_file(self, level: str, message: str, tag: str):
"""Write a message to the log file."""
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
with open(self.log_file, "a", encoding="utf-8") as f:
f.write(f"[{timestamp}] [{level}] [{tag}] {message}\n")
def debug(self, message: str, tag: str = "DEBUG", **kwargs):
"""Log a debug message to file."""
self._write_to_file("DEBUG", message, tag)
def info(self, message: str, tag: str = "INFO", **kwargs):
"""Log an info message to file."""
self._write_to_file("INFO", message, tag)
def success(self, message: str, tag: str = "SUCCESS", **kwargs):
"""Log a success message to file."""
self._write_to_file("SUCCESS", message, tag)
def warning(self, message: str, tag: str = "WARNING", **kwargs):
"""Log a warning message to file."""
self._write_to_file("WARNING", message, tag)
def error(self, message: str, tag: str = "ERROR", **kwargs):
"""Log an error message to file."""
self._write_to_file("ERROR", message, tag)
def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50):
"""Log URL fetch status to file."""
status = "SUCCESS" if success else "FAILED"
message = f"{url[:url_length]}... | Status: {status} | Time: {timing:.2f}s"
self._write_to_file("URL_STATUS", message, tag)
def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50):
"""Log error status to file."""
message = f"{url[:url_length]}... | Error: {error}"
self._write_to_file("ERROR", message, tag)
async def main():
browser_config = BrowserConfig(headless=True, verbose=True)
crawler = AsyncWebCrawler(config=browser_config, logger=AsyncFileLogger("/Users/unclecode/devs/crawl4ai/.private/tmp/crawl.log"))
await crawler.start()
try:
crawl_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
)
# Use the crawler multiple times
result = await crawler.arun(
url='https://kidocode.com/',
config=crawl_config
)
if result.success:
print("First crawl - Raw Markdown Length:", len(result.markdown_v2.raw_markdown))
finally:
# Always ensure we close the crawler
await crawler.close()
if __name__ == "__main__":
asyncio.run(main())