feat(logger): add abstract logger base class and file logger implementation

Add AsyncLoggerBase abstract class to standardize logger interface and introduce AsyncFileLogger for file-only logging. Remove deprecated always_bypass_cache parameter and clean up AsyncWebCrawler initialization. BREAKING CHANGE: Removed deprecated 'always_by_pass_cache' parameter. Use BrowserConfig cache settings instead.
2025-02-23 21:23:41 +08:00 · 2025-02-23 21:23:41 +08:00 · c6d48080a4
commit c6d48080a4
parent 46d2f12851
7 changed files with 198 additions and 63 deletions
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@ -8,6 +8,10 @@ from .content_scraping_strategy import (
    WebScrapingStrategy,
    LXMLWebScrapingStrategy,
 )
+from .async_logger import (
+    AsyncLoggerBase,
+    AsyncLogger,
+)
 from .proxy_strategy import (
    ProxyRotationStrategy,
    RoundRobinProxyStrategy,
@ -59,6 +63,8 @@ from .deep_crawling import (
 )

 __all__ = [
+    "AsyncLoggerBase",
+    "AsyncLogger",
    "AsyncWebCrawler",
    "DeepCrawlStrategy",
    "BFSDeepCrawlStrategy",
--- a/crawl4ai/version.py
+++ b/crawl4ai/version.py
@ -1,3 +1,2 @@
 # crawl4ai/_version.py
-# __version__ = "0.4.3b3"
-__version__ = "0.4.300"
+__version__ = "0.5.0"
--- a/crawl4ai/async_logger.py
+++ b/crawl4ai/async_logger.py
@ -1,3 +1,4 @@
+from abc import ABC, abstractmethod
 from enum import Enum
 from typing import Optional, Dict, Any
 from colorama import Fore, Style, init
@ -13,7 +14,37 @@ class LogLevel(Enum):
    ERROR = 5


-class AsyncLogger:
+
+class AsyncLoggerBase(ABC):
+    @abstractmethod
+    def debug(self, message: str, tag: str = "DEBUG", **kwargs):
+        pass
+
+    @abstractmethod
+    def info(self, message: str, tag: str = "INFO", **kwargs):
+        pass
+
+    @abstractmethod
+    def success(self, message: str, tag: str = "SUCCESS", **kwargs):
+        pass
+
+    @abstractmethod
+    def warning(self, message: str, tag: str = "WARNING", **kwargs):
+        pass
+
+    @abstractmethod
+    def error(self, message: str, tag: str = "ERROR", **kwargs):
+        pass
+
+    @abstractmethod
+    def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50):
+        pass
+
+    @abstractmethod
+    def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50):
+        pass
+
+class AsyncLogger(AsyncLoggerBase):
    """
    Asynchronous logger with support for colored console output and file logging.
    Supports templated messages with colored components.
@ -225,3 +256,55 @@ class AsyncLogger:
            tag=tag,
            params={"url": url, "url_length": url_length, "error": error},
        )
+
+class AsyncFileLogger(AsyncLoggerBase):
+    """
+    File-only asynchronous logger that writes logs to a specified file.
+    """
+
+    def __init__(self, log_file: str):
+        """
+        Initialize the file logger.
+
+        Args:
+            log_file: File path for logging
+        """
+        self.log_file = log_file
+        os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True)
+
+    def _write_to_file(self, level: str, message: str, tag: str):
+        """Write a message to the log file."""
+        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
+        with open(self.log_file, "a", encoding="utf-8") as f:
+            f.write(f"[{timestamp}] [{level}] [{tag}] {message}\n")
+
+    def debug(self, message: str, tag: str = "DEBUG", **kwargs):
+        """Log a debug message to file."""
+        self._write_to_file("DEBUG", message, tag)
+
+    def info(self, message: str, tag: str = "INFO", **kwargs):
+        """Log an info message to file."""
+        self._write_to_file("INFO", message, tag)
+
+    def success(self, message: str, tag: str = "SUCCESS", **kwargs):
+        """Log a success message to file."""
+        self._write_to_file("SUCCESS", message, tag)
+
+    def warning(self, message: str, tag: str = "WARNING", **kwargs):
+        """Log a warning message to file."""
+        self._write_to_file("WARNING", message, tag)
+
+    def error(self, message: str, tag: str = "ERROR", **kwargs):
+        """Log an error message to file."""
+        self._write_to_file("ERROR", message, tag)
+
+    def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50):
+        """Log URL fetch status to file."""
+        status = "SUCCESS" if success else "FAILED"
+        message = f"{url[:url_length]}... | Status: {status} | Time: {timing:.2f}s"
+        self._write_to_file("URL_STATUS", message, tag)
+
+    def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50):
+        """Log error status to file."""
+        message = f"{url[:url_length]}... | Error: {error}"
+        self._write_to_file("ERROR", message, tag)
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@ -2,7 +2,6 @@ from .__version__ import __version__ as crawl4ai_version
 import os
 import sys
 import time
-import warnings
 from colorama import Fore
 from pathlib import Path
 from typing import Optional, List
@ -30,7 +29,7 @@ from .markdown_generation_strategy import (
    MarkdownGenerationStrategy,
 )
 from .deep_crawling import DeepCrawlDecorator
-from .async_logger import AsyncLogger
+from .async_logger import AsyncLogger, AsyncLoggerBase
 from .async_configs import BrowserConfig, CrawlerRunConfig
 from .async_dispatcher import * # noqa: F403
 from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
@ -80,22 +79,21 @@ class AsyncWebCrawler:
        await crawler.close()
        ```

-            Attributes:
+    Attributes:
        browser_config (BrowserConfig): Configuration object for browser settings.
        crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages.
        logger (AsyncLogger): Logger instance for recording events and errors.
-        always_bypass_cache (bool): Whether to always bypass cache.
        crawl4ai_folder (str): Directory for storing cache.
        base_directory (str): Base directory for storing cache.
        ready (bool): Whether the crawler is ready for use.

-        Methods:
-            start(): Start the crawler explicitly without using context manager.
-            close(): Close the crawler explicitly without using context manager.
-            arun(): Run the crawler for a single source: URL (web, local file, or raw HTML).
-            awarmup(): Perform warmup sequence.
-            arun_many(): Run the crawler for multiple sources.
-            aprocess_html(): Process HTML content.
+    Methods:
+        start(): Start the crawler explicitly without using context manager.
+        close(): Close the crawler explicitly without using context manager.
+        arun(): Run the crawler for a single source: URL (web, local file, or raw HTML).
+        awarmup(): Perform warmup sequence.
+        arun_many(): Run the crawler for multiple sources.
+        aprocess_html(): Process HTML content.

    Typical Usage:
        async with AsyncWebCrawler() as crawler:
@ -116,50 +114,30 @@ class AsyncWebCrawler:

    def __init__(
        self,
-        crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
-        config: Optional[BrowserConfig] = None,
-        always_bypass_cache: bool = False,
-        always_by_pass_cache: Optional[bool] = None,  # Deprecated parameter
+        crawler_strategy: AsyncCrawlerStrategy = None,
+        config: BrowserConfig = None,
        base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
        thread_safe: bool = False,
+        logger: AsyncLoggerBase = None,
        **kwargs,
    ):
        """
        Initialize the AsyncWebCrawler.

        Args:
-            crawler_strategy: Strategy for crawling web pages. If None, will create AsyncPlaywrightCrawlerStrategy
-            config: Configuration object for browser settings. If None, will be created from kwargs
-            always_bypass_cache: Whether to always bypass cache (new parameter)
-            always_by_pass_cache: Deprecated, use always_bypass_cache instead
+            crawler_strategy: Strategy for crawling web pages. Default AsyncPlaywrightCrawlerStrategy
+            config: Configuration object for browser settings. Default BrowserConfig()
            base_directory: Base directory for storing cache
            thread_safe: Whether to use thread-safe operations
            **kwargs: Additional arguments for backwards compatibility
        """
        # Handle browser configuration
-        browser_config = config
-        if browser_config is not None:
-            if any(
-                k in kwargs
-                for k in [
-                    "browser_type",
-                    "headless",
-                    "viewport_width",
-                    "viewport_height",
-                ]
-            ):
-                self.logger.warning(
-                    message="Both browser_config and legacy browser parameters provided. browser_config will take precedence.",
-                    tag="WARNING",
-                )
-        else:
-            # Create browser config from kwargs for backwards compatibility
-            browser_config = BrowserConfig.from_kwargs(kwargs)
+        browser_config = config or BrowserConfig()

        self.browser_config = browser_config

        # Initialize logger first since other components may need it
-        self.logger = AsyncLogger(
+        self.logger = logger or AsyncLogger(
            log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"),
            verbose=self.browser_config.verbose,
            tag_width=10,
@ -173,24 +151,6 @@ class AsyncWebCrawler:
            **params,  # Pass remaining kwargs for backwards compatibility
        )

-        # If craweler strategy doesnt have logger, use crawler logger
-        if not self.crawler_strategy.logger:
-            self.crawler_strategy.logger = self.logger
-
-        # Handle deprecated cache parameter
-        if always_by_pass_cache is not None:
-            if kwargs.get("warning", True):
-                warnings.warn(
-                    "'always_by_pass_cache' is deprecated and will be removed in version 0.5.0. "
-                    "Use 'always_bypass_cache' instead. "
-                    "Pass warning=False to suppress this warning.",
-                    DeprecationWarning,
-                    stacklevel=2,
-                )
-            self.always_bypass_cache = always_by_pass_cache
-        else:
-            self.always_bypass_cache = always_bypass_cache
-
        # Thread safety setup
        self._lock = asyncio.Lock() if thread_safe else None

@ -356,7 +316,7 @@ class AsyncWebCrawler:

                # Create cache context
                cache_context = CacheContext(
-                    url, config.cache_mode, self.always_bypass_cache
+                    url, config.cache_mode, False
                )

                # Initialize processing variables
--- a/crawl4ai/hub.py
+++ b/crawl4ai/hub.py
@ -1,6 +1,6 @@
 # crawl4ai/hub.py
 from abc import ABC, abstractmethod
-from typing import Dict, Type
+from typing import Dict, Type, Union
 import logging
 import importlib
 from pathlib import Path
@ -63,7 +63,7 @@ class CrawlerHub:
            cls._crawlers[name] = obj

    @classmethod
-    def get(cls, name: str) -> Type[BaseCrawler] | None:
+    def get(cls, name: str) -> Union[Type[BaseCrawler], None]:
        if not cls._crawlers:
            cls._discover_crawlers()
        return cls._crawlers.get(name)
--- a/docs/examples/hello_world.py
+++ b/docs/examples/hello_world.py
@ -1,5 +1,12 @@
 import asyncio
-from crawl4ai import *
+from crawl4ai import (
+    AsyncWebCrawler,
+    BrowserConfig,
+    CrawlerRunConfig,
+    CacheMode,
+    DefaultMarkdownGenerator,
+    PruningContentFilter,
+)


 async def main():
--- a/tests/loggers/test_logger.py
+++ b/tests/loggers/test_logger.py
@ -0,0 +1,80 @@
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, AsyncLoggerBase
+import os
+from datetime import datetime
+
+class AsyncFileLogger(AsyncLoggerBase):
+    """
+    File-only asynchronous logger that writes logs to a specified file.
+    """
+
+    def __init__(self, log_file: str):
+        """
+        Initialize the file logger.
+
+        Args:
+            log_file: File path for logging
+        """
+        self.log_file = log_file
+        os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True)
+
+    def _write_to_file(self, level: str, message: str, tag: str):
+        """Write a message to the log file."""
+        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
+        with open(self.log_file, "a", encoding="utf-8") as f:
+            f.write(f"[{timestamp}] [{level}] [{tag}] {message}\n")
+
+    def debug(self, message: str, tag: str = "DEBUG", **kwargs):
+        """Log a debug message to file."""
+        self._write_to_file("DEBUG", message, tag)
+
+    def info(self, message: str, tag: str = "INFO", **kwargs):
+        """Log an info message to file."""
+        self._write_to_file("INFO", message, tag)
+
+    def success(self, message: str, tag: str = "SUCCESS", **kwargs):
+        """Log a success message to file."""
+        self._write_to_file("SUCCESS", message, tag)
+
+    def warning(self, message: str, tag: str = "WARNING", **kwargs):
+        """Log a warning message to file."""
+        self._write_to_file("WARNING", message, tag)
+
+    def error(self, message: str, tag: str = "ERROR", **kwargs):
+        """Log an error message to file."""
+        self._write_to_file("ERROR", message, tag)
+
+    def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50):
+        """Log URL fetch status to file."""
+        status = "SUCCESS" if success else "FAILED"
+        message = f"{url[:url_length]}... | Status: {status} | Time: {timing:.2f}s"
+        self._write_to_file("URL_STATUS", message, tag)
+
+    def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50):
+        """Log error status to file."""
+        message = f"{url[:url_length]}... | Error: {error}"
+        self._write_to_file("ERROR", message, tag)
+
+async def main():
+    browser_config = BrowserConfig(headless=True, verbose=True)
+    crawler = AsyncWebCrawler(config=browser_config, logger=AsyncFileLogger("/Users/unclecode/devs/crawl4ai/.private/tmp/crawl.log"))
+    await crawler.start()
+    
+    try:
+        crawl_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+        )
+        # Use the crawler multiple times
+        result = await crawler.arun(
+            url='https://kidocode.com/',
+            config=crawl_config
+        )
+        if result.success:
+            print("First crawl - Raw Markdown Length:", len(result.markdown_v2.raw_markdown))
+            
+    finally:
+        # Always ensure we close the crawler
+        await crawler.close()
+
+if __name__ == "__main__":
+    asyncio.run(main())