feat(logger): add abstract logger base class and file logger implementation
Add AsyncLoggerBase abstract class to standardize logger interface and introduce AsyncFileLogger for file-only logging. Remove deprecated always_bypass_cache parameter and clean up AsyncWebCrawler initialization. BREAKING CHANGE: Removed deprecated 'always_by_pass_cache' parameter. Use BrowserConfig cache settings instead.
This commit is contained in:
parent
46d2f12851
commit
c6d48080a4
@ -8,6 +8,10 @@ from .content_scraping_strategy import (
|
||||
WebScrapingStrategy,
|
||||
LXMLWebScrapingStrategy,
|
||||
)
|
||||
from .async_logger import (
|
||||
AsyncLoggerBase,
|
||||
AsyncLogger,
|
||||
)
|
||||
from .proxy_strategy import (
|
||||
ProxyRotationStrategy,
|
||||
RoundRobinProxyStrategy,
|
||||
@ -59,6 +63,8 @@ from .deep_crawling import (
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"AsyncLoggerBase",
|
||||
"AsyncLogger",
|
||||
"AsyncWebCrawler",
|
||||
"DeepCrawlStrategy",
|
||||
"BFSDeepCrawlStrategy",
|
||||
|
@ -1,3 +1,2 @@
|
||||
# crawl4ai/_version.py
|
||||
# __version__ = "0.4.3b3"
|
||||
__version__ = "0.4.300"
|
||||
__version__ = "0.5.0"
|
||||
|
@ -1,3 +1,4 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from enum import Enum
|
||||
from typing import Optional, Dict, Any
|
||||
from colorama import Fore, Style, init
|
||||
@ -13,7 +14,37 @@ class LogLevel(Enum):
|
||||
ERROR = 5
|
||||
|
||||
|
||||
class AsyncLogger:
|
||||
|
||||
class AsyncLoggerBase(ABC):
|
||||
@abstractmethod
|
||||
def debug(self, message: str, tag: str = "DEBUG", **kwargs):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def info(self, message: str, tag: str = "INFO", **kwargs):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def success(self, message: str, tag: str = "SUCCESS", **kwargs):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def warning(self, message: str, tag: str = "WARNING", **kwargs):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def error(self, message: str, tag: str = "ERROR", **kwargs):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50):
|
||||
pass
|
||||
|
||||
class AsyncLogger(AsyncLoggerBase):
|
||||
"""
|
||||
Asynchronous logger with support for colored console output and file logging.
|
||||
Supports templated messages with colored components.
|
||||
@ -225,3 +256,55 @@ class AsyncLogger:
|
||||
tag=tag,
|
||||
params={"url": url, "url_length": url_length, "error": error},
|
||||
)
|
||||
|
||||
class AsyncFileLogger(AsyncLoggerBase):
|
||||
"""
|
||||
File-only asynchronous logger that writes logs to a specified file.
|
||||
"""
|
||||
|
||||
def __init__(self, log_file: str):
|
||||
"""
|
||||
Initialize the file logger.
|
||||
|
||||
Args:
|
||||
log_file: File path for logging
|
||||
"""
|
||||
self.log_file = log_file
|
||||
os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True)
|
||||
|
||||
def _write_to_file(self, level: str, message: str, tag: str):
|
||||
"""Write a message to the log file."""
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
|
||||
with open(self.log_file, "a", encoding="utf-8") as f:
|
||||
f.write(f"[{timestamp}] [{level}] [{tag}] {message}\n")
|
||||
|
||||
def debug(self, message: str, tag: str = "DEBUG", **kwargs):
|
||||
"""Log a debug message to file."""
|
||||
self._write_to_file("DEBUG", message, tag)
|
||||
|
||||
def info(self, message: str, tag: str = "INFO", **kwargs):
|
||||
"""Log an info message to file."""
|
||||
self._write_to_file("INFO", message, tag)
|
||||
|
||||
def success(self, message: str, tag: str = "SUCCESS", **kwargs):
|
||||
"""Log a success message to file."""
|
||||
self._write_to_file("SUCCESS", message, tag)
|
||||
|
||||
def warning(self, message: str, tag: str = "WARNING", **kwargs):
|
||||
"""Log a warning message to file."""
|
||||
self._write_to_file("WARNING", message, tag)
|
||||
|
||||
def error(self, message: str, tag: str = "ERROR", **kwargs):
|
||||
"""Log an error message to file."""
|
||||
self._write_to_file("ERROR", message, tag)
|
||||
|
||||
def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50):
|
||||
"""Log URL fetch status to file."""
|
||||
status = "SUCCESS" if success else "FAILED"
|
||||
message = f"{url[:url_length]}... | Status: {status} | Time: {timing:.2f}s"
|
||||
self._write_to_file("URL_STATUS", message, tag)
|
||||
|
||||
def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50):
|
||||
"""Log error status to file."""
|
||||
message = f"{url[:url_length]}... | Error: {error}"
|
||||
self._write_to_file("ERROR", message, tag)
|
||||
|
@ -2,7 +2,6 @@ from .__version__ import __version__ as crawl4ai_version
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import warnings
|
||||
from colorama import Fore
|
||||
from pathlib import Path
|
||||
from typing import Optional, List
|
||||
@ -30,7 +29,7 @@ from .markdown_generation_strategy import (
|
||||
MarkdownGenerationStrategy,
|
||||
)
|
||||
from .deep_crawling import DeepCrawlDecorator
|
||||
from .async_logger import AsyncLogger
|
||||
from .async_logger import AsyncLogger, AsyncLoggerBase
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from .async_dispatcher import * # noqa: F403
|
||||
from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
|
||||
@ -80,22 +79,21 @@ class AsyncWebCrawler:
|
||||
await crawler.close()
|
||||
```
|
||||
|
||||
Attributes:
|
||||
Attributes:
|
||||
browser_config (BrowserConfig): Configuration object for browser settings.
|
||||
crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages.
|
||||
logger (AsyncLogger): Logger instance for recording events and errors.
|
||||
always_bypass_cache (bool): Whether to always bypass cache.
|
||||
crawl4ai_folder (str): Directory for storing cache.
|
||||
base_directory (str): Base directory for storing cache.
|
||||
ready (bool): Whether the crawler is ready for use.
|
||||
|
||||
Methods:
|
||||
start(): Start the crawler explicitly without using context manager.
|
||||
close(): Close the crawler explicitly without using context manager.
|
||||
arun(): Run the crawler for a single source: URL (web, local file, or raw HTML).
|
||||
awarmup(): Perform warmup sequence.
|
||||
arun_many(): Run the crawler for multiple sources.
|
||||
aprocess_html(): Process HTML content.
|
||||
Methods:
|
||||
start(): Start the crawler explicitly without using context manager.
|
||||
close(): Close the crawler explicitly without using context manager.
|
||||
arun(): Run the crawler for a single source: URL (web, local file, or raw HTML).
|
||||
awarmup(): Perform warmup sequence.
|
||||
arun_many(): Run the crawler for multiple sources.
|
||||
aprocess_html(): Process HTML content.
|
||||
|
||||
Typical Usage:
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
@ -116,50 +114,30 @@ class AsyncWebCrawler:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
|
||||
config: Optional[BrowserConfig] = None,
|
||||
always_bypass_cache: bool = False,
|
||||
always_by_pass_cache: Optional[bool] = None, # Deprecated parameter
|
||||
crawler_strategy: AsyncCrawlerStrategy = None,
|
||||
config: BrowserConfig = None,
|
||||
base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
|
||||
thread_safe: bool = False,
|
||||
logger: AsyncLoggerBase = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Initialize the AsyncWebCrawler.
|
||||
|
||||
Args:
|
||||
crawler_strategy: Strategy for crawling web pages. If None, will create AsyncPlaywrightCrawlerStrategy
|
||||
config: Configuration object for browser settings. If None, will be created from kwargs
|
||||
always_bypass_cache: Whether to always bypass cache (new parameter)
|
||||
always_by_pass_cache: Deprecated, use always_bypass_cache instead
|
||||
crawler_strategy: Strategy for crawling web pages. Default AsyncPlaywrightCrawlerStrategy
|
||||
config: Configuration object for browser settings. Default BrowserConfig()
|
||||
base_directory: Base directory for storing cache
|
||||
thread_safe: Whether to use thread-safe operations
|
||||
**kwargs: Additional arguments for backwards compatibility
|
||||
"""
|
||||
# Handle browser configuration
|
||||
browser_config = config
|
||||
if browser_config is not None:
|
||||
if any(
|
||||
k in kwargs
|
||||
for k in [
|
||||
"browser_type",
|
||||
"headless",
|
||||
"viewport_width",
|
||||
"viewport_height",
|
||||
]
|
||||
):
|
||||
self.logger.warning(
|
||||
message="Both browser_config and legacy browser parameters provided. browser_config will take precedence.",
|
||||
tag="WARNING",
|
||||
)
|
||||
else:
|
||||
# Create browser config from kwargs for backwards compatibility
|
||||
browser_config = BrowserConfig.from_kwargs(kwargs)
|
||||
browser_config = config or BrowserConfig()
|
||||
|
||||
self.browser_config = browser_config
|
||||
|
||||
# Initialize logger first since other components may need it
|
||||
self.logger = AsyncLogger(
|
||||
self.logger = logger or AsyncLogger(
|
||||
log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"),
|
||||
verbose=self.browser_config.verbose,
|
||||
tag_width=10,
|
||||
@ -173,24 +151,6 @@ class AsyncWebCrawler:
|
||||
**params, # Pass remaining kwargs for backwards compatibility
|
||||
)
|
||||
|
||||
# If craweler strategy doesnt have logger, use crawler logger
|
||||
if not self.crawler_strategy.logger:
|
||||
self.crawler_strategy.logger = self.logger
|
||||
|
||||
# Handle deprecated cache parameter
|
||||
if always_by_pass_cache is not None:
|
||||
if kwargs.get("warning", True):
|
||||
warnings.warn(
|
||||
"'always_by_pass_cache' is deprecated and will be removed in version 0.5.0. "
|
||||
"Use 'always_bypass_cache' instead. "
|
||||
"Pass warning=False to suppress this warning.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
self.always_bypass_cache = always_by_pass_cache
|
||||
else:
|
||||
self.always_bypass_cache = always_bypass_cache
|
||||
|
||||
# Thread safety setup
|
||||
self._lock = asyncio.Lock() if thread_safe else None
|
||||
|
||||
@ -356,7 +316,7 @@ class AsyncWebCrawler:
|
||||
|
||||
# Create cache context
|
||||
cache_context = CacheContext(
|
||||
url, config.cache_mode, self.always_bypass_cache
|
||||
url, config.cache_mode, False
|
||||
)
|
||||
|
||||
# Initialize processing variables
|
||||
|
@ -1,6 +1,6 @@
|
||||
# crawl4ai/hub.py
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Type
|
||||
from typing import Dict, Type, Union
|
||||
import logging
|
||||
import importlib
|
||||
from pathlib import Path
|
||||
@ -63,7 +63,7 @@ class CrawlerHub:
|
||||
cls._crawlers[name] = obj
|
||||
|
||||
@classmethod
|
||||
def get(cls, name: str) -> Type[BaseCrawler] | None:
|
||||
def get(cls, name: str) -> Union[Type[BaseCrawler], None]:
|
||||
if not cls._crawlers:
|
||||
cls._discover_crawlers()
|
||||
return cls._crawlers.get(name)
|
@ -1,5 +1,12 @@
|
||||
import asyncio
|
||||
from crawl4ai import *
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
CacheMode,
|
||||
DefaultMarkdownGenerator,
|
||||
PruningContentFilter,
|
||||
)
|
||||
|
||||
|
||||
async def main():
|
||||
|
80
tests/loggers/test_logger.py
Normal file
80
tests/loggers/test_logger.py
Normal file
@ -0,0 +1,80 @@
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, AsyncLoggerBase
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
class AsyncFileLogger(AsyncLoggerBase):
|
||||
"""
|
||||
File-only asynchronous logger that writes logs to a specified file.
|
||||
"""
|
||||
|
||||
def __init__(self, log_file: str):
|
||||
"""
|
||||
Initialize the file logger.
|
||||
|
||||
Args:
|
||||
log_file: File path for logging
|
||||
"""
|
||||
self.log_file = log_file
|
||||
os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True)
|
||||
|
||||
def _write_to_file(self, level: str, message: str, tag: str):
|
||||
"""Write a message to the log file."""
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
|
||||
with open(self.log_file, "a", encoding="utf-8") as f:
|
||||
f.write(f"[{timestamp}] [{level}] [{tag}] {message}\n")
|
||||
|
||||
def debug(self, message: str, tag: str = "DEBUG", **kwargs):
|
||||
"""Log a debug message to file."""
|
||||
self._write_to_file("DEBUG", message, tag)
|
||||
|
||||
def info(self, message: str, tag: str = "INFO", **kwargs):
|
||||
"""Log an info message to file."""
|
||||
self._write_to_file("INFO", message, tag)
|
||||
|
||||
def success(self, message: str, tag: str = "SUCCESS", **kwargs):
|
||||
"""Log a success message to file."""
|
||||
self._write_to_file("SUCCESS", message, tag)
|
||||
|
||||
def warning(self, message: str, tag: str = "WARNING", **kwargs):
|
||||
"""Log a warning message to file."""
|
||||
self._write_to_file("WARNING", message, tag)
|
||||
|
||||
def error(self, message: str, tag: str = "ERROR", **kwargs):
|
||||
"""Log an error message to file."""
|
||||
self._write_to_file("ERROR", message, tag)
|
||||
|
||||
def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50):
|
||||
"""Log URL fetch status to file."""
|
||||
status = "SUCCESS" if success else "FAILED"
|
||||
message = f"{url[:url_length]}... | Status: {status} | Time: {timing:.2f}s"
|
||||
self._write_to_file("URL_STATUS", message, tag)
|
||||
|
||||
def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50):
|
||||
"""Log error status to file."""
|
||||
message = f"{url[:url_length]}... | Error: {error}"
|
||||
self._write_to_file("ERROR", message, tag)
|
||||
|
||||
async def main():
|
||||
browser_config = BrowserConfig(headless=True, verbose=True)
|
||||
crawler = AsyncWebCrawler(config=browser_config, logger=AsyncFileLogger("/Users/unclecode/devs/crawl4ai/.private/tmp/crawl.log"))
|
||||
await crawler.start()
|
||||
|
||||
try:
|
||||
crawl_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
# Use the crawler multiple times
|
||||
result = await crawler.arun(
|
||||
url='https://kidocode.com/',
|
||||
config=crawl_config
|
||||
)
|
||||
if result.success:
|
||||
print("First crawl - Raw Markdown Length:", len(result.markdown_v2.raw_markdown))
|
||||
|
||||
finally:
|
||||
# Always ensure we close the crawler
|
||||
await crawler.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
Loading…
x
Reference in New Issue
Block a user