diff --git a/haystack/__init__.py b/haystack/__init__.py index b20a367e3..dd4e910b8 100644 --- a/haystack/__init__.py +++ b/haystack/__init__.py @@ -3,6 +3,11 @@ from haystack.core.errors import ComponentError, DeserializationError from haystack.core.pipeline import Pipeline from haystack.core.serialization import default_from_dict, default_to_dict from haystack.dataclasses import Answer, Document, ExtractedAnswer, GeneratedAnswer +import haystack.logging + +# Initialize the logging configuration +# This is a no-op unless `structlog` is installed +haystack.logging.configure_logging() __all__ = [ "component", diff --git a/haystack/logging.py b/haystack/logging.py new file mode 100644 index 000000000..6f3ea1d6f --- /dev/null +++ b/haystack/logging.py @@ -0,0 +1,97 @@ +import logging +import os +import typing +from typing import List + +if typing.TYPE_CHECKING: + from structlog.typing import Processor + +HAYSTACK_LOGGING_USE_JSON_ENV_VAR = "HAYSTACK_LOGGING_USE_JSON" +HAYSTACK_LOGGING_IGNORE_STRUCTLOG_ENV_VAR = "HAYSTACK_LOGGING_IGNORE_STRUCTLOG" + + +def configure_logging(use_json: bool = False) -> None: + """Configure logging for Haystack. + + - If `structlog` is not installed, we keep everything as it is. The user is responsible for configuring logging + themselves. + - If `structlog` is installed, we configure it to format log entries including its key-value data. To disable this + behavior set the environment variable `HAYSTACK_LOGGING_IGNORE_STRUCTLOG` to `true`. + - If `structlog` is installed, you can JSON format all logs. Enable this by + - setting the `use_json` parameter to `True` when calling this function + - setting the environment variable `HAYSTACK_LOGGING_USE_JSON` to `true` + """ + try: + import structlog + from structlog.processors import ExceptionRenderer + from structlog.tracebacks import ExceptionDictTransformer + + except ImportError: + # structlog is not installed - fall back to standard logging + return + + if os.getenv(HAYSTACK_LOGGING_IGNORE_STRUCTLOG_ENV_VAR, "false").lower() == "true": + # If the user wants to ignore structlog, we don't configure it and fall back to standard logging + return + + # We roughly follow the structlog documentation here: + # https://www.structlog.org/en/stable/standard-library.html#rendering-using-structlog-based-formatters-within-logging + # This means that we use structlog to format the log entries for entries emitted via `logging` and `structlog`. + + shared_processors: List[Processor] = [ + # Add the log level to the event_dict for structlog to use + structlog.stdlib.add_log_level, + # Adds the current timestamp in ISO format to logs + structlog.processors.TimeStamper(fmt="iso"), + ] + + structlog.configure( + processors=shared_processors + [structlog.stdlib.ProcessorFormatter.wrap_for_formatter], + logger_factory=structlog.stdlib.LoggerFactory(), + cache_logger_on_first_use=True, + # This is a filter that will filter out log entries that are below the log level of the root logger. + wrapper_class=structlog.make_filtering_bound_logger(min_level=logging.root.getEffectiveLevel()), + ) + + renderers: List[Processor] + if os.getenv(HAYSTACK_LOGGING_USE_JSON_ENV_VAR, "false").lower() == "true" or use_json: + renderers = [ + ExceptionRenderer( + # don't show locals in production logs - this can be quite sensitive information + ExceptionDictTransformer(show_locals=False) + ), + structlog.processors.JSONRenderer(), + ] + else: + renderers = [structlog.dev.ConsoleRenderer()] + + formatter = structlog.stdlib.ProcessorFormatter( + # These run ONLY on `logging` entries that do NOT originate within + # structlog. + foreign_pre_chain=shared_processors + + [ + # Add the information from the `logging` `extras` to the event dictionary + structlog.stdlib.ExtraAdder() + ], + # These run on ALL entries after the pre_chain is done. + processors=[ + # Remove _record & _from_structlog. to avoid that this metadata is added to the final log record + structlog.stdlib.ProcessorFormatter.remove_processors_meta, + *renderers, + ], + ) + + handler = logging.StreamHandler() + handler.name = "HaystackLoggingHandler" + # Use OUR `ProcessorFormatter` to format all `logging` entries. + handler.setFormatter(formatter) + + root_logger = logging.getLogger() + # avoid adding our handler twice + old_handlers = [ + h + for h in root_logger.handlers + if not (isinstance(h, logging.StreamHandler) and h.name == "HaystackLoggingHandler") + ] + new_handlers = [handler, *old_handlers] + root_logger.handlers = new_handlers diff --git a/pyproject.toml b/pyproject.toml index 2372310ef..18e2fc49d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -122,6 +122,9 @@ extra-dependencies = [ # Tracing "opentelemetry-sdk", "ddtrace", + + # Structured logging + "structlog", ] [tool.hatch.envs.test.scripts] diff --git a/releasenotes/notes/structured-logging-2d6cef3fee2b4f0e.yaml b/releasenotes/notes/structured-logging-2d6cef3fee2b4f0e.yaml new file mode 100644 index 000000000..583dea247 --- /dev/null +++ b/releasenotes/notes/structured-logging-2d6cef3fee2b4f0e.yaml @@ -0,0 +1,18 @@ +--- +features: + - | + Haystack now supports structured logging out-of-the box. + Logging can be separated into 3 categories: + - If [`structlog`](https://www.structlog.org/en/stable/) is not installed, Haystack will + use the standard Python logging library with whatever configuration is present. + - If `structlog` is installed, Haystack will log through [`structlog`](https://www.structlog.org/en/stable/) using + structlog's console renderer. + To disable structlog, set the environment variable `HAYSTACK_LOGGING_IGNORE_STRUCTLOG_ENV_VAR` to `true`. + - To log in JSON, install [`structlog`](https://www.structlog.org/en/stable/) and + - set the environment variable `HAYSTACK_LOGGING_JSON` to `true` or + - enable JSON logging from Python + ```python + import haystack.logging + + haystack.logging.configure_logging(use_json=True) + ``` diff --git a/test/test_logging.py b/test/test_logging.py new file mode 100644 index 000000000..9c212417e --- /dev/null +++ b/test/test_logging.py @@ -0,0 +1,206 @@ +import builtins +import json +import logging +import os +import sys +from datetime import datetime, timezone +from pathlib import Path +from unittest.mock import ANY, Mock + +import pytest +from _pytest.capture import CaptureFixture +from _pytest.logging import LogCaptureFixture +from _pytest.monkeypatch import MonkeyPatch + +from haystack import logging as haystack_logging + + +@pytest.fixture(autouse=True) +def reset_logging_config() -> None: + old_handlers = logging.root.handlers.copy() + yield + # Reset the logging configuration after each test to avoid impacting other tests + logging.root.handlers = old_handlers + + +class TestSkipLoggingConfiguration: + def test_skip_logging_configuration( + self, monkeypatch: MonkeyPatch, capfd: CaptureFixture, caplog: LogCaptureFixture + ) -> None: + monkeypatch.setenv("HAYSTACK_LOGGING_IGNORE_STRUCTLOG", "true") + haystack_logging.configure_logging() + + logger = logging.getLogger(__name__) + logger.warning("Hello, structured logging!", extra={"key1": "value1", "key2": "value2"}) + + # the pytest fixture caplog only captures logs being rendered from the stdlib logging module + assert caplog.messages == ["Hello, structured logging!"] + + # Nothing should be captured by capfd since structlog is not configured + assert capfd.readouterr().err == "" + + def test_skip_logging_if_structlog_not_installed( + self, monkeypatch: MonkeyPatch, capfd: CaptureFixture, caplog: LogCaptureFixture + ) -> None: + monkeypatch.delitem(sys.modules, "structlog", raising=False) + monkeypatch.setattr(builtins, "__import__", Mock(side_effect=ImportError)) + + haystack_logging.configure_logging() + + logger = logging.getLogger(__name__) + logger.warning("Hello, structured logging!", extra={"key1": "value1", "key2": "value2"}) + + # the pytest fixture caplog only captures logs being rendered from the stdlib logging module + assert caplog.messages == ["Hello, structured logging!"] + + # Nothing should be captured by capfd since structlog is not configured + assert capfd.readouterr().err == "" + + +class TestStructuredLoggingConsoleRendering: + def test_log_filtering_when_using_debug(self, capfd: CaptureFixture) -> None: + haystack_logging.configure_logging() + + logger = logging.getLogger(__name__) + logger.debug("Hello, structured logging!", extra={"key1": "value1", "key2": "value2"}) + + # Use `capfd` to capture the output of the final structlog rendering result + output = capfd.readouterr().err + assert output == "" + + def test_log_filtering_when_using_debug_and_log_level_is_debug(self, capfd: CaptureFixture) -> None: + haystack_logging.configure_logging() + + logger = logging.getLogger(__name__) + logger.setLevel(logging.DEBUG) + + logger.debug("Hello, structured logging!", extra={"key1": "value1", "key2": "value2"}) + + # Use `capfd` to capture the output of the final structlog rendering result + output = capfd.readouterr().err + assert output != "" + + def test_console_rendered_structured_log(self, capfd: CaptureFixture) -> None: + haystack_logging.configure_logging() + + logger = logging.getLogger(__name__) + logger.warning("Hello, structured logging!", extra={"key1": "value1", "key2": "value2"}) + + # Use `capfd` to capture the output of the final structlog rendering result + output = capfd.readouterr().err + + # Only check for the minute to be a bit more robust + today = datetime.now(tz=timezone.utc).isoformat(timespec="minutes").replace("+00:00", "") + assert today in output + + log_level = "warning" + assert log_level in output + + assert "Hello, structured logging!" in output + + assert "key1" in output + assert "value1" in output + + def test_logging_exceptions(self, capfd: CaptureFixture) -> None: + haystack_logging.configure_logging() + + logger = logging.getLogger(__name__) + + def function_that_raises_and_adds_to_stack_trace(): + raise ValueError("This is an error") + + try: + function_that_raises_and_adds_to_stack_trace() + except ValueError: + logger.exception("An error happened") + + # Use `capfd` to capture the output of the final structlog rendering result + output = capfd.readouterr().err + + assert "An error happened" in output + + +class TestStructuredLoggingJSONRendering: + def test_logging_as_json(self, capfd: CaptureFixture) -> None: + haystack_logging.configure_logging(use_json=True) + + logger = logging.getLogger(__name__) + logger.warning("Hello, structured logging!", extra={"key1": "value1", "key2": "value2"}) + + # Use `capfd` to capture the output of the final structlog rendering result + output = capfd.readouterr().err + parsed_output = json.loads(output) # should not raise an error + + assert parsed_output == { + "event": "Hello, structured logging!", + "key1": "value1", + "key2": "value2", + "level": "warning", + "timestamp": ANY, + } + + def test_logging_as_json_enabling_via_env(self, capfd: CaptureFixture, monkeypatch: MonkeyPatch) -> None: + monkeypatch.setenv("HAYSTACK_LOGGING_USE_JSON", "true") + haystack_logging.configure_logging() + + logger = logging.getLogger(__name__) + logger.warning("Hello, structured logging!", extra={"key1": "value1", "key2": "value2"}) + + # Use `capfd` to capture the output of the final structlog rendering result + output = capfd.readouterr().err + parsed_output = json.loads(output) # should not raise an error + + assert parsed_output == { + "event": "Hello, structured logging!", + "key1": "value1", + "key2": "value2", + "level": "warning", + "timestamp": ANY, + } + + def test_logging_exceptions_json(self, capfd: CaptureFixture) -> None: + haystack_logging.configure_logging(use_json=True) + + logger = logging.getLogger(__name__) + + def function_that_raises_and_adds_to_stack_trace(): + my_local_variable = "my_local_variable" # noqa: F841 + raise ValueError("This is an error") + + try: + function_that_raises_and_adds_to_stack_trace() + except ValueError: + logger.exception("An error happened ") + + # Use `capfd` to capture the output of the final structlog rendering result + output = capfd.readouterr().err + parsed_output = json.loads(output) + assert parsed_output == { + "event": "An error happened ", + "level": "error", + "timestamp": ANY, + "exception": [ + { + "exc_type": "ValueError", + "exc_value": "This is an error", + "syntax_error": None, + "is_cause": False, + "frames": [ + { + "filename": str(Path.cwd() / "test" / "test_logging.py"), + "lineno": ANY, # otherwise the test breaks if you add a line :-) + "name": "test_logging_exceptions_json", + "line": "", + "locals": None, + }, + { + "filename": str(Path.cwd() / "test" / "test_logging.py"), + "lineno": ANY, # otherwise the test breaks if you add a line :-) + "name": "function_that_raises_and_adds_to_stack_trace", + "line": "", + "locals": None, + }, + ], + } + ], + } diff --git a/test/tracing/test_datadog.py b/test/tracing/test_datadog.py index d7e647ae0..9a259a372 100644 --- a/test/tracing/test_datadog.py +++ b/test/tracing/test_datadog.py @@ -17,8 +17,6 @@ def datadog_tracer(monkeypatch: MonkeyPatch) -> ddtrace.Tracer: tracer = ddtrace.Tracer() - # monkeypatch.setattr(ddtrace, "tracer", tracer) - return tracer