chore: make Haystack warnings consistent (#9083)

* chore: make Haystack warnings consistent * more structured logging * small fixes
2025-06-26 22:00:13 +00:00 · 2025-03-21 18:18:55 +01:00 · 2025-03-21 18:18:55 +01:00 · 1c1030efc6
commit 1c1030efc6
parent 3e435439d9
11 changed files with 65 additions and 50 deletions
--- a/haystack/components/converters/openapi_functions.py
+++ b/haystack/components/converters/openapi_functions.py
@ -84,7 +84,7 @@ class OpenAPIServiceToFunctions:
                            "IO error reading OpenAPI specification file: {source}. Error: {e}", source=source, e=e
                        )
                else:
-                    logger.warning(f"OpenAPI specification file not found: {source}")
+                    logger.warning("OpenAPI specification file not found: {source}", source=source)
            elif isinstance(source, ByteStream):
                openapi_spec_content = source.data.decode("utf-8")
                if not openapi_spec_content:
--- a/haystack/components/converters/output_adapter.py
+++ b/haystack/components/converters/output_adapter.py
@ -5,7 +5,6 @@
 import ast
 import contextlib
 from typing import Any, Callable, Dict, Optional, Set
-from warnings import warn

 import jinja2.runtime
 from jinja2 import Environment, TemplateSyntaxError, meta
@ -13,9 +12,11 @@ from jinja2.nativetypes import NativeEnvironment
 from jinja2.sandbox import SandboxedEnvironment
 from typing_extensions import TypeAlias

-from haystack import component, default_from_dict, default_to_dict
+from haystack import component, default_from_dict, default_to_dict, logging
 from haystack.utils import deserialize_callable, deserialize_type, serialize_callable, serialize_type

+logger = logging.getLogger(__name__)
+

 class OutputAdaptationException(Exception):
    """Exception raised when there is an error during output adaptation."""
@ -76,7 +77,7 @@ class OutputAdapter:
                "Unsafe mode is enabled. This allows execution of arbitrary code in the Jinja template. "
                "Use this only if you trust the source of the template."
            )
-            warn(msg)
+            logger.warning(msg)
        self._env = (
            NativeEnvironment() if self._unsafe else SandboxedEnvironment(undefined=jinja2.runtime.StrictUndefined)
        )
--- a/haystack/components/embedders/hugging_face_api_document_embedder.py
+++ b/haystack/components/embedders/hugging_face_api_document_embedder.py
@ -2,12 +2,11 @@
 #
 # SPDX-License-Identifier: Apache-2.0

-import warnings
 from typing import Any, Dict, List, Optional, Union

 from tqdm import tqdm

-from haystack import component, default_from_dict, default_to_dict
+from haystack import component, default_from_dict, default_to_dict, logging
 from haystack.dataclasses import Document
 from haystack.lazy_imports import LazyImport
 from haystack.utils import Secret, deserialize_secrets_inplace
@ -17,6 +16,8 @@ from haystack.utils.url_validation import is_valid_http_url
 with LazyImport(message="Run 'pip install \"huggingface_hub>=0.27.0\"'") as huggingface_hub_import:
    from huggingface_hub import InferenceClient

+logger = logging.getLogger(__name__)
+

@component
 class HuggingFaceAPIDocumentEmbedder:
@ -241,11 +242,11 @@ class HuggingFaceAPIDocumentEmbedder:
        if self.api_type == HFEmbeddingAPIType.SERVERLESS_INFERENCE_API:
            if truncate is not None:
                msg = "`truncate` parameter is not supported for Serverless Inference API. It will be ignored."
-                warnings.warn(msg)
+                logger.warning(msg)
                truncate = None
            if normalize is not None:
                msg = "`normalize` parameter is not supported for Serverless Inference API. It will be ignored."
-                warnings.warn(msg)
+                logger.warning(msg)
                normalize = None

        all_embeddings = []
--- a/haystack/components/embedders/hugging_face_api_text_embedder.py
+++ b/haystack/components/embedders/hugging_face_api_text_embedder.py
@ -2,10 +2,9 @@
 #
 # SPDX-License-Identifier: Apache-2.0

-import warnings
 from typing import Any, Dict, List, Optional, Union

-from haystack import component, default_from_dict, default_to_dict
+from haystack import component, default_from_dict, default_to_dict, logging
 from haystack.lazy_imports import LazyImport
 from haystack.utils import Secret, deserialize_secrets_inplace
 from haystack.utils.hf import HFEmbeddingAPIType, HFModelType, check_valid_model
@ -14,6 +13,8 @@ from haystack.utils.url_validation import is_valid_http_url
 with LazyImport(message="Run 'pip install \"huggingface_hub>=0.27.0\"'") as huggingface_hub_import:
    from huggingface_hub import InferenceClient

+logger = logging.getLogger(__name__)
+

@component
 class HuggingFaceAPITextEmbedder:
@ -200,11 +201,11 @@ class HuggingFaceAPITextEmbedder:
        if self.api_type == HFEmbeddingAPIType.SERVERLESS_INFERENCE_API:
            if truncate is not None:
                msg = "`truncate` parameter is not supported for Serverless Inference API. It will be ignored."
-                warnings.warn(msg)
+                logger.warning(msg)
                truncate = None
            if normalize is not None:
                msg = "`normalize` parameter is not supported for Serverless Inference API. It will be ignored."
-                warnings.warn(msg)
+                logger.warning(msg)
                normalize = None

        text_to_embed = self.prefix + text + self.suffix
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@ -4,15 +4,16 @@

 import json
 from typing import Any, Dict, List, Optional, Tuple, Type
-from warnings import warn

 from tqdm import tqdm

-from haystack import component, default_from_dict, default_to_dict
+from haystack import component, default_from_dict, default_to_dict, logging
 from haystack.components.builders import PromptBuilder
 from haystack.components.generators import OpenAIGenerator
 from haystack.utils import Secret, deserialize_secrets_inplace, deserialize_type, serialize_type

+logger = logging.getLogger(__name__)
+

@component
 class LLMEvaluator:
@ -206,10 +207,9 @@ class LLMEvaluator:
            try:
                result = self.generator.run(prompt=prompt["prompt"])
            except Exception as e:
-                msg = f"Error while generating response for prompt: {prompt}. Error: {e}"
                if self.raise_on_failure:
-                    raise ValueError(msg)
-                warn(msg)
+                    raise ValueError(f"Error while generating response for prompt: {prompt}. Error: {e}")
+                logger.warning("Error while generating response for prompt: {prompt}. Error: {e}", prompt=prompt, e=e)
                results.append(None)
                errors += 1
                continue
@ -225,8 +225,11 @@ class LLMEvaluator:
                metadata = result["meta"]

        if errors > 0:
-            msg = f"LLM evaluator failed for {errors} out of {len(list_of_input_names_to_values)} inputs."
-            warn(msg)
+            logger.warning(
+                "LLM evaluator failed for {errors} out of {len(list_of_input_names_to_values)} inputs.",
+                errors=errors,
+                len=len(list_of_input_names_to_values),
+            )

        return {"results": results, "meta": metadata}

@ -374,14 +377,19 @@ class LLMEvaluator:
            msg = "Response from LLM evaluator is not a valid JSON."
            if self.raise_on_failure:
                raise ValueError(msg)
-            warn(msg)
+            logger.warning(msg)
            return False

        if not all(output in parsed_output for output in expected):
-            msg = f"Expected response from LLM evaluator to be JSON with keys {expected}, got {received}."
            if self.raise_on_failure:
-                raise ValueError(msg)
-            warn(msg)
+                raise ValueError(
+                    f"Expected response from LLM evaluator to be JSON with keys {expected}, got {{received}}."
+                )
+            logger.warning(
+                "Expected response from LLM evaluator to be JSON with keys {expected}, got {received}.",
+                expected=expected,
+                received=received,
+            )
            return False

        return True
--- a/haystack/components/readers/extractive.py
+++ b/haystack/components/readers/extractive.py
@ -3,7 +3,6 @@
 # SPDX-License-Identifier: Apache-2.0

 import math
-import warnings
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union

@ -215,9 +214,10 @@ class ExtractiveReader:
        document_contents = []
        for i, doc in enumerate(documents):
            if doc.content is None:
-                warnings.warn(
-                    f"Document with id {doc.id} was passed to ExtractiveReader. The Document doesn't "
-                    f"contain any text and it will be ignored."
+                logger.warning(
+                    "Document with id {doc_id} was passed to ExtractiveReader. The Document doesn't "
+                    "contain any text and it will be ignored.",
+                    doc_id=doc.id,
                )
                continue
            texts.append(doc.content)
--- a/haystack/components/routers/conditional_router.py
+++ b/haystack/components/routers/conditional_router.py
@ -5,7 +5,6 @@
 import ast
 import contextlib
 from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Set, Union, get_args, get_origin
-from warnings import warn

 from jinja2 import Environment, TemplateSyntaxError, meta
 from jinja2.nativetypes import NativeEnvironment
@ -192,7 +191,7 @@ class ConditionalRouter:
                "Unsafe mode is enabled. This allows execution of arbitrary code in the Jinja template. "
                "Use this only if you trust the source of the template."
            )
-            warn(msg)
+            logger.warning(msg)

        self._env = NativeEnvironment() if self._unsafe else SandboxedEnvironment()
        self._env.filters.update(self.custom_filters)
@ -216,13 +215,11 @@ class ConditionalRouter:
        # warn about unused optional variables
        unused_optional_vars = set(self.optional_variables) - input_types if self.optional_variables else None
        if unused_optional_vars:
-            msg = (
-                f"The following optional variables are specified but not used in any route: {unused_optional_vars}. "
-                "Check if there's a typo in variable names."
+            logger.warning(
+                "The following optional variables are specified but not used in any route: {unused_optional_vars}. "
+                "Check if there's a typo in variable names.",
+                unused_optional_vars=unused_optional_vars,
            )
-            # intentionally using both warn and logger
-            warn(msg, UserWarning)
-            logger.warning(msg)

        # add mandatory input types
        component.set_input_types(self, **dict.fromkeys(mandatory_input_types, Any))
--- a/haystack/evaluation/eval_run_result.py
+++ b/haystack/evaluation/eval_run_result.py
@ -5,13 +5,15 @@
 import csv
 from copy import deepcopy
 from typing import Any, Dict, List, Literal, Optional, Union
-from warnings import warn

+from haystack import logging
 from haystack.lazy_imports import LazyImport

 with LazyImport("Run 'pip install pandas'") as pandas_import:
    from pandas import DataFrame

+logger = logging.getLogger(__name__)
+

 class EvaluationRunResult:
    """
@ -188,10 +190,15 @@ class EvaluationRunResult:
            raise ValueError("The 'other' parameter must have 'run_name', 'inputs', and 'results' attributes.")

        if self.run_name == other.run_name:
-            warn(f"The run names of the two evaluation results are the same ('{self.run_name}')")
+            logger.warning(
+                "The run names of the two evaluation results are the same ('{run_name}')", run_name=self.run_name
+            )

        if self.inputs.keys() != other.inputs.keys():
-            warn(f"The input columns differ between the results; using the input columns of '{self.run_name}'.")
+            logger.warning(
+                "The input columns differ between the results; using the input columns of '{run_name}'",
+                run_name=self.run_name,
+            )

        # got both detailed reports
        detailed_a = self.detailed_report(output_format="json")
--- a/test/components/embedders/test_hugging_face_api_document_embedder.py
+++ b/test/components/embedders/test_hugging_face_api_document_embedder.py
@ -203,7 +203,7 @@ class TestHuggingFaceAPIDocumentEmbedder:
            "my_prefix document number 4 my_suffix",
        ]

-    def test_embed_batch(self, mock_check_valid_model, recwarn):
+    def test_embed_batch(self, mock_check_valid_model, caplog):
        texts = ["text 1", "text 2", "text 3", "text 4", "text 5"]

        with patch("huggingface_hub.InferenceClient.feature_extraction") as mock_embedding_patch:
@ -225,10 +225,10 @@ class TestHuggingFaceAPIDocumentEmbedder:
            assert len(embedding) == 384
            assert all(isinstance(x, float) for x in embedding)

-        # Check that warnings about ignoring truncate and normalize are raised
-        assert len(recwarn) == 2
-        assert "truncate" in str(recwarn[0].message)
-        assert "normalize" in str(recwarn[1].message)
+        # Check that logger warnings about ignoring truncate and normalize are raised
+        assert len(caplog.records) == 2
+        assert "truncate" in caplog.records[0].message
+        assert "normalize" in caplog.records[1].message

    def test_embed_batch_wrong_embedding_shape(self, mock_check_valid_model):
        texts = ["text 1", "text 2", "text 3", "text 4", "text 5"]
--- a/test/components/embedders/test_hugging_face_api_text_embedder.py
+++ b/test/components/embedders/test_hugging_face_api_text_embedder.py
@ -136,7 +136,7 @@ class TestHuggingFaceAPITextEmbedder:
        with pytest.raises(TypeError):
            embedder.run(text=list_integers_input)

-    def test_run(self, mock_check_valid_model, recwarn):
+    def test_run(self, mock_check_valid_model, caplog):
        with patch("huggingface_hub.InferenceClient.feature_extraction") as mock_embedding_patch:
            mock_embedding_patch.return_value = array([[random.random() for _ in range(384)]])

@ -158,9 +158,9 @@ class TestHuggingFaceAPITextEmbedder:
        assert all(isinstance(x, float) for x in result["embedding"])

        # Check that warnings about ignoring truncate and normalize are raised
-        assert len(recwarn) == 2
-        assert "truncate" in str(recwarn[0].message)
-        assert "normalize" in str(recwarn[1].message)
+        assert len(caplog.records) == 2
+        assert "truncate" in caplog.records[0].message
+        assert "normalize" in caplog.records[1].message

    def test_run_wrong_embedding_shape(self, mock_check_valid_model):
        # embedding ndim > 2
--- a/test/components/routers/test_conditional_router.py
+++ b/test/components/routers/test_conditional_router.py
@ -525,7 +525,7 @@ class TestRouter:
        result = pipe.run(data={"router": {"question": "What?", "mode": "chat", "language": "en", "source": "doc"}})
        assert result["router"] == {"en_doc_chat": "What?"}, "Pipeline should handle all parameters"

-    def test_warns_on_unused_optional_variables(self):
+    def test_warns_on_unused_optional_variables(self, caplog):
        """
        Test that a warning is raised when optional_variables contains variables
        that are not used in any route conditions or outputs.
@ -536,8 +536,8 @@ class TestRouter:
        ]

        # Initialize with unused optional variables and capture warning
-        with pytest.warns(UserWarning, match="optional variables"):
-            router = ConditionalRouter(routes=routes, optional_variables=["unused_var1", "unused_var2"])
+        router = ConditionalRouter(routes=routes, optional_variables=["unused_var1", "unused_var2"])
+        assert "optional variables" in caplog.records[0].message

        # Verify router still works normally
        result = router.run(question="What?", mode="chat")