feat: Add FaithfulnessEvaluator component (#7424)

* draft FaithfulnessEvaluator * reno * calculate score per statement and aggregate * Update release note * update default values in tests and fix import path * remove instructions, inputs, outputs params * remove unused imports * add expected format example to docstring * remove name 'llm' from tests and docstring
2025-11-30 17:06:08 +00:00 · 2024-04-04 18:33:59 +02:00 · 2024-04-04 18:33:59 +02:00 · 9d02dc607a
commit 9d02dc607a
parent 189dfaf640
4 changed files with 298 additions and 0 deletions
--- a/haystack/components/evaluators/init.py
+++ b/haystack/components/evaluators/init.py
@ -2,6 +2,7 @@ from .answer_exact_match import AnswerExactMatchEvaluator
 from .document_map import DocumentMAPEvaluator
 from .document_mrr import DocumentMRREvaluator
 from .document_recall import DocumentRecallEvaluator
+from .faithfulness import FaithfulnessEvaluator
 from .llm_evaluator import LLMEvaluator
 from .sas_evaluator import SASEvaluator

@ -10,6 +11,7 @@ __all__ = [
    "DocumentMAPEvaluator",
    "DocumentMRREvaluator",
    "DocumentRecallEvaluator",
+    "FaithfulnessEvaluator",
    "LLMEvaluator",
    "SASEvaluator",
 ]
--- a/haystack/components/evaluators/faithfulness.py
+++ b/haystack/components/evaluators/faithfulness.py
@ -0,0 +1,161 @@
+from typing import Any, Dict, List, Optional
+
+from numpy import mean as np_mean
+
+from haystack import default_from_dict
+from haystack.components.evaluators.llm_evaluator import LLMEvaluator
+from haystack.core.component import component
+from haystack.utils import Secret, deserialize_secrets_inplace
+
+
+class FaithfulnessEvaluator(LLMEvaluator):
+    """
+    Evaluator that checks if a generated answer can be inferred from the provided contexts.
+
+    An LLM separates the answer into multiple statements and checks whether the statement can be inferred from the
+    context or not. The final score for the full answer is a number from 0.0 to 1.0. It represents the proportion of
+    statements that can be inferred from the provided contexts.
+
+    Usage example:
+    ```python
+    from haystack.components.evaluators import FaithfulnessEvaluator
+
+    questions = ["Who created the Python language?"]
+    contexts = [
+        [
+            "Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming language. Its design philosophy emphasizes code readability, and its language constructs aim to help programmers write clear, logical code for both small and large-scale software projects."
+        ],
+    ]
+    responses = ["Python is a high-level general-purpose programming language that was created by George Lucas."]
+    evaluator = FaithfulnessEvaluator()
+    result = evaluator.run(questions=questions, contexts=contexts, responses=responses)
+    print(results["evaluator"])
+    # {'results': [{'statements': ['Python is a high-level general-purpose programming language.',
+    # 'Python was created by George Lucas.'], 'statement_scores':
+    # [1, 0], 'score': 0.5}], 'score': 0.5, 'individual_scores': [0.5]}
+
+    ```
+    """
+
+    def __init__(
+        self,
+        examples: Optional[List[Dict[str, Any]]] = None,
+        api: str = "openai",
+        api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
+    ):
+        """
+        Creates an instance of LLMEvaluator.
+
+        :param examples:
+            Few-shot examples conforming to the expected input and output format of FaithfulnessEvaluator.
+            Each example must be a dictionary with keys "inputs" and "outputs".
+            "inputs" must be a dictionary with keys "questions", "contexts", and "responses".
+            "outputs" must be a dictionary with "statements" and "statement_scores".
+            Expected format:
+            [{
+                "inputs": {
+                    "questions": "What is the capital of Italy?", "contexts": ["Rome is the capital of Italy."],
+                    "responses": "Rome is the capital of Italy with more than 4 million inhabitants.",
+                },
+                "outputs": {
+                    "statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."],
+                    "statement_scores": [1, 0],
+                },
+            }]
+        :param api:
+            The API to use for calling an LLM through a Generator.
+            Supported APIs: "openai".
+        :param api_key:
+            The API key.
+
+        """
+        self.instructions = (
+            "Your task is to judge the faithfulness or groundedness of statements based "
+            "on context information. First, please extract statements from a provided "
+            "response to a question. Second, calculate a faithfulness score for each "
+            "statement made in the response. The score is 1 if the statement can be "
+            "inferred from the provided context or 0 if it cannot be inferred."
+        )
+        self.inputs = [("questions", List[str]), ("contexts", List[List[str]]), ("responses", List[str])]
+        self.outputs = ["statements", "statement_scores"]
+        self.examples = examples or [
+            {
+                "inputs": {
+                    "questions": "What is the capital of Germany and when was it founded?",
+                    "contexts": ["Berlin is the capital of Germany and was founded in 1244."],
+                    "responses": "The capital of Germany, Berlin, was founded in the 13th century.",
+                },
+                "outputs": {
+                    "statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."],
+                    "statement_scores": [1, 1],
+                },
+            },
+            {
+                "inputs": {
+                    "questions": "What is the capital of France?",
+                    "contexts": ["Berlin is the capital of Germany."],
+                    "responses": "Paris",
+                },
+                "outputs": {"statements": ["Paris is the capital of France."], "statement_scores": [0]},
+            },
+            {
+                "inputs": {
+                    "questions": "What is the capital of Italy?",
+                    "contexts": ["Rome is the capital of Italy."],
+                    "responses": "Rome is the capital of Italy with more than 4 million inhabitants.",
+                },
+                "outputs": {
+                    "statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."],
+                    "statement_scores": [1, 0],
+                },
+            },
+        ]
+        self.api = api
+        self.api_key = api_key
+
+        super().__init__(
+            instructions=self.instructions,
+            inputs=self.inputs,
+            outputs=self.outputs,
+            examples=self.examples,
+            api=self.api,
+            api_key=self.api_key,
+        )
+
+    @component.output_types(results=List[Dict[str, Any]])
+    def run(self, **inputs) -> Dict[str, Any]:
+        """
+        Run the LLM evaluator.
+
+        :param inputs:
+            The input values to evaluate. The keys are the input names and the values are lists of input values.
+        :returns:
+            A dictionary with the following outputs:
+                - `score`: Mean faithfulness score over all the provided input answers.
+                - `individual_scores`: A list of faithfulness scores for each input answer.
+                - `results`: A list of dictionaries with `statements` and `statement_scores` for each input answer.
+        """
+        result = super().run(**inputs)
+
+        # calculate average statement faithfulness score per query
+        for res in result["results"]:
+            res["score"] = np_mean(res["statement_scores"])
+
+        # calculate average answer faithfulness score over all queries
+        result["score"] = np_mean([res["score"] for res in result["results"]])
+        result["individual_scores"] = [res["score"] for res in result["results"]]
+
+        return result
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "FaithfulnessEvaluator":
+        """
+        Deserialize this component from a dictionary.
+
+        :param data:
+            The dictionary representation of this component.
+        :returns:
+            The deserialized component instance.
+        """
+        deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
+        return default_from_dict(cls, data)
--- a/releasenotes/notes/faithfulness-evaluator-2e039a697c847d1c.yaml
+++ b/releasenotes/notes/faithfulness-evaluator-2e039a697c847d1c.yaml
@ -0,0 +1,6 @@
+---
+features:
+  - |
+    Add a new FaithfulnessEvaluator component that can be used to evaluate faithfulness / groundedness / hallucinations of LLMs in a RAG pipeline.
+    Given a question, a list of retrieved document contents (contexts), and a predicted answer, FaithfulnessEvaluator returns a score ranging from 0 (poor faithfulness) to 1 (perfect faithfulness).
+    The score is the proportion of statements in the predicted answer that could by inferred from the documents.
--- a/test/components/evaluators/test_faithfulness_evaluator.py
+++ b/test/components/evaluators/test_faithfulness_evaluator.py
@ -0,0 +1,129 @@
+from typing import List
+
+import pytest
+
+from haystack.components.evaluators import FaithfulnessEvaluator
+from haystack.utils.auth import Secret
+
+
+class TestFaithfulnessEvaluator:
+    def test_init_default(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = FaithfulnessEvaluator()
+        assert component.api == "openai"
+        assert component.generator.client.api_key == "test-api-key"
+        assert component.instructions == (
+            "Your task is to judge the faithfulness or groundedness of statements based "
+            "on context information. First, please extract statements from a provided "
+            "response to a question. Second, calculate a faithfulness score for each "
+            "statement made in the response. The score is 1 if the statement can be "
+            "inferred from the provided context or 0 if it cannot be inferred."
+        )
+        assert component.inputs == [("questions", List[str]), ("contexts", List[List[str]]), ("responses", List[str])]
+        assert component.outputs == ["statements", "statement_scores"]
+        assert component.examples == [
+            {
+                "inputs": {
+                    "questions": "What is the capital of Germany and when was it founded?",
+                    "contexts": ["Berlin is the capital of Germany and was founded in 1244."],
+                    "responses": "The capital of Germany, Berlin, was founded in the 13th century.",
+                },
+                "outputs": {
+                    "statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."],
+                    "statement_scores": [1, 1],
+                },
+            },
+            {
+                "inputs": {
+                    "questions": "What is the capital of France?",
+                    "contexts": ["Berlin is the capital of Germany."],
+                    "responses": "Paris",
+                },
+                "outputs": {"statements": ["Paris is the capital of France."], "statement_scores": [0]},
+            },
+            {
+                "inputs": {
+                    "questions": "What is the capital of Italy?",
+                    "contexts": ["Rome is the capital of Italy."],
+                    "responses": "Rome is the capital of Italy with more than 4 million inhabitants.",
+                },
+                "outputs": {
+                    "statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."],
+                    "statement_scores": [1, 0],
+                },
+            },
+        ]
+
+    def test_init_fail_wo_openai_api_key(self, monkeypatch):
+        monkeypatch.delenv("OPENAI_API_KEY", raising=False)
+        with pytest.raises(ValueError, match="None of the .* environment variables are set"):
+            FaithfulnessEvaluator()
+
+    def test_init_with_parameters(self):
+        component = FaithfulnessEvaluator(
+            api_key=Secret.from_token("test-api-key"),
+            api="openai",
+            examples=[
+                {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
+                {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
+            ],
+        )
+        assert component.generator.client.api_key == "test-api-key"
+        assert component.api == "openai"
+        assert component.examples == [
+            {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
+            {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
+        ]
+
+    def test_from_dict(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+
+        data = {
+            "type": "haystack.components.evaluators.faithfulness.FaithfulnessEvaluator",
+            "init_parameters": {
+                "api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"},
+                "api": "openai",
+                "examples": [{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+            },
+        }
+        component = FaithfulnessEvaluator.from_dict(data)
+        assert component.api == "openai"
+        assert component.generator.client.api_key == "test-api-key"
+        assert component.examples == [
+            {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}
+        ]
+
+    def test_run_calculates_mean_score(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = FaithfulnessEvaluator()
+
+        def generator_run(self, *args, **kwargs):
+            if "Football" in kwargs["prompt"]:
+                return {"replies": ['{"statements": ["a", "b"], "statement_scores": [1, 0]}']}
+            else:
+                return {"replies": ['{"statements": ["c", "d"], "statement_scores": [1, 1]}']}
+
+        monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run)
+
+        questions = ["Which is the most popular global sport?", "Who created the Python language?"]
+        contexts = [
+            [
+                "The popularity of sports can be measured in various ways, including TV viewership, social media presence, number of participants, and economic impact. Football is undoubtedly the world's most popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and Messi, drawing a followership of more than 4 billion people."
+            ],
+            [
+                "Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming language. Its design philosophy emphasizes code readability, and its language constructs aim to help programmers write clear, logical code for both small and large-scale software projects."
+            ],
+        ]
+        responses = [
+            "Football is the most popular sport with around 4 billion followers worldwide.",
+            "Python is a high-level general-purpose programming language that was created by George Lucas.",
+        ]
+        results = component.run(questions=questions, contexts=contexts, responses=responses)
+        assert results == {
+            "individual_scores": [0.5, 1],
+            "results": [
+                {"score": 0.5, "statement_scores": [1, 0], "statements": ["a", "b"]},
+                {"score": 1, "statement_scores": [1, 1], "statements": ["c", "d"]},
+            ],
+            "score": 0.75,
+        }