feat: Add ContextRelevanceEvaluator component (#7519)

* feat: Add ContextRelevanceEvaluator component * reno * fix expected inputs and example docstring * remove responses parameter from tests * specify inputs explicitly * add new evaluator to api reference docs
2026-01-08 13:06:29 +00:00 · 2024-04-22 14:10:00 +02:00 · 2024-04-22 14:10:00 +02:00 · b12e0db134
commit b12e0db134
parent 5d0ccfe7d4
5 changed files with 304 additions and 0 deletions
--- a/docs/pydoc/config/evaluators_api.yml
+++ b/docs/pydoc/config/evaluators_api.yml
@ -4,6 +4,7 @@ loaders:
    modules:
      [
        "answer_exact_match",
+        "context_relevance",
        "document_map",
        "document_mrr",
        "document_recall",
--- a/haystack/components/evaluators/init.py
+++ b/haystack/components/evaluators/init.py
@ -1,4 +1,5 @@
 from .answer_exact_match import AnswerExactMatchEvaluator
+from .context_relevance import ContextRelevanceEvaluator
 from .document_map import DocumentMAPEvaluator
 from .document_mrr import DocumentMRREvaluator
 from .document_recall import DocumentRecallEvaluator
@ -9,6 +10,7 @@ from .sas_evaluator import SASEvaluator

 __all__ = [
    "AnswerExactMatchEvaluator",
+    "ContextRelevanceEvaluator",
    "DocumentMAPEvaluator",
    "DocumentMRREvaluator",
    "DocumentRecallEvaluator",
--- a/haystack/components/evaluators/context_relevance.py
+++ b/haystack/components/evaluators/context_relevance.py
@ -0,0 +1,154 @@
+from typing import Any, Dict, List, Optional
+
+from numpy import mean as np_mean
+
+from haystack import default_from_dict
+from haystack.components.evaluators.llm_evaluator import LLMEvaluator
+from haystack.core.component import component
+from haystack.utils import Secret, deserialize_secrets_inplace
+
+# Private global variable for default examples to include in the prompt if the user does not provide any examples
+_DEFAULT_EXAMPLES = [
+    {
+        "inputs": {
+            "questions": "What is the capital of Germany?",
+            "contexts": ["Berlin is the capital of Germany and was founded in 1244."],
+        },
+        "outputs": {
+            "statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."],
+            "statement_scores": [1, 0],
+        },
+    },
+    {
+        "inputs": {"questions": "What is the capital of France?", "contexts": ["Berlin is the capital of Germany."]},
+        "outputs": {"statements": ["Berlin is the capital of Germany."], "statement_scores": [0]},
+    },
+    {
+        "inputs": {"questions": "What is the capital of Italy?", "contexts": ["Rome is the capital of Italy."]},
+        "outputs": {"statements": ["Rome is the capital of Italy."], "statement_scores": [1]},
+    },
+]
+
+
+class ContextRelevanceEvaluator(LLMEvaluator):
+    """
+    Evaluator that checks if a provided context is relevant to the question.
+
+    An LLM separates the answer into multiple statements and checks whether the statement can be inferred from the
+    context or not. The final score for the full answer is a number from 0.0 to 1.0. It represents the proportion of
+    statements that can be inferred from the provided contexts.
+
+    Usage example:
+    ```python
+    from haystack.components.evaluators import ContextRelevanceEvaluator
+
+    questions = ["Who created the Python language?"]
+    contexts = [
+        [
+            "Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming language. Its design philosophy emphasizes code readability, and its language constructs aim to help programmers write clear, logical code for both small and large-scale software projects."
+        ],
+    ]
+
+    evaluator = ContextRelevanceEvaluator()
+    result = evaluator.run(questions=questions, contexts=contexts)
+    print(result["score"])
+    # 1.0
+    print(result["individual_scores"])
+    # [1.0]
+    print(result["results"])
+    # [{'statements': ['Python, created by Guido van Rossum in the late 1980s.'], 'statement_scores': [1], 'score': 1.0}]
+    ```
+    """
+
+    def __init__(
+        self,
+        examples: Optional[List[Dict[str, Any]]] = None,
+        api: str = "openai",
+        api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
+    ):
+        """
+        Creates an instance of ContextRelevanceEvaluator.
+
+        :param examples:
+            Optional few-shot examples conforming to the expected input and output format of ContextRelevanceEvaluator.
+            Default examples will be used if none are provided.
+            Each example must be a dictionary with keys "inputs" and "outputs".
+            "inputs" must be a dictionary with keys "questions" and "contexts".
+            "outputs" must be a dictionary with "statements" and "statement_scores".
+            Expected format:
+            [{
+                "inputs": {
+                    "questions": "What is the capital of Italy?", "contexts": ["Rome is the capital of Italy."],
+                },
+                "outputs": {
+                    "statements": ["Rome is the capital of Italy."],
+                    "statement_scores": [1],
+                },
+            }]
+        :param api:
+            The API to use for calling an LLM through a Generator.
+            Supported APIs: "openai".
+        :param api_key:
+            The API key.
+
+        """
+        self.instructions = (
+            "Your task is to judge how relevant the provided context is for answering a question. "
+            "First, please extract statements from the provided context. "
+            "Second, calculate a relevance score for each statement in the context. "
+            "The score is 1 if the statement is relevant to answer the question or 0 if it is not relevant."
+        )
+        self.inputs = [("questions", List[str]), ("contexts", List[List[str]])]
+        self.outputs = ["statements", "statement_scores"]
+        self.examples = examples or _DEFAULT_EXAMPLES
+        self.api = api
+        self.api_key = api_key
+
+        super().__init__(
+            instructions=self.instructions,
+            inputs=self.inputs,
+            outputs=self.outputs,
+            examples=self.examples,
+            api=self.api,
+            api_key=self.api_key,
+        )
+
+    @component.output_types(results=List[Dict[str, Any]])
+    def run(self, questions: List[str], contexts: List[List[str]]) -> Dict[str, Any]:
+        """
+        Run the LLM evaluator.
+
+        :param questions:
+            A list of questions.
+        :param contexts:
+            A list of lists of contexts. Each list of contexts corresponds to one question.
+        :returns:
+            A dictionary with the following outputs:
+                - `score`: Mean context relevance score over all the provided input questions.
+                - `individual_scores`: A list of context relevance scores for each input question.
+                - `results`: A list of dictionaries with `statements` and `statement_scores` for each input context.
+        """
+        result = super().run(questions=questions, contexts=contexts)
+
+        # calculate average statement relevance score per query
+        for res in result["results"]:
+            res["score"] = np_mean(res["statement_scores"])
+
+        # calculate average context relevance score over all queries
+        result["score"] = np_mean([res["score"] for res in result["results"]])
+        result["individual_scores"] = [res["score"] for res in result["results"]]
+
+        return result
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "ContextRelevanceEvaluator":
+        """
+        Deserialize this component from a dictionary.
+
+        :param data:
+            The dictionary representation of this component.
+        :returns:
+            The deserialized component instance.
+        """
+        deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
+        return default_from_dict(cls, data)
--- a/releasenotes/notes/context-relevance-04063b9dc9fe7379.yaml
+++ b/releasenotes/notes/context-relevance-04063b9dc9fe7379.yaml
@ -0,0 +1,5 @@
+---
+features:
+  - |
+    Add a new ContextRelevanceEvaluator component that can be used to evaluate whether retrieved documents are relevant to answer a question with a RAG pipeline.
+    Given a question and a list of retrieved document contents (contexts), an LLM is used to score to what extent the provided context is relevant. The score ranges from 0 to 1.
--- a/test/components/evaluators/test_context_relevance_evaluator.py
+++ b/test/components/evaluators/test_context_relevance_evaluator.py
@ -0,0 +1,142 @@
+import os
+from typing import List
+
+import pytest
+
+from haystack.components.evaluators import ContextRelevanceEvaluator
+from haystack.utils.auth import Secret
+
+
+class TestContextRelevanceEvaluator:
+    def test_init_default(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = ContextRelevanceEvaluator()
+        assert component.api == "openai"
+        assert component.generator.client.api_key == "test-api-key"
+        assert component.instructions == (
+            "Your task is to judge how relevant the provided context is for answering a question. "
+            "First, please extract statements from the provided context. "
+            "Second, calculate a relevance score for each statement in the context. "
+            "The score is 1 if the statement is relevant to answer the question or 0 if it is not relevant."
+        )
+        assert component.inputs == [("questions", List[str]), ("contexts", List[List[str]])]
+        assert component.outputs == ["statements", "statement_scores"]
+        assert component.examples == [
+            {
+                "inputs": {
+                    "questions": "What is the capital of Germany?",
+                    "contexts": ["Berlin is the capital of Germany and was founded in 1244."],
+                },
+                "outputs": {
+                    "statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."],
+                    "statement_scores": [1, 0],
+                },
+            },
+            {
+                "inputs": {
+                    "questions": "What is the capital of France?",
+                    "contexts": ["Berlin is the capital of Germany."],
+                },
+                "outputs": {"statements": ["Berlin is the capital of Germany."], "statement_scores": [0]},
+            },
+            {
+                "inputs": {"questions": "What is the capital of Italy?", "contexts": ["Rome is the capital of Italy."]},
+                "outputs": {"statements": ["Rome is the capital of Italy."], "statement_scores": [1]},
+            },
+        ]
+
+    def test_init_fail_wo_openai_api_key(self, monkeypatch):
+        monkeypatch.delenv("OPENAI_API_KEY", raising=False)
+        with pytest.raises(ValueError, match="None of the .* environment variables are set"):
+            ContextRelevanceEvaluator()
+
+    def test_init_with_parameters(self):
+        component = ContextRelevanceEvaluator(
+            api_key=Secret.from_token("test-api-key"),
+            api="openai",
+            examples=[
+                {"inputs": {"questions": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
+                {"inputs": {"questions": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
+            ],
+        )
+        assert component.generator.client.api_key == "test-api-key"
+        assert component.api == "openai"
+        assert component.examples == [
+            {"inputs": {"questions": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
+            {"inputs": {"questions": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
+        ]
+
+    def test_from_dict(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+
+        data = {
+            "type": "haystack.components.evaluators.context_relevance.ContextRelevanceEvaluator",
+            "init_parameters": {
+                "api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"},
+                "api": "openai",
+                "examples": [{"inputs": {"questions": "What is football?"}, "outputs": {"score": 0}}],
+            },
+        }
+        component = ContextRelevanceEvaluator.from_dict(data)
+        assert component.api == "openai"
+        assert component.generator.client.api_key == "test-api-key"
+        assert component.examples == [{"inputs": {"questions": "What is football?"}, "outputs": {"score": 0}}]
+
+    def test_run_calculates_mean_score(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = ContextRelevanceEvaluator()
+
+        def generator_run(self, *args, **kwargs):
+            if "Football" in kwargs["prompt"]:
+                return {"replies": ['{"statements": ["a", "b"], "statement_scores": [1, 0]}']}
+            else:
+                return {"replies": ['{"statements": ["c", "d"], "statement_scores": [1, 1]}']}
+
+        monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run)
+
+        questions = ["Which is the most popular global sport?", "Who created the Python language?"]
+        contexts = [
+            [
+                "The popularity of sports can be measured in various ways, including TV viewership, social media "
+                "presence, number of participants, and economic impact. Football is undoubtedly the world's most "
+                "popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and "
+                "Messi, drawing a followership of more than 4 billion people."
+            ],
+            [
+                "Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming "
+                "language. Its design philosophy emphasizes code readability, and its language constructs aim to help "
+                "programmers write clear, logical code for both small and large-scale software projects."
+            ],
+        ]
+        results = component.run(questions=questions, contexts=contexts)
+        assert results == {
+            "individual_scores": [0.5, 1],
+            "results": [
+                {"score": 0.5, "statement_scores": [1, 0], "statements": ["a", "b"]},
+                {"score": 1, "statement_scores": [1, 1], "statements": ["c", "d"]},
+            ],
+            "score": 0.75,
+        }
+
+    def test_run_missing_parameters(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = ContextRelevanceEvaluator()
+        with pytest.raises(TypeError, match="missing 2 required positional arguments"):
+            component.run()
+
+    @pytest.mark.skipif(
+        not os.environ.get("OPENAI_API_KEY", None),
+        reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",
+    )
+    @pytest.mark.integration
+    def test_live_run(self):
+        questions = ["Who created the Python language?"]
+        contexts = [["Python, created by Guido van Rossum, is a high-level general-purpose programming language."]]
+
+        evaluator = ContextRelevanceEvaluator()
+        result = evaluator.run(questions=questions, contexts=contexts)
+        assert result["score"] == 1.0
+        assert result["individual_scores"] == [1.0]
+        assert result["results"][0]["score"] == 1.0
+        assert result["results"][0]["statement_scores"] == [1.0]
+        assert "Guido van Rossum" in result["results"][0]["statements"][0]