refactor: Remove questions inputs from evaluators (#7466)

* Remove questions input from AnswerExactMatchEvaluator * Remove questions input from DocumentRecallEvaluator
2026-01-08 21:28:00 +00:00 · 2024-04-04 14:14:18 +02:00 · 2024-04-04 14:14:18 +02:00 · dc87f51759
commit dc87f51759
parent 12acb3f12e
4 changed files with 20 additions and 106 deletions
--- a/haystack/components/evaluators/answer_exact_match.py
+++ b/haystack/components/evaluators/answer_exact_match.py
@ -7,9 +7,9 @@ from haystack.core.component import component
 class AnswerExactMatchEvaluator:
    """
    Evaluator that checks if the predicted answers matches any of the ground truth answers exactly.
-    The result is a number from 0.0 to 1.0, it represents the proportion of questions where any predicted answer
-    matched one of the ground truth answers.
-    Each question can have multiple ground truth answers and multiple predicted answers.
+    The result is a number from 0.0 to 1.0, it represents the proportion any predicted answer
+    that matched one of the ground truth answers.
+    There can be multiple ground truth answers and multiple predicted answers as input.

    Usage example:
    ```python
@ -17,7 +17,6 @@ class AnswerExactMatchEvaluator:

    evaluator = AnswerExactMatchEvaluator()
    result = evaluator.run(
-        questions=["What is the capital of Germany?", "What is the capital of France?"],
        ground_truth_answers=[["Berlin"], ["Paris"]],
        predicted_answers=[["Berlin"], ["Lyon"]],
    )
@ -30,15 +29,11 @@ class AnswerExactMatchEvaluator:
    """

    @component.output_types(individual_scores=List[int], score=float)
-    def run(
-        self, questions: List[str], ground_truth_answers: List[List[str]], predicted_answers: List[List[str]]
-    ) -> Dict[str, Any]:
+    def run(self, ground_truth_answers: List[List[str]], predicted_answers: List[List[str]]) -> Dict[str, Any]:
        """
        Run the AnswerExactMatchEvaluator on the given inputs.
-        All lists must have the same length.
+        `ground_truth_answers` and `retrieved_answers` must have the same length.

-        :param questions:
-            A list of questions.
        :param ground_truth_answers:
            A list of expected answers for each question.
        :param predicted_answers:
@ -49,8 +44,8 @@ class AnswerExactMatchEvaluator:
            - `score` - A number from 0.0 to 1.0 that represents the proportion of questions where any predicted
                         answer matched one of the ground truth answers.
        """
-        if not len(questions) == len(ground_truth_answers) == len(predicted_answers):
-            raise ValueError("The length of questions, ground_truth_answers, and predicted_answers must be the same.")
+        if not len(ground_truth_answers) == len(predicted_answers):
+            raise ValueError("The length of ground_truth_answers and predicted_answers must be the same.")

        matches = []
        for truths, extracted in zip(ground_truth_answers, predicted_answers):
@ -60,6 +55,6 @@ class AnswerExactMatchEvaluator:
                matches.append(0)

        # The proportion of questions where any predicted answer matched one of the ground truth answers
-        average = sum(matches) / len(questions)
+        average = sum(matches) / len(predicted_answers)

        return {"individual_scores": matches, "score": average}
--- a/haystack/components/evaluators/document_recall.py
+++ b/haystack/components/evaluators/document_recall.py
@ -31,16 +31,15 @@ class RecallMode(Enum):
@component
 class DocumentRecallEvaluator:
    """
-    Evaluator that calculates the Recall score for a list of questions.
+    Evaluator that calculates the Recall score for a list of documents.
    Returns both a list of scores for each question and the average.
-    Each question can have multiple ground truth documents and multiple predicted documents.
+    There can be multiple ground truth documents and multiple predicted documents as input.

    Usage example:
    ```python
    from haystack.components.evaluators import DocumentRecallEvaluator
    evaluator = DocumentRecallEvaluator()
    result = evaluator.run(
-        questions=["What is the capital of Germany?", "What is the capital of France?"],
        ground_truth_answers=[["Berlin"], ["Paris"]],
        predicted_answers=[["Paris"], ["London"]],
    )
@ -80,17 +79,12 @@ class DocumentRecallEvaluator:

    @component.output_types(score=float, individual_scores=List[float])
    def run(
-        self,
-        questions: List[str],
-        ground_truth_documents: List[List[Document]],
-        retrieved_documents: List[List[Document]],
+        self, ground_truth_documents: List[List[Document]], retrieved_documents: List[List[Document]]
    ) -> Dict[str, Any]:
        """
        Run the DocumentRecallEvaluator on the given inputs.
-        All lists must have the same length.
+        `ground_truth_documents` and `retrieved_documents` must have the same length.

-        :param questions:
-            A list of questions.
        :param ground_truth_documents:
            A list of expected documents for each question.
        :param retrieved_documents:
@ -100,8 +94,8 @@ class DocumentRecallEvaluator:
            - `invididual_scores` - A list of numbers from 0.0 to 1.0 that represents the proportion of matching documents retrieved.
                                    If the mode is `single_hit`, the individual scores are True or False.
        """
-        if not len(questions) == len(ground_truth_documents) == len(retrieved_documents):
-            msg = "The length of questions, ground_truth_documents, and predicted_documents must be the same."
+        if len(ground_truth_documents) != len(retrieved_documents):
+            msg = "The length of ground_truth_documents and retrieved_documents must be the same."
            raise ValueError(msg)

        scores = []
@ -109,4 +103,4 @@ class DocumentRecallEvaluator:
            score = self.mode_function(ground_truth, retrieved)
            scores.append(score)

-        return {"score": sum(scores) / len(questions), "individual_scores": scores}
+        return {"score": sum(scores) / len(retrieved_documents), "individual_scores": scores}
--- a/test/components/evaluators/test_answer_exact_match.py
+++ b/test/components/evaluators/test_answer_exact_match.py
@ -5,33 +5,21 @@ from haystack.components.evaluators import AnswerExactMatchEvaluator

 def test_run_with_all_matching():
    evaluator = AnswerExactMatchEvaluator()
-    result = evaluator.run(
-        questions=["What is the capital of Germany?", "What is the capital of France?"],
-        ground_truth_answers=[["Berlin"], ["Paris"]],
-        predicted_answers=[["Berlin"], ["Paris"]],
-    )
+    result = evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Berlin"], ["Paris"]])

    assert result == {"individual_scores": [1, 1], "score": 1.0}


 def test_run_with_no_matching():
    evaluator = AnswerExactMatchEvaluator()
-    result = evaluator.run(
-        questions=["What is the capital of Germany?", "What is the capital of France?"],
-        ground_truth_answers=[["Berlin"], ["Paris"]],
-        predicted_answers=[["Paris"], ["London"]],
-    )
+    result = evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Paris"], ["London"]])

    assert result == {"individual_scores": [0, 0], "score": 0.0}


 def test_run_with_partial_matching():
    evaluator = AnswerExactMatchEvaluator()
-    result = evaluator.run(
-        questions=["What is the capital of Germany?", "What is the capital of France?"],
-        ground_truth_answers=[["Berlin"], ["Paris"]],
-        predicted_answers=[["Berlin"], ["London"]],
-    )
+    result = evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Berlin"], ["London"]])

    assert result == {"individual_scores": [1, 0], "score": 0.5}

@ -39,14 +27,6 @@ def test_run_with_partial_matching():
 def test_run_with_complex_data():
    evaluator = AnswerExactMatchEvaluator()
    result = evaluator.run(
-        questions=[
-            "In what country is Normandy located?",
-            "When was the Latin version of the word Norman first recorded?",
-            "What developed in Normandy during the 1100s?",
-            "In what century did important classical music developments occur in Normandy?",
-            "From which countries did the Norse originate?",
-            "What century did the Normans first gain their separate identity?",
-        ],
        ground_truth_answers=[
            ["France"],
            ["9th century", "9th"],
@ -71,22 +51,7 @@ def test_run_with_different_lengths():
    evaluator = AnswerExactMatchEvaluator()

    with pytest.raises(ValueError):
-        evaluator.run(
-            questions=["What is the capital of Germany?"],
-            ground_truth_answers=[["Berlin"], ["Paris"]],
-            predicted_answers=[["Berlin"], ["London"]],
-        )
+        evaluator.run(ground_truth_answers=[["Berlin"]], predicted_answers=[["Berlin"], ["London"]])

    with pytest.raises(ValueError):
-        evaluator.run(
-            questions=["What is the capital of Germany?", "What is the capital of France?"],
-            ground_truth_answers=[["Berlin"]],
-            predicted_answers=[["Berlin"], ["London"]],
-        )
-
-    with pytest.raises(ValueError):
-        evaluator.run(
-            questions=["What is the capital of Germany?", "What is the capital of France?"],
-            ground_truth_answers=[["Berlin"], ["Paris"]],
-            predicted_answers=[["Berlin"]],
-        )
+        evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Berlin"]])
--- a/test/components/evaluators/test_document_recall.py
+++ b/test/components/evaluators/test_document_recall.py
@ -16,7 +16,6 @@ class TestDocumentRecallEvaluatorSingleHit:

    def test_run_with_all_matching(self, evaluator):
        result = evaluator.run(
-            questions=["What is the capital of Germany?", "What is the capital of France?"],
            ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
            retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
        )
@ -25,7 +24,6 @@ class TestDocumentRecallEvaluatorSingleHit:

    def test_run_with_no_matching(self, evaluator):
        result = evaluator.run(
-            questions=["What is the capital of Germany?", "What is the capital of France?"],
            ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
            retrieved_documents=[[Document(content="Paris")], [Document(content="London")]],
        )
@ -34,7 +32,6 @@ class TestDocumentRecallEvaluatorSingleHit:

    def test_run_with_partial_matching(self, evaluator):
        result = evaluator.run(
-            questions=["What is the capital of Germany?", "What is the capital of France?"],
            ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
            retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
        )
@ -43,14 +40,6 @@ class TestDocumentRecallEvaluatorSingleHit:

    def test_run_with_complex_data(self, evaluator):
        result = evaluator.run(
-            questions=[
-                "In what country is Normandy located?",
-                "When was the Latin version of the word Norman first recorded?",
-                "What developed in Normandy during the 1100s?",
-                "In what century did important classical music developments occur in Normandy?",
-                "From which countries did the Norse originate?",
-                "What century did the Normans first gain their separate identity?",
-            ],
            ground_truth_documents=[
                [Document(content="France")],
                [Document(content="9th century"), Document(content="9th")],
@ -78,21 +67,12 @@ class TestDocumentRecallEvaluatorSingleHit:
    def test_run_with_different_lengths(self, evaluator):
        with pytest.raises(ValueError):
            evaluator.run(
-                questions=["What is the capital of Germany?"],
-                ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
-                retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
-            )
-
-        with pytest.raises(ValueError):
-            evaluator.run(
-                questions=["What is the capital of Germany?", "What is the capital of France?"],
                ground_truth_documents=[[Document(content="Berlin")]],
                retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
            )

        with pytest.raises(ValueError):
            evaluator.run(
-                questions=["What is the capital of Germany?", "What is the capital of France?"],
                ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
                retrieved_documents=[[Document(content="Berlin")]],
            )
@ -105,7 +85,6 @@ class TestDocumentRecallEvaluatorMultiHit:

    def test_run_with_all_matching(self, evaluator):
        result = evaluator.run(
-            questions=["What is the capital of Germany?", "What is the capital of France?"],
            ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
            retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
        )
@ -114,7 +93,6 @@ class TestDocumentRecallEvaluatorMultiHit:

    def test_run_with_no_matching(self, evaluator):
        result = evaluator.run(
-            questions=["What is the capital of Germany?", "What is the capital of France?"],
            ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
            retrieved_documents=[[Document(content="Paris")], [Document(content="London")]],
        )
@ -123,7 +101,6 @@ class TestDocumentRecallEvaluatorMultiHit:

    def test_run_with_partial_matching(self, evaluator):
        result = evaluator.run(
-            questions=["What is the capital of Germany?", "What is the capital of France?"],
            ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
            retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
        )
@ -132,14 +109,6 @@ class TestDocumentRecallEvaluatorMultiHit:

    def test_run_with_complex_data(self, evaluator):
        result = evaluator.run(
-            questions=[
-                "In what country is Normandy located?",
-                "When was the Latin version of the word Norman first recorded?",
-                "What developed in Normandy during the 1100s?",
-                "In what century did important classical music developments occur in Normandy?",
-                "From which countries did the Norse originate?",
-                "What century did the Normans first gain their separate identity?",
-            ],
            ground_truth_documents=[
                [Document(content="France")],
                [Document(content="9th century"), Document(content="9th")],
@ -172,21 +141,12 @@ class TestDocumentRecallEvaluatorMultiHit:
    def test_run_with_different_lengths(self, evaluator):
        with pytest.raises(ValueError):
            evaluator.run(
-                questions=["What is the capital of Germany?"],
-                ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
-                retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
-            )
-
-        with pytest.raises(ValueError):
-            evaluator.run(
-                questions=["What is the capital of Germany?", "What is the capital of France?"],
                ground_truth_documents=[[Document(content="Berlin")]],
                retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
            )

        with pytest.raises(ValueError):
            evaluator.run(
-                questions=["What is the capital of Germany?", "What is the capital of France?"],
                ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
                retrieved_documents=[[Document(content="Berlin")]],
            )