refactor: AnswerExactMatchEvaluator component inputs (#7536)

* refactor component inputs * release notes * Update class docstring * pylint * update existing note instead of creating a new one --------- Co-authored-by: Julian Risch <julian.risch@deepset.ai>
2026-01-08 13:06:29 +00:00 · 2024-04-12 08:59:16 +02:00 · 2024-04-12 08:59:16 +02:00 · 2bad5bcb96
commit 2bad5bcb96
parent e90ffafb47
3 changed files with 44 additions and 32 deletions
--- a/haystack/components/evaluators/answer_exact_match.py
+++ b/haystack/components/evaluators/answer_exact_match.py
@ -6,10 +6,11 @@ from haystack.core.component import component
@component
 class AnswerExactMatchEvaluator:
    """
-    Evaluator that checks if the predicted answers matches any of the ground truth answers exactly.
-    The result is a number from 0.0 to 1.0, it represents the proportion any predicted answer
-    that matched one of the ground truth answers.
-    There can be multiple ground truth answers and multiple predicted answers as input.
+    Evaluator that checks if predicted answers exactly match ground truth answers.
+
+    Each predicted answer is compared to one ground truth answer.
+    The final score is a number ranging from 0.0 to 1.0.
+    It represents the proportion of predicted answers that match their corresponding ground truth answer.

    Usage example:
    ```python
@ -17,8 +18,8 @@ class AnswerExactMatchEvaluator:

    evaluator = AnswerExactMatchEvaluator()
    result = evaluator.run(
-        ground_truth_answers=[["Berlin"], ["Paris"]],
-        predicted_answers=[["Berlin"], ["Lyon"]],
+        ground_truth_answers=["Berlin", "Paris"],
+        predicted_answers=["Berlin", "Lyon"],
    )

    print(result["individual_scores"])
@ -29,15 +30,15 @@ class AnswerExactMatchEvaluator:
    """

    @component.output_types(individual_scores=List[int], score=float)
-    def run(self, ground_truth_answers: List[List[str]], predicted_answers: List[List[str]]) -> Dict[str, Any]:
+    def run(self, ground_truth_answers: List[str], predicted_answers: List[str]) -> Dict[str, Any]:
        """
        Run the AnswerExactMatchEvaluator on the given inputs.
        `ground_truth_answers` and `retrieved_answers` must have the same length.

        :param ground_truth_answers:
-            A list of expected answers for each question.
+            A list of expected answers.
        :param predicted_answers:
-            A list of predicted answers for each question.
+            A list of predicted answers.
        :returns:
            A dictionary with the following outputs:
            - `individual_scores` - A list of 0s and 1s, where 1 means that the predicted answer matched one of the ground truth.
@ -48,8 +49,8 @@ class AnswerExactMatchEvaluator:
            raise ValueError("The length of ground_truth_answers and predicted_answers must be the same.")

        matches = []
-        for truths, extracted in zip(ground_truth_answers, predicted_answers):
-            if set(truths) & set(extracted):
+        for truth, extracted in zip(ground_truth_answers, predicted_answers):
+            if truth == extracted:
                matches.append(1)
            else:
                matches.append(0)
--- a/releasenotes/notes/exact-match-evaluator-197bb87b65e19d0c.yaml
+++ b/releasenotes/notes/exact-match-evaluator-197bb87b65e19d0c.yaml
@ -1,6 +1,5 @@
 ---
 features:
  - |
-    Add `AnswerExactMatchEvaluator`, a Component that can be used to calculate the Exact Match metric
-    given a list of questions, a list of expected answers for each question and the list of predicted
-    answers for each question.
+    Add `AnswerExactMatchEvaluator`, a component that can be used to calculate the Exact Match metric
+    comparing a list of expected answers with a list of predicted answers.
--- a/test/components/evaluators/test_answer_exact_match.py
+++ b/test/components/evaluators/test_answer_exact_match.py
@ -5,21 +5,21 @@ from haystack.components.evaluators import AnswerExactMatchEvaluator

 def test_run_with_all_matching():
    evaluator = AnswerExactMatchEvaluator()
-    result = evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Berlin"], ["Paris"]])
+    result = evaluator.run(ground_truth_answers=["Berlin", "Paris"], predicted_answers=["Berlin", "Paris"])

    assert result == {"individual_scores": [1, 1], "score": 1.0}


 def test_run_with_no_matching():
    evaluator = AnswerExactMatchEvaluator()
-    result = evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Paris"], ["London"]])
+    result = evaluator.run(ground_truth_answers=["Berlin", "Paris"], predicted_answers=["Paris", "London"])

    assert result == {"individual_scores": [0, 0], "score": 0.0}


 def test_run_with_partial_matching():
    evaluator = AnswerExactMatchEvaluator()
-    result = evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Berlin"], ["London"]])
+    result = evaluator.run(ground_truth_answers=["Berlin", "Paris"], predicted_answers=["Berlin", "London"])

    assert result == {"individual_scores": [1, 0], "score": 0.5}

@ -28,30 +28,42 @@ def test_run_with_complex_data():
    evaluator = AnswerExactMatchEvaluator()
    result = evaluator.run(
        ground_truth_answers=[
-            ["France"],
-            ["9th century", "9th"],
-            ["classical music", "classical"],
-            ["11th century", "the 11th"],
-            ["Denmark", "Iceland", "Norway"],
-            ["10th century", "10th"],
+            "France",
+            "9th century",
+            "9th",
+            "classical music",
+            "classical",
+            "11th century",
+            "the 11th",
+            "Denmark",
+            "Iceland",
+            "Norway",
+            "10th century",
+            "10th",
        ],
        predicted_answers=[
-            ["France"],
-            ["9th century", "10th century", "9th"],
-            ["classic music", "rock music", "dubstep"],
-            ["11th", "the 11th", "11th century"],
-            ["Denmark, Iceland and Norway"],
-            ["10th century", "the first half of the 10th century", "10th", "10th"],
+            "France",
+            "9th century",
+            "10th century",
+            "9th",
+            "classic music",
+            "rock music",
+            "dubstep",
+            "the 11th",
+            "11th century",
+            "Denmark, Iceland and Norway",
+            "10th century",
+            "10th",
        ],
    )
-    assert result == {"individual_scores": [1, 1, 0, 1, 0, 1], "score": 0.6666666666666666}
+    assert result == {"individual_scores": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], "score": 0.3333333333333333}


 def test_run_with_different_lengths():
    evaluator = AnswerExactMatchEvaluator()

    with pytest.raises(ValueError):
-        evaluator.run(ground_truth_answers=[["Berlin"]], predicted_answers=[["Berlin"], ["London"]])
+        evaluator.run(ground_truth_answers=["Berlin"], predicted_answers=["Berlin", "London"])

    with pytest.raises(ValueError):
-        evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Berlin"]])
+        evaluator.run(ground_truth_answers=["Berlin", "Paris"], predicted_answers=["Berlin"])