Add AnswerExactMatchEvaluator (#7050)

* Add AnswerExactMatchEvaluator * Add release notes * Fix linting * Update docstrings
2025-12-05 03:17:31 +00:00 · 2024-02-23 10:37:18 +01:00 · 2024-02-23 10:37:18 +01:00 · b4011af8e9
commit b4011af8e9
parent bc8a48cc3c
4 changed files with 119 additions and 0 deletions
--- a/haystack/components/evaluators/init.py
+++ b/haystack/components/evaluators/init.py
@ -0,0 +1,3 @@
+from .answer_exact_match import AnswerExactMatchEvaluator
+
+__all__ = ["AnswerExactMatchEvaluator"]
--- a/haystack/components/evaluators/answer_exact_match.py
+++ b/haystack/components/evaluators/answer_exact_match.py
@ -0,0 +1,49 @@
+from typing import Any, Dict, List
+
+from haystack import default_from_dict, default_to_dict
+from haystack.core.component import component
+
+
+@component
+class AnswerExactMatchEvaluator:
+    """
+    Evaluator that checks if the predicted answers matches any of the ground truth answers exactly.
+    The result is a number from 0.0 to 1.0, it represents the proportion of questions where any predicted answer
+    matched one of the ground truth answers.
+    Each question can have multiple ground truth answers and multiple predicted answers.
+    """
+
+    def to_dict(self) -> Dict[str, Any]:
+        return default_to_dict(self)
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "AnswerExactMatchEvaluator":
+        return default_from_dict(cls, data)
+
+    @component.output_types(result=float)
+    def run(
+        self, questions: List[str], ground_truth_answers: List[List[str]], predicted_answers: List[List[str]]
+    ) -> Dict[str, float]:
+        """
+        Run the AnswerExactMatchEvaluator on the given inputs.
+        All lists must have the same length.
+
+        :param questions: A list of questions.
+        :param ground_truth_answers: A list of expected answers for each question.
+        :param predicted_answers: A list of predicted answers for each question.
+        :returns: A dictionary with the following outputs:
+                * `result` - A number from 0.0 to 1.0 that represents the proportion of questions where any predicted
+                answer matched one of the ground truth answers.
+        """
+        if not len(questions) == len(ground_truth_answers) == len(predicted_answers):
+            raise ValueError("The length of questions, ground_truth_answers, and predicted_answers must be the same.")
+
+        matches = 0
+        for truths, extracted in zip(ground_truth_answers, predicted_answers):
+            if set(truths) & set(extracted):
+                matches += 1
+
+        # The proportion of questions where any predicted answer matched one of the ground truth answers
+        result = matches / len(questions)
+
+        return {"result": result}
--- a/releasenotes/notes/exact-match-evaluator-197bb87b65e19d0c.yaml
+++ b/releasenotes/notes/exact-match-evaluator-197bb87b65e19d0c.yaml
@ -0,0 +1,6 @@
+---
+features:
+  - |
+    Add `AnswerExactMatchEvaluator`, a Component that can be used to calculate the Exact Match metric
+    given a list of questions, a list of expected answers for each question and the list of predicted
+    answers for each question.
--- a/test/components/evaluators/test_answer_exact_match.py
+++ b/test/components/evaluators/test_answer_exact_match.py
@ -0,0 +1,61 @@
+import pytest
+
+from haystack.components.evaluators import AnswerExactMatchEvaluator
+
+
+def test_run_with_all_matching():
+    evaluator = AnswerExactMatchEvaluator()
+    result = evaluator.run(
+        questions=["What is the capital of Germany?", "What is the capital of France?"],
+        ground_truth_answers=[["Berlin"], ["Paris"]],
+        predicted_answers=[["Berlin"], ["Paris"]],
+    )
+
+    assert result["result"] == 1.0
+
+
+def test_run_with_no_matching():
+    evaluator = AnswerExactMatchEvaluator()
+    result = evaluator.run(
+        questions=["What is the capital of Germany?", "What is the capital of France?"],
+        ground_truth_answers=[["Berlin"], ["Paris"]],
+        predicted_answers=[["Paris"], ["London"]],
+    )
+
+    assert result["result"] == 0.0
+
+
+def test_run_with_partial_matching():
+    evaluator = AnswerExactMatchEvaluator()
+    result = evaluator.run(
+        questions=["What is the capital of Germany?", "What is the capital of France?"],
+        ground_truth_answers=[["Berlin"], ["Paris"]],
+        predicted_answers=[["Berlin"], ["London"]],
+    )
+
+    assert result["result"] == 0.5
+
+
+def test_run_with_different_lengths():
+    evaluator = AnswerExactMatchEvaluator()
+
+    with pytest.raises(ValueError):
+        evaluator.run(
+            questions=["What is the capital of Germany?"],
+            ground_truth_answers=[["Berlin"], ["Paris"]],
+            predicted_answers=[["Berlin"], ["London"]],
+        )
+
+    with pytest.raises(ValueError):
+        evaluator.run(
+            questions=["What is the capital of Germany?", "What is the capital of France?"],
+            ground_truth_answers=[["Berlin"]],
+            predicted_answers=[["Berlin"], ["London"]],
+        )
+
+    with pytest.raises(ValueError):
+        evaluator.run(
+            questions=["What is the capital of Germany?", "What is the capital of France?"],
+            ground_truth_answers=[["Berlin"], ["Paris"]],
+            predicted_answers=[["Berlin"]],
+        )