Add AnswerExactMatchEvaluator (#7050)

* Add AnswerExactMatchEvaluator * Add release notes * Fix linting * Update docstrings
2025-12-08 21:37:12 +00:00 · 2024-02-23 10:37:18 +01:00 · 2024-02-23 10:37:18 +01:00 · b4011af8e9
commit b4011af8e9
parent bc8a48cc3c
4 changed files with 119 additions and 0 deletions
--- a/haystack/components/evaluators/init.py
+++ b/haystack/components/evaluators/init.py
@ -0,0 +1,3 @@
 from .answer_exact_match import AnswerExactMatchEvaluator
 __all__ = ["AnswerExactMatchEvaluator"]
--- a/haystack/components/evaluators/answer_exact_match.py
+++ b/haystack/components/evaluators/answer_exact_match.py
@ -0,0 +1,49 @@
 from typing import Any, Dict, List
 from haystack import default_from_dict, default_to_dict
 from haystack.core.component import component
@component
 class AnswerExactMatchEvaluator:
    """
    Evaluator that checks if the predicted answers matches any of the ground truth answers exactly.
    The result is a number from 0.0 to 1.0, it represents the proportion of questions where any predicted answer
    matched one of the ground truth answers.
    Each question can have multiple ground truth answers and multiple predicted answers.
    """
    def to_dict(self) -> Dict[str, Any]:
        return default_to_dict(self)
    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "AnswerExactMatchEvaluator":
        return default_from_dict(cls, data)
    @component.output_types(result=float)
    def run(
        self, questions: List[str], ground_truth_answers: List[List[str]], predicted_answers: List[List[str]]
    ) -> Dict[str, float]:
        """
        Run the AnswerExactMatchEvaluator on the given inputs.
        All lists must have the same length.
        :param questions: A list of questions.
        :param ground_truth_answers: A list of expected answers for each question.
        :param predicted_answers: A list of predicted answers for each question.
        :returns: A dictionary with the following outputs:
                * `result` - A number from 0.0 to 1.0 that represents the proportion of questions where any predicted
                answer matched one of the ground truth answers.
        """
        if not len(questions) == len(ground_truth_answers) == len(predicted_answers):
            raise ValueError("The length of questions, ground_truth_answers, and predicted_answers must be the same.")
        matches = 0
        for truths, extracted in zip(ground_truth_answers, predicted_answers):
            if set(truths) & set(extracted):
                matches += 1
        # The proportion of questions where any predicted answer matched one of the ground truth answers
        result = matches / len(questions)
        return {"result": result}
--- a/releasenotes/notes/exact-match-evaluator-197bb87b65e19d0c.yaml
+++ b/releasenotes/notes/exact-match-evaluator-197bb87b65e19d0c.yaml
@ -0,0 +1,6 @@
 ---
 features:
  - |
    Add `AnswerExactMatchEvaluator`, a Component that can be used to calculate the Exact Match metric
    given a list of questions, a list of expected answers for each question and the list of predicted
    answers for each question.
--- a/test/components/evaluators/test_answer_exact_match.py
+++ b/test/components/evaluators/test_answer_exact_match.py
@ -0,0 +1,61 @@
 import pytest
 from haystack.components.evaluators import AnswerExactMatchEvaluator
 def test_run_with_all_matching():
    evaluator = AnswerExactMatchEvaluator()
    result = evaluator.run(
        questions=["What is the capital of Germany?", "What is the capital of France?"],
        ground_truth_answers=[["Berlin"], ["Paris"]],
        predicted_answers=[["Berlin"], ["Paris"]],
    )
    assert result["result"] == 1.0
 def test_run_with_no_matching():
    evaluator = AnswerExactMatchEvaluator()
    result = evaluator.run(
        questions=["What is the capital of Germany?", "What is the capital of France?"],
        ground_truth_answers=[["Berlin"], ["Paris"]],
        predicted_answers=[["Paris"], ["London"]],
    )
    assert result["result"] == 0.0
 def test_run_with_partial_matching():
    evaluator = AnswerExactMatchEvaluator()
    result = evaluator.run(
        questions=["What is the capital of Germany?", "What is the capital of France?"],
        ground_truth_answers=[["Berlin"], ["Paris"]],
        predicted_answers=[["Berlin"], ["London"]],
    )
    assert result["result"] == 0.5
 def test_run_with_different_lengths():
    evaluator = AnswerExactMatchEvaluator()
    with pytest.raises(ValueError):
        evaluator.run(
            questions=["What is the capital of Germany?"],
            ground_truth_answers=[["Berlin"], ["Paris"]],
            predicted_answers=[["Berlin"], ["London"]],
        )
    with pytest.raises(ValueError):
        evaluator.run(
            questions=["What is the capital of Germany?", "What is the capital of France?"],
            ground_truth_answers=[["Berlin"]],
            predicted_answers=[["Berlin"], ["London"]],
        )
    with pytest.raises(ValueError):
        evaluator.run(
            questions=["What is the capital of Germany?", "What is the capital of France?"],
            ground_truth_answers=[["Berlin"], ["Paris"]],
            predicted_answers=[["Berlin"]],
        )
		`@ -0,0 +1,3 @@`
							`from .answer_exact_match import AnswerExactMatchEvaluator`

							`__all__ = ["AnswerExactMatchEvaluator"]`