diff --git a/haystack/components/evaluators/answer_exact_match.py b/haystack/components/evaluators/answer_exact_match.py index dcd44408f..db5e72259 100644 --- a/haystack/components/evaluators/answer_exact_match.py +++ b/haystack/components/evaluators/answer_exact_match.py @@ -6,10 +6,11 @@ from haystack.core.component import component @component class AnswerExactMatchEvaluator: """ - Evaluator that checks if the predicted answers matches any of the ground truth answers exactly. - The result is a number from 0.0 to 1.0, it represents the proportion any predicted answer - that matched one of the ground truth answers. - There can be multiple ground truth answers and multiple predicted answers as input. + Evaluator that checks if predicted answers exactly match ground truth answers. + + Each predicted answer is compared to one ground truth answer. + The final score is a number ranging from 0.0 to 1.0. + It represents the proportion of predicted answers that match their corresponding ground truth answer. Usage example: ```python @@ -17,8 +18,8 @@ class AnswerExactMatchEvaluator: evaluator = AnswerExactMatchEvaluator() result = evaluator.run( - ground_truth_answers=[["Berlin"], ["Paris"]], - predicted_answers=[["Berlin"], ["Lyon"]], + ground_truth_answers=["Berlin", "Paris"], + predicted_answers=["Berlin", "Lyon"], ) print(result["individual_scores"]) @@ -29,15 +30,15 @@ class AnswerExactMatchEvaluator: """ @component.output_types(individual_scores=List[int], score=float) - def run(self, ground_truth_answers: List[List[str]], predicted_answers: List[List[str]]) -> Dict[str, Any]: + def run(self, ground_truth_answers: List[str], predicted_answers: List[str]) -> Dict[str, Any]: """ Run the AnswerExactMatchEvaluator on the given inputs. `ground_truth_answers` and `retrieved_answers` must have the same length. :param ground_truth_answers: - A list of expected answers for each question. + A list of expected answers. :param predicted_answers: - A list of predicted answers for each question. + A list of predicted answers. :returns: A dictionary with the following outputs: - `individual_scores` - A list of 0s and 1s, where 1 means that the predicted answer matched one of the ground truth. @@ -48,8 +49,8 @@ class AnswerExactMatchEvaluator: raise ValueError("The length of ground_truth_answers and predicted_answers must be the same.") matches = [] - for truths, extracted in zip(ground_truth_answers, predicted_answers): - if set(truths) & set(extracted): + for truth, extracted in zip(ground_truth_answers, predicted_answers): + if truth == extracted: matches.append(1) else: matches.append(0) diff --git a/releasenotes/notes/exact-match-evaluator-197bb87b65e19d0c.yaml b/releasenotes/notes/exact-match-evaluator-197bb87b65e19d0c.yaml index ad380617d..c872542be 100644 --- a/releasenotes/notes/exact-match-evaluator-197bb87b65e19d0c.yaml +++ b/releasenotes/notes/exact-match-evaluator-197bb87b65e19d0c.yaml @@ -1,6 +1,5 @@ --- features: - | - Add `AnswerExactMatchEvaluator`, a Component that can be used to calculate the Exact Match metric - given a list of questions, a list of expected answers for each question and the list of predicted - answers for each question. + Add `AnswerExactMatchEvaluator`, a component that can be used to calculate the Exact Match metric + comparing a list of expected answers with a list of predicted answers. diff --git a/test/components/evaluators/test_answer_exact_match.py b/test/components/evaluators/test_answer_exact_match.py index 9c7b395b2..2aa4fe7ea 100644 --- a/test/components/evaluators/test_answer_exact_match.py +++ b/test/components/evaluators/test_answer_exact_match.py @@ -5,21 +5,21 @@ from haystack.components.evaluators import AnswerExactMatchEvaluator def test_run_with_all_matching(): evaluator = AnswerExactMatchEvaluator() - result = evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Berlin"], ["Paris"]]) + result = evaluator.run(ground_truth_answers=["Berlin", "Paris"], predicted_answers=["Berlin", "Paris"]) assert result == {"individual_scores": [1, 1], "score": 1.0} def test_run_with_no_matching(): evaluator = AnswerExactMatchEvaluator() - result = evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Paris"], ["London"]]) + result = evaluator.run(ground_truth_answers=["Berlin", "Paris"], predicted_answers=["Paris", "London"]) assert result == {"individual_scores": [0, 0], "score": 0.0} def test_run_with_partial_matching(): evaluator = AnswerExactMatchEvaluator() - result = evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Berlin"], ["London"]]) + result = evaluator.run(ground_truth_answers=["Berlin", "Paris"], predicted_answers=["Berlin", "London"]) assert result == {"individual_scores": [1, 0], "score": 0.5} @@ -28,30 +28,42 @@ def test_run_with_complex_data(): evaluator = AnswerExactMatchEvaluator() result = evaluator.run( ground_truth_answers=[ - ["France"], - ["9th century", "9th"], - ["classical music", "classical"], - ["11th century", "the 11th"], - ["Denmark", "Iceland", "Norway"], - ["10th century", "10th"], + "France", + "9th century", + "9th", + "classical music", + "classical", + "11th century", + "the 11th", + "Denmark", + "Iceland", + "Norway", + "10th century", + "10th", ], predicted_answers=[ - ["France"], - ["9th century", "10th century", "9th"], - ["classic music", "rock music", "dubstep"], - ["11th", "the 11th", "11th century"], - ["Denmark, Iceland and Norway"], - ["10th century", "the first half of the 10th century", "10th", "10th"], + "France", + "9th century", + "10th century", + "9th", + "classic music", + "rock music", + "dubstep", + "the 11th", + "11th century", + "Denmark, Iceland and Norway", + "10th century", + "10th", ], ) - assert result == {"individual_scores": [1, 1, 0, 1, 0, 1], "score": 0.6666666666666666} + assert result == {"individual_scores": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], "score": 0.3333333333333333} def test_run_with_different_lengths(): evaluator = AnswerExactMatchEvaluator() with pytest.raises(ValueError): - evaluator.run(ground_truth_answers=[["Berlin"]], predicted_answers=[["Berlin"], ["London"]]) + evaluator.run(ground_truth_answers=["Berlin"], predicted_answers=["Berlin", "London"]) with pytest.raises(ValueError): - evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Berlin"]]) + evaluator.run(ground_truth_answers=["Berlin", "Paris"], predicted_answers=["Berlin"])