diff --git a/haystack/components/evaluators/__init__.py b/haystack/components/evaluators/__init__.py deleted file mode 100644 index 9550a5f42..000000000 --- a/haystack/components/evaluators/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .answer_exact_match import AnswerExactMatchEvaluator - -__all__ = ["AnswerExactMatchEvaluator"] diff --git a/haystack/components/evaluators/answer_exact_match.py b/haystack/components/evaluators/answer_exact_match.py deleted file mode 100644 index eb509e8be..000000000 --- a/haystack/components/evaluators/answer_exact_match.py +++ /dev/null @@ -1,49 +0,0 @@ -from typing import Any, Dict, List - -from haystack import default_from_dict, default_to_dict -from haystack.core.component import component - - -@component -class AnswerExactMatchEvaluator: - """ - Evaluator that checks if the predicted answers matches any of the ground truth answers exactly. - The result is a number from 0.0 to 1.0, it represents the proportion of questions where any predicted answer - matched one of the ground truth answers. - Each question can have multiple ground truth answers and multiple predicted answers. - """ - - def to_dict(self) -> Dict[str, Any]: - return default_to_dict(self) - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "AnswerExactMatchEvaluator": - return default_from_dict(cls, data) - - @component.output_types(result=float) - def run( - self, questions: List[str], ground_truth_answers: List[List[str]], predicted_answers: List[List[str]] - ) -> Dict[str, float]: - """ - Run the AnswerExactMatchEvaluator on the given inputs. - All lists must have the same length. - - :param questions: A list of questions. - :param ground_truth_answers: A list of expected answers for each question. - :param predicted_answers: A list of predicted answers for each question. - :returns: A dictionary with the following outputs: - * `result` - A number from 0.0 to 1.0 that represents the proportion of questions where any predicted - answer matched one of the ground truth answers. - """ - if not len(questions) == len(ground_truth_answers) == len(predicted_answers): - raise ValueError("The length of questions, ground_truth_answers, and predicted_answers must be the same.") - - matches = 0 - for truths, extracted in zip(ground_truth_answers, predicted_answers): - if set(truths) & set(extracted): - matches += 1 - - # The proportion of questions where any predicted answer matched one of the ground truth answers - result = matches / len(questions) - - return {"result": result} diff --git a/releasenotes/notes/exact-match-evaluator-197bb87b65e19d0c.yaml b/releasenotes/notes/exact-match-evaluator-197bb87b65e19d0c.yaml deleted file mode 100644 index ad380617d..000000000 --- a/releasenotes/notes/exact-match-evaluator-197bb87b65e19d0c.yaml +++ /dev/null @@ -1,6 +0,0 @@ ---- -features: - - | - Add `AnswerExactMatchEvaluator`, a Component that can be used to calculate the Exact Match metric - given a list of questions, a list of expected answers for each question and the list of predicted - answers for each question. diff --git a/test/components/evaluators/test_answer_exact_match.py b/test/components/evaluators/test_answer_exact_match.py deleted file mode 100644 index c179c74a2..000000000 --- a/test/components/evaluators/test_answer_exact_match.py +++ /dev/null @@ -1,61 +0,0 @@ -import pytest - -from haystack.components.evaluators import AnswerExactMatchEvaluator - - -def test_run_with_all_matching(): - evaluator = AnswerExactMatchEvaluator() - result = evaluator.run( - questions=["What is the capital of Germany?", "What is the capital of France?"], - ground_truth_answers=[["Berlin"], ["Paris"]], - predicted_answers=[["Berlin"], ["Paris"]], - ) - - assert result["result"] == 1.0 - - -def test_run_with_no_matching(): - evaluator = AnswerExactMatchEvaluator() - result = evaluator.run( - questions=["What is the capital of Germany?", "What is the capital of France?"], - ground_truth_answers=[["Berlin"], ["Paris"]], - predicted_answers=[["Paris"], ["London"]], - ) - - assert result["result"] == 0.0 - - -def test_run_with_partial_matching(): - evaluator = AnswerExactMatchEvaluator() - result = evaluator.run( - questions=["What is the capital of Germany?", "What is the capital of France?"], - ground_truth_answers=[["Berlin"], ["Paris"]], - predicted_answers=[["Berlin"], ["London"]], - ) - - assert result["result"] == 0.5 - - -def test_run_with_different_lengths(): - evaluator = AnswerExactMatchEvaluator() - - with pytest.raises(ValueError): - evaluator.run( - questions=["What is the capital of Germany?"], - ground_truth_answers=[["Berlin"], ["Paris"]], - predicted_answers=[["Berlin"], ["London"]], - ) - - with pytest.raises(ValueError): - evaluator.run( - questions=["What is the capital of Germany?", "What is the capital of France?"], - ground_truth_answers=[["Berlin"]], - predicted_answers=[["Berlin"], ["London"]], - ) - - with pytest.raises(ValueError): - evaluator.run( - questions=["What is the capital of Germany?", "What is the capital of France?"], - ground_truth_answers=[["Berlin"], ["Paris"]], - predicted_answers=[["Berlin"]], - )