diff --git a/haystack/components/evaluators/answer_exact_match.py b/haystack/components/evaluators/answer_exact_match.py index f6292f32c..dcd44408f 100644 --- a/haystack/components/evaluators/answer_exact_match.py +++ b/haystack/components/evaluators/answer_exact_match.py @@ -7,9 +7,9 @@ from haystack.core.component import component class AnswerExactMatchEvaluator: """ Evaluator that checks if the predicted answers matches any of the ground truth answers exactly. - The result is a number from 0.0 to 1.0, it represents the proportion of questions where any predicted answer - matched one of the ground truth answers. - Each question can have multiple ground truth answers and multiple predicted answers. + The result is a number from 0.0 to 1.0, it represents the proportion any predicted answer + that matched one of the ground truth answers. + There can be multiple ground truth answers and multiple predicted answers as input. Usage example: ```python @@ -17,7 +17,6 @@ class AnswerExactMatchEvaluator: evaluator = AnswerExactMatchEvaluator() result = evaluator.run( - questions=["What is the capital of Germany?", "What is the capital of France?"], ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Berlin"], ["Lyon"]], ) @@ -30,15 +29,11 @@ class AnswerExactMatchEvaluator: """ @component.output_types(individual_scores=List[int], score=float) - def run( - self, questions: List[str], ground_truth_answers: List[List[str]], predicted_answers: List[List[str]] - ) -> Dict[str, Any]: + def run(self, ground_truth_answers: List[List[str]], predicted_answers: List[List[str]]) -> Dict[str, Any]: """ Run the AnswerExactMatchEvaluator on the given inputs. - All lists must have the same length. + `ground_truth_answers` and `retrieved_answers` must have the same length. - :param questions: - A list of questions. :param ground_truth_answers: A list of expected answers for each question. :param predicted_answers: @@ -49,8 +44,8 @@ class AnswerExactMatchEvaluator: - `score` - A number from 0.0 to 1.0 that represents the proportion of questions where any predicted answer matched one of the ground truth answers. """ - if not len(questions) == len(ground_truth_answers) == len(predicted_answers): - raise ValueError("The length of questions, ground_truth_answers, and predicted_answers must be the same.") + if not len(ground_truth_answers) == len(predicted_answers): + raise ValueError("The length of ground_truth_answers and predicted_answers must be the same.") matches = [] for truths, extracted in zip(ground_truth_answers, predicted_answers): @@ -60,6 +55,6 @@ class AnswerExactMatchEvaluator: matches.append(0) # The proportion of questions where any predicted answer matched one of the ground truth answers - average = sum(matches) / len(questions) + average = sum(matches) / len(predicted_answers) return {"individual_scores": matches, "score": average} diff --git a/haystack/components/evaluators/document_recall.py b/haystack/components/evaluators/document_recall.py index 0aaa2bd17..4102aa1ff 100644 --- a/haystack/components/evaluators/document_recall.py +++ b/haystack/components/evaluators/document_recall.py @@ -31,16 +31,15 @@ class RecallMode(Enum): @component class DocumentRecallEvaluator: """ - Evaluator that calculates the Recall score for a list of questions. + Evaluator that calculates the Recall score for a list of documents. Returns both a list of scores for each question and the average. - Each question can have multiple ground truth documents and multiple predicted documents. + There can be multiple ground truth documents and multiple predicted documents as input. Usage example: ```python from haystack.components.evaluators import DocumentRecallEvaluator evaluator = DocumentRecallEvaluator() result = evaluator.run( - questions=["What is the capital of Germany?", "What is the capital of France?"], ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Paris"], ["London"]], ) @@ -80,17 +79,12 @@ class DocumentRecallEvaluator: @component.output_types(score=float, individual_scores=List[float]) def run( - self, - questions: List[str], - ground_truth_documents: List[List[Document]], - retrieved_documents: List[List[Document]], + self, ground_truth_documents: List[List[Document]], retrieved_documents: List[List[Document]] ) -> Dict[str, Any]: """ Run the DocumentRecallEvaluator on the given inputs. - All lists must have the same length. + `ground_truth_documents` and `retrieved_documents` must have the same length. - :param questions: - A list of questions. :param ground_truth_documents: A list of expected documents for each question. :param retrieved_documents: @@ -100,8 +94,8 @@ class DocumentRecallEvaluator: - `invididual_scores` - A list of numbers from 0.0 to 1.0 that represents the proportion of matching documents retrieved. If the mode is `single_hit`, the individual scores are True or False. """ - if not len(questions) == len(ground_truth_documents) == len(retrieved_documents): - msg = "The length of questions, ground_truth_documents, and predicted_documents must be the same." + if len(ground_truth_documents) != len(retrieved_documents): + msg = "The length of ground_truth_documents and retrieved_documents must be the same." raise ValueError(msg) scores = [] @@ -109,4 +103,4 @@ class DocumentRecallEvaluator: score = self.mode_function(ground_truth, retrieved) scores.append(score) - return {"score": sum(scores) / len(questions), "individual_scores": scores} + return {"score": sum(scores) / len(retrieved_documents), "individual_scores": scores} diff --git a/test/components/evaluators/test_answer_exact_match.py b/test/components/evaluators/test_answer_exact_match.py index 91e4647aa..9c7b395b2 100644 --- a/test/components/evaluators/test_answer_exact_match.py +++ b/test/components/evaluators/test_answer_exact_match.py @@ -5,33 +5,21 @@ from haystack.components.evaluators import AnswerExactMatchEvaluator def test_run_with_all_matching(): evaluator = AnswerExactMatchEvaluator() - result = evaluator.run( - questions=["What is the capital of Germany?", "What is the capital of France?"], - ground_truth_answers=[["Berlin"], ["Paris"]], - predicted_answers=[["Berlin"], ["Paris"]], - ) + result = evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Berlin"], ["Paris"]]) assert result == {"individual_scores": [1, 1], "score": 1.0} def test_run_with_no_matching(): evaluator = AnswerExactMatchEvaluator() - result = evaluator.run( - questions=["What is the capital of Germany?", "What is the capital of France?"], - ground_truth_answers=[["Berlin"], ["Paris"]], - predicted_answers=[["Paris"], ["London"]], - ) + result = evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Paris"], ["London"]]) assert result == {"individual_scores": [0, 0], "score": 0.0} def test_run_with_partial_matching(): evaluator = AnswerExactMatchEvaluator() - result = evaluator.run( - questions=["What is the capital of Germany?", "What is the capital of France?"], - ground_truth_answers=[["Berlin"], ["Paris"]], - predicted_answers=[["Berlin"], ["London"]], - ) + result = evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Berlin"], ["London"]]) assert result == {"individual_scores": [1, 0], "score": 0.5} @@ -39,14 +27,6 @@ def test_run_with_partial_matching(): def test_run_with_complex_data(): evaluator = AnswerExactMatchEvaluator() result = evaluator.run( - questions=[ - "In what country is Normandy located?", - "When was the Latin version of the word Norman first recorded?", - "What developed in Normandy during the 1100s?", - "In what century did important classical music developments occur in Normandy?", - "From which countries did the Norse originate?", - "What century did the Normans first gain their separate identity?", - ], ground_truth_answers=[ ["France"], ["9th century", "9th"], @@ -71,22 +51,7 @@ def test_run_with_different_lengths(): evaluator = AnswerExactMatchEvaluator() with pytest.raises(ValueError): - evaluator.run( - questions=["What is the capital of Germany?"], - ground_truth_answers=[["Berlin"], ["Paris"]], - predicted_answers=[["Berlin"], ["London"]], - ) + evaluator.run(ground_truth_answers=[["Berlin"]], predicted_answers=[["Berlin"], ["London"]]) with pytest.raises(ValueError): - evaluator.run( - questions=["What is the capital of Germany?", "What is the capital of France?"], - ground_truth_answers=[["Berlin"]], - predicted_answers=[["Berlin"], ["London"]], - ) - - with pytest.raises(ValueError): - evaluator.run( - questions=["What is the capital of Germany?", "What is the capital of France?"], - ground_truth_answers=[["Berlin"], ["Paris"]], - predicted_answers=[["Berlin"]], - ) + evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Berlin"]]) diff --git a/test/components/evaluators/test_document_recall.py b/test/components/evaluators/test_document_recall.py index d73406df0..56e77f02c 100644 --- a/test/components/evaluators/test_document_recall.py +++ b/test/components/evaluators/test_document_recall.py @@ -16,7 +16,6 @@ class TestDocumentRecallEvaluatorSingleHit: def test_run_with_all_matching(self, evaluator): result = evaluator.run( - questions=["What is the capital of Germany?", "What is the capital of France?"], ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]], retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]], ) @@ -25,7 +24,6 @@ class TestDocumentRecallEvaluatorSingleHit: def test_run_with_no_matching(self, evaluator): result = evaluator.run( - questions=["What is the capital of Germany?", "What is the capital of France?"], ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]], retrieved_documents=[[Document(content="Paris")], [Document(content="London")]], ) @@ -34,7 +32,6 @@ class TestDocumentRecallEvaluatorSingleHit: def test_run_with_partial_matching(self, evaluator): result = evaluator.run( - questions=["What is the capital of Germany?", "What is the capital of France?"], ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]], retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]], ) @@ -43,14 +40,6 @@ class TestDocumentRecallEvaluatorSingleHit: def test_run_with_complex_data(self, evaluator): result = evaluator.run( - questions=[ - "In what country is Normandy located?", - "When was the Latin version of the word Norman first recorded?", - "What developed in Normandy during the 1100s?", - "In what century did important classical music developments occur in Normandy?", - "From which countries did the Norse originate?", - "What century did the Normans first gain their separate identity?", - ], ground_truth_documents=[ [Document(content="France")], [Document(content="9th century"), Document(content="9th")], @@ -78,21 +67,12 @@ class TestDocumentRecallEvaluatorSingleHit: def test_run_with_different_lengths(self, evaluator): with pytest.raises(ValueError): evaluator.run( - questions=["What is the capital of Germany?"], - ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]], - retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]], - ) - - with pytest.raises(ValueError): - evaluator.run( - questions=["What is the capital of Germany?", "What is the capital of France?"], ground_truth_documents=[[Document(content="Berlin")]], retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]], ) with pytest.raises(ValueError): evaluator.run( - questions=["What is the capital of Germany?", "What is the capital of France?"], ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]], retrieved_documents=[[Document(content="Berlin")]], ) @@ -105,7 +85,6 @@ class TestDocumentRecallEvaluatorMultiHit: def test_run_with_all_matching(self, evaluator): result = evaluator.run( - questions=["What is the capital of Germany?", "What is the capital of France?"], ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]], retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]], ) @@ -114,7 +93,6 @@ class TestDocumentRecallEvaluatorMultiHit: def test_run_with_no_matching(self, evaluator): result = evaluator.run( - questions=["What is the capital of Germany?", "What is the capital of France?"], ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]], retrieved_documents=[[Document(content="Paris")], [Document(content="London")]], ) @@ -123,7 +101,6 @@ class TestDocumentRecallEvaluatorMultiHit: def test_run_with_partial_matching(self, evaluator): result = evaluator.run( - questions=["What is the capital of Germany?", "What is the capital of France?"], ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]], retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]], ) @@ -132,14 +109,6 @@ class TestDocumentRecallEvaluatorMultiHit: def test_run_with_complex_data(self, evaluator): result = evaluator.run( - questions=[ - "In what country is Normandy located?", - "When was the Latin version of the word Norman first recorded?", - "What developed in Normandy during the 1100s?", - "In what century did important classical music developments occur in Normandy?", - "From which countries did the Norse originate?", - "What century did the Normans first gain their separate identity?", - ], ground_truth_documents=[ [Document(content="France")], [Document(content="9th century"), Document(content="9th")], @@ -172,21 +141,12 @@ class TestDocumentRecallEvaluatorMultiHit: def test_run_with_different_lengths(self, evaluator): with pytest.raises(ValueError): evaluator.run( - questions=["What is the capital of Germany?"], - ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]], - retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]], - ) - - with pytest.raises(ValueError): - evaluator.run( - questions=["What is the capital of Germany?", "What is the capital of France?"], ground_truth_documents=[[Document(content="Berlin")]], retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]], ) with pytest.raises(ValueError): evaluator.run( - questions=["What is the capital of Germany?", "What is the capital of France?"], ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]], retrieved_documents=[[Document(content="Berlin")]], )