diff --git a/haystack/components/evaluators/document_mrr.py b/haystack/components/evaluators/document_mrr.py new file mode 100644 index 000000000..d0194902a --- /dev/null +++ b/haystack/components/evaluators/document_mrr.py @@ -0,0 +1,79 @@ +from typing import Any, Dict, List + +from haystack import Document, component + + +@component +class DocumentMeanReciprocalRank: + """ + Evaluator that calculates the mean reciprocal rank of the retrieved documents. + + MRR measures how high the first retrieved document is ranked. + Each question can have multiple ground truth documents and multiple retrieved documents. + + `DocumentMeanReciprocalRank` doesn't normalize its inputs, the `DocumentCleaner` component + should be used to clean and normalize the documents before passing them to this evaluator. + + Usage example: + ```python + from haystack.components.evaluators import AnswerExactMatchEvaluator + evaluator = DocumentMeanReciprocalRank() + result = evaluator.run( + ground_truth_documents=[ + [Document(content="France")], + [Document(content="9th century"), Document(content="9th")], + ], + retrieved_documents=[ + [Document(content="France")], + [Document(content="9th century"), Document(content="10th century"), Document(content="9th")], + ], + ) + print(result["individual_scores"]) + # [1.0, 0.8333333333333333] + print(result["score"]) + # 0.9166666666666666 + ``` + """ + + @component.output_types(score=float, individual_scores=List[float]) + def run( + self, ground_truth_documents: List[List[Document]], retrieved_documents: List[List[Document]] + ) -> Dict[str, Any]: + """ + Run the DocumentMeanReciprocalRank on the given inputs. + + `ground_truth_documents` and `retrieved_documents` must have the same length. + + :param ground_truth_documents: + A list of expected documents for each question. + :param retrieved_documents: + A list of retrieved documents for each question. + :returns: + A dictionary with the following outputs: + - `score` - The average of calculated scores. + - `invididual_scores` - A list of numbers from 0.0 to 1.0 that represents how high the first retrieved document is ranked. + """ + if len(ground_truth_documents) != len(retrieved_documents): + msg = "The length of ground_truth_documents and retrieved_documents must be the same." + raise ValueError(msg) + + individual_scores = [] + + for ground_truth, retrieved in zip(ground_truth_documents, retrieved_documents): + score = 0.0 + for ground_document in ground_truth: + if ground_document.content is None: + continue + + for rank, retrieved_document in enumerate(retrieved): + if retrieved_document.content is None: + continue + + if ground_document.content in retrieved_document.content: + score = 1 / (rank + 1) + break + individual_scores.append(score) + + score = sum(individual_scores) / len(retrieved_documents) + + return {"score": score, "individual_scores": individual_scores} diff --git a/releasenotes/notes/document-mrr-evaluator-fa7c266cc91201a7.yaml b/releasenotes/notes/document-mrr-evaluator-fa7c266cc91201a7.yaml new file mode 100644 index 000000000..7e56e9489 --- /dev/null +++ b/releasenotes/notes/document-mrr-evaluator-fa7c266cc91201a7.yaml @@ -0,0 +1,4 @@ +--- +features: + - | + Add DocumentMeanReciprocalRank, it can be used to calculate mean reciprocal rank of retrieved documents. diff --git a/test/components/evaluators/test_document_mrr.py b/test/components/evaluators/test_document_mrr.py new file mode 100644 index 000000000..959492c64 --- /dev/null +++ b/test/components/evaluators/test_document_mrr.py @@ -0,0 +1,82 @@ +import pytest + +from haystack import Document +from haystack.components.evaluators.document_mrr import DocumentMeanReciprocalRank + + +def test_run_with_all_matching(): + evaluator = DocumentMeanReciprocalRank() + result = evaluator.run( + ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]], + retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]], + ) + + assert result == {"individual_scores": [1.0, 1.0], "score": 1.0} + + +def test_run_with_no_matching(): + evaluator = DocumentMeanReciprocalRank() + result = evaluator.run( + ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]], + retrieved_documents=[[Document(content="Paris")], [Document(content="London")]], + ) + + assert result == {"individual_scores": [0.0, 0.0], "score": 0.0} + + +def test_run_with_partial_matching(): + evaluator = DocumentMeanReciprocalRank() + result = evaluator.run( + ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]], + retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]], + ) + + assert result == {"individual_scores": [1.0, 0.0], "score": 0.5} + + +def test_run_with_complex_data(): + evaluator = DocumentMeanReciprocalRank() + result = evaluator.run( + ground_truth_documents=[ + [Document(content="France")], + [Document(content="9th century"), Document(content="9th")], + [Document(content="classical music"), Document(content="classical")], + [Document(content="11th century"), Document(content="the 11th")], + [Document(content="Denmark, Iceland and Norway")], + [Document(content="10th century"), Document(content="10th")], + ], + retrieved_documents=[ + [Document(content="France")], + [Document(content="10th century"), Document(content="9th century"), Document(content="9th")], + [Document(content="rock music"), Document(content="dubstep"), Document(content="classical")], + [Document(content="11th"), Document(content="the 11th"), Document(content="11th century")], + [Document(content="Denmark"), Document(content="Norway"), Document(content="Iceland")], + [ + Document(content="10th century"), + Document(content="the first half of the 10th century"), + Document(content="10th"), + Document(content="10th"), + ], + ], + ) + + assert result == { + "individual_scores": [1.0, 0.5, 0.3333333333333333, 0.5, 0.0, 1.0], + "score": pytest.approx(0.555555555555555), + } + + +def test_run_with_different_lengths(): + with pytest.raises(ValueError): + evaluator = DocumentMeanReciprocalRank() + evaluator.run( + ground_truth_documents=[[Document(content="Berlin")]], + retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]], + ) + + with pytest.raises(ValueError): + evaluator = DocumentMeanReciprocalRank() + evaluator.run( + ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]], + retrieved_documents=[[Document(content="Berlin")]], + )