feat: Add DocumentMeanReciprocalRank (#7468)

* Add DocumentMeanReciprocalRank * Fix float precision error
2026-01-07 20:46:31 +00:00 · 2024-04-04 14:55:37 +02:00 · 2024-04-04 14:55:37 +02:00 · bdc25ca2a0
commit bdc25ca2a0
parent 7799909069
3 changed files with 165 additions and 0 deletions
--- a/haystack/components/evaluators/document_mrr.py
+++ b/haystack/components/evaluators/document_mrr.py
@ -0,0 +1,79 @@
+from typing import Any, Dict, List
+
+from haystack import Document, component
+
+
+@component
+class DocumentMeanReciprocalRank:
+    """
+    Evaluator that calculates the mean reciprocal rank of the retrieved documents.
+
+    MRR measures how high the first retrieved document is ranked.
+    Each question can have multiple ground truth documents and multiple retrieved documents.
+
+    `DocumentMeanReciprocalRank` doesn't normalize its inputs, the `DocumentCleaner` component
+    should be used to clean and normalize the documents before passing them to this evaluator.
+
+    Usage example:
+    ```python
+    from haystack.components.evaluators import AnswerExactMatchEvaluator
+    evaluator = DocumentMeanReciprocalRank()
+    result = evaluator.run(
+        ground_truth_documents=[
+            [Document(content="France")],
+            [Document(content="9th century"), Document(content="9th")],
+        ],
+        retrieved_documents=[
+            [Document(content="France")],
+            [Document(content="9th century"), Document(content="10th century"), Document(content="9th")],
+        ],
+    )
+    print(result["individual_scores"])
+    # [1.0, 0.8333333333333333]
+    print(result["score"])
+    # 0.9166666666666666
+    ```
+    """
+
+    @component.output_types(score=float, individual_scores=List[float])
+    def run(
+        self, ground_truth_documents: List[List[Document]], retrieved_documents: List[List[Document]]
+    ) -> Dict[str, Any]:
+        """
+        Run the DocumentMeanReciprocalRank on the given inputs.
+
+        `ground_truth_documents` and `retrieved_documents` must have the same length.
+
+        :param ground_truth_documents:
+            A list of expected documents for each question.
+        :param retrieved_documents:
+            A list of retrieved documents for each question.
+        :returns:
+            A dictionary with the following outputs:
+            - `score` - The average of calculated scores.
+            - `invididual_scores` - A list of numbers from 0.0 to 1.0 that represents how high the first retrieved document is ranked.
+        """
+        if len(ground_truth_documents) != len(retrieved_documents):
+            msg = "The length of ground_truth_documents and retrieved_documents must be the same."
+            raise ValueError(msg)
+
+        individual_scores = []
+
+        for ground_truth, retrieved in zip(ground_truth_documents, retrieved_documents):
+            score = 0.0
+            for ground_document in ground_truth:
+                if ground_document.content is None:
+                    continue
+
+                for rank, retrieved_document in enumerate(retrieved):
+                    if retrieved_document.content is None:
+                        continue
+
+                    if ground_document.content in retrieved_document.content:
+                        score = 1 / (rank + 1)
+                        break
+            individual_scores.append(score)
+
+        score = sum(individual_scores) / len(retrieved_documents)
+
+        return {"score": score, "individual_scores": individual_scores}
--- a/releasenotes/notes/document-mrr-evaluator-fa7c266cc91201a7.yaml
+++ b/releasenotes/notes/document-mrr-evaluator-fa7c266cc91201a7.yaml
@ -0,0 +1,4 @@
+---
+features:
+  - |
+    Add DocumentMeanReciprocalRank, it can be used to calculate mean reciprocal rank of retrieved documents.
--- a/test/components/evaluators/test_document_mrr.py
+++ b/test/components/evaluators/test_document_mrr.py
@ -0,0 +1,82 @@
+import pytest
+
+from haystack import Document
+from haystack.components.evaluators.document_mrr import DocumentMeanReciprocalRank
+
+
+def test_run_with_all_matching():
+    evaluator = DocumentMeanReciprocalRank()
+    result = evaluator.run(
+        ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
+        retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
+    )
+
+    assert result == {"individual_scores": [1.0, 1.0], "score": 1.0}
+
+
+def test_run_with_no_matching():
+    evaluator = DocumentMeanReciprocalRank()
+    result = evaluator.run(
+        ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
+        retrieved_documents=[[Document(content="Paris")], [Document(content="London")]],
+    )
+
+    assert result == {"individual_scores": [0.0, 0.0], "score": 0.0}
+
+
+def test_run_with_partial_matching():
+    evaluator = DocumentMeanReciprocalRank()
+    result = evaluator.run(
+        ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
+        retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
+    )
+
+    assert result == {"individual_scores": [1.0, 0.0], "score": 0.5}
+
+
+def test_run_with_complex_data():
+    evaluator = DocumentMeanReciprocalRank()
+    result = evaluator.run(
+        ground_truth_documents=[
+            [Document(content="France")],
+            [Document(content="9th century"), Document(content="9th")],
+            [Document(content="classical music"), Document(content="classical")],
+            [Document(content="11th century"), Document(content="the 11th")],
+            [Document(content="Denmark, Iceland and Norway")],
+            [Document(content="10th century"), Document(content="10th")],
+        ],
+        retrieved_documents=[
+            [Document(content="France")],
+            [Document(content="10th century"), Document(content="9th century"), Document(content="9th")],
+            [Document(content="rock music"), Document(content="dubstep"), Document(content="classical")],
+            [Document(content="11th"), Document(content="the 11th"), Document(content="11th century")],
+            [Document(content="Denmark"), Document(content="Norway"), Document(content="Iceland")],
+            [
+                Document(content="10th century"),
+                Document(content="the first half of the 10th century"),
+                Document(content="10th"),
+                Document(content="10th"),
+            ],
+        ],
+    )
+
+    assert result == {
+        "individual_scores": [1.0, 0.5, 0.3333333333333333, 0.5, 0.0, 1.0],
+        "score": pytest.approx(0.555555555555555),
+    }
+
+
+def test_run_with_different_lengths():
+    with pytest.raises(ValueError):
+        evaluator = DocumentMeanReciprocalRank()
+        evaluator.run(
+            ground_truth_documents=[[Document(content="Berlin")]],
+            retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
+        )
+
+    with pytest.raises(ValueError):
+        evaluator = DocumentMeanReciprocalRank()
+        evaluator.run(
+            ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
+            retrieved_documents=[[Document(content="Berlin")]],
+        )