mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-07 20:46:31 +00:00
feat: Add DocumentMeanReciprocalRank (#7468)
* Add DocumentMeanReciprocalRank * Fix float precision error
This commit is contained in:
parent
7799909069
commit
bdc25ca2a0
79
haystack/components/evaluators/document_mrr.py
Normal file
79
haystack/components/evaluators/document_mrr.py
Normal file
@ -0,0 +1,79 @@
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from haystack import Document, component
|
||||
|
||||
|
||||
@component
|
||||
class DocumentMeanReciprocalRank:
|
||||
"""
|
||||
Evaluator that calculates the mean reciprocal rank of the retrieved documents.
|
||||
|
||||
MRR measures how high the first retrieved document is ranked.
|
||||
Each question can have multiple ground truth documents and multiple retrieved documents.
|
||||
|
||||
`DocumentMeanReciprocalRank` doesn't normalize its inputs, the `DocumentCleaner` component
|
||||
should be used to clean and normalize the documents before passing them to this evaluator.
|
||||
|
||||
Usage example:
|
||||
```python
|
||||
from haystack.components.evaluators import AnswerExactMatchEvaluator
|
||||
evaluator = DocumentMeanReciprocalRank()
|
||||
result = evaluator.run(
|
||||
ground_truth_documents=[
|
||||
[Document(content="France")],
|
||||
[Document(content="9th century"), Document(content="9th")],
|
||||
],
|
||||
retrieved_documents=[
|
||||
[Document(content="France")],
|
||||
[Document(content="9th century"), Document(content="10th century"), Document(content="9th")],
|
||||
],
|
||||
)
|
||||
print(result["individual_scores"])
|
||||
# [1.0, 0.8333333333333333]
|
||||
print(result["score"])
|
||||
# 0.9166666666666666
|
||||
```
|
||||
"""
|
||||
|
||||
@component.output_types(score=float, individual_scores=List[float])
|
||||
def run(
|
||||
self, ground_truth_documents: List[List[Document]], retrieved_documents: List[List[Document]]
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Run the DocumentMeanReciprocalRank on the given inputs.
|
||||
|
||||
`ground_truth_documents` and `retrieved_documents` must have the same length.
|
||||
|
||||
:param ground_truth_documents:
|
||||
A list of expected documents for each question.
|
||||
:param retrieved_documents:
|
||||
A list of retrieved documents for each question.
|
||||
:returns:
|
||||
A dictionary with the following outputs:
|
||||
- `score` - The average of calculated scores.
|
||||
- `invididual_scores` - A list of numbers from 0.0 to 1.0 that represents how high the first retrieved document is ranked.
|
||||
"""
|
||||
if len(ground_truth_documents) != len(retrieved_documents):
|
||||
msg = "The length of ground_truth_documents and retrieved_documents must be the same."
|
||||
raise ValueError(msg)
|
||||
|
||||
individual_scores = []
|
||||
|
||||
for ground_truth, retrieved in zip(ground_truth_documents, retrieved_documents):
|
||||
score = 0.0
|
||||
for ground_document in ground_truth:
|
||||
if ground_document.content is None:
|
||||
continue
|
||||
|
||||
for rank, retrieved_document in enumerate(retrieved):
|
||||
if retrieved_document.content is None:
|
||||
continue
|
||||
|
||||
if ground_document.content in retrieved_document.content:
|
||||
score = 1 / (rank + 1)
|
||||
break
|
||||
individual_scores.append(score)
|
||||
|
||||
score = sum(individual_scores) / len(retrieved_documents)
|
||||
|
||||
return {"score": score, "individual_scores": individual_scores}
|
||||
@ -0,0 +1,4 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
Add DocumentMeanReciprocalRank, it can be used to calculate mean reciprocal rank of retrieved documents.
|
||||
82
test/components/evaluators/test_document_mrr.py
Normal file
82
test/components/evaluators/test_document_mrr.py
Normal file
@ -0,0 +1,82 @@
|
||||
import pytest
|
||||
|
||||
from haystack import Document
|
||||
from haystack.components.evaluators.document_mrr import DocumentMeanReciprocalRank
|
||||
|
||||
|
||||
def test_run_with_all_matching():
|
||||
evaluator = DocumentMeanReciprocalRank()
|
||||
result = evaluator.run(
|
||||
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
||||
retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
||||
)
|
||||
|
||||
assert result == {"individual_scores": [1.0, 1.0], "score": 1.0}
|
||||
|
||||
|
||||
def test_run_with_no_matching():
|
||||
evaluator = DocumentMeanReciprocalRank()
|
||||
result = evaluator.run(
|
||||
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
||||
retrieved_documents=[[Document(content="Paris")], [Document(content="London")]],
|
||||
)
|
||||
|
||||
assert result == {"individual_scores": [0.0, 0.0], "score": 0.0}
|
||||
|
||||
|
||||
def test_run_with_partial_matching():
|
||||
evaluator = DocumentMeanReciprocalRank()
|
||||
result = evaluator.run(
|
||||
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
||||
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
|
||||
)
|
||||
|
||||
assert result == {"individual_scores": [1.0, 0.0], "score": 0.5}
|
||||
|
||||
|
||||
def test_run_with_complex_data():
|
||||
evaluator = DocumentMeanReciprocalRank()
|
||||
result = evaluator.run(
|
||||
ground_truth_documents=[
|
||||
[Document(content="France")],
|
||||
[Document(content="9th century"), Document(content="9th")],
|
||||
[Document(content="classical music"), Document(content="classical")],
|
||||
[Document(content="11th century"), Document(content="the 11th")],
|
||||
[Document(content="Denmark, Iceland and Norway")],
|
||||
[Document(content="10th century"), Document(content="10th")],
|
||||
],
|
||||
retrieved_documents=[
|
||||
[Document(content="France")],
|
||||
[Document(content="10th century"), Document(content="9th century"), Document(content="9th")],
|
||||
[Document(content="rock music"), Document(content="dubstep"), Document(content="classical")],
|
||||
[Document(content="11th"), Document(content="the 11th"), Document(content="11th century")],
|
||||
[Document(content="Denmark"), Document(content="Norway"), Document(content="Iceland")],
|
||||
[
|
||||
Document(content="10th century"),
|
||||
Document(content="the first half of the 10th century"),
|
||||
Document(content="10th"),
|
||||
Document(content="10th"),
|
||||
],
|
||||
],
|
||||
)
|
||||
|
||||
assert result == {
|
||||
"individual_scores": [1.0, 0.5, 0.3333333333333333, 0.5, 0.0, 1.0],
|
||||
"score": pytest.approx(0.555555555555555),
|
||||
}
|
||||
|
||||
|
||||
def test_run_with_different_lengths():
|
||||
with pytest.raises(ValueError):
|
||||
evaluator = DocumentMeanReciprocalRank()
|
||||
evaluator.run(
|
||||
ground_truth_documents=[[Document(content="Berlin")]],
|
||||
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
evaluator = DocumentMeanReciprocalRank()
|
||||
evaluator.run(
|
||||
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
||||
retrieved_documents=[[Document(content="Berlin")]],
|
||||
)
|
||||
Loading…
x
Reference in New Issue
Block a user