mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-24 17:30:38 +00:00

* Add DocumentRecallEvaluator * Fix mypy error * Simplify recall logic and change output for single hit mode * Remove unused import * Add comment for RecallMode fields * Reword RecallMode comments Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com> --------- Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
193 lines
9.4 KiB
Python
193 lines
9.4 KiB
Python
import pytest
|
|
|
|
from haystack.components.evaluators.document_recall import DocumentRecallEvaluator, RecallMode
|
|
from haystack.dataclasses import Document
|
|
|
|
|
|
def test_init_with_unknown_mode_string():
|
|
with pytest.raises(ValueError):
|
|
DocumentRecallEvaluator(mode="unknown_mode")
|
|
|
|
|
|
class TestDocumentRecallEvaluatorSingleHit:
|
|
@pytest.fixture
|
|
def evaluator(self):
|
|
return DocumentRecallEvaluator(mode=RecallMode.SINGLE_HIT)
|
|
|
|
def test_run_with_all_matching(self, evaluator):
|
|
result = evaluator.run(
|
|
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
|
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
|
retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
|
)
|
|
|
|
assert result == {"individual_scores": [1.0, 1.0], "score": 1.0}
|
|
|
|
def test_run_with_no_matching(self, evaluator):
|
|
result = evaluator.run(
|
|
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
|
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
|
retrieved_documents=[[Document(content="Paris")], [Document(content="London")]],
|
|
)
|
|
|
|
assert result == {"individual_scores": [0.0, 0.0], "score": 0.0}
|
|
|
|
def test_run_with_partial_matching(self, evaluator):
|
|
result = evaluator.run(
|
|
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
|
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
|
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
|
|
)
|
|
|
|
assert result == {"individual_scores": [1.0, 0.0], "score": 0.5}
|
|
|
|
def test_run_with_complex_data(self, evaluator):
|
|
result = evaluator.run(
|
|
questions=[
|
|
"In what country is Normandy located?",
|
|
"When was the Latin version of the word Norman first recorded?",
|
|
"What developed in Normandy during the 1100s?",
|
|
"In what century did important classical music developments occur in Normandy?",
|
|
"From which countries did the Norse originate?",
|
|
"What century did the Normans first gain their separate identity?",
|
|
],
|
|
ground_truth_documents=[
|
|
[Document(content="France")],
|
|
[Document(content="9th century"), Document(content="9th")],
|
|
[Document(content="classical music"), Document(content="classical")],
|
|
[Document(content="11th century"), Document(content="the 11th")],
|
|
[Document(content="Denmark, Iceland and Norway")],
|
|
[Document(content="10th century"), Document(content="10th")],
|
|
],
|
|
retrieved_documents=[
|
|
[Document(content="France")],
|
|
[Document(content="9th century"), Document(content="10th century"), Document(content="9th")],
|
|
[Document(content="classical"), Document(content="rock music"), Document(content="dubstep")],
|
|
[Document(content="11th"), Document(content="the 11th"), Document(content="11th century")],
|
|
[Document(content="Denmark"), Document(content="Norway"), Document(content="Iceland")],
|
|
[
|
|
Document(content="10th century"),
|
|
Document(content="the first half of the 10th century"),
|
|
Document(content="10th"),
|
|
Document(content="10th"),
|
|
],
|
|
],
|
|
)
|
|
assert result == {"individual_scores": [True, True, True, True, False, True], "score": 0.8333333333333334}
|
|
|
|
def test_run_with_different_lengths(self, evaluator):
|
|
with pytest.raises(ValueError):
|
|
evaluator.run(
|
|
questions=["What is the capital of Germany?"],
|
|
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
|
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
|
|
)
|
|
|
|
with pytest.raises(ValueError):
|
|
evaluator.run(
|
|
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
|
ground_truth_documents=[[Document(content="Berlin")]],
|
|
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
|
|
)
|
|
|
|
with pytest.raises(ValueError):
|
|
evaluator.run(
|
|
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
|
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
|
retrieved_documents=[[Document(content="Berlin")]],
|
|
)
|
|
|
|
|
|
class TestDocumentRecallEvaluatorMultiHit:
|
|
@pytest.fixture
|
|
def evaluator(self):
|
|
return DocumentRecallEvaluator(mode=RecallMode.MULTI_HIT)
|
|
|
|
def test_run_with_all_matching(self, evaluator):
|
|
result = evaluator.run(
|
|
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
|
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
|
retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
|
)
|
|
|
|
assert result == {"individual_scores": [1.0, 1.0], "score": 1.0}
|
|
|
|
def test_run_with_no_matching(self, evaluator):
|
|
result = evaluator.run(
|
|
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
|
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
|
retrieved_documents=[[Document(content="Paris")], [Document(content="London")]],
|
|
)
|
|
|
|
assert result == {"individual_scores": [0.0, 0.0], "score": 0.0}
|
|
|
|
def test_run_with_partial_matching(self, evaluator):
|
|
result = evaluator.run(
|
|
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
|
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
|
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
|
|
)
|
|
|
|
assert result == {"individual_scores": [1.0, 0.0], "score": 0.5}
|
|
|
|
def test_run_with_complex_data(self, evaluator):
|
|
result = evaluator.run(
|
|
questions=[
|
|
"In what country is Normandy located?",
|
|
"When was the Latin version of the word Norman first recorded?",
|
|
"What developed in Normandy during the 1100s?",
|
|
"In what century did important classical music developments occur in Normandy?",
|
|
"From which countries did the Norse originate?",
|
|
"What century did the Normans first gain their separate identity?",
|
|
],
|
|
ground_truth_documents=[
|
|
[Document(content="France")],
|
|
[Document(content="9th century"), Document(content="9th")],
|
|
[Document(content="classical music"), Document(content="classical")],
|
|
[Document(content="11th century"), Document(content="the 11th")],
|
|
[
|
|
Document(content="Denmark"),
|
|
Document(content="Iceland"),
|
|
Document(content="Norway"),
|
|
Document(content="Denmark, Iceland and Norway"),
|
|
],
|
|
[Document(content="10th century"), Document(content="10th")],
|
|
],
|
|
retrieved_documents=[
|
|
[Document(content="France")],
|
|
[Document(content="9th century"), Document(content="10th century"), Document(content="9th")],
|
|
[Document(content="classical"), Document(content="rock music"), Document(content="dubstep")],
|
|
[Document(content="11th"), Document(content="the 11th"), Document(content="11th century")],
|
|
[Document(content="Denmark"), Document(content="Norway"), Document(content="Iceland")],
|
|
[
|
|
Document(content="10th century"),
|
|
Document(content="the first half of the 10th century"),
|
|
Document(content="10th"),
|
|
Document(content="10th"),
|
|
],
|
|
],
|
|
)
|
|
assert result == {"individual_scores": [1.0, 1.0, 0.5, 1.0, 0.75, 1.0], "score": 0.875}
|
|
|
|
def test_run_with_different_lengths(self, evaluator):
|
|
with pytest.raises(ValueError):
|
|
evaluator.run(
|
|
questions=["What is the capital of Germany?"],
|
|
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
|
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
|
|
)
|
|
|
|
with pytest.raises(ValueError):
|
|
evaluator.run(
|
|
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
|
ground_truth_documents=[[Document(content="Berlin")]],
|
|
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
|
|
)
|
|
|
|
with pytest.raises(ValueError):
|
|
evaluator.run(
|
|
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
|
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
|
retrieved_documents=[[Document(content="Berlin")]],
|
|
)
|