haystack/test/components/evaluators/test_document_ndcg.py
2025-05-26 16:22:51 +00:00

204 lines
7.1 KiB
Python

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
import pytest
from haystack import Document
from haystack.components.evaluators.document_ndcg import DocumentNDCGEvaluator
def test_run_with_scores():
evaluator = DocumentNDCGEvaluator()
result = evaluator.run(
ground_truth_documents=[
[
Document(content="doc1", score=3),
Document(content="doc2", score=2),
Document(content="doc3", score=3),
Document(content="doc6", score=2),
Document(content="doc7", score=3),
Document(content="doc8", score=2),
]
],
retrieved_documents=[
[
Document(content="doc1"),
Document(content="doc2"),
Document(content="doc3"),
Document(content="doc4"),
Document(content="doc5"),
]
],
)
assert result["individual_scores"][0] == pytest.approx(0.6592, abs=1e-4)
assert result["score"] == pytest.approx(0.6592, abs=1e-4)
def test_run_without_scores():
evaluator = DocumentNDCGEvaluator()
result = evaluator.run(
ground_truth_documents=[[Document(content="France"), Document(content="Paris")]],
retrieved_documents=[[Document(content="France"), Document(content="Germany"), Document(content="Paris")]],
)
assert result["individual_scores"][0] == pytest.approx(0.9197, abs=1e-4)
assert result["score"] == pytest.approx(0.9197, abs=1e-4)
def test_run_with_multiple_lists_of_docs():
evaluator = DocumentNDCGEvaluator()
result = evaluator.run(
ground_truth_documents=[
[Document(content="France"), Document(content="Paris")],
[
Document(content="doc1", score=3),
Document(content="doc2", score=2),
Document(content="doc3", score=3),
Document(content="doc6", score=2),
Document(content="doc7", score=3),
Document(content="doc8", score=2),
],
],
retrieved_documents=[
[Document(content="France"), Document(content="Germany"), Document(content="Paris")],
[
Document(content="doc1"),
Document(content="doc2"),
Document(content="doc3"),
Document(content="doc4"),
Document(content="doc5"),
],
],
)
assert result["individual_scores"][0] == pytest.approx(0.9197, abs=1e-4)
assert result["individual_scores"][1] == pytest.approx(0.6592, abs=1e-4)
assert result["score"] == pytest.approx(0.7895, abs=1e-4)
def test_run_with_different_lengths():
evaluator = DocumentNDCGEvaluator()
with pytest.raises(ValueError):
evaluator.run(
ground_truth_documents=[[Document(content="Berlin")]],
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
)
with pytest.raises(ValueError):
evaluator.run(
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
retrieved_documents=[[Document(content="Berlin")]],
)
def test_run_with_mixed_documents_with_and_without_scores():
evaluator = DocumentNDCGEvaluator()
with pytest.raises(ValueError):
evaluator.run(
ground_truth_documents=[[Document(content="France", score=3), Document(content="Paris")]],
retrieved_documents=[[Document(content="France"), Document(content="Germany"), Document(content="Paris")]],
)
def test_run_empty_retrieved():
evaluator = DocumentNDCGEvaluator()
result = evaluator.run(ground_truth_documents=[[Document(content="France")]], retrieved_documents=[[]])
assert result["individual_scores"] == [0.0]
assert result["score"] == 0.0
def test_run_empty_ground_truth():
evaluator = DocumentNDCGEvaluator()
result = evaluator.run(ground_truth_documents=[[]], retrieved_documents=[[Document(content="France")]])
assert result["individual_scores"] == [0.0]
assert result["score"] == 0.0
def test_run_empty_retrieved_and_empty_ground_truth():
evaluator = DocumentNDCGEvaluator()
result = evaluator.run(ground_truth_documents=[[]], retrieved_documents=[[]])
assert result["individual_scores"] == [0.0]
assert result["score"] == 0.0
def test_run_no_retrieved():
evaluator = DocumentNDCGEvaluator()
with pytest.raises(ValueError):
result = evaluator.run(ground_truth_documents=[[Document(content="France")]], retrieved_documents=[])
def test_run_no_ground_truth():
evaluator = DocumentNDCGEvaluator()
with pytest.raises(ValueError):
evaluator.run(ground_truth_documents=[], retrieved_documents=[[Document(content="France")]])
def test_run_no_retrieved_and_no_ground_truth():
evaluator = DocumentNDCGEvaluator()
with pytest.raises(ValueError):
evaluator.run(ground_truth_documents=[], retrieved_documents=[])
def test_calculate_dcg_with_scores():
evaluator = DocumentNDCGEvaluator()
gt_docs = [
Document(content="doc1", score=3),
Document(content="doc2", score=2),
Document(content="doc3", score=3),
Document(content="doc4", score=0),
Document(content="doc5", score=1),
Document(content="doc6", score=2),
]
ret_docs = [
Document(content="doc1"),
Document(content="doc2"),
Document(content="doc3"),
Document(content="doc4"),
Document(content="doc5"),
Document(content="doc6"),
]
dcg = evaluator.calculate_dcg(gt_docs, ret_docs)
assert dcg == pytest.approx(6.8611, abs=1e-4)
def test_calculate_dcg_without_scores():
evaluator = DocumentNDCGEvaluator()
gt_docs = [Document(content="doc1"), Document(content="doc2")]
ret_docs = [Document(content="doc2"), Document(content="doc3"), Document(content="doc1")]
dcg = evaluator.calculate_dcg(gt_docs, ret_docs)
assert dcg == pytest.approx(1.5, abs=1e-4)
def test_calculate_dcg_empty():
evaluator = DocumentNDCGEvaluator()
gt_docs = [Document(content="doc1")]
ret_docs = []
dcg = evaluator.calculate_dcg(gt_docs, ret_docs)
assert dcg == 0
def test_calculate_idcg_with_scores():
evaluator = DocumentNDCGEvaluator()
gt_docs = [
Document(content="doc1", score=3),
Document(content="doc2", score=3),
Document(content="doc3", score=2),
Document(content="doc4", score=3),
Document(content="doc5", score=2),
Document(content="doc6", score=2),
]
idcg = evaluator.calculate_idcg(gt_docs)
assert idcg == pytest.approx(8.7403, abs=1e-4)
def test_calculate_idcg_without_scores():
evaluator = DocumentNDCGEvaluator()
gt_docs = [Document(content="doc1"), Document(content="doc2"), Document(content="doc3")]
idcg = evaluator.calculate_idcg(gt_docs)
assert idcg == pytest.approx(2.1309, abs=1e-4)
def test_calculate_idcg_empty():
evaluator = DocumentNDCGEvaluator()
gt_docs = []
idcg = evaluator.calculate_idcg(gt_docs)
assert idcg == 0