2024-05-09 15:40:36 +02:00
|
|
|
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
|
|
|
#
|
|
|
|
# SPDX-License-Identifier: Apache-2.0
|
2024-04-25 12:16:48 +02:00
|
|
|
from haystack.evaluation import EvaluationRunResult
|
|
|
|
import pytest
|
|
|
|
|
|
|
|
|
|
|
|
def test_init_results_evaluator():
|
|
|
|
data = {
|
|
|
|
"inputs": {
|
|
|
|
"query_id": ["53c3b3e6", "225f87f7"],
|
|
|
|
"question": ["What is the capital of France?", "What is the capital of Spain?"],
|
|
|
|
"contexts": ["wiki_France", "wiki_Spain"],
|
|
|
|
"answer": ["Paris", "Madrid"],
|
|
|
|
"predicted_answer": ["Paris", "Madrid"],
|
|
|
|
},
|
|
|
|
"metrics": {
|
|
|
|
"reciprocal_rank": {"individual_scores": [0.378064, 0.534964], "score": 0.476932},
|
|
|
|
"single_hit": {"individual_scores": [1, 1], "score": 0.75},
|
|
|
|
"multi_hit": {"individual_scores": [0.706125, 0.454976], "score": 0.46428375},
|
|
|
|
"context_relevance": {"individual_scores": [0.805466, 0.410251], "score": 0.58177975},
|
|
|
|
"faithfulness": {"individual_scores": [0.135581, 0.695974], "score": 0.40585375},
|
|
|
|
"semantic_answer_similarity": {"individual_scores": [0.971241, 0.159320], "score": 0.53757075},
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
_ = EvaluationRunResult("testing_pipeline_1", inputs=data["inputs"], results=data["metrics"])
|
|
|
|
|
|
|
|
with pytest.raises(ValueError, match="No inputs provided"):
|
|
|
|
_ = EvaluationRunResult("testing_pipeline_1", inputs={}, results={})
|
|
|
|
|
|
|
|
with pytest.raises(ValueError, match="Lengths of the inputs should be the same"):
|
|
|
|
_ = EvaluationRunResult(
|
|
|
|
"testing_pipeline_1",
|
|
|
|
inputs={"query_id": ["53c3b3e6", "something else"], "question": ["What is the capital of France?"]},
|
|
|
|
results={"some": {"score": 0.1, "individual_scores": [0.378064, 0.534964]}},
|
|
|
|
)
|
|
|
|
|
|
|
|
with pytest.raises(ValueError, match="Aggregate score missing"):
|
|
|
|
_ = EvaluationRunResult(
|
|
|
|
"testing_pipeline_1",
|
|
|
|
inputs={
|
|
|
|
"query_id": ["53c3b3e6", "something else"],
|
|
|
|
"question": ["What is the capital of France?", "another"],
|
|
|
|
},
|
|
|
|
results={"some": {"individual_scores": [0.378064, 0.534964]}},
|
|
|
|
)
|
|
|
|
|
|
|
|
with pytest.raises(ValueError, match="Individual scores missing"):
|
|
|
|
_ = EvaluationRunResult(
|
|
|
|
"testing_pipeline_1",
|
|
|
|
inputs={
|
|
|
|
"query_id": ["53c3b3e6", "something else"],
|
|
|
|
"question": ["What is the capital of France?", "another"],
|
|
|
|
},
|
|
|
|
results={"some": {"score": 0.378064}},
|
|
|
|
)
|
|
|
|
|
|
|
|
with pytest.raises(ValueError, match="Length of individual scores .* should be the same as the inputs"):
|
|
|
|
_ = EvaluationRunResult(
|
|
|
|
"testing_pipeline_1",
|
|
|
|
inputs={
|
|
|
|
"query_id": ["53c3b3e6", "something else"],
|
|
|
|
"question": ["What is the capital of France?", "another"],
|
|
|
|
},
|
|
|
|
results={"some": {"score": 0.1, "individual_scores": [0.378064, 0.534964, 0.3]}},
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def test_score_report():
|
|
|
|
data = {
|
|
|
|
"inputs": {
|
|
|
|
"query_id": ["53c3b3e6", "225f87f7"],
|
|
|
|
"question": ["What is the capital of France?", "What is the capital of Spain?"],
|
|
|
|
"contexts": ["wiki_France", "wiki_Spain"],
|
|
|
|
"answer": ["Paris", "Madrid"],
|
|
|
|
"predicted_answer": ["Paris", "Madrid"],
|
|
|
|
},
|
|
|
|
"metrics": {
|
|
|
|
"reciprocal_rank": {"individual_scores": [0.378064, 0.534964], "score": 0.476932},
|
|
|
|
"single_hit": {"individual_scores": [1, 1], "score": 0.75},
|
|
|
|
"multi_hit": {"individual_scores": [0.706125, 0.454976], "score": 0.46428375},
|
|
|
|
"context_relevance": {"individual_scores": [0.805466, 0.410251], "score": 0.58177975},
|
|
|
|
"faithfulness": {"individual_scores": [0.135581, 0.695974], "score": 0.40585375},
|
|
|
|
"semantic_answer_similarity": {"individual_scores": [0.971241, 0.159320], "score": 0.53757075},
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
result = EvaluationRunResult("testing_pipeline_1", inputs=data["inputs"], results=data["metrics"])
|
|
|
|
report = result.score_report().to_json()
|
2024-06-06 14:33:45 +02:00
|
|
|
|
2024-04-25 12:16:48 +02:00
|
|
|
assert report == (
|
2024-06-06 14:33:45 +02:00
|
|
|
'{"metrics":{"0":"reciprocal_rank","1":"single_hit","2":"multi_hit","3":"context_relevance",'
|
|
|
|
'"4":"faithfulness","5":"semantic_answer_similarity"},'
|
|
|
|
'"score":{"0":0.476932,"1":0.75,"2":0.46428375,"3":0.58177975,"4":0.40585375,"5":0.53757075}}'
|
2024-04-25 12:16:48 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def test_to_pandas():
|
|
|
|
data = {
|
|
|
|
"inputs": {
|
|
|
|
"query_id": ["53c3b3e6", "225f87f7", "53c3b3e6", "225f87f7"],
|
|
|
|
"question": [
|
|
|
|
"What is the capital of France?",
|
|
|
|
"What is the capital of Spain?",
|
|
|
|
"What is the capital of Luxembourg?",
|
|
|
|
"What is the capital of Portugal?",
|
|
|
|
],
|
|
|
|
"contexts": ["wiki_France", "wiki_Spain", "wiki_Luxembourg", "wiki_Portugal"],
|
|
|
|
"answer": ["Paris", "Madrid", "Luxembourg", "Lisbon"],
|
|
|
|
"predicted_answer": ["Paris", "Madrid", "Luxembourg", "Lisbon"],
|
|
|
|
},
|
|
|
|
"metrics": {
|
|
|
|
"reciprocal_rank": {"score": 0.1, "individual_scores": [0.378064, 0.534964, 0.216058, 0.778642]},
|
|
|
|
"single_hit": {"score": 0.1, "individual_scores": [1, 1, 0, 1]},
|
|
|
|
"multi_hit": {"score": 0.1, "individual_scores": [0.706125, 0.454976, 0.445512, 0.250522]},
|
|
|
|
"context_relevance": {"score": 0.1, "individual_scores": [0.805466, 0.410251, 0.750070, 0.361332]},
|
|
|
|
"faithfulness": {"score": 0.1, "individual_scores": [0.135581, 0.695974, 0.749861, 0.041999]},
|
|
|
|
"semantic_answer_similarity": {"score": 0.1, "individual_scores": [0.971241, 0.159320, 0.019722, 1]},
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
result = EvaluationRunResult("testing_pipeline_1", inputs=data["inputs"], results=data["metrics"])
|
|
|
|
assert result.to_pandas().to_json() == (
|
|
|
|
'{"query_id":{"0":"53c3b3e6","1":"225f87f7","2":"53c3b3e6","3":"225f87f7"},'
|
|
|
|
'"question":{"0":"What is the capital of France?","1":"What is the capital of Spain?",'
|
|
|
|
'"2":"What is the capital of Luxembourg?","3":"What is the capital of Portugal?"},'
|
|
|
|
'"contexts":{"0":"wiki_France","1":"wiki_Spain","2":"wiki_Luxembourg","3":"wiki_Portugal"},'
|
|
|
|
'"answer":{"0":"Paris","1":"Madrid","2":"Luxembourg","3":"Lisbon"},'
|
|
|
|
'"predicted_answer":{"0":"Paris","1":"Madrid","2":"Luxembourg","3":"Lisbon"},'
|
|
|
|
'"reciprocal_rank":{"0":0.378064,"1":0.534964,"2":0.216058,"3":0.778642},'
|
|
|
|
'"single_hit":{"0":1,"1":1,"2":0,"3":1},'
|
|
|
|
'"multi_hit":{"0":0.706125,"1":0.454976,"2":0.445512,"3":0.250522},'
|
|
|
|
'"context_relevance":{"0":0.805466,"1":0.410251,"2":0.75007,"3":0.361332},'
|
|
|
|
'"faithfulness":{"0":0.135581,"1":0.695974,"2":0.749861,"3":0.041999},'
|
|
|
|
'"semantic_answer_similarity":{"0":0.971241,"1":0.15932,"2":0.019722,"3":1.0}}'
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def test_comparative_individual_scores_report():
|
|
|
|
data_1 = {
|
|
|
|
"inputs": {
|
|
|
|
"query_id": ["53c3b3e6", "225f87f7"],
|
|
|
|
"question": ["What is the capital of France?", "What is the capital of Spain?"],
|
|
|
|
"contexts": ["wiki_France", "wiki_Spain"],
|
|
|
|
"answer": ["Paris", "Madrid"],
|
|
|
|
"predicted_answer": ["Paris", "Madrid"],
|
|
|
|
},
|
|
|
|
"metrics": {
|
|
|
|
"reciprocal_rank": {"individual_scores": [0.378064, 0.534964], "score": 0.476932},
|
|
|
|
"single_hit": {"individual_scores": [1, 1], "score": 0.75},
|
|
|
|
"multi_hit": {"individual_scores": [0.706125, 0.454976], "score": 0.46428375},
|
|
|
|
"context_relevance": {"individual_scores": [0.805466, 0.410251], "score": 0.58177975},
|
|
|
|
"faithfulness": {"individual_scores": [0.135581, 0.695974], "score": 0.40585375},
|
|
|
|
"semantic_answer_similarity": {"individual_scores": [0.971241, 0.159320], "score": 0.53757075},
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
data_2 = {
|
|
|
|
"inputs": {
|
|
|
|
"query_id": ["53c3b3e6", "225f87f7"],
|
|
|
|
"question": ["What is the capital of France?", "What is the capital of Spain?"],
|
|
|
|
"contexts": ["wiki_France", "wiki_Spain"],
|
|
|
|
"answer": ["Paris", "Madrid"],
|
|
|
|
"predicted_answer": ["Paris", "Madrid"],
|
|
|
|
},
|
|
|
|
"metrics": {
|
|
|
|
"reciprocal_rank": {"individual_scores": [0.378064, 0.534964], "score": 0.476932},
|
|
|
|
"single_hit": {"individual_scores": [1, 1], "score": 0.75},
|
|
|
|
"multi_hit": {"individual_scores": [0.706125, 0.454976], "score": 0.46428375},
|
|
|
|
"context_relevance": {"individual_scores": [0.805466, 0.410251], "score": 0.58177975},
|
|
|
|
"faithfulness": {"individual_scores": [0.135581, 0.695974], "score": 0.40585375},
|
|
|
|
"semantic_answer_similarity": {"individual_scores": [0.971241, 0.159320], "score": 0.53757075},
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
result1 = EvaluationRunResult("testing_pipeline_1", inputs=data_1["inputs"], results=data_1["metrics"])
|
|
|
|
result2 = EvaluationRunResult("testing_pipeline_2", inputs=data_2["inputs"], results=data_2["metrics"])
|
|
|
|
results = result1.comparative_individual_scores_report(result2)
|
|
|
|
|
|
|
|
assert results.to_json() == (
|
|
|
|
'{"query_id":{"0":"53c3b3e6","1":"225f87f7"},'
|
|
|
|
'"question":{"0":"What is the capital of France?","1":"What is the capital of Spain?"},'
|
|
|
|
'"contexts":{"0":"wiki_France","1":"wiki_Spain"},"answer":{"0":"Paris","1":"Madrid"},'
|
|
|
|
'"predicted_answer":{"0":"Paris","1":"Madrid"},'
|
|
|
|
'"testing_pipeline_1_reciprocal_rank":{"0":0.378064,"1":0.534964},'
|
|
|
|
'"testing_pipeline_1_single_hit":{"0":1,"1":1},'
|
|
|
|
'"testing_pipeline_1_multi_hit":{"0":0.706125,"1":0.454976},'
|
|
|
|
'"testing_pipeline_1_context_relevance":{"0":0.805466,"1":0.410251},'
|
|
|
|
'"testing_pipeline_1_faithfulness":{"0":0.135581,"1":0.695974},'
|
|
|
|
'"testing_pipeline_1_semantic_answer_similarity":{"0":0.971241,"1":0.15932},'
|
|
|
|
'"testing_pipeline_2_reciprocal_rank":{"0":0.378064,"1":0.534964},'
|
|
|
|
'"testing_pipeline_2_single_hit":{"0":1,"1":1},'
|
|
|
|
'"testing_pipeline_2_multi_hit":{"0":0.706125,"1":0.454976},'
|
|
|
|
'"testing_pipeline_2_context_relevance":{"0":0.805466,"1":0.410251},'
|
|
|
|
'"testing_pipeline_2_faithfulness":{"0":0.135581,"1":0.695974},'
|
|
|
|
'"testing_pipeline_2_semantic_answer_similarity":{"0":0.971241,"1":0.15932}}'
|
|
|
|
)
|