mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-12 15:27:06 +00:00
feat: implementing evalualtion results API (#7520)
* initial import * adding tests * attending PR comments * fixing tests * updating tests * updating tests and code * renaming * fixing linting issues * adding release notes * adding docstrings * latest fixes
This commit is contained in:
parent
e974a23fa3
commit
9a9c8aa1c8
@ -2,6 +2,7 @@ from .answer_exact_match import AnswerExactMatchEvaluator
|
||||
from .document_map import DocumentMAPEvaluator
|
||||
from .document_mrr import DocumentMRREvaluator
|
||||
from .document_recall import DocumentRecallEvaluator
|
||||
from .evaluation_result import EvaluationResult
|
||||
from .faithfulness import FaithfulnessEvaluator
|
||||
from .llm_evaluator import LLMEvaluator
|
||||
from .sas_evaluator import SASEvaluator
|
||||
@ -11,6 +12,7 @@ __all__ = [
|
||||
"DocumentMAPEvaluator",
|
||||
"DocumentMRREvaluator",
|
||||
"DocumentRecallEvaluator",
|
||||
"EvaluationResult",
|
||||
"FaithfulnessEvaluator",
|
||||
"LLMEvaluator",
|
||||
"SASEvaluator",
|
||||
|
||||
98
haystack/components/evaluators/evaluation_result.py
Normal file
98
haystack/components/evaluators/evaluation_result.py
Normal file
@ -0,0 +1,98 @@
|
||||
from typing import Any, Dict
|
||||
|
||||
from pandas import DataFrame
|
||||
from pandas import concat as pd_concat
|
||||
|
||||
|
||||
class EvaluationResult:
|
||||
"""
|
||||
A class to store the results of an evaluation pipeline.
|
||||
|
||||
data = {
|
||||
"inputs": {
|
||||
"question": ["What is the capital of France?", "What is the capital of Spain?"],
|
||||
"contexts": ["wiki_France", "wiki_Spain"],
|
||||
"predicted_answer": ["Paris", "Madrid"],
|
||||
},
|
||||
"metrics": [
|
||||
{"name": "reciprocal_rank", "scores": [0.378064, 0.534964, 0.216058, 0.778642]},
|
||||
{"name": "context_relevance", "scores": [0.805466, 0.410251, 0.750070, 0.361332]},
|
||||
],
|
||||
}
|
||||
|
||||
eval_result = EvaluationResult(pipeline_name="testing_pipeline_1", results=data)
|
||||
eval_result.to_pandas()
|
||||
"""
|
||||
|
||||
def __init__(self, pipeline_name: str, results: Dict[str, Any]):
|
||||
"""
|
||||
Initialize the EvaluationResult object.
|
||||
|
||||
:param pipeline_name: The name of the pipeline that generated the results.
|
||||
:param results: A dictionary containing the results of the evaluators used in the EvaluationPipeline.
|
||||
it should have the following keys:
|
||||
- inputs: A dictionary containing the inputs used in the evaluation.
|
||||
- metrics: A list of dictionaries each containing the following keys:
|
||||
'name': The name of the metric.
|
||||
'score': The aggregated score for the metric.
|
||||
'individual_scores': A list of scores for each query.
|
||||
"""
|
||||
self.results = results
|
||||
self.pipeline_name = pipeline_name
|
||||
|
||||
def score_report(self) -> DataFrame:
|
||||
"""
|
||||
Transforms the results into a DataFrame with the aggregated scores for each metric.
|
||||
|
||||
:returns:
|
||||
A DataFrame with the aggregated scores.
|
||||
|
||||
"""
|
||||
results = {entry["name"]: entry["score"] for entry in self.results["metrics"]}
|
||||
return DataFrame.from_dict(results, orient="index", columns=["score"])
|
||||
|
||||
def to_pandas(self) -> DataFrame:
|
||||
"""
|
||||
Creates a DataFrame containing the scores for each query and each metric.
|
||||
|
||||
:returns:
|
||||
A DataFrame with the scores.
|
||||
"""
|
||||
inputs_columns = list(self.results["inputs"].keys())
|
||||
inputs_values = list(self.results["inputs"].values())
|
||||
inputs_values = list(map(list, zip(*inputs_values))) # transpose the values
|
||||
df_inputs = DataFrame(inputs_values, columns=inputs_columns)
|
||||
|
||||
scores_columns = [entry["name"] for entry in self.results["metrics"]]
|
||||
scores_values = [entry["individual_scores"] for entry in self.results["metrics"]]
|
||||
scores_values = list(map(list, zip(*scores_values))) # transpose the values
|
||||
df_scores = DataFrame(scores_values, columns=scores_columns)
|
||||
|
||||
return df_inputs.join(df_scores)
|
||||
|
||||
def comparative_individual_scores_report(self, other: "EvaluationResult") -> DataFrame:
|
||||
"""
|
||||
Creates a DataFrame with the scores for each metric in the results of two different pipelines.
|
||||
|
||||
:param other: The other EvaluationResults object to compare with.
|
||||
:returns:
|
||||
A DataFrame with the scores from both EvaluationResults objects.
|
||||
"""
|
||||
pipe_a_df = self.to_pandas()
|
||||
pipe_b_df = other.to_pandas()
|
||||
|
||||
# check if the columns are the same, i.e.: the same queries and evaluation pipeline
|
||||
columns_a = list(pipe_a_df.columns)
|
||||
columns_b = list(pipe_b_df.columns)
|
||||
if columns_a != columns_b:
|
||||
raise ValueError(f"The two evaluation results do not have the same columns: {columns_a} != {columns_b}")
|
||||
|
||||
# add the pipeline name to the column
|
||||
ignore = ["query_id", "question", "contexts", "answer"]
|
||||
pipe_b_df.drop(columns=ignore, inplace=True, errors="ignore")
|
||||
pipe_b_df.columns = [f"{other.pipeline_name}_{column}" for column in pipe_b_df.columns]
|
||||
pipe_a_df.columns = [f"{self.pipeline_name}_{col}" if col not in ignore else col for col in pipe_a_df.columns]
|
||||
|
||||
results_df = pd_concat([pipe_a_df, pipe_b_df], axis=1)
|
||||
|
||||
return results_df
|
||||
@ -0,0 +1,5 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
Added a new EvaluationResult component.
|
||||
This is a wrapper for all the results coming from the Evaluators, presenting the metric scores as a DataFrame.
|
||||
0
test/components/evaluators/__init__.py
Normal file
0
test/components/evaluators/__init__.py
Normal file
170
test/components/evaluators/test_results_evaluator.py
Normal file
170
test/components/evaluators/test_results_evaluator.py
Normal file
@ -0,0 +1,170 @@
|
||||
from haystack.components.evaluators.evaluation_result import EvaluationResult
|
||||
|
||||
|
||||
def test_init_results_evaluator():
|
||||
data = {
|
||||
"inputs": {
|
||||
"query_id": ["53c3b3e6", "225f87f7"],
|
||||
"question": ["What is the capital of France?", "What is the capital of Spain?"],
|
||||
"contexts": ["wiki_France", "wiki_Spain"],
|
||||
"answer": ["Paris", "Madrid"],
|
||||
"predicted_answer": ["Paris", "Madrid"],
|
||||
},
|
||||
"metrics": [
|
||||
{"name": "reciprocal_rank", "scores": [0.378064, 0.534964, 0.216058, 0.778642]},
|
||||
{"name": "single_hit", "scores": [1, 1, 0, 1]},
|
||||
{"name": "multi_hit", "scores": [0.706125, 0.454976, 0.445512, 0.250522]},
|
||||
{"name": "context_relevance", "scores": [0.805466, 0.410251, 0.750070, 0.361332]},
|
||||
{"name": "faithfulness", "scores": [0.135581, 0.695974, 0.749861, 0.041999]},
|
||||
{"name": "semantic_answer_similarity", "scores": [0.971241, 0.159320, 0.019722, 1]},
|
||||
],
|
||||
}
|
||||
|
||||
_ = EvaluationResult(pipeline_name="testing_pipeline_1", results=data)
|
||||
|
||||
|
||||
def test_score_report():
|
||||
data = {
|
||||
"inputs": {
|
||||
"query_id": ["53c3b3e6", "225f87f7"],
|
||||
"question": ["What is the capital of France?", "What is the capital of Spain?"],
|
||||
"contexts": ["wiki_France", "wiki_Spain"],
|
||||
"answer": ["Paris", "Madrid"],
|
||||
"predicted_answer": ["Paris", "Madrid"],
|
||||
},
|
||||
"metrics": [
|
||||
{
|
||||
"name": "reciprocal_rank",
|
||||
"individual_scores": [0.378064, 0.534964, 0.216058, 0.778642],
|
||||
"score": 0.476932,
|
||||
},
|
||||
{"name": "single_hit", "individual_scores": [1, 1, 0, 1], "score": 0.75},
|
||||
{"name": "multi_hit", "individual_scores": [0.706125, 0.454976, 0.445512, 0.250522], "score": 0.46428375},
|
||||
{
|
||||
"name": "context_relevance",
|
||||
"individual_scores": [0.805466, 0.410251, 0.750070, 0.361332],
|
||||
"score": 0.58177975,
|
||||
},
|
||||
{
|
||||
"name": "faithfulness",
|
||||
"individual_scores": [0.135581, 0.695974, 0.749861, 0.041999],
|
||||
"score": 0.40585375,
|
||||
},
|
||||
{
|
||||
"name": "semantic_answer_similarity",
|
||||
"individual_scores": [0.971241, 0.159320, 0.019722, 1],
|
||||
"score": 0.53757075,
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
evaluator = EvaluationResult(pipeline_name="testing_pipeline_1", results=data)
|
||||
result = evaluator.score_report().to_json()
|
||||
assert result == (
|
||||
'{"score":{"reciprocal_rank":0.476932,"single_hit":0.75,"multi_hit":0.46428375,'
|
||||
'"context_relevance":0.58177975,"faithfulness":0.40585375,'
|
||||
'"semantic_answer_similarity":0.53757075}}'
|
||||
)
|
||||
|
||||
|
||||
def test_to_pandas():
|
||||
data = {
|
||||
"inputs": {
|
||||
"query_id": ["53c3b3e6", "225f87f7", "53c3b3e6", "225f87f7"],
|
||||
"question": [
|
||||
"What is the capital of France?",
|
||||
"What is the capital of Spain?",
|
||||
"What is the capital of Luxembourg?",
|
||||
"What is the capital of Portugal?",
|
||||
],
|
||||
"contexts": ["wiki_France", "wiki_Spain", "wiki_Luxembourg", "wiki_Portugal"],
|
||||
"answer": ["Paris", "Madrid", "Luxembourg", "Lisbon"],
|
||||
"predicted_answer": ["Paris", "Madrid", "Luxembourg", "Lisbon"],
|
||||
},
|
||||
"metrics": [
|
||||
{"name": "reciprocal_rank", "individual_scores": [0.378064, 0.534964, 0.216058, 0.778642]},
|
||||
{"name": "single_hit", "individual_scores": [1, 1, 0, 1]},
|
||||
{"name": "multi_hit", "individual_scores": [0.706125, 0.454976, 0.445512, 0.250522]},
|
||||
{"name": "context_relevance", "individual_scores": [0.805466, 0.410251, 0.750070, 0.361332]},
|
||||
{"name": "faithfulness", "individual_scores": [0.135581, 0.695974, 0.749861, 0.041999]},
|
||||
{"name": "semantic_answer_similarity", "individual_scores": [0.971241, 0.159320, 0.019722, 1]},
|
||||
],
|
||||
}
|
||||
|
||||
evaluator = EvaluationResult(pipeline_name="testing_pipeline_1", results=data)
|
||||
assert evaluator.to_pandas().to_json() == (
|
||||
'{"query_id":{"0":"53c3b3e6","1":"225f87f7","2":"53c3b3e6","3":"225f87f7"},'
|
||||
'"question":{"0":"What is the capital of France?","1":"What is the capital of Spain?",'
|
||||
'"2":"What is the capital of Luxembourg?","3":"What is the capital of Portugal?"},'
|
||||
'"contexts":{"0":"wiki_France","1":"wiki_Spain","2":"wiki_Luxembourg","3":"wiki_Portugal"},'
|
||||
'"answer":{"0":"Paris","1":"Madrid","2":"Luxembourg","3":"Lisbon"},'
|
||||
'"predicted_answer":{"0":"Paris","1":"Madrid","2":"Luxembourg","3":"Lisbon"},'
|
||||
'"reciprocal_rank":{"0":0.378064,"1":0.534964,"2":0.216058,"3":0.778642},'
|
||||
'"single_hit":{"0":1,"1":1,"2":0,"3":1},'
|
||||
'"multi_hit":{"0":0.706125,"1":0.454976,"2":0.445512,"3":0.250522},'
|
||||
'"context_relevance":{"0":0.805466,"1":0.410251,"2":0.75007,"3":0.361332},'
|
||||
'"faithfulness":{"0":0.135581,"1":0.695974,"2":0.749861,"3":0.041999},'
|
||||
'"semantic_answer_similarity":{"0":0.971241,"1":0.15932,"2":0.019722,"3":1.0}}'
|
||||
)
|
||||
|
||||
|
||||
def test_comparative_individual_scores_report():
|
||||
data_1 = {
|
||||
"inputs": {
|
||||
"query_id": ["53c3b3e6", "225f87f7"],
|
||||
"question": ["What is the capital of France?", "What is the capital of Spain?"],
|
||||
"contexts": ["wiki_France", "wiki_Spain"],
|
||||
"answer": ["Paris", "Madrid"],
|
||||
"predicted_answer": ["Paris", "Madrid"],
|
||||
},
|
||||
"metrics": [
|
||||
{"name": "reciprocal_rank", "individual_scores": [0.378064, 0.534964, 0.216058, 0.778642]},
|
||||
{"name": "single_hit", "individual_scores": [1, 1, 0, 1]},
|
||||
{"name": "multi_hit", "individual_scores": [0.706125, 0.454976, 0.445512, 0.250522]},
|
||||
{"name": "context_relevance", "individual_scores": [0.805466, 0.410251, 0.750070, 0.361332]},
|
||||
{"name": "faithfulness", "individual_scores": [0.135581, 0.695974, 0.749861, 0.041999]},
|
||||
{"name": "semantic_answer_similarity", "individual_scores": [0.971241, 0.159320, 0.019722, 1]},
|
||||
],
|
||||
}
|
||||
|
||||
data_2 = {
|
||||
"inputs": {
|
||||
"query_id": ["53c3b3e6", "225f87f7"],
|
||||
"question": ["What is the capital of France?", "What is the capital of Spain?"],
|
||||
"contexts": ["wiki_France", "wiki_Spain"],
|
||||
"answer": ["Paris", "Madrid"],
|
||||
"predicted_answer": ["Paris", "Madrid"],
|
||||
},
|
||||
"metrics": [
|
||||
{"name": "reciprocal_rank", "individual_scores": [0.378064, 0.534964, 0.216058, 0.778642]},
|
||||
{"name": "single_hit", "individual_scores": [1, 1, 0, 1]},
|
||||
{"name": "multi_hit", "individual_scores": [0.706125, 0.454976, 0.445512, 0.250522]},
|
||||
{"name": "context_relevance", "individual_scores": [0.805466, 0.410251, 0.750070, 0.361332]},
|
||||
{"name": "faithfulness", "individual_scores": [0.135581, 0.695974, 0.749861, 0.041999]},
|
||||
{"name": "semantic_answer_similarity", "individual_scores": [0.971241, 0.159320, 0.019722, 1]},
|
||||
],
|
||||
}
|
||||
|
||||
evaluator_1 = EvaluationResult(pipeline_name="testing_pipeline_1", results=data_1)
|
||||
evaluator_2 = EvaluationResult(pipeline_name="testing_pipeline_2", results=data_2)
|
||||
results = evaluator_1.comparative_individual_scores_report(evaluator_2)
|
||||
|
||||
assert results.to_json() == (
|
||||
'{"query_id":{"0":"53c3b3e6","1":"225f87f7"},'
|
||||
'"question":{"0":"What is the capital of France?","1":"What is the capital of Spain?"},'
|
||||
'"contexts":{"0":"wiki_France","1":"wiki_Spain"},"answer":{"0":"Paris","1":"Madrid"},'
|
||||
'"testing_pipeline_1_predicted_answer":{"0":"Paris","1":"Madrid"},'
|
||||
'"testing_pipeline_1_reciprocal_rank":{"0":0.378064,"1":0.534964},'
|
||||
'"testing_pipeline_1_single_hit":{"0":1,"1":1},'
|
||||
'"testing_pipeline_1_multi_hit":{"0":0.706125,"1":0.454976},'
|
||||
'"testing_pipeline_1_context_relevance":{"0":0.805466,"1":0.410251},'
|
||||
'"testing_pipeline_1_faithfulness":{"0":0.135581,"1":0.695974},'
|
||||
'"testing_pipeline_1_semantic_answer_similarity":{"0":0.971241,"1":0.15932},'
|
||||
'"testing_pipeline_2_predicted_answer":{"0":"Paris","1":"Madrid"},'
|
||||
'"testing_pipeline_2_reciprocal_rank":{"0":0.378064,"1":0.534964},'
|
||||
'"testing_pipeline_2_single_hit":{"0":1,"1":1},'
|
||||
'"testing_pipeline_2_multi_hit":{"0":0.706125,"1":0.454976},'
|
||||
'"testing_pipeline_2_context_relevance":{"0":0.805466,"1":0.410251},'
|
||||
'"testing_pipeline_2_faithfulness":{"0":0.135581,"1":0.695974},'
|
||||
'"testing_pipeline_2_semantic_answer_similarity":{"0":0.971241,"1":0.15932}}'
|
||||
)
|
||||
Loading…
x
Reference in New Issue
Block a user