mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-08 04:56:45 +00:00
add docstrings for EvaluationRunResult (#7885)
This commit is contained in:
parent
28902c4c65
commit
7c31d5f418
@ -3,7 +3,8 @@ loaders:
|
||||
search_path: [../../../haystack/evaluation]
|
||||
modules:
|
||||
[
|
||||
"eval_run_result"
|
||||
"base",
|
||||
"eval_run_result",
|
||||
]
|
||||
ignore_when_discovered: ["__init__"]
|
||||
processors:
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from .eval_run_result import BaseEvaluationRunResult, EvaluationRunResult
|
||||
from .base import BaseEvaluationRunResult
|
||||
from .eval_run_result import EvaluationRunResult
|
||||
|
||||
__all__ = ["BaseEvaluationRunResult", "EvaluationRunResult"]
|
||||
|
||||
49
haystack/evaluation/base.py
Normal file
49
haystack/evaluation/base.py
Normal file
@ -0,0 +1,49 @@
|
||||
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Optional
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
|
||||
class BaseEvaluationRunResult(ABC):
|
||||
"""
|
||||
Represents the results of an evaluation run.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def to_pandas(self) -> "DataFrame":
|
||||
"""
|
||||
Creates a Pandas DataFrame containing the scores of each metric for every input sample.
|
||||
|
||||
:returns:
|
||||
Pandas DataFrame with the scores.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def score_report(self) -> "DataFrame":
|
||||
"""
|
||||
Transforms the results into a Pandas DataFrame with the aggregated scores for each metric.
|
||||
|
||||
:returns:
|
||||
Pandas DataFrame with the aggregated scores.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def comparative_individual_scores_report(
|
||||
self, other: "BaseEvaluationRunResult", keep_columns: Optional[List[str]] = None
|
||||
) -> "DataFrame":
|
||||
"""
|
||||
Creates a Pandas DataFrame with the scores for each metric in the results of two different evaluation runs.
|
||||
|
||||
The inputs to both evaluation runs is assumed to be the same.
|
||||
|
||||
:param other:
|
||||
Results of another evaluation run to compare with.
|
||||
:param keep_columns:
|
||||
List of common column names to keep from the inputs of the evaluation runs to compare.
|
||||
:returns:
|
||||
Pandas DataFrame with the score comparison.
|
||||
"""
|
||||
@ -2,7 +2,6 @@
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from copy import deepcopy
|
||||
from typing import Any, Dict, List, Optional
|
||||
from warnings import warn
|
||||
@ -10,46 +9,7 @@ from warnings import warn
|
||||
from pandas import DataFrame
|
||||
from pandas import concat as pd_concat
|
||||
|
||||
|
||||
class BaseEvaluationRunResult(ABC):
|
||||
"""
|
||||
Represents the results of an evaluation run.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def to_pandas(self) -> "DataFrame":
|
||||
"""
|
||||
Creates a Pandas DataFrame containing the scores of each metric for every input sample.
|
||||
|
||||
:returns:
|
||||
Pandas DataFrame with the scores.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def score_report(self) -> "DataFrame":
|
||||
"""
|
||||
Transforms the results into a Pandas DataFrame with the aggregated scores for each metric.
|
||||
|
||||
:returns:
|
||||
Pandas DataFrame with the aggregated scores.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def comparative_individual_scores_report(
|
||||
self, other: "BaseEvaluationRunResult", keep_columns: Optional[List[str]] = None
|
||||
) -> "DataFrame":
|
||||
"""
|
||||
Creates a Pandas DataFrame with the scores for each metric in the results of two different evaluation runs.
|
||||
|
||||
The inputs to both evaluation runs is assumed to be the same.
|
||||
|
||||
:param other:
|
||||
Results of another evaluation run to compare with.
|
||||
:param keep_columns:
|
||||
List of common column names to keep from the inputs of the evaluation runs to compare.
|
||||
:returns:
|
||||
Pandas DataFrame with the score comparison.
|
||||
"""
|
||||
from .base import BaseEvaluationRunResult
|
||||
|
||||
|
||||
class EvaluationRunResult(BaseEvaluationRunResult):
|
||||
@ -99,13 +59,25 @@ class EvaluationRunResult(BaseEvaluationRunResult):
|
||||
f"Got {len(outputs['individual_scores'])} but expected {expected_len}."
|
||||
)
|
||||
|
||||
def score_report(self) -> DataFrame: # noqa: D102
|
||||
def score_report(self) -> DataFrame:
|
||||
"""
|
||||
Transforms the results into a Pandas DataFrame with the aggregated scores for each metric.
|
||||
|
||||
:returns:
|
||||
Pandas DataFrame with the aggregated scores.
|
||||
"""
|
||||
results = {k: v["score"] for k, v in self.results.items()}
|
||||
df = DataFrame.from_dict(results, orient="index", columns=["score"]).reset_index()
|
||||
df.columns = ["metrics", "score"]
|
||||
return df
|
||||
|
||||
def to_pandas(self) -> DataFrame: # noqa: D102
|
||||
def to_pandas(self) -> DataFrame:
|
||||
"""
|
||||
Creates a Pandas DataFrame containing the scores of each metric for every input sample.
|
||||
|
||||
:returns:
|
||||
Pandas DataFrame with the scores.
|
||||
"""
|
||||
inputs_columns = list(self.inputs.keys())
|
||||
inputs_values = list(self.inputs.values())
|
||||
inputs_values = list(map(list, zip(*inputs_values))) # transpose the values
|
||||
@ -118,9 +90,21 @@ class EvaluationRunResult(BaseEvaluationRunResult):
|
||||
|
||||
return df_inputs.join(df_scores)
|
||||
|
||||
def comparative_individual_scores_report( # noqa: D102
|
||||
def comparative_individual_scores_report(
|
||||
self, other: "BaseEvaluationRunResult", keep_columns: Optional[List[str]] = None
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Creates a Pandas DataFrame with the scores for each metric in the results of two different evaluation runs.
|
||||
|
||||
The inputs to both evaluation runs is assumed to be the same.
|
||||
|
||||
:param other:
|
||||
Results of another evaluation run to compare with.
|
||||
:param keep_columns:
|
||||
List of common column names to keep from the inputs of the evaluation runs to compare.
|
||||
:returns:
|
||||
Pandas DataFrame with the score comparison.
|
||||
"""
|
||||
if not isinstance(other, EvaluationRunResult):
|
||||
raise ValueError("Comparative scores can only be computed between EvaluationRunResults.")
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user