haystack/haystack/components/eval/statistical_evaluator.py
Silvano Cerza 2b8a606cb8
refactor: Refactor StatisticalEvaluator (#6999)
* Refactor StatisticalEvaluator

* Update StatisticalEvaluator

* Rename StatisticalMetric.from_string to from_str and change internal logic

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Fix tests

---------

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
2024-02-15 16:47:35 +01:00

113 lines
4.4 KiB
Python

import collections
from enum import Enum
from typing import Any, Dict, List, Union
from numpy import array as np_array
from numpy import mean as np_mean
from haystack import default_from_dict, default_to_dict
from haystack.core.component import component
class StatisticalMetric(Enum):
"""
Metrics supported by the StatisticalEvaluator.
"""
F1 = "f1"
EM = "exact_match"
@classmethod
def from_str(cls, metric: str) -> "StatisticalMetric":
map = {e.value: e for e in StatisticalMetric}
metric_ = map.get(metric)
if metric_ is None:
raise ValueError(f"Unknown statistical metric '{metric}'")
return metric_
@component
class StatisticalEvaluator:
"""
StatisticalEvaluator is a component that evaluates the performance of a model based on statistical metrics.
It's usually used in QA and Retrieval Augmented Generation (RAG) pipelines to evaluate the quality of the generated answers.
The supported metrics are:
- F1: Measures word overlap between predictions and labels.
- Exact Match: Measures the proportion of cases where prediction is identical to the expected label.
"""
def __init__(self, metric: Union[str, StatisticalMetric]):
"""
Creates a new instance of StatisticalEvaluator.
:param metric: Metric to use for evaluation in this component. Supported metrics are F1 and Exact Match.
"""
if isinstance(metric, str):
metric = StatisticalMetric.from_str(metric)
self._metric = metric
self._metric_function = {StatisticalMetric.F1: self._f1, StatisticalMetric.EM: self._exact_match}[self._metric]
def to_dict(self) -> Dict[str, Any]:
return default_to_dict(self, metric=self._metric.value)
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "StatisticalEvaluator":
data["init_parameters"]["metric"] = StatisticalMetric(data["init_parameters"]["metric"])
return default_from_dict(cls, data)
@component.output_types(result=float)
def run(self, labels: List[str], predictions: List[str]) -> Dict[str, Any]:
"""
Run the StatisticalEvaluator to compute the metric between a list of predictions and a list of labels.
Both must be list of strings of same length.
:param predictions: List of predictions.
:param labels: List of labels against which the predictions are compared.
:returns: A dictionary with the following outputs:
* `result` - Calculated result of the chosen metric.
"""
if len(labels) != len(predictions):
raise ValueError("The number of predictions and labels must be the same.")
return {"result": self._metric_function(labels, predictions)}
@staticmethod
def _f1(labels: List[str], predictions: List[str]):
"""
Measure word overlap between predictions and labels.
"""
if len(predictions) == 0:
# We expect callers of this function already checked if predictions and labels are equal length
return 0.0
scores: List[float] = []
tokenized_predictions = [pred.split() for pred in predictions]
tokenized_labels = [label.split() for label in labels]
for label_tokens, prediction_tokens in zip(tokenized_labels, tokenized_predictions):
common = collections.Counter(label_tokens) & collections.Counter(prediction_tokens)
num_same = sum(common.values())
if len(label_tokens) == 0 or len(prediction_tokens) == 0:
# If either is no-answer, then F1 is 1 if they agree, 0 otherwise
return int(label_tokens == prediction_tokens)
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(label_tokens)
f1 = (2 * precision * recall) / (precision + recall)
scores.append(f1)
return np_mean(scores)
@staticmethod
def _exact_match(labels: List[str], predictions: List[str]) -> float:
"""
Measure the proportion of cases where predictiond is identical to the the expected label.
"""
if len(predictions) == 0:
# We expect callers of this function already checked if predictions and labels are equal length
return 0.0
score_list = np_array(predictions) == np_array(labels)
return np_mean(score_list)