mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-07 20:46:31 +00:00
Remove all evaluator components (#7053)
This commit is contained in:
parent
f3be576b5c
commit
8ca4bf405b
@ -1,4 +0,0 @@
|
||||
from .sas_evaluator import SASEvaluator
|
||||
from .statistical_evaluator import StatisticalEvaluator, StatisticalMetric
|
||||
|
||||
__all__ = ["SASEvaluator", "StatisticalEvaluator", "StatisticalMetric"]
|
||||
@ -1,144 +0,0 @@
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from numpy import mean as np_mean
|
||||
|
||||
from haystack import component, default_from_dict, default_to_dict
|
||||
from haystack.lazy_imports import LazyImport
|
||||
from haystack.utils import ComponentDevice, expit
|
||||
from haystack.utils.auth import Secret, deserialize_secrets_inplace
|
||||
|
||||
with LazyImport(message="Run 'pip install scikit-learn \"sentence-transformers>=2.2.0\"'") as sas_import:
|
||||
from sentence_transformers import CrossEncoder, SentenceTransformer, util
|
||||
from transformers import AutoConfig
|
||||
|
||||
|
||||
@component
|
||||
class SASEvaluator:
|
||||
"""
|
||||
SASEvaluator computes the Semantic Answer Similarity (SAS) between a list of predictions and a list of labels.
|
||||
It's usually used in Retrieval Augmented Generation (RAG) pipelines to evaluate the quality of the generated answers.
|
||||
|
||||
The SAS is computed using a pre-trained model from the Hugging Face model hub. The model can be either a
|
||||
Bi-Encoder or a Cross-Encoder. The choice of the model is based on the `model` parameter.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
|
||||
batch_size: int = 32,
|
||||
device: Optional[ComponentDevice] = None,
|
||||
token: Secret = Secret.from_env_var("HF_API_TOKEN", strict=False),
|
||||
):
|
||||
"""
|
||||
Creates a new instance of SASEvaluator.
|
||||
|
||||
:param model: SentenceTransformers semantic textual similarity model, should be path or string pointing to
|
||||
a downloadable model.
|
||||
:param batch_size: Number of prediction-label pairs to encode at once.
|
||||
:param device: The device on which the model is loaded. If `None`, the default device is automatically
|
||||
selected.
|
||||
:param token: The Hugging Face token for HTTP bearer authorization.
|
||||
You can find your HF token at https://huggingface.co/settings/tokens.
|
||||
"""
|
||||
sas_import.check()
|
||||
|
||||
self._model = model
|
||||
self._batch_size = batch_size
|
||||
self._device = device
|
||||
self._token = token
|
||||
self._similarity_model = None
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return default_to_dict(
|
||||
self,
|
||||
model=self._model,
|
||||
batch_size=self._batch_size,
|
||||
device=self._device.to_dict() if self._device else None,
|
||||
token=self._token.to_dict() if self._token else None,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "SASEvaluator":
|
||||
deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
|
||||
if device := data.get("init_parameters", {}).get("device"):
|
||||
data["init_parameters"]["device"] = ComponentDevice.from_dict(device)
|
||||
return default_from_dict(cls, data)
|
||||
|
||||
def warm_up(self):
|
||||
"""
|
||||
Load the model used for evaluation
|
||||
"""
|
||||
token = self._token.resolve_value() if self._token else None
|
||||
config = AutoConfig.from_pretrained(self._model, use_auth_token=token)
|
||||
cross_encoder_used = False
|
||||
if config.architectures:
|
||||
cross_encoder_used = any(arch.endswith("ForSequenceClassification") for arch in config.architectures)
|
||||
device = ComponentDevice.resolve_device(self._device).to_torch_str()
|
||||
# Based on the Model string we can load either Bi-Encoders or Cross Encoders.
|
||||
# Similarity computation changes for both approaches
|
||||
if cross_encoder_used:
|
||||
self._similarity_model = CrossEncoder(
|
||||
self._model,
|
||||
device=device,
|
||||
tokenizer_args={"use_auth_token": token},
|
||||
automodel_args={"use_auth_token": token},
|
||||
)
|
||||
else:
|
||||
self._similarity_model = SentenceTransformer(self._model, device=device, use_auth_token=token)
|
||||
|
||||
@component.output_types(sas=float, scores=List[float])
|
||||
def run(self, labels: List[str], predictions: List[str]) -> Dict[str, Any]:
|
||||
"""
|
||||
Run the SASEvaluator to compute the Semantic Answer Similarity (SAS) between a list of predictions and a list of
|
||||
labels. Both must be list of strings of same length.
|
||||
|
||||
:param predictions: List of predictions.
|
||||
:param labels: List of labels against which the predictions are compared.
|
||||
:returns: A dictionary with the following outputs:
|
||||
* `sas` - Cumulative SAS score for the entire dataset.
|
||||
* `scores` - A list of similarity scores for each prediction-label pair.
|
||||
"""
|
||||
if len(labels) != len(predictions):
|
||||
raise ValueError("The number of predictions and labels must be the same.")
|
||||
|
||||
if len(predictions) == 0:
|
||||
return {"sas": 0.0, "scores": [0.0]}
|
||||
|
||||
if not self._similarity_model:
|
||||
msg = "The model has not been initialized. Call warm_up() before running the evaluator."
|
||||
raise RuntimeError(msg)
|
||||
|
||||
if isinstance(self._similarity_model, CrossEncoder):
|
||||
# For Cross Encoders we create a list of pairs of predictions and labels
|
||||
sentence_pairs = [[pred, label] for pred, label in zip(predictions, labels)]
|
||||
similarity_scores = self._similarity_model.predict(
|
||||
sentence_pairs, batch_size=self._batch_size, convert_to_numpy=True
|
||||
)
|
||||
|
||||
# All Cross Encoders do not return a set of logits scores that are normalized
|
||||
# We normalize scores if they are larger than 1
|
||||
if (similarity_scores > 1).any():
|
||||
similarity_scores = expit(similarity_scores)
|
||||
|
||||
# Convert scores to list of floats from numpy array
|
||||
similarity_scores = similarity_scores.tolist()
|
||||
|
||||
else:
|
||||
# For Bi-encoders we create embeddings separately for predictions and labels
|
||||
predictions_embeddings = self._similarity_model.encode(
|
||||
predictions, batch_size=self._batch_size, convert_to_tensor=True
|
||||
)
|
||||
label_embeddings = self._similarity_model.encode(
|
||||
labels, batch_size=self._batch_size, convert_to_tensor=True
|
||||
)
|
||||
|
||||
# Compute cosine-similarities
|
||||
scores = util.cos_sim(predictions_embeddings, label_embeddings)
|
||||
|
||||
# cos_sim computes cosine similarity between all pairs of vectors in pred_embeddings and label_embeddings
|
||||
# It returns a matrix with shape (len(predictions), len(labels))
|
||||
similarity_scores = [scores[i][i].item() for i in range(len(predictions))]
|
||||
|
||||
sas_score = np_mean(similarity_scores)
|
||||
|
||||
return {"sas": sas_score, "scores": similarity_scores}
|
||||
@ -1,171 +0,0 @@
|
||||
import collections
|
||||
import itertools
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Union
|
||||
|
||||
from numpy import array as np_array
|
||||
from numpy import mean as np_mean
|
||||
|
||||
from haystack import default_from_dict, default_to_dict
|
||||
from haystack.core.component import component
|
||||
|
||||
|
||||
class StatisticalMetric(Enum):
|
||||
"""
|
||||
Metrics supported by the StatisticalEvaluator.
|
||||
"""
|
||||
|
||||
F1 = "f1"
|
||||
EM = "exact_match"
|
||||
RECALL_SINGLE_HIT = "recall_single_hit"
|
||||
RECALL_MULTI_HIT = "recall_multi_hit"
|
||||
MRR = "mean_reciprocal_rank"
|
||||
|
||||
@classmethod
|
||||
def from_str(cls, metric: str) -> "StatisticalMetric":
|
||||
map = {e.value: e for e in StatisticalMetric}
|
||||
metric_ = map.get(metric)
|
||||
if metric_ is None:
|
||||
raise ValueError(f"Unknown statistical metric '{metric}'")
|
||||
return metric_
|
||||
|
||||
|
||||
@component
|
||||
class StatisticalEvaluator:
|
||||
"""
|
||||
StatisticalEvaluator is a component that evaluates the performance of a model based on statistical metrics.
|
||||
It's usually used in QA and Retrieval Augmented Generation (RAG) pipelines to evaluate the quality of the generated answers.
|
||||
|
||||
The supported metrics are:
|
||||
- F1: Measures word overlap between predictions and labels.
|
||||
- Exact Match: Measures the proportion of cases where prediction is identical to the expected label.
|
||||
"""
|
||||
|
||||
def __init__(self, metric: Union[str, StatisticalMetric]):
|
||||
"""
|
||||
Creates a new instance of StatisticalEvaluator.
|
||||
|
||||
:param metric: Metric to use for evaluation in this component. Supported metrics are F1 and Exact Match.
|
||||
"""
|
||||
if isinstance(metric, str):
|
||||
metric = StatisticalMetric.from_str(metric)
|
||||
self._metric = metric
|
||||
|
||||
self._metric_function = {
|
||||
StatisticalMetric.F1: self._f1,
|
||||
StatisticalMetric.EM: self._exact_match,
|
||||
StatisticalMetric.RECALL_SINGLE_HIT: self._recall_single_hit,
|
||||
StatisticalMetric.RECALL_MULTI_HIT: self._recall_multi_hit,
|
||||
StatisticalMetric.MRR: self._mrr,
|
||||
}[self._metric]
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return default_to_dict(self, metric=self._metric.value)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "StatisticalEvaluator":
|
||||
data["init_parameters"]["metric"] = StatisticalMetric(data["init_parameters"]["metric"])
|
||||
return default_from_dict(cls, data)
|
||||
|
||||
@component.output_types(result=float)
|
||||
def run(self, labels: List[str], predictions: List[str]) -> Dict[str, Any]:
|
||||
"""
|
||||
Run the StatisticalEvaluator to compute the metric between a list of predictions and a list of labels.
|
||||
Both must be list of strings of same length.
|
||||
|
||||
:param predictions: List of predictions.
|
||||
:param labels: List of labels against which the predictions are compared.
|
||||
:returns: A dictionary with the following outputs:
|
||||
* `result` - Calculated result of the chosen metric.
|
||||
"""
|
||||
return {"result": self._metric_function(labels, predictions)}
|
||||
|
||||
@staticmethod
|
||||
def _f1(labels: List[str], predictions: List[str]):
|
||||
"""
|
||||
Measure word overlap between predictions and labels.
|
||||
"""
|
||||
if len(labels) != len(predictions):
|
||||
raise ValueError("The number of predictions and labels must be the same.")
|
||||
|
||||
if len(predictions) == 0:
|
||||
# We expect callers of this function already checked if predictions and labels are equal length
|
||||
return 0.0
|
||||
|
||||
scores: List[float] = []
|
||||
tokenized_predictions = [pred.split() for pred in predictions]
|
||||
tokenized_labels = [label.split() for label in labels]
|
||||
for label_tokens, prediction_tokens in zip(tokenized_labels, tokenized_predictions):
|
||||
common = collections.Counter(label_tokens) & collections.Counter(prediction_tokens)
|
||||
num_same = sum(common.values())
|
||||
if len(label_tokens) == 0 or len(prediction_tokens) == 0:
|
||||
# If either is no-answer, then F1 is 1 if they agree, 0 otherwise
|
||||
return int(label_tokens == prediction_tokens)
|
||||
if num_same == 0:
|
||||
return 0
|
||||
precision = 1.0 * num_same / len(prediction_tokens)
|
||||
recall = 1.0 * num_same / len(label_tokens)
|
||||
f1 = (2 * precision * recall) / (precision + recall)
|
||||
scores.append(f1)
|
||||
|
||||
return np_mean(scores)
|
||||
|
||||
@staticmethod
|
||||
def _exact_match(labels: List[str], predictions: List[str]) -> float:
|
||||
"""
|
||||
Measure the proportion of cases where prediction is identical to the the expected label.
|
||||
"""
|
||||
if len(labels) != len(predictions):
|
||||
raise ValueError("The number of predictions and labels must be the same.")
|
||||
|
||||
if len(predictions) == 0:
|
||||
# We expect callers of this function already checked if predictions and labels are equal length
|
||||
return 0.0
|
||||
score_list = np_array(predictions) == np_array(labels)
|
||||
return np_mean(score_list)
|
||||
|
||||
@staticmethod
|
||||
def _recall_single_hit(labels: List[str], predictions: List[str]) -> float:
|
||||
"""
|
||||
Measures how many times a label is present in at least one prediction.
|
||||
If the same label is found in multiple predictions it is only counted once.
|
||||
"""
|
||||
if len(labels) == 0:
|
||||
return 0.0
|
||||
|
||||
# In Recall Single Hit we only consider if a label is present in at least one prediction.
|
||||
# No need to count multiple occurrences of the same label in different predictions
|
||||
retrieved_labels = {l for l, p in itertools.product(labels, predictions) if l in p}
|
||||
return len(retrieved_labels) / len(labels)
|
||||
|
||||
@staticmethod
|
||||
def _recall_multi_hit(labels: List[str], predictions: List[str]) -> float:
|
||||
"""
|
||||
Measures how many times a label is present in at least one or more predictions.
|
||||
"""
|
||||
if len(labels) == 0:
|
||||
return 0.0
|
||||
|
||||
correct_retrievals = 0
|
||||
for label, prediction in itertools.product(labels, predictions):
|
||||
if label in prediction:
|
||||
correct_retrievals += 1
|
||||
|
||||
return correct_retrievals / len(labels)
|
||||
|
||||
@staticmethod
|
||||
def _mrr(labels: List[str], predictions: List[str]) -> float:
|
||||
"""
|
||||
Measures the mean reciprocal rank of times a label is present in at least one or more predictions.
|
||||
"""
|
||||
if len(labels) == 0:
|
||||
return 0.0
|
||||
|
||||
mrr_sum = 0.0
|
||||
for label in labels:
|
||||
for rank, prediction in enumerate(predictions):
|
||||
if label in prediction:
|
||||
mrr_sum += 1 / (rank + 1)
|
||||
break
|
||||
|
||||
return mrr_sum / len(labels)
|
||||
@ -1,174 +0,0 @@
|
||||
import pytest
|
||||
|
||||
from haystack.components.evaluators import SASEvaluator
|
||||
from haystack.utils.device import ComponentDevice
|
||||
|
||||
|
||||
class TestSASEvaluator:
|
||||
def test_init_default(self, monkeypatch):
|
||||
monkeypatch.setenv("HF_API_TOKEN", "fake-token")
|
||||
evaluator = SASEvaluator()
|
||||
|
||||
assert evaluator._model == "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
|
||||
assert evaluator._batch_size == 32
|
||||
assert evaluator._device is None
|
||||
assert evaluator._token.resolve_value() == "fake-token"
|
||||
|
||||
def test_to_dict(self, monkeypatch):
|
||||
monkeypatch.setenv("HF_API_TOKEN", "fake-token")
|
||||
|
||||
evaluator = SASEvaluator(device=ComponentDevice.from_str("cuda:0"))
|
||||
|
||||
expected_dict = {
|
||||
"type": "haystack.components.evaluators.sas_evaluator.SASEvaluator",
|
||||
"init_parameters": {
|
||||
"model": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
|
||||
"batch_size": 32,
|
||||
"device": {"type": "single", "device": "cuda:0"},
|
||||
"token": {"type": "env_var", "env_vars": ["HF_API_TOKEN"], "strict": False},
|
||||
},
|
||||
}
|
||||
assert evaluator.to_dict() == expected_dict
|
||||
|
||||
def test_from_dict(self, monkeypatch):
|
||||
monkeypatch.setenv("HF_API_TOKEN", "fake-token")
|
||||
evaluator = SASEvaluator.from_dict(
|
||||
{
|
||||
"type": "haystack.components.evaluators.sas_evaluator.SASEvaluator",
|
||||
"init_parameters": {
|
||||
"model": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
|
||||
"batch_size": 32,
|
||||
"device": {"type": "single", "device": "cuda:0"},
|
||||
"token": {"type": "env_var", "env_vars": ["HF_API_TOKEN"], "strict": False},
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
assert evaluator._model == "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
|
||||
assert evaluator._batch_size == 32
|
||||
assert evaluator._device.to_torch_str() == "cuda:0"
|
||||
assert evaluator._token.resolve_value() == "fake-token"
|
||||
|
||||
def test_run_with_empty_inputs(self):
|
||||
evaluator = SASEvaluator()
|
||||
result = evaluator.run(labels=[], predictions=[])
|
||||
assert len(result) == 2
|
||||
assert result["sas"] == 0.0
|
||||
assert result["scores"] == [0.0]
|
||||
|
||||
def test_run_with_different_lengths(self):
|
||||
evaluator = SASEvaluator()
|
||||
labels = [
|
||||
"A construction budget of US $2.3 billion",
|
||||
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||||
]
|
||||
predictions = [
|
||||
"A construction budget of US $2.3 billion",
|
||||
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||||
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
||||
]
|
||||
with pytest.raises(ValueError):
|
||||
evaluator.run(labels=labels, predictions=predictions)
|
||||
|
||||
def test_run_not_warmed_up(self):
|
||||
evaluator = SASEvaluator()
|
||||
labels = [
|
||||
"A construction budget of US $2.3 billion",
|
||||
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||||
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
||||
]
|
||||
predictions = [
|
||||
"A construction budget of US $2.3 billion",
|
||||
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||||
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
||||
]
|
||||
with pytest.raises(RuntimeError):
|
||||
evaluator.run(labels=labels, predictions=predictions)
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_run_with_matching_predictions(self):
|
||||
evaluator = SASEvaluator()
|
||||
labels = [
|
||||
"A construction budget of US $2.3 billion",
|
||||
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||||
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
||||
]
|
||||
predictions = [
|
||||
"A construction budget of US $2.3 billion",
|
||||
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||||
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
||||
]
|
||||
evaluator.warm_up()
|
||||
result = evaluator.run(labels=labels, predictions=predictions)
|
||||
|
||||
assert len(result) == 2
|
||||
assert result["sas"] == pytest.approx(1.0)
|
||||
assert result["scores"] == pytest.approx([1.0, 1.0, 1.0])
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_run_with_single_prediction(self):
|
||||
evaluator = SASEvaluator()
|
||||
|
||||
labels = ["US $2.3 billion"]
|
||||
evaluator.warm_up()
|
||||
result = evaluator.run(labels=labels, predictions=["A construction budget of US $2.3 billion"])
|
||||
assert len(result) == 2
|
||||
assert result["sas"] == pytest.approx(0.689089, abs=1e-5)
|
||||
assert result["scores"] == pytest.approx([0.689089], abs=1e-5)
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_run_with_mismatched_predictions(self):
|
||||
evaluator = SASEvaluator()
|
||||
labels = [
|
||||
"US $2.3 billion",
|
||||
"Paris's cultural magnificence is symbolized by the Eiffel Tower",
|
||||
"Japan was transformed into a modernized world power after the Meiji Restoration.",
|
||||
]
|
||||
predictions = [
|
||||
"A construction budget of US $2.3 billion",
|
||||
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||||
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
||||
]
|
||||
evaluator.warm_up()
|
||||
result = evaluator.run(labels=labels, predictions=predictions)
|
||||
assert len(result) == 2
|
||||
assert result["sas"] == pytest.approx(0.8227189)
|
||||
assert result["scores"] == pytest.approx([0.689089, 0.870389, 0.908679], abs=1e-5)
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_run_with_bi_encoder_model(self):
|
||||
evaluator = SASEvaluator(model="sentence-transformers/all-mpnet-base-v2")
|
||||
labels = [
|
||||
"A construction budget of US $2.3 billion",
|
||||
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||||
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
||||
]
|
||||
predictions = [
|
||||
"A construction budget of US $2.3 billion",
|
||||
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||||
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
||||
]
|
||||
evaluator.warm_up()
|
||||
result = evaluator.run(labels=labels, predictions=predictions)
|
||||
assert len(result) == 2
|
||||
assert result["sas"] == pytest.approx(1.0)
|
||||
assert result["scores"] == pytest.approx([1.0, 1.0, 1.0])
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_run_with_cross_encoder_model(self):
|
||||
evaluator = SASEvaluator(model="cross-encoder/ms-marco-MiniLM-L-6-v2")
|
||||
labels = [
|
||||
"A construction budget of US $2.3 billion",
|
||||
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||||
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
||||
]
|
||||
predictions = [
|
||||
"A construction budget of US $2.3 billion",
|
||||
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||||
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
||||
]
|
||||
evaluator.warm_up()
|
||||
result = evaluator.run(labels=labels, predictions=predictions)
|
||||
assert len(result) == 2
|
||||
assert result["sas"] == pytest.approx(0.999967, abs=1e-5)
|
||||
assert result["scores"] == pytest.approx([0.9999765157699585, 0.999968409538269, 0.9999572038650513], abs=1e-5)
|
||||
@ -1,225 +0,0 @@
|
||||
import pytest
|
||||
|
||||
from haystack.components.evaluators import StatisticalEvaluator, StatisticalMetric
|
||||
|
||||
|
||||
class TestStatisticalEvaluator:
|
||||
def test_init_default(self):
|
||||
evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1)
|
||||
assert evaluator._metric == StatisticalMetric.F1
|
||||
|
||||
def test_init_with_string(self):
|
||||
evaluator = StatisticalEvaluator(metric="exact_match")
|
||||
assert evaluator._metric == StatisticalMetric.EM
|
||||
|
||||
def test_to_dict(self):
|
||||
evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1)
|
||||
|
||||
expected_dict = {
|
||||
"type": "haystack.components.evaluators.statistical_evaluator.StatisticalEvaluator",
|
||||
"init_parameters": {"metric": "f1"},
|
||||
}
|
||||
assert evaluator.to_dict() == expected_dict
|
||||
|
||||
def test_from_dict(self):
|
||||
evaluator = StatisticalEvaluator.from_dict(
|
||||
{
|
||||
"type": "haystack.components.evaluators.statistical_evaluator.StatisticalEvaluator",
|
||||
"init_parameters": {"metric": "f1"},
|
||||
}
|
||||
)
|
||||
|
||||
assert evaluator._metric == StatisticalMetric.F1
|
||||
|
||||
|
||||
class TestStatisticalEvaluatorF1:
|
||||
def test_run_with_empty_inputs(self):
|
||||
evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1)
|
||||
result = evaluator.run(labels=[], predictions=[])
|
||||
assert len(result) == 1
|
||||
assert result["result"] == 0.0
|
||||
|
||||
def test_run_with_different_lengths(self):
|
||||
evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1)
|
||||
labels = [
|
||||
"A construction budget of US $2.3 billion",
|
||||
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||||
]
|
||||
predictions = [
|
||||
"A construction budget of US $2.3 billion",
|
||||
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||||
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
||||
]
|
||||
with pytest.raises(ValueError):
|
||||
evaluator.run(labels=labels, predictions=predictions)
|
||||
|
||||
def test_run_with_matching_predictions(self):
|
||||
evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1)
|
||||
labels = ["OpenSource", "HaystackAI", "LLMs"]
|
||||
predictions = ["OpenSource", "HaystackAI", "LLMs"]
|
||||
result = evaluator.run(labels=labels, predictions=predictions)
|
||||
|
||||
assert len(result) == 1
|
||||
assert result["result"] == 1.0
|
||||
|
||||
def test_run_with_single_prediction(self):
|
||||
evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1)
|
||||
|
||||
result = evaluator.run(labels=["Source"], predictions=["Open Source"])
|
||||
assert len(result) == 1
|
||||
assert result["result"] == pytest.approx(2 / 3)
|
||||
|
||||
def test_run_with_mismatched_predictions(self):
|
||||
labels = ["Source", "HaystackAI"]
|
||||
evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1)
|
||||
predictions = ["Open Source", "HaystackAI"]
|
||||
result = evaluator.run(labels=labels, predictions=predictions)
|
||||
assert len(result) == 1
|
||||
assert result["result"] == pytest.approx(5 / 6)
|
||||
|
||||
|
||||
class TestStatisticalEvaluatorExactMatch:
|
||||
def test_run_with_empty_inputs(self):
|
||||
evaluator = StatisticalEvaluator(metric=StatisticalMetric.EM)
|
||||
result = evaluator.run(predictions=[], labels=[])
|
||||
assert len(result) == 1
|
||||
assert result["result"] == 0.0
|
||||
|
||||
def test_run_with_different_lengths(self):
|
||||
evaluator = StatisticalEvaluator(metric=StatisticalMetric.EM)
|
||||
labels = [
|
||||
"A construction budget of US $2.3 billion",
|
||||
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||||
]
|
||||
predictions = [
|
||||
"A construction budget of US $2.3 billion",
|
||||
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||||
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
||||
]
|
||||
with pytest.raises(ValueError):
|
||||
evaluator.run(labels=labels, predictions=predictions)
|
||||
|
||||
def test_run_with_matching_predictions(self):
|
||||
labels = ["OpenSource", "HaystackAI", "LLMs"]
|
||||
evaluator = StatisticalEvaluator(metric=StatisticalMetric.EM)
|
||||
predictions = ["OpenSource", "HaystackAI", "LLMs"]
|
||||
result = evaluator.run(labels=labels, predictions=predictions)
|
||||
|
||||
assert len(result) == 1
|
||||
assert result["result"] == 1.0
|
||||
|
||||
def test_run_with_single_prediction(self):
|
||||
evaluator = StatisticalEvaluator(metric=StatisticalMetric.EM)
|
||||
result = evaluator.run(labels=["OpenSource"], predictions=["OpenSource"])
|
||||
assert len(result) == 1
|
||||
assert result["result"] == 1.0
|
||||
|
||||
def test_run_with_mismatched_predictions(self):
|
||||
evaluator = StatisticalEvaluator(metric=StatisticalMetric.EM)
|
||||
labels = ["Source", "HaystackAI", "LLMs"]
|
||||
predictions = ["OpenSource", "HaystackAI", "LLMs"]
|
||||
result = evaluator.run(labels=labels, predictions=predictions)
|
||||
assert len(result) == 1
|
||||
assert result["result"] == 2 / 3
|
||||
|
||||
|
||||
class TestStatisticalEvaluatorRecallSingleHit:
|
||||
def test_run(self):
|
||||
evaluator = StatisticalEvaluator(metric=StatisticalMetric.RECALL_SINGLE_HIT)
|
||||
labels = ["Eiffel Tower", "Louvre Museum", "Colosseum", "Trajan's Column"]
|
||||
predictions = [
|
||||
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||||
"The Eiffel Tower max height is 330 meters.",
|
||||
"Louvre Museum is the world's largest art museum and a historic monument in Paris, France.",
|
||||
"The Leaning Tower of Pisa is the campanile, or freestanding bell tower, of Pisa Cathedral.",
|
||||
]
|
||||
result = evaluator.run(labels=labels, predictions=predictions)
|
||||
assert len(result) == 1
|
||||
assert result["result"] == 2 / 4
|
||||
|
||||
def test_run_with_empty_labels(self):
|
||||
evaluator = StatisticalEvaluator(metric=StatisticalMetric.RECALL_SINGLE_HIT)
|
||||
predictions = [
|
||||
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||||
"The Eiffel Tower max height is 330 meters.",
|
||||
"Louvre Museum is the world's largest art museum and a historic monument in Paris, France.",
|
||||
"The Leaning Tower of Pisa is the campanile, or freestanding bell tower, of Pisa Cathedral.",
|
||||
]
|
||||
result = evaluator.run(labels=[], predictions=predictions)
|
||||
assert len(result) == 1
|
||||
assert result["result"] == 0.0
|
||||
|
||||
def test_run_with_empty_predictions(self):
|
||||
evaluator = StatisticalEvaluator(metric=StatisticalMetric.RECALL_SINGLE_HIT)
|
||||
labels = ["Eiffel Tower", "Louvre Museum", "Colosseum", "Trajan's Column"]
|
||||
result = evaluator.run(labels=labels, predictions=[])
|
||||
assert len(result) == 1
|
||||
assert result["result"] == 0.0
|
||||
|
||||
|
||||
class TestStatisticalEvaluatorRecallMultiHit:
|
||||
def test_run(self):
|
||||
evaluator = StatisticalEvaluator(metric=StatisticalMetric.RECALL_MULTI_HIT)
|
||||
labels = ["Eiffel Tower", "Louvre Museum", "Colosseum", "Trajan's Column"]
|
||||
predictions = [
|
||||
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||||
"The Eiffel Tower max height is 330 meters.",
|
||||
"Louvre Museum is the world's largest art museum and a historic monument in Paris, France.",
|
||||
"The Leaning Tower of Pisa is the campanile, or freestanding bell tower, of Pisa Cathedral.",
|
||||
]
|
||||
result = evaluator.run(labels=labels, predictions=predictions)
|
||||
assert len(result) == 1
|
||||
assert result["result"] == 0.75
|
||||
|
||||
def test_run_with_empty_labels(self):
|
||||
evaluator = StatisticalEvaluator(metric=StatisticalMetric.RECALL_MULTI_HIT)
|
||||
predictions = [
|
||||
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||||
"The Eiffel Tower max height is 330 meters.",
|
||||
"Louvre Museum is the world's largest art museum and a historic monument in Paris, France.",
|
||||
"The Leaning Tower of Pisa is the campanile, or freestanding bell tower, of Pisa Cathedral.",
|
||||
]
|
||||
result = evaluator.run(labels=[], predictions=predictions)
|
||||
assert len(result) == 1
|
||||
assert result["result"] == 0.0
|
||||
|
||||
def test_run_with_empty_predictions(self):
|
||||
evaluator = StatisticalEvaluator(metric=StatisticalMetric.RECALL_MULTI_HIT)
|
||||
labels = ["Eiffel Tower", "Louvre Museum", "Colosseum", "Trajan's Column"]
|
||||
result = evaluator.run(labels=labels, predictions=[])
|
||||
assert len(result) == 1
|
||||
assert result["result"] == 0.0
|
||||
|
||||
|
||||
class TestStatisticalEvaluatorMRR:
|
||||
def test_run(self):
|
||||
evaluator = StatisticalEvaluator(metric=StatisticalMetric.MRR)
|
||||
labels = ["Eiffel Tower", "Louvre Museum", "Colosseum", "Trajan's Column"]
|
||||
predictions = [
|
||||
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||||
"The Eiffel Tower max height is 330 meters.",
|
||||
"Louvre Museum is the world's largest art museum and a historic monument in Paris, France.",
|
||||
"The Leaning Tower of Pisa is the campanile, or freestanding bell tower, of Pisa Cathedral.",
|
||||
]
|
||||
result = evaluator.run(labels=labels, predictions=predictions)
|
||||
assert len(result) == 1
|
||||
assert result["result"] == 1 / 3
|
||||
|
||||
def test_run_with_empty_labels(self):
|
||||
evaluator = StatisticalEvaluator(metric=StatisticalMetric.MRR)
|
||||
predictions = [
|
||||
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||||
"The Eiffel Tower max height is 330 meters.",
|
||||
"Louvre Museum is the world's largest art museum and a historic monument in Paris, France.",
|
||||
"The Leaning Tower of Pisa is the campanile, or freestanding bell tower, of Pisa Cathedral.",
|
||||
]
|
||||
result = evaluator.run(labels=[], predictions=predictions)
|
||||
assert len(result) == 1
|
||||
assert result["result"] == 0.0
|
||||
|
||||
def test_run_with_empty_predictions(self):
|
||||
evaluator = StatisticalEvaluator(metric=StatisticalMetric.MRR)
|
||||
labels = ["Eiffel Tower", "Louvre Museum", "Colosseum", "Trajan's Column"]
|
||||
result = evaluator.run(labels=labels, predictions=[])
|
||||
assert len(result) == 1
|
||||
assert result["result"] == 0.0
|
||||
Loading…
x
Reference in New Issue
Block a user