Remove all evaluator components (#7053)

This commit is contained in:
Silvano Cerza 2024-02-21 18:24:14 +01:00 committed by GitHub
parent f3be576b5c
commit 8ca4bf405b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 0 additions and 718 deletions

View File

@ -1,4 +0,0 @@
from .sas_evaluator import SASEvaluator
from .statistical_evaluator import StatisticalEvaluator, StatisticalMetric
__all__ = ["SASEvaluator", "StatisticalEvaluator", "StatisticalMetric"]

View File

@ -1,144 +0,0 @@
from typing import Any, Dict, List, Optional
from numpy import mean as np_mean
from haystack import component, default_from_dict, default_to_dict
from haystack.lazy_imports import LazyImport
from haystack.utils import ComponentDevice, expit
from haystack.utils.auth import Secret, deserialize_secrets_inplace
with LazyImport(message="Run 'pip install scikit-learn \"sentence-transformers>=2.2.0\"'") as sas_import:
from sentence_transformers import CrossEncoder, SentenceTransformer, util
from transformers import AutoConfig
@component
class SASEvaluator:
"""
SASEvaluator computes the Semantic Answer Similarity (SAS) between a list of predictions and a list of labels.
It's usually used in Retrieval Augmented Generation (RAG) pipelines to evaluate the quality of the generated answers.
The SAS is computed using a pre-trained model from the Hugging Face model hub. The model can be either a
Bi-Encoder or a Cross-Encoder. The choice of the model is based on the `model` parameter.
"""
def __init__(
self,
model: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
batch_size: int = 32,
device: Optional[ComponentDevice] = None,
token: Secret = Secret.from_env_var("HF_API_TOKEN", strict=False),
):
"""
Creates a new instance of SASEvaluator.
:param model: SentenceTransformers semantic textual similarity model, should be path or string pointing to
a downloadable model.
:param batch_size: Number of prediction-label pairs to encode at once.
:param device: The device on which the model is loaded. If `None`, the default device is automatically
selected.
:param token: The Hugging Face token for HTTP bearer authorization.
You can find your HF token at https://huggingface.co/settings/tokens.
"""
sas_import.check()
self._model = model
self._batch_size = batch_size
self._device = device
self._token = token
self._similarity_model = None
def to_dict(self) -> Dict[str, Any]:
return default_to_dict(
self,
model=self._model,
batch_size=self._batch_size,
device=self._device.to_dict() if self._device else None,
token=self._token.to_dict() if self._token else None,
)
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "SASEvaluator":
deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
if device := data.get("init_parameters", {}).get("device"):
data["init_parameters"]["device"] = ComponentDevice.from_dict(device)
return default_from_dict(cls, data)
def warm_up(self):
"""
Load the model used for evaluation
"""
token = self._token.resolve_value() if self._token else None
config = AutoConfig.from_pretrained(self._model, use_auth_token=token)
cross_encoder_used = False
if config.architectures:
cross_encoder_used = any(arch.endswith("ForSequenceClassification") for arch in config.architectures)
device = ComponentDevice.resolve_device(self._device).to_torch_str()
# Based on the Model string we can load either Bi-Encoders or Cross Encoders.
# Similarity computation changes for both approaches
if cross_encoder_used:
self._similarity_model = CrossEncoder(
self._model,
device=device,
tokenizer_args={"use_auth_token": token},
automodel_args={"use_auth_token": token},
)
else:
self._similarity_model = SentenceTransformer(self._model, device=device, use_auth_token=token)
@component.output_types(sas=float, scores=List[float])
def run(self, labels: List[str], predictions: List[str]) -> Dict[str, Any]:
"""
Run the SASEvaluator to compute the Semantic Answer Similarity (SAS) between a list of predictions and a list of
labels. Both must be list of strings of same length.
:param predictions: List of predictions.
:param labels: List of labels against which the predictions are compared.
:returns: A dictionary with the following outputs:
* `sas` - Cumulative SAS score for the entire dataset.
* `scores` - A list of similarity scores for each prediction-label pair.
"""
if len(labels) != len(predictions):
raise ValueError("The number of predictions and labels must be the same.")
if len(predictions) == 0:
return {"sas": 0.0, "scores": [0.0]}
if not self._similarity_model:
msg = "The model has not been initialized. Call warm_up() before running the evaluator."
raise RuntimeError(msg)
if isinstance(self._similarity_model, CrossEncoder):
# For Cross Encoders we create a list of pairs of predictions and labels
sentence_pairs = [[pred, label] for pred, label in zip(predictions, labels)]
similarity_scores = self._similarity_model.predict(
sentence_pairs, batch_size=self._batch_size, convert_to_numpy=True
)
# All Cross Encoders do not return a set of logits scores that are normalized
# We normalize scores if they are larger than 1
if (similarity_scores > 1).any():
similarity_scores = expit(similarity_scores)
# Convert scores to list of floats from numpy array
similarity_scores = similarity_scores.tolist()
else:
# For Bi-encoders we create embeddings separately for predictions and labels
predictions_embeddings = self._similarity_model.encode(
predictions, batch_size=self._batch_size, convert_to_tensor=True
)
label_embeddings = self._similarity_model.encode(
labels, batch_size=self._batch_size, convert_to_tensor=True
)
# Compute cosine-similarities
scores = util.cos_sim(predictions_embeddings, label_embeddings)
# cos_sim computes cosine similarity between all pairs of vectors in pred_embeddings and label_embeddings
# It returns a matrix with shape (len(predictions), len(labels))
similarity_scores = [scores[i][i].item() for i in range(len(predictions))]
sas_score = np_mean(similarity_scores)
return {"sas": sas_score, "scores": similarity_scores}

View File

@ -1,171 +0,0 @@
import collections
import itertools
from enum import Enum
from typing import Any, Dict, List, Union
from numpy import array as np_array
from numpy import mean as np_mean
from haystack import default_from_dict, default_to_dict
from haystack.core.component import component
class StatisticalMetric(Enum):
"""
Metrics supported by the StatisticalEvaluator.
"""
F1 = "f1"
EM = "exact_match"
RECALL_SINGLE_HIT = "recall_single_hit"
RECALL_MULTI_HIT = "recall_multi_hit"
MRR = "mean_reciprocal_rank"
@classmethod
def from_str(cls, metric: str) -> "StatisticalMetric":
map = {e.value: e for e in StatisticalMetric}
metric_ = map.get(metric)
if metric_ is None:
raise ValueError(f"Unknown statistical metric '{metric}'")
return metric_
@component
class StatisticalEvaluator:
"""
StatisticalEvaluator is a component that evaluates the performance of a model based on statistical metrics.
It's usually used in QA and Retrieval Augmented Generation (RAG) pipelines to evaluate the quality of the generated answers.
The supported metrics are:
- F1: Measures word overlap between predictions and labels.
- Exact Match: Measures the proportion of cases where prediction is identical to the expected label.
"""
def __init__(self, metric: Union[str, StatisticalMetric]):
"""
Creates a new instance of StatisticalEvaluator.
:param metric: Metric to use for evaluation in this component. Supported metrics are F1 and Exact Match.
"""
if isinstance(metric, str):
metric = StatisticalMetric.from_str(metric)
self._metric = metric
self._metric_function = {
StatisticalMetric.F1: self._f1,
StatisticalMetric.EM: self._exact_match,
StatisticalMetric.RECALL_SINGLE_HIT: self._recall_single_hit,
StatisticalMetric.RECALL_MULTI_HIT: self._recall_multi_hit,
StatisticalMetric.MRR: self._mrr,
}[self._metric]
def to_dict(self) -> Dict[str, Any]:
return default_to_dict(self, metric=self._metric.value)
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "StatisticalEvaluator":
data["init_parameters"]["metric"] = StatisticalMetric(data["init_parameters"]["metric"])
return default_from_dict(cls, data)
@component.output_types(result=float)
def run(self, labels: List[str], predictions: List[str]) -> Dict[str, Any]:
"""
Run the StatisticalEvaluator to compute the metric between a list of predictions and a list of labels.
Both must be list of strings of same length.
:param predictions: List of predictions.
:param labels: List of labels against which the predictions are compared.
:returns: A dictionary with the following outputs:
* `result` - Calculated result of the chosen metric.
"""
return {"result": self._metric_function(labels, predictions)}
@staticmethod
def _f1(labels: List[str], predictions: List[str]):
"""
Measure word overlap between predictions and labels.
"""
if len(labels) != len(predictions):
raise ValueError("The number of predictions and labels must be the same.")
if len(predictions) == 0:
# We expect callers of this function already checked if predictions and labels are equal length
return 0.0
scores: List[float] = []
tokenized_predictions = [pred.split() for pred in predictions]
tokenized_labels = [label.split() for label in labels]
for label_tokens, prediction_tokens in zip(tokenized_labels, tokenized_predictions):
common = collections.Counter(label_tokens) & collections.Counter(prediction_tokens)
num_same = sum(common.values())
if len(label_tokens) == 0 or len(prediction_tokens) == 0:
# If either is no-answer, then F1 is 1 if they agree, 0 otherwise
return int(label_tokens == prediction_tokens)
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(label_tokens)
f1 = (2 * precision * recall) / (precision + recall)
scores.append(f1)
return np_mean(scores)
@staticmethod
def _exact_match(labels: List[str], predictions: List[str]) -> float:
"""
Measure the proportion of cases where prediction is identical to the the expected label.
"""
if len(labels) != len(predictions):
raise ValueError("The number of predictions and labels must be the same.")
if len(predictions) == 0:
# We expect callers of this function already checked if predictions and labels are equal length
return 0.0
score_list = np_array(predictions) == np_array(labels)
return np_mean(score_list)
@staticmethod
def _recall_single_hit(labels: List[str], predictions: List[str]) -> float:
"""
Measures how many times a label is present in at least one prediction.
If the same label is found in multiple predictions it is only counted once.
"""
if len(labels) == 0:
return 0.0
# In Recall Single Hit we only consider if a label is present in at least one prediction.
# No need to count multiple occurrences of the same label in different predictions
retrieved_labels = {l for l, p in itertools.product(labels, predictions) if l in p}
return len(retrieved_labels) / len(labels)
@staticmethod
def _recall_multi_hit(labels: List[str], predictions: List[str]) -> float:
"""
Measures how many times a label is present in at least one or more predictions.
"""
if len(labels) == 0:
return 0.0
correct_retrievals = 0
for label, prediction in itertools.product(labels, predictions):
if label in prediction:
correct_retrievals += 1
return correct_retrievals / len(labels)
@staticmethod
def _mrr(labels: List[str], predictions: List[str]) -> float:
"""
Measures the mean reciprocal rank of times a label is present in at least one or more predictions.
"""
if len(labels) == 0:
return 0.0
mrr_sum = 0.0
for label in labels:
for rank, prediction in enumerate(predictions):
if label in prediction:
mrr_sum += 1 / (rank + 1)
break
return mrr_sum / len(labels)

View File

@ -1,174 +0,0 @@
import pytest
from haystack.components.evaluators import SASEvaluator
from haystack.utils.device import ComponentDevice
class TestSASEvaluator:
def test_init_default(self, monkeypatch):
monkeypatch.setenv("HF_API_TOKEN", "fake-token")
evaluator = SASEvaluator()
assert evaluator._model == "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
assert evaluator._batch_size == 32
assert evaluator._device is None
assert evaluator._token.resolve_value() == "fake-token"
def test_to_dict(self, monkeypatch):
monkeypatch.setenv("HF_API_TOKEN", "fake-token")
evaluator = SASEvaluator(device=ComponentDevice.from_str("cuda:0"))
expected_dict = {
"type": "haystack.components.evaluators.sas_evaluator.SASEvaluator",
"init_parameters": {
"model": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
"batch_size": 32,
"device": {"type": "single", "device": "cuda:0"},
"token": {"type": "env_var", "env_vars": ["HF_API_TOKEN"], "strict": False},
},
}
assert evaluator.to_dict() == expected_dict
def test_from_dict(self, monkeypatch):
monkeypatch.setenv("HF_API_TOKEN", "fake-token")
evaluator = SASEvaluator.from_dict(
{
"type": "haystack.components.evaluators.sas_evaluator.SASEvaluator",
"init_parameters": {
"model": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
"batch_size": 32,
"device": {"type": "single", "device": "cuda:0"},
"token": {"type": "env_var", "env_vars": ["HF_API_TOKEN"], "strict": False},
},
}
)
assert evaluator._model == "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
assert evaluator._batch_size == 32
assert evaluator._device.to_torch_str() == "cuda:0"
assert evaluator._token.resolve_value() == "fake-token"
def test_run_with_empty_inputs(self):
evaluator = SASEvaluator()
result = evaluator.run(labels=[], predictions=[])
assert len(result) == 2
assert result["sas"] == 0.0
assert result["scores"] == [0.0]
def test_run_with_different_lengths(self):
evaluator = SASEvaluator()
labels = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
]
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
with pytest.raises(ValueError):
evaluator.run(labels=labels, predictions=predictions)
def test_run_not_warmed_up(self):
evaluator = SASEvaluator()
labels = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
with pytest.raises(RuntimeError):
evaluator.run(labels=labels, predictions=predictions)
@pytest.mark.integration
def test_run_with_matching_predictions(self):
evaluator = SASEvaluator()
labels = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
evaluator.warm_up()
result = evaluator.run(labels=labels, predictions=predictions)
assert len(result) == 2
assert result["sas"] == pytest.approx(1.0)
assert result["scores"] == pytest.approx([1.0, 1.0, 1.0])
@pytest.mark.integration
def test_run_with_single_prediction(self):
evaluator = SASEvaluator()
labels = ["US $2.3 billion"]
evaluator.warm_up()
result = evaluator.run(labels=labels, predictions=["A construction budget of US $2.3 billion"])
assert len(result) == 2
assert result["sas"] == pytest.approx(0.689089, abs=1e-5)
assert result["scores"] == pytest.approx([0.689089], abs=1e-5)
@pytest.mark.integration
def test_run_with_mismatched_predictions(self):
evaluator = SASEvaluator()
labels = [
"US $2.3 billion",
"Paris's cultural magnificence is symbolized by the Eiffel Tower",
"Japan was transformed into a modernized world power after the Meiji Restoration.",
]
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
evaluator.warm_up()
result = evaluator.run(labels=labels, predictions=predictions)
assert len(result) == 2
assert result["sas"] == pytest.approx(0.8227189)
assert result["scores"] == pytest.approx([0.689089, 0.870389, 0.908679], abs=1e-5)
@pytest.mark.integration
def test_run_with_bi_encoder_model(self):
evaluator = SASEvaluator(model="sentence-transformers/all-mpnet-base-v2")
labels = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
evaluator.warm_up()
result = evaluator.run(labels=labels, predictions=predictions)
assert len(result) == 2
assert result["sas"] == pytest.approx(1.0)
assert result["scores"] == pytest.approx([1.0, 1.0, 1.0])
@pytest.mark.integration
def test_run_with_cross_encoder_model(self):
evaluator = SASEvaluator(model="cross-encoder/ms-marco-MiniLM-L-6-v2")
labels = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
evaluator.warm_up()
result = evaluator.run(labels=labels, predictions=predictions)
assert len(result) == 2
assert result["sas"] == pytest.approx(0.999967, abs=1e-5)
assert result["scores"] == pytest.approx([0.9999765157699585, 0.999968409538269, 0.9999572038650513], abs=1e-5)

View File

@ -1,225 +0,0 @@
import pytest
from haystack.components.evaluators import StatisticalEvaluator, StatisticalMetric
class TestStatisticalEvaluator:
def test_init_default(self):
evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1)
assert evaluator._metric == StatisticalMetric.F1
def test_init_with_string(self):
evaluator = StatisticalEvaluator(metric="exact_match")
assert evaluator._metric == StatisticalMetric.EM
def test_to_dict(self):
evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1)
expected_dict = {
"type": "haystack.components.evaluators.statistical_evaluator.StatisticalEvaluator",
"init_parameters": {"metric": "f1"},
}
assert evaluator.to_dict() == expected_dict
def test_from_dict(self):
evaluator = StatisticalEvaluator.from_dict(
{
"type": "haystack.components.evaluators.statistical_evaluator.StatisticalEvaluator",
"init_parameters": {"metric": "f1"},
}
)
assert evaluator._metric == StatisticalMetric.F1
class TestStatisticalEvaluatorF1:
def test_run_with_empty_inputs(self):
evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1)
result = evaluator.run(labels=[], predictions=[])
assert len(result) == 1
assert result["result"] == 0.0
def test_run_with_different_lengths(self):
evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1)
labels = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
]
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
with pytest.raises(ValueError):
evaluator.run(labels=labels, predictions=predictions)
def test_run_with_matching_predictions(self):
evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1)
labels = ["OpenSource", "HaystackAI", "LLMs"]
predictions = ["OpenSource", "HaystackAI", "LLMs"]
result = evaluator.run(labels=labels, predictions=predictions)
assert len(result) == 1
assert result["result"] == 1.0
def test_run_with_single_prediction(self):
evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1)
result = evaluator.run(labels=["Source"], predictions=["Open Source"])
assert len(result) == 1
assert result["result"] == pytest.approx(2 / 3)
def test_run_with_mismatched_predictions(self):
labels = ["Source", "HaystackAI"]
evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1)
predictions = ["Open Source", "HaystackAI"]
result = evaluator.run(labels=labels, predictions=predictions)
assert len(result) == 1
assert result["result"] == pytest.approx(5 / 6)
class TestStatisticalEvaluatorExactMatch:
def test_run_with_empty_inputs(self):
evaluator = StatisticalEvaluator(metric=StatisticalMetric.EM)
result = evaluator.run(predictions=[], labels=[])
assert len(result) == 1
assert result["result"] == 0.0
def test_run_with_different_lengths(self):
evaluator = StatisticalEvaluator(metric=StatisticalMetric.EM)
labels = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
]
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
with pytest.raises(ValueError):
evaluator.run(labels=labels, predictions=predictions)
def test_run_with_matching_predictions(self):
labels = ["OpenSource", "HaystackAI", "LLMs"]
evaluator = StatisticalEvaluator(metric=StatisticalMetric.EM)
predictions = ["OpenSource", "HaystackAI", "LLMs"]
result = evaluator.run(labels=labels, predictions=predictions)
assert len(result) == 1
assert result["result"] == 1.0
def test_run_with_single_prediction(self):
evaluator = StatisticalEvaluator(metric=StatisticalMetric.EM)
result = evaluator.run(labels=["OpenSource"], predictions=["OpenSource"])
assert len(result) == 1
assert result["result"] == 1.0
def test_run_with_mismatched_predictions(self):
evaluator = StatisticalEvaluator(metric=StatisticalMetric.EM)
labels = ["Source", "HaystackAI", "LLMs"]
predictions = ["OpenSource", "HaystackAI", "LLMs"]
result = evaluator.run(labels=labels, predictions=predictions)
assert len(result) == 1
assert result["result"] == 2 / 3
class TestStatisticalEvaluatorRecallSingleHit:
def test_run(self):
evaluator = StatisticalEvaluator(metric=StatisticalMetric.RECALL_SINGLE_HIT)
labels = ["Eiffel Tower", "Louvre Museum", "Colosseum", "Trajan's Column"]
predictions = [
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Eiffel Tower max height is 330 meters.",
"Louvre Museum is the world's largest art museum and a historic monument in Paris, France.",
"The Leaning Tower of Pisa is the campanile, or freestanding bell tower, of Pisa Cathedral.",
]
result = evaluator.run(labels=labels, predictions=predictions)
assert len(result) == 1
assert result["result"] == 2 / 4
def test_run_with_empty_labels(self):
evaluator = StatisticalEvaluator(metric=StatisticalMetric.RECALL_SINGLE_HIT)
predictions = [
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Eiffel Tower max height is 330 meters.",
"Louvre Museum is the world's largest art museum and a historic monument in Paris, France.",
"The Leaning Tower of Pisa is the campanile, or freestanding bell tower, of Pisa Cathedral.",
]
result = evaluator.run(labels=[], predictions=predictions)
assert len(result) == 1
assert result["result"] == 0.0
def test_run_with_empty_predictions(self):
evaluator = StatisticalEvaluator(metric=StatisticalMetric.RECALL_SINGLE_HIT)
labels = ["Eiffel Tower", "Louvre Museum", "Colosseum", "Trajan's Column"]
result = evaluator.run(labels=labels, predictions=[])
assert len(result) == 1
assert result["result"] == 0.0
class TestStatisticalEvaluatorRecallMultiHit:
def test_run(self):
evaluator = StatisticalEvaluator(metric=StatisticalMetric.RECALL_MULTI_HIT)
labels = ["Eiffel Tower", "Louvre Museum", "Colosseum", "Trajan's Column"]
predictions = [
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Eiffel Tower max height is 330 meters.",
"Louvre Museum is the world's largest art museum and a historic monument in Paris, France.",
"The Leaning Tower of Pisa is the campanile, or freestanding bell tower, of Pisa Cathedral.",
]
result = evaluator.run(labels=labels, predictions=predictions)
assert len(result) == 1
assert result["result"] == 0.75
def test_run_with_empty_labels(self):
evaluator = StatisticalEvaluator(metric=StatisticalMetric.RECALL_MULTI_HIT)
predictions = [
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Eiffel Tower max height is 330 meters.",
"Louvre Museum is the world's largest art museum and a historic monument in Paris, France.",
"The Leaning Tower of Pisa is the campanile, or freestanding bell tower, of Pisa Cathedral.",
]
result = evaluator.run(labels=[], predictions=predictions)
assert len(result) == 1
assert result["result"] == 0.0
def test_run_with_empty_predictions(self):
evaluator = StatisticalEvaluator(metric=StatisticalMetric.RECALL_MULTI_HIT)
labels = ["Eiffel Tower", "Louvre Museum", "Colosseum", "Trajan's Column"]
result = evaluator.run(labels=labels, predictions=[])
assert len(result) == 1
assert result["result"] == 0.0
class TestStatisticalEvaluatorMRR:
def test_run(self):
evaluator = StatisticalEvaluator(metric=StatisticalMetric.MRR)
labels = ["Eiffel Tower", "Louvre Museum", "Colosseum", "Trajan's Column"]
predictions = [
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Eiffel Tower max height is 330 meters.",
"Louvre Museum is the world's largest art museum and a historic monument in Paris, France.",
"The Leaning Tower of Pisa is the campanile, or freestanding bell tower, of Pisa Cathedral.",
]
result = evaluator.run(labels=labels, predictions=predictions)
assert len(result) == 1
assert result["result"] == 1 / 3
def test_run_with_empty_labels(self):
evaluator = StatisticalEvaluator(metric=StatisticalMetric.MRR)
predictions = [
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Eiffel Tower max height is 330 meters.",
"Louvre Museum is the world's largest art museum and a historic monument in Paris, France.",
"The Leaning Tower of Pisa is the campanile, or freestanding bell tower, of Pisa Cathedral.",
]
result = evaluator.run(labels=[], predictions=predictions)
assert len(result) == 1
assert result["result"] == 0.0
def test_run_with_empty_predictions(self):
evaluator = StatisticalEvaluator(metric=StatisticalMetric.MRR)
labels = ["Eiffel Tower", "Louvre Museum", "Colosseum", "Trajan's Column"]
result = evaluator.run(labels=labels, predictions=[])
assert len(result) == 1
assert result["result"] == 0.0