diff --git a/haystack/components/evaluators/__init__.py b/haystack/components/evaluators/__init__.py deleted file mode 100644 index 3467a2ad8..000000000 --- a/haystack/components/evaluators/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .sas_evaluator import SASEvaluator -from .statistical_evaluator import StatisticalEvaluator, StatisticalMetric - -__all__ = ["SASEvaluator", "StatisticalEvaluator", "StatisticalMetric"] diff --git a/haystack/components/evaluators/sas_evaluator.py b/haystack/components/evaluators/sas_evaluator.py deleted file mode 100644 index 2c755922b..000000000 --- a/haystack/components/evaluators/sas_evaluator.py +++ /dev/null @@ -1,144 +0,0 @@ -from typing import Any, Dict, List, Optional - -from numpy import mean as np_mean - -from haystack import component, default_from_dict, default_to_dict -from haystack.lazy_imports import LazyImport -from haystack.utils import ComponentDevice, expit -from haystack.utils.auth import Secret, deserialize_secrets_inplace - -with LazyImport(message="Run 'pip install scikit-learn \"sentence-transformers>=2.2.0\"'") as sas_import: - from sentence_transformers import CrossEncoder, SentenceTransformer, util - from transformers import AutoConfig - - -@component -class SASEvaluator: - """ - SASEvaluator computes the Semantic Answer Similarity (SAS) between a list of predictions and a list of labels. - It's usually used in Retrieval Augmented Generation (RAG) pipelines to evaluate the quality of the generated answers. - - The SAS is computed using a pre-trained model from the Hugging Face model hub. The model can be either a - Bi-Encoder or a Cross-Encoder. The choice of the model is based on the `model` parameter. - """ - - def __init__( - self, - model: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2", - batch_size: int = 32, - device: Optional[ComponentDevice] = None, - token: Secret = Secret.from_env_var("HF_API_TOKEN", strict=False), - ): - """ - Creates a new instance of SASEvaluator. - - :param model: SentenceTransformers semantic textual similarity model, should be path or string pointing to - a downloadable model. - :param batch_size: Number of prediction-label pairs to encode at once. - :param device: The device on which the model is loaded. If `None`, the default device is automatically - selected. - :param token: The Hugging Face token for HTTP bearer authorization. - You can find your HF token at https://huggingface.co/settings/tokens. - """ - sas_import.check() - - self._model = model - self._batch_size = batch_size - self._device = device - self._token = token - self._similarity_model = None - - def to_dict(self) -> Dict[str, Any]: - return default_to_dict( - self, - model=self._model, - batch_size=self._batch_size, - device=self._device.to_dict() if self._device else None, - token=self._token.to_dict() if self._token else None, - ) - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "SASEvaluator": - deserialize_secrets_inplace(data["init_parameters"], keys=["token"]) - if device := data.get("init_parameters", {}).get("device"): - data["init_parameters"]["device"] = ComponentDevice.from_dict(device) - return default_from_dict(cls, data) - - def warm_up(self): - """ - Load the model used for evaluation - """ - token = self._token.resolve_value() if self._token else None - config = AutoConfig.from_pretrained(self._model, use_auth_token=token) - cross_encoder_used = False - if config.architectures: - cross_encoder_used = any(arch.endswith("ForSequenceClassification") for arch in config.architectures) - device = ComponentDevice.resolve_device(self._device).to_torch_str() - # Based on the Model string we can load either Bi-Encoders or Cross Encoders. - # Similarity computation changes for both approaches - if cross_encoder_used: - self._similarity_model = CrossEncoder( - self._model, - device=device, - tokenizer_args={"use_auth_token": token}, - automodel_args={"use_auth_token": token}, - ) - else: - self._similarity_model = SentenceTransformer(self._model, device=device, use_auth_token=token) - - @component.output_types(sas=float, scores=List[float]) - def run(self, labels: List[str], predictions: List[str]) -> Dict[str, Any]: - """ - Run the SASEvaluator to compute the Semantic Answer Similarity (SAS) between a list of predictions and a list of - labels. Both must be list of strings of same length. - - :param predictions: List of predictions. - :param labels: List of labels against which the predictions are compared. - :returns: A dictionary with the following outputs: - * `sas` - Cumulative SAS score for the entire dataset. - * `scores` - A list of similarity scores for each prediction-label pair. - """ - if len(labels) != len(predictions): - raise ValueError("The number of predictions and labels must be the same.") - - if len(predictions) == 0: - return {"sas": 0.0, "scores": [0.0]} - - if not self._similarity_model: - msg = "The model has not been initialized. Call warm_up() before running the evaluator." - raise RuntimeError(msg) - - if isinstance(self._similarity_model, CrossEncoder): - # For Cross Encoders we create a list of pairs of predictions and labels - sentence_pairs = [[pred, label] for pred, label in zip(predictions, labels)] - similarity_scores = self._similarity_model.predict( - sentence_pairs, batch_size=self._batch_size, convert_to_numpy=True - ) - - # All Cross Encoders do not return a set of logits scores that are normalized - # We normalize scores if they are larger than 1 - if (similarity_scores > 1).any(): - similarity_scores = expit(similarity_scores) - - # Convert scores to list of floats from numpy array - similarity_scores = similarity_scores.tolist() - - else: - # For Bi-encoders we create embeddings separately for predictions and labels - predictions_embeddings = self._similarity_model.encode( - predictions, batch_size=self._batch_size, convert_to_tensor=True - ) - label_embeddings = self._similarity_model.encode( - labels, batch_size=self._batch_size, convert_to_tensor=True - ) - - # Compute cosine-similarities - scores = util.cos_sim(predictions_embeddings, label_embeddings) - - # cos_sim computes cosine similarity between all pairs of vectors in pred_embeddings and label_embeddings - # It returns a matrix with shape (len(predictions), len(labels)) - similarity_scores = [scores[i][i].item() for i in range(len(predictions))] - - sas_score = np_mean(similarity_scores) - - return {"sas": sas_score, "scores": similarity_scores} diff --git a/haystack/components/evaluators/statistical_evaluator.py b/haystack/components/evaluators/statistical_evaluator.py deleted file mode 100644 index abf338687..000000000 --- a/haystack/components/evaluators/statistical_evaluator.py +++ /dev/null @@ -1,171 +0,0 @@ -import collections -import itertools -from enum import Enum -from typing import Any, Dict, List, Union - -from numpy import array as np_array -from numpy import mean as np_mean - -from haystack import default_from_dict, default_to_dict -from haystack.core.component import component - - -class StatisticalMetric(Enum): - """ - Metrics supported by the StatisticalEvaluator. - """ - - F1 = "f1" - EM = "exact_match" - RECALL_SINGLE_HIT = "recall_single_hit" - RECALL_MULTI_HIT = "recall_multi_hit" - MRR = "mean_reciprocal_rank" - - @classmethod - def from_str(cls, metric: str) -> "StatisticalMetric": - map = {e.value: e for e in StatisticalMetric} - metric_ = map.get(metric) - if metric_ is None: - raise ValueError(f"Unknown statistical metric '{metric}'") - return metric_ - - -@component -class StatisticalEvaluator: - """ - StatisticalEvaluator is a component that evaluates the performance of a model based on statistical metrics. - It's usually used in QA and Retrieval Augmented Generation (RAG) pipelines to evaluate the quality of the generated answers. - - The supported metrics are: - - F1: Measures word overlap between predictions and labels. - - Exact Match: Measures the proportion of cases where prediction is identical to the expected label. - """ - - def __init__(self, metric: Union[str, StatisticalMetric]): - """ - Creates a new instance of StatisticalEvaluator. - - :param metric: Metric to use for evaluation in this component. Supported metrics are F1 and Exact Match. - """ - if isinstance(metric, str): - metric = StatisticalMetric.from_str(metric) - self._metric = metric - - self._metric_function = { - StatisticalMetric.F1: self._f1, - StatisticalMetric.EM: self._exact_match, - StatisticalMetric.RECALL_SINGLE_HIT: self._recall_single_hit, - StatisticalMetric.RECALL_MULTI_HIT: self._recall_multi_hit, - StatisticalMetric.MRR: self._mrr, - }[self._metric] - - def to_dict(self) -> Dict[str, Any]: - return default_to_dict(self, metric=self._metric.value) - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "StatisticalEvaluator": - data["init_parameters"]["metric"] = StatisticalMetric(data["init_parameters"]["metric"]) - return default_from_dict(cls, data) - - @component.output_types(result=float) - def run(self, labels: List[str], predictions: List[str]) -> Dict[str, Any]: - """ - Run the StatisticalEvaluator to compute the metric between a list of predictions and a list of labels. - Both must be list of strings of same length. - - :param predictions: List of predictions. - :param labels: List of labels against which the predictions are compared. - :returns: A dictionary with the following outputs: - * `result` - Calculated result of the chosen metric. - """ - return {"result": self._metric_function(labels, predictions)} - - @staticmethod - def _f1(labels: List[str], predictions: List[str]): - """ - Measure word overlap between predictions and labels. - """ - if len(labels) != len(predictions): - raise ValueError("The number of predictions and labels must be the same.") - - if len(predictions) == 0: - # We expect callers of this function already checked if predictions and labels are equal length - return 0.0 - - scores: List[float] = [] - tokenized_predictions = [pred.split() for pred in predictions] - tokenized_labels = [label.split() for label in labels] - for label_tokens, prediction_tokens in zip(tokenized_labels, tokenized_predictions): - common = collections.Counter(label_tokens) & collections.Counter(prediction_tokens) - num_same = sum(common.values()) - if len(label_tokens) == 0 or len(prediction_tokens) == 0: - # If either is no-answer, then F1 is 1 if they agree, 0 otherwise - return int(label_tokens == prediction_tokens) - if num_same == 0: - return 0 - precision = 1.0 * num_same / len(prediction_tokens) - recall = 1.0 * num_same / len(label_tokens) - f1 = (2 * precision * recall) / (precision + recall) - scores.append(f1) - - return np_mean(scores) - - @staticmethod - def _exact_match(labels: List[str], predictions: List[str]) -> float: - """ - Measure the proportion of cases where prediction is identical to the the expected label. - """ - if len(labels) != len(predictions): - raise ValueError("The number of predictions and labels must be the same.") - - if len(predictions) == 0: - # We expect callers of this function already checked if predictions and labels are equal length - return 0.0 - score_list = np_array(predictions) == np_array(labels) - return np_mean(score_list) - - @staticmethod - def _recall_single_hit(labels: List[str], predictions: List[str]) -> float: - """ - Measures how many times a label is present in at least one prediction. - If the same label is found in multiple predictions it is only counted once. - """ - if len(labels) == 0: - return 0.0 - - # In Recall Single Hit we only consider if a label is present in at least one prediction. - # No need to count multiple occurrences of the same label in different predictions - retrieved_labels = {l for l, p in itertools.product(labels, predictions) if l in p} - return len(retrieved_labels) / len(labels) - - @staticmethod - def _recall_multi_hit(labels: List[str], predictions: List[str]) -> float: - """ - Measures how many times a label is present in at least one or more predictions. - """ - if len(labels) == 0: - return 0.0 - - correct_retrievals = 0 - for label, prediction in itertools.product(labels, predictions): - if label in prediction: - correct_retrievals += 1 - - return correct_retrievals / len(labels) - - @staticmethod - def _mrr(labels: List[str], predictions: List[str]) -> float: - """ - Measures the mean reciprocal rank of times a label is present in at least one or more predictions. - """ - if len(labels) == 0: - return 0.0 - - mrr_sum = 0.0 - for label in labels: - for rank, prediction in enumerate(predictions): - if label in prediction: - mrr_sum += 1 / (rank + 1) - break - - return mrr_sum / len(labels) diff --git a/test/components/evaluators/test_sas_evaluator.py b/test/components/evaluators/test_sas_evaluator.py deleted file mode 100644 index 2b9f2f2ea..000000000 --- a/test/components/evaluators/test_sas_evaluator.py +++ /dev/null @@ -1,174 +0,0 @@ -import pytest - -from haystack.components.evaluators import SASEvaluator -from haystack.utils.device import ComponentDevice - - -class TestSASEvaluator: - def test_init_default(self, monkeypatch): - monkeypatch.setenv("HF_API_TOKEN", "fake-token") - evaluator = SASEvaluator() - - assert evaluator._model == "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" - assert evaluator._batch_size == 32 - assert evaluator._device is None - assert evaluator._token.resolve_value() == "fake-token" - - def test_to_dict(self, monkeypatch): - monkeypatch.setenv("HF_API_TOKEN", "fake-token") - - evaluator = SASEvaluator(device=ComponentDevice.from_str("cuda:0")) - - expected_dict = { - "type": "haystack.components.evaluators.sas_evaluator.SASEvaluator", - "init_parameters": { - "model": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2", - "batch_size": 32, - "device": {"type": "single", "device": "cuda:0"}, - "token": {"type": "env_var", "env_vars": ["HF_API_TOKEN"], "strict": False}, - }, - } - assert evaluator.to_dict() == expected_dict - - def test_from_dict(self, monkeypatch): - monkeypatch.setenv("HF_API_TOKEN", "fake-token") - evaluator = SASEvaluator.from_dict( - { - "type": "haystack.components.evaluators.sas_evaluator.SASEvaluator", - "init_parameters": { - "model": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2", - "batch_size": 32, - "device": {"type": "single", "device": "cuda:0"}, - "token": {"type": "env_var", "env_vars": ["HF_API_TOKEN"], "strict": False}, - }, - } - ) - - assert evaluator._model == "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" - assert evaluator._batch_size == 32 - assert evaluator._device.to_torch_str() == "cuda:0" - assert evaluator._token.resolve_value() == "fake-token" - - def test_run_with_empty_inputs(self): - evaluator = SASEvaluator() - result = evaluator.run(labels=[], predictions=[]) - assert len(result) == 2 - assert result["sas"] == 0.0 - assert result["scores"] == [0.0] - - def test_run_with_different_lengths(self): - evaluator = SASEvaluator() - labels = [ - "A construction budget of US $2.3 billion", - "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", - ] - predictions = [ - "A construction budget of US $2.3 billion", - "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", - "The Meiji Restoration in 1868 transformed Japan into a modernized world power.", - ] - with pytest.raises(ValueError): - evaluator.run(labels=labels, predictions=predictions) - - def test_run_not_warmed_up(self): - evaluator = SASEvaluator() - labels = [ - "A construction budget of US $2.3 billion", - "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", - "The Meiji Restoration in 1868 transformed Japan into a modernized world power.", - ] - predictions = [ - "A construction budget of US $2.3 billion", - "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", - "The Meiji Restoration in 1868 transformed Japan into a modernized world power.", - ] - with pytest.raises(RuntimeError): - evaluator.run(labels=labels, predictions=predictions) - - @pytest.mark.integration - def test_run_with_matching_predictions(self): - evaluator = SASEvaluator() - labels = [ - "A construction budget of US $2.3 billion", - "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", - "The Meiji Restoration in 1868 transformed Japan into a modernized world power.", - ] - predictions = [ - "A construction budget of US $2.3 billion", - "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", - "The Meiji Restoration in 1868 transformed Japan into a modernized world power.", - ] - evaluator.warm_up() - result = evaluator.run(labels=labels, predictions=predictions) - - assert len(result) == 2 - assert result["sas"] == pytest.approx(1.0) - assert result["scores"] == pytest.approx([1.0, 1.0, 1.0]) - - @pytest.mark.integration - def test_run_with_single_prediction(self): - evaluator = SASEvaluator() - - labels = ["US $2.3 billion"] - evaluator.warm_up() - result = evaluator.run(labels=labels, predictions=["A construction budget of US $2.3 billion"]) - assert len(result) == 2 - assert result["sas"] == pytest.approx(0.689089, abs=1e-5) - assert result["scores"] == pytest.approx([0.689089], abs=1e-5) - - @pytest.mark.integration - def test_run_with_mismatched_predictions(self): - evaluator = SASEvaluator() - labels = [ - "US $2.3 billion", - "Paris's cultural magnificence is symbolized by the Eiffel Tower", - "Japan was transformed into a modernized world power after the Meiji Restoration.", - ] - predictions = [ - "A construction budget of US $2.3 billion", - "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", - "The Meiji Restoration in 1868 transformed Japan into a modernized world power.", - ] - evaluator.warm_up() - result = evaluator.run(labels=labels, predictions=predictions) - assert len(result) == 2 - assert result["sas"] == pytest.approx(0.8227189) - assert result["scores"] == pytest.approx([0.689089, 0.870389, 0.908679], abs=1e-5) - - @pytest.mark.integration - def test_run_with_bi_encoder_model(self): - evaluator = SASEvaluator(model="sentence-transformers/all-mpnet-base-v2") - labels = [ - "A construction budget of US $2.3 billion", - "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", - "The Meiji Restoration in 1868 transformed Japan into a modernized world power.", - ] - predictions = [ - "A construction budget of US $2.3 billion", - "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", - "The Meiji Restoration in 1868 transformed Japan into a modernized world power.", - ] - evaluator.warm_up() - result = evaluator.run(labels=labels, predictions=predictions) - assert len(result) == 2 - assert result["sas"] == pytest.approx(1.0) - assert result["scores"] == pytest.approx([1.0, 1.0, 1.0]) - - @pytest.mark.integration - def test_run_with_cross_encoder_model(self): - evaluator = SASEvaluator(model="cross-encoder/ms-marco-MiniLM-L-6-v2") - labels = [ - "A construction budget of US $2.3 billion", - "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", - "The Meiji Restoration in 1868 transformed Japan into a modernized world power.", - ] - predictions = [ - "A construction budget of US $2.3 billion", - "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", - "The Meiji Restoration in 1868 transformed Japan into a modernized world power.", - ] - evaluator.warm_up() - result = evaluator.run(labels=labels, predictions=predictions) - assert len(result) == 2 - assert result["sas"] == pytest.approx(0.999967, abs=1e-5) - assert result["scores"] == pytest.approx([0.9999765157699585, 0.999968409538269, 0.9999572038650513], abs=1e-5) diff --git a/test/components/evaluators/test_statistical_evaluator.py b/test/components/evaluators/test_statistical_evaluator.py deleted file mode 100644 index 51efb1e98..000000000 --- a/test/components/evaluators/test_statistical_evaluator.py +++ /dev/null @@ -1,225 +0,0 @@ -import pytest - -from haystack.components.evaluators import StatisticalEvaluator, StatisticalMetric - - -class TestStatisticalEvaluator: - def test_init_default(self): - evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1) - assert evaluator._metric == StatisticalMetric.F1 - - def test_init_with_string(self): - evaluator = StatisticalEvaluator(metric="exact_match") - assert evaluator._metric == StatisticalMetric.EM - - def test_to_dict(self): - evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1) - - expected_dict = { - "type": "haystack.components.evaluators.statistical_evaluator.StatisticalEvaluator", - "init_parameters": {"metric": "f1"}, - } - assert evaluator.to_dict() == expected_dict - - def test_from_dict(self): - evaluator = StatisticalEvaluator.from_dict( - { - "type": "haystack.components.evaluators.statistical_evaluator.StatisticalEvaluator", - "init_parameters": {"metric": "f1"}, - } - ) - - assert evaluator._metric == StatisticalMetric.F1 - - -class TestStatisticalEvaluatorF1: - def test_run_with_empty_inputs(self): - evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1) - result = evaluator.run(labels=[], predictions=[]) - assert len(result) == 1 - assert result["result"] == 0.0 - - def test_run_with_different_lengths(self): - evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1) - labels = [ - "A construction budget of US $2.3 billion", - "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", - ] - predictions = [ - "A construction budget of US $2.3 billion", - "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", - "The Meiji Restoration in 1868 transformed Japan into a modernized world power.", - ] - with pytest.raises(ValueError): - evaluator.run(labels=labels, predictions=predictions) - - def test_run_with_matching_predictions(self): - evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1) - labels = ["OpenSource", "HaystackAI", "LLMs"] - predictions = ["OpenSource", "HaystackAI", "LLMs"] - result = evaluator.run(labels=labels, predictions=predictions) - - assert len(result) == 1 - assert result["result"] == 1.0 - - def test_run_with_single_prediction(self): - evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1) - - result = evaluator.run(labels=["Source"], predictions=["Open Source"]) - assert len(result) == 1 - assert result["result"] == pytest.approx(2 / 3) - - def test_run_with_mismatched_predictions(self): - labels = ["Source", "HaystackAI"] - evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1) - predictions = ["Open Source", "HaystackAI"] - result = evaluator.run(labels=labels, predictions=predictions) - assert len(result) == 1 - assert result["result"] == pytest.approx(5 / 6) - - -class TestStatisticalEvaluatorExactMatch: - def test_run_with_empty_inputs(self): - evaluator = StatisticalEvaluator(metric=StatisticalMetric.EM) - result = evaluator.run(predictions=[], labels=[]) - assert len(result) == 1 - assert result["result"] == 0.0 - - def test_run_with_different_lengths(self): - evaluator = StatisticalEvaluator(metric=StatisticalMetric.EM) - labels = [ - "A construction budget of US $2.3 billion", - "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", - ] - predictions = [ - "A construction budget of US $2.3 billion", - "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", - "The Meiji Restoration in 1868 transformed Japan into a modernized world power.", - ] - with pytest.raises(ValueError): - evaluator.run(labels=labels, predictions=predictions) - - def test_run_with_matching_predictions(self): - labels = ["OpenSource", "HaystackAI", "LLMs"] - evaluator = StatisticalEvaluator(metric=StatisticalMetric.EM) - predictions = ["OpenSource", "HaystackAI", "LLMs"] - result = evaluator.run(labels=labels, predictions=predictions) - - assert len(result) == 1 - assert result["result"] == 1.0 - - def test_run_with_single_prediction(self): - evaluator = StatisticalEvaluator(metric=StatisticalMetric.EM) - result = evaluator.run(labels=["OpenSource"], predictions=["OpenSource"]) - assert len(result) == 1 - assert result["result"] == 1.0 - - def test_run_with_mismatched_predictions(self): - evaluator = StatisticalEvaluator(metric=StatisticalMetric.EM) - labels = ["Source", "HaystackAI", "LLMs"] - predictions = ["OpenSource", "HaystackAI", "LLMs"] - result = evaluator.run(labels=labels, predictions=predictions) - assert len(result) == 1 - assert result["result"] == 2 / 3 - - -class TestStatisticalEvaluatorRecallSingleHit: - def test_run(self): - evaluator = StatisticalEvaluator(metric=StatisticalMetric.RECALL_SINGLE_HIT) - labels = ["Eiffel Tower", "Louvre Museum", "Colosseum", "Trajan's Column"] - predictions = [ - "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", - "The Eiffel Tower max height is 330 meters.", - "Louvre Museum is the world's largest art museum and a historic monument in Paris, France.", - "The Leaning Tower of Pisa is the campanile, or freestanding bell tower, of Pisa Cathedral.", - ] - result = evaluator.run(labels=labels, predictions=predictions) - assert len(result) == 1 - assert result["result"] == 2 / 4 - - def test_run_with_empty_labels(self): - evaluator = StatisticalEvaluator(metric=StatisticalMetric.RECALL_SINGLE_HIT) - predictions = [ - "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", - "The Eiffel Tower max height is 330 meters.", - "Louvre Museum is the world's largest art museum and a historic monument in Paris, France.", - "The Leaning Tower of Pisa is the campanile, or freestanding bell tower, of Pisa Cathedral.", - ] - result = evaluator.run(labels=[], predictions=predictions) - assert len(result) == 1 - assert result["result"] == 0.0 - - def test_run_with_empty_predictions(self): - evaluator = StatisticalEvaluator(metric=StatisticalMetric.RECALL_SINGLE_HIT) - labels = ["Eiffel Tower", "Louvre Museum", "Colosseum", "Trajan's Column"] - result = evaluator.run(labels=labels, predictions=[]) - assert len(result) == 1 - assert result["result"] == 0.0 - - -class TestStatisticalEvaluatorRecallMultiHit: - def test_run(self): - evaluator = StatisticalEvaluator(metric=StatisticalMetric.RECALL_MULTI_HIT) - labels = ["Eiffel Tower", "Louvre Museum", "Colosseum", "Trajan's Column"] - predictions = [ - "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", - "The Eiffel Tower max height is 330 meters.", - "Louvre Museum is the world's largest art museum and a historic monument in Paris, France.", - "The Leaning Tower of Pisa is the campanile, or freestanding bell tower, of Pisa Cathedral.", - ] - result = evaluator.run(labels=labels, predictions=predictions) - assert len(result) == 1 - assert result["result"] == 0.75 - - def test_run_with_empty_labels(self): - evaluator = StatisticalEvaluator(metric=StatisticalMetric.RECALL_MULTI_HIT) - predictions = [ - "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", - "The Eiffel Tower max height is 330 meters.", - "Louvre Museum is the world's largest art museum and a historic monument in Paris, France.", - "The Leaning Tower of Pisa is the campanile, or freestanding bell tower, of Pisa Cathedral.", - ] - result = evaluator.run(labels=[], predictions=predictions) - assert len(result) == 1 - assert result["result"] == 0.0 - - def test_run_with_empty_predictions(self): - evaluator = StatisticalEvaluator(metric=StatisticalMetric.RECALL_MULTI_HIT) - labels = ["Eiffel Tower", "Louvre Museum", "Colosseum", "Trajan's Column"] - result = evaluator.run(labels=labels, predictions=[]) - assert len(result) == 1 - assert result["result"] == 0.0 - - -class TestStatisticalEvaluatorMRR: - def test_run(self): - evaluator = StatisticalEvaluator(metric=StatisticalMetric.MRR) - labels = ["Eiffel Tower", "Louvre Museum", "Colosseum", "Trajan's Column"] - predictions = [ - "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", - "The Eiffel Tower max height is 330 meters.", - "Louvre Museum is the world's largest art museum and a historic monument in Paris, France.", - "The Leaning Tower of Pisa is the campanile, or freestanding bell tower, of Pisa Cathedral.", - ] - result = evaluator.run(labels=labels, predictions=predictions) - assert len(result) == 1 - assert result["result"] == 1 / 3 - - def test_run_with_empty_labels(self): - evaluator = StatisticalEvaluator(metric=StatisticalMetric.MRR) - predictions = [ - "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", - "The Eiffel Tower max height is 330 meters.", - "Louvre Museum is the world's largest art museum and a historic monument in Paris, France.", - "The Leaning Tower of Pisa is the campanile, or freestanding bell tower, of Pisa Cathedral.", - ] - result = evaluator.run(labels=[], predictions=predictions) - assert len(result) == 1 - assert result["result"] == 0.0 - - def test_run_with_empty_predictions(self): - evaluator = StatisticalEvaluator(metric=StatisticalMetric.MRR) - labels = ["Eiffel Tower", "Louvre Museum", "Colosseum", "Trajan's Column"] - result = evaluator.run(labels=labels, predictions=[]) - assert len(result) == 1 - assert result["result"] == 0.0