Remove all evaluator components (#7053)

2026-01-07 20:46:31 +00:00 · 2024-02-21 18:24:14 +01:00 · 2024-02-21 18:24:14 +01:00 · 8ca4bf405b
commit 8ca4bf405b
parent f3be576b5c
5 changed files with 0 additions and 718 deletions
--- a/haystack/components/evaluators/init.py
+++ b/haystack/components/evaluators/init.py
@ -1,4 +0,0 @@
-from .sas_evaluator import SASEvaluator
-from .statistical_evaluator import StatisticalEvaluator, StatisticalMetric
-
-__all__ = ["SASEvaluator", "StatisticalEvaluator", "StatisticalMetric"]
--- a/haystack/components/evaluators/sas_evaluator.py
+++ b/haystack/components/evaluators/sas_evaluator.py
@ -1,144 +0,0 @@
-from typing import Any, Dict, List, Optional
-
-from numpy import mean as np_mean
-
-from haystack import component, default_from_dict, default_to_dict
-from haystack.lazy_imports import LazyImport
-from haystack.utils import ComponentDevice, expit
-from haystack.utils.auth import Secret, deserialize_secrets_inplace
-
-with LazyImport(message="Run 'pip install scikit-learn \"sentence-transformers>=2.2.0\"'") as sas_import:
-    from sentence_transformers import CrossEncoder, SentenceTransformer, util
-    from transformers import AutoConfig
-
-
-@component
-class SASEvaluator:
-    """
-    SASEvaluator computes the Semantic Answer Similarity (SAS) between a list of predictions and a list of labels.
-    It's usually used in Retrieval Augmented Generation (RAG) pipelines to evaluate the quality of the generated answers.
-
-    The SAS is computed using a pre-trained model from the Hugging Face model hub. The model can be either a
-    Bi-Encoder or a Cross-Encoder. The choice of the model is based on the `model` parameter.
-    """
-
-    def __init__(
-        self,
-        model: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
-        batch_size: int = 32,
-        device: Optional[ComponentDevice] = None,
-        token: Secret = Secret.from_env_var("HF_API_TOKEN", strict=False),
-    ):
-        """
-        Creates a new instance of SASEvaluator.
-
-        :param model: SentenceTransformers semantic textual similarity model, should be path or string pointing to
-            a downloadable model.
-        :param batch_size: Number of prediction-label pairs to encode at once.
-        :param device: The device on which the model is loaded. If `None`, the default device is automatically
-            selected.
-        :param token: The Hugging Face token for HTTP bearer authorization.
-            You can find your HF token at https://huggingface.co/settings/tokens.
-        """
-        sas_import.check()
-
-        self._model = model
-        self._batch_size = batch_size
-        self._device = device
-        self._token = token
-        self._similarity_model = None
-
-    def to_dict(self) -> Dict[str, Any]:
-        return default_to_dict(
-            self,
-            model=self._model,
-            batch_size=self._batch_size,
-            device=self._device.to_dict() if self._device else None,
-            token=self._token.to_dict() if self._token else None,
-        )
-
-    @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> "SASEvaluator":
-        deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
-        if device := data.get("init_parameters", {}).get("device"):
-            data["init_parameters"]["device"] = ComponentDevice.from_dict(device)
-        return default_from_dict(cls, data)
-
-    def warm_up(self):
-        """
-        Load the model used for evaluation
-        """
-        token = self._token.resolve_value() if self._token else None
-        config = AutoConfig.from_pretrained(self._model, use_auth_token=token)
-        cross_encoder_used = False
-        if config.architectures:
-            cross_encoder_used = any(arch.endswith("ForSequenceClassification") for arch in config.architectures)
-        device = ComponentDevice.resolve_device(self._device).to_torch_str()
-        # Based on the Model string we can load either Bi-Encoders or Cross Encoders.
-        # Similarity computation changes for both approaches
-        if cross_encoder_used:
-            self._similarity_model = CrossEncoder(
-                self._model,
-                device=device,
-                tokenizer_args={"use_auth_token": token},
-                automodel_args={"use_auth_token": token},
-            )
-        else:
-            self._similarity_model = SentenceTransformer(self._model, device=device, use_auth_token=token)
-
-    @component.output_types(sas=float, scores=List[float])
-    def run(self, labels: List[str], predictions: List[str]) -> Dict[str, Any]:
-        """
-        Run the SASEvaluator to compute the Semantic Answer Similarity (SAS) between a list of predictions and a list of
-        labels. Both must be list of strings of same length.
-
-        :param predictions: List of predictions.
-        :param labels: List of labels against which the predictions are compared.
-        :returns: A dictionary with the following outputs:
-                * `sas` - Cumulative SAS score for the entire dataset.
-                * `scores` - A list of similarity scores for each prediction-label pair.
-        """
-        if len(labels) != len(predictions):
-            raise ValueError("The number of predictions and labels must be the same.")
-
-        if len(predictions) == 0:
-            return {"sas": 0.0, "scores": [0.0]}
-
-        if not self._similarity_model:
-            msg = "The model has not been initialized. Call warm_up() before running the evaluator."
-            raise RuntimeError(msg)
-
-        if isinstance(self._similarity_model, CrossEncoder):
-            # For Cross Encoders we create a list of pairs of predictions and labels
-            sentence_pairs = [[pred, label] for pred, label in zip(predictions, labels)]
-            similarity_scores = self._similarity_model.predict(
-                sentence_pairs, batch_size=self._batch_size, convert_to_numpy=True
-            )
-
-            # All Cross Encoders do not return a set of logits scores that are normalized
-            # We normalize scores if they are larger than 1
-            if (similarity_scores > 1).any():
-                similarity_scores = expit(similarity_scores)
-
-            # Convert scores to list of floats from numpy array
-            similarity_scores = similarity_scores.tolist()
-
-        else:
-            # For Bi-encoders we create embeddings separately for predictions and labels
-            predictions_embeddings = self._similarity_model.encode(
-                predictions, batch_size=self._batch_size, convert_to_tensor=True
-            )
-            label_embeddings = self._similarity_model.encode(
-                labels, batch_size=self._batch_size, convert_to_tensor=True
-            )
-
-            # Compute cosine-similarities
-            scores = util.cos_sim(predictions_embeddings, label_embeddings)
-
-            # cos_sim computes cosine similarity between all pairs of vectors in pred_embeddings and label_embeddings
-            # It returns a matrix with shape (len(predictions), len(labels))
-            similarity_scores = [scores[i][i].item() for i in range(len(predictions))]
-
-        sas_score = np_mean(similarity_scores)
-
-        return {"sas": sas_score, "scores": similarity_scores}
--- a/haystack/components/evaluators/statistical_evaluator.py
+++ b/haystack/components/evaluators/statistical_evaluator.py
@ -1,171 +0,0 @@
-import collections
-import itertools
-from enum import Enum
-from typing import Any, Dict, List, Union
-
-from numpy import array as np_array
-from numpy import mean as np_mean
-
-from haystack import default_from_dict, default_to_dict
-from haystack.core.component import component
-
-
-class StatisticalMetric(Enum):
-    """
-    Metrics supported by the StatisticalEvaluator.
-    """
-
-    F1 = "f1"
-    EM = "exact_match"
-    RECALL_SINGLE_HIT = "recall_single_hit"
-    RECALL_MULTI_HIT = "recall_multi_hit"
-    MRR = "mean_reciprocal_rank"
-
-    @classmethod
-    def from_str(cls, metric: str) -> "StatisticalMetric":
-        map = {e.value: e for e in StatisticalMetric}
-        metric_ = map.get(metric)
-        if metric_ is None:
-            raise ValueError(f"Unknown statistical metric '{metric}'")
-        return metric_
-
-
-@component
-class StatisticalEvaluator:
-    """
-    StatisticalEvaluator is a component that evaluates the performance of a model based on statistical metrics.
-    It's usually used in QA and Retrieval Augmented Generation (RAG) pipelines to evaluate the quality of the generated answers.
-
-    The supported metrics are:
-    - F1: Measures word overlap between predictions and labels.
-    - Exact Match: Measures the proportion of cases where prediction is identical to the expected label.
-    """
-
-    def __init__(self, metric: Union[str, StatisticalMetric]):
-        """
-        Creates a new instance of StatisticalEvaluator.
-
-        :param metric: Metric to use for evaluation in this component. Supported metrics are F1 and Exact Match.
-        """
-        if isinstance(metric, str):
-            metric = StatisticalMetric.from_str(metric)
-        self._metric = metric
-
-        self._metric_function = {
-            StatisticalMetric.F1: self._f1,
-            StatisticalMetric.EM: self._exact_match,
-            StatisticalMetric.RECALL_SINGLE_HIT: self._recall_single_hit,
-            StatisticalMetric.RECALL_MULTI_HIT: self._recall_multi_hit,
-            StatisticalMetric.MRR: self._mrr,
-        }[self._metric]
-
-    def to_dict(self) -> Dict[str, Any]:
-        return default_to_dict(self, metric=self._metric.value)
-
-    @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> "StatisticalEvaluator":
-        data["init_parameters"]["metric"] = StatisticalMetric(data["init_parameters"]["metric"])
-        return default_from_dict(cls, data)
-
-    @component.output_types(result=float)
-    def run(self, labels: List[str], predictions: List[str]) -> Dict[str, Any]:
-        """
-        Run the StatisticalEvaluator to compute the metric between a list of predictions and a list of labels.
-        Both must be list of strings of same length.
-
-        :param predictions: List of predictions.
-        :param labels: List of labels against which the predictions are compared.
-        :returns: A dictionary with the following outputs:
-                    * `result` - Calculated result of the chosen metric.
-        """
-        return {"result": self._metric_function(labels, predictions)}
-
-    @staticmethod
-    def _f1(labels: List[str], predictions: List[str]):
-        """
-        Measure word overlap between predictions and labels.
-        """
-        if len(labels) != len(predictions):
-            raise ValueError("The number of predictions and labels must be the same.")
-
-        if len(predictions) == 0:
-            # We expect callers of this function already checked if predictions and labels are equal length
-            return 0.0
-
-        scores: List[float] = []
-        tokenized_predictions = [pred.split() for pred in predictions]
-        tokenized_labels = [label.split() for label in labels]
-        for label_tokens, prediction_tokens in zip(tokenized_labels, tokenized_predictions):
-            common = collections.Counter(label_tokens) & collections.Counter(prediction_tokens)
-            num_same = sum(common.values())
-            if len(label_tokens) == 0 or len(prediction_tokens) == 0:
-                # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
-                return int(label_tokens == prediction_tokens)
-            if num_same == 0:
-                return 0
-            precision = 1.0 * num_same / len(prediction_tokens)
-            recall = 1.0 * num_same / len(label_tokens)
-            f1 = (2 * precision * recall) / (precision + recall)
-            scores.append(f1)
-
-        return np_mean(scores)
-
-    @staticmethod
-    def _exact_match(labels: List[str], predictions: List[str]) -> float:
-        """
-        Measure the proportion of cases where prediction is identical to the the expected label.
-        """
-        if len(labels) != len(predictions):
-            raise ValueError("The number of predictions and labels must be the same.")
-
-        if len(predictions) == 0:
-            # We expect callers of this function already checked if predictions and labels are equal length
-            return 0.0
-        score_list = np_array(predictions) == np_array(labels)
-        return np_mean(score_list)
-
-    @staticmethod
-    def _recall_single_hit(labels: List[str], predictions: List[str]) -> float:
-        """
-        Measures how many times a label is present in at least one prediction.
-        If the same label is found in multiple predictions it is only counted once.
-        """
-        if len(labels) == 0:
-            return 0.0
-
-        # In Recall Single Hit we only consider if a label is present in at least one prediction.
-        # No need to count multiple occurrences of the same label in different predictions
-        retrieved_labels = {l for l, p in itertools.product(labels, predictions) if l in p}
-        return len(retrieved_labels) / len(labels)
-
-    @staticmethod
-    def _recall_multi_hit(labels: List[str], predictions: List[str]) -> float:
-        """
-        Measures how many times a label is present in at least one or more predictions.
-        """
-        if len(labels) == 0:
-            return 0.0
-
-        correct_retrievals = 0
-        for label, prediction in itertools.product(labels, predictions):
-            if label in prediction:
-                correct_retrievals += 1
-
-        return correct_retrievals / len(labels)
-
-    @staticmethod
-    def _mrr(labels: List[str], predictions: List[str]) -> float:
-        """
-        Measures the mean reciprocal rank of times a label is present in at least one or more predictions.
-        """
-        if len(labels) == 0:
-            return 0.0
-
-        mrr_sum = 0.0
-        for label in labels:
-            for rank, prediction in enumerate(predictions):
-                if label in prediction:
-                    mrr_sum += 1 / (rank + 1)
-                    break
-
-        return mrr_sum / len(labels)
--- a/test/components/evaluators/test_sas_evaluator.py
+++ b/test/components/evaluators/test_sas_evaluator.py
@ -1,174 +0,0 @@
-import pytest
-
-from haystack.components.evaluators import SASEvaluator
-from haystack.utils.device import ComponentDevice
-
-
-class TestSASEvaluator:
-    def test_init_default(self, monkeypatch):
-        monkeypatch.setenv("HF_API_TOKEN", "fake-token")
-        evaluator = SASEvaluator()
-
-        assert evaluator._model == "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
-        assert evaluator._batch_size == 32
-        assert evaluator._device is None
-        assert evaluator._token.resolve_value() == "fake-token"
-
-    def test_to_dict(self, monkeypatch):
-        monkeypatch.setenv("HF_API_TOKEN", "fake-token")
-
-        evaluator = SASEvaluator(device=ComponentDevice.from_str("cuda:0"))
-
-        expected_dict = {
-            "type": "haystack.components.evaluators.sas_evaluator.SASEvaluator",
-            "init_parameters": {
-                "model": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
-                "batch_size": 32,
-                "device": {"type": "single", "device": "cuda:0"},
-                "token": {"type": "env_var", "env_vars": ["HF_API_TOKEN"], "strict": False},
-            },
-        }
-        assert evaluator.to_dict() == expected_dict
-
-    def test_from_dict(self, monkeypatch):
-        monkeypatch.setenv("HF_API_TOKEN", "fake-token")
-        evaluator = SASEvaluator.from_dict(
-            {
-                "type": "haystack.components.evaluators.sas_evaluator.SASEvaluator",
-                "init_parameters": {
-                    "model": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
-                    "batch_size": 32,
-                    "device": {"type": "single", "device": "cuda:0"},
-                    "token": {"type": "env_var", "env_vars": ["HF_API_TOKEN"], "strict": False},
-                },
-            }
-        )
-
-        assert evaluator._model == "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
-        assert evaluator._batch_size == 32
-        assert evaluator._device.to_torch_str() == "cuda:0"
-        assert evaluator._token.resolve_value() == "fake-token"
-
-    def test_run_with_empty_inputs(self):
-        evaluator = SASEvaluator()
-        result = evaluator.run(labels=[], predictions=[])
-        assert len(result) == 2
-        assert result["sas"] == 0.0
-        assert result["scores"] == [0.0]
-
-    def test_run_with_different_lengths(self):
-        evaluator = SASEvaluator()
-        labels = [
-            "A construction budget of US $2.3 billion",
-            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
-        ]
-        predictions = [
-            "A construction budget of US $2.3 billion",
-            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
-            "The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
-        ]
-        with pytest.raises(ValueError):
-            evaluator.run(labels=labels, predictions=predictions)
-
-    def test_run_not_warmed_up(self):
-        evaluator = SASEvaluator()
-        labels = [
-            "A construction budget of US $2.3 billion",
-            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
-            "The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
-        ]
-        predictions = [
-            "A construction budget of US $2.3 billion",
-            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
-            "The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
-        ]
-        with pytest.raises(RuntimeError):
-            evaluator.run(labels=labels, predictions=predictions)
-
-    @pytest.mark.integration
-    def test_run_with_matching_predictions(self):
-        evaluator = SASEvaluator()
-        labels = [
-            "A construction budget of US $2.3 billion",
-            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
-            "The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
-        ]
-        predictions = [
-            "A construction budget of US $2.3 billion",
-            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
-            "The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
-        ]
-        evaluator.warm_up()
-        result = evaluator.run(labels=labels, predictions=predictions)
-
-        assert len(result) == 2
-        assert result["sas"] == pytest.approx(1.0)
-        assert result["scores"] == pytest.approx([1.0, 1.0, 1.0])
-
-    @pytest.mark.integration
-    def test_run_with_single_prediction(self):
-        evaluator = SASEvaluator()
-
-        labels = ["US $2.3 billion"]
-        evaluator.warm_up()
-        result = evaluator.run(labels=labels, predictions=["A construction budget of US $2.3 billion"])
-        assert len(result) == 2
-        assert result["sas"] == pytest.approx(0.689089, abs=1e-5)
-        assert result["scores"] == pytest.approx([0.689089], abs=1e-5)
-
-    @pytest.mark.integration
-    def test_run_with_mismatched_predictions(self):
-        evaluator = SASEvaluator()
-        labels = [
-            "US $2.3 billion",
-            "Paris's cultural magnificence is symbolized by the Eiffel Tower",
-            "Japan was transformed into a modernized world power after the Meiji Restoration.",
-        ]
-        predictions = [
-            "A construction budget of US $2.3 billion",
-            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
-            "The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
-        ]
-        evaluator.warm_up()
-        result = evaluator.run(labels=labels, predictions=predictions)
-        assert len(result) == 2
-        assert result["sas"] == pytest.approx(0.8227189)
-        assert result["scores"] == pytest.approx([0.689089, 0.870389, 0.908679], abs=1e-5)
-
-    @pytest.mark.integration
-    def test_run_with_bi_encoder_model(self):
-        evaluator = SASEvaluator(model="sentence-transformers/all-mpnet-base-v2")
-        labels = [
-            "A construction budget of US $2.3 billion",
-            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
-            "The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
-        ]
-        predictions = [
-            "A construction budget of US $2.3 billion",
-            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
-            "The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
-        ]
-        evaluator.warm_up()
-        result = evaluator.run(labels=labels, predictions=predictions)
-        assert len(result) == 2
-        assert result["sas"] == pytest.approx(1.0)
-        assert result["scores"] == pytest.approx([1.0, 1.0, 1.0])
-
-    @pytest.mark.integration
-    def test_run_with_cross_encoder_model(self):
-        evaluator = SASEvaluator(model="cross-encoder/ms-marco-MiniLM-L-6-v2")
-        labels = [
-            "A construction budget of US $2.3 billion",
-            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
-            "The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
-        ]
-        predictions = [
-            "A construction budget of US $2.3 billion",
-            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
-            "The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
-        ]
-        evaluator.warm_up()
-        result = evaluator.run(labels=labels, predictions=predictions)
-        assert len(result) == 2
-        assert result["sas"] == pytest.approx(0.999967, abs=1e-5)
-        assert result["scores"] == pytest.approx([0.9999765157699585, 0.999968409538269, 0.9999572038650513], abs=1e-5)
--- a/test/components/evaluators/test_statistical_evaluator.py
+++ b/test/components/evaluators/test_statistical_evaluator.py
@ -1,225 +0,0 @@
-import pytest
-
-from haystack.components.evaluators import StatisticalEvaluator, StatisticalMetric
-
-
-class TestStatisticalEvaluator:
-    def test_init_default(self):
-        evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1)
-        assert evaluator._metric == StatisticalMetric.F1
-
-    def test_init_with_string(self):
-        evaluator = StatisticalEvaluator(metric="exact_match")
-        assert evaluator._metric == StatisticalMetric.EM
-
-    def test_to_dict(self):
-        evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1)
-
-        expected_dict = {
-            "type": "haystack.components.evaluators.statistical_evaluator.StatisticalEvaluator",
-            "init_parameters": {"metric": "f1"},
-        }
-        assert evaluator.to_dict() == expected_dict
-
-    def test_from_dict(self):
-        evaluator = StatisticalEvaluator.from_dict(
-            {
-                "type": "haystack.components.evaluators.statistical_evaluator.StatisticalEvaluator",
-                "init_parameters": {"metric": "f1"},
-            }
-        )
-
-        assert evaluator._metric == StatisticalMetric.F1
-
-
-class TestStatisticalEvaluatorF1:
-    def test_run_with_empty_inputs(self):
-        evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1)
-        result = evaluator.run(labels=[], predictions=[])
-        assert len(result) == 1
-        assert result["result"] == 0.0
-
-    def test_run_with_different_lengths(self):
-        evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1)
-        labels = [
-            "A construction budget of US $2.3 billion",
-            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
-        ]
-        predictions = [
-            "A construction budget of US $2.3 billion",
-            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
-            "The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
-        ]
-        with pytest.raises(ValueError):
-            evaluator.run(labels=labels, predictions=predictions)
-
-    def test_run_with_matching_predictions(self):
-        evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1)
-        labels = ["OpenSource", "HaystackAI", "LLMs"]
-        predictions = ["OpenSource", "HaystackAI", "LLMs"]
-        result = evaluator.run(labels=labels, predictions=predictions)
-
-        assert len(result) == 1
-        assert result["result"] == 1.0
-
-    def test_run_with_single_prediction(self):
-        evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1)
-
-        result = evaluator.run(labels=["Source"], predictions=["Open Source"])
-        assert len(result) == 1
-        assert result["result"] == pytest.approx(2 / 3)
-
-    def test_run_with_mismatched_predictions(self):
-        labels = ["Source", "HaystackAI"]
-        evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1)
-        predictions = ["Open Source", "HaystackAI"]
-        result = evaluator.run(labels=labels, predictions=predictions)
-        assert len(result) == 1
-        assert result["result"] == pytest.approx(5 / 6)
-
-
-class TestStatisticalEvaluatorExactMatch:
-    def test_run_with_empty_inputs(self):
-        evaluator = StatisticalEvaluator(metric=StatisticalMetric.EM)
-        result = evaluator.run(predictions=[], labels=[])
-        assert len(result) == 1
-        assert result["result"] == 0.0
-
-    def test_run_with_different_lengths(self):
-        evaluator = StatisticalEvaluator(metric=StatisticalMetric.EM)
-        labels = [
-            "A construction budget of US $2.3 billion",
-            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
-        ]
-        predictions = [
-            "A construction budget of US $2.3 billion",
-            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
-            "The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
-        ]
-        with pytest.raises(ValueError):
-            evaluator.run(labels=labels, predictions=predictions)
-
-    def test_run_with_matching_predictions(self):
-        labels = ["OpenSource", "HaystackAI", "LLMs"]
-        evaluator = StatisticalEvaluator(metric=StatisticalMetric.EM)
-        predictions = ["OpenSource", "HaystackAI", "LLMs"]
-        result = evaluator.run(labels=labels, predictions=predictions)
-
-        assert len(result) == 1
-        assert result["result"] == 1.0
-
-    def test_run_with_single_prediction(self):
-        evaluator = StatisticalEvaluator(metric=StatisticalMetric.EM)
-        result = evaluator.run(labels=["OpenSource"], predictions=["OpenSource"])
-        assert len(result) == 1
-        assert result["result"] == 1.0
-
-    def test_run_with_mismatched_predictions(self):
-        evaluator = StatisticalEvaluator(metric=StatisticalMetric.EM)
-        labels = ["Source", "HaystackAI", "LLMs"]
-        predictions = ["OpenSource", "HaystackAI", "LLMs"]
-        result = evaluator.run(labels=labels, predictions=predictions)
-        assert len(result) == 1
-        assert result["result"] == 2 / 3
-
-
-class TestStatisticalEvaluatorRecallSingleHit:
-    def test_run(self):
-        evaluator = StatisticalEvaluator(metric=StatisticalMetric.RECALL_SINGLE_HIT)
-        labels = ["Eiffel Tower", "Louvre Museum", "Colosseum", "Trajan's Column"]
-        predictions = [
-            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
-            "The Eiffel Tower max height is 330 meters.",
-            "Louvre Museum is the world's largest art museum and a historic monument in Paris, France.",
-            "The Leaning Tower of Pisa is the campanile, or freestanding bell tower, of Pisa Cathedral.",
-        ]
-        result = evaluator.run(labels=labels, predictions=predictions)
-        assert len(result) == 1
-        assert result["result"] == 2 / 4
-
-    def test_run_with_empty_labels(self):
-        evaluator = StatisticalEvaluator(metric=StatisticalMetric.RECALL_SINGLE_HIT)
-        predictions = [
-            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
-            "The Eiffel Tower max height is 330 meters.",
-            "Louvre Museum is the world's largest art museum and a historic monument in Paris, France.",
-            "The Leaning Tower of Pisa is the campanile, or freestanding bell tower, of Pisa Cathedral.",
-        ]
-        result = evaluator.run(labels=[], predictions=predictions)
-        assert len(result) == 1
-        assert result["result"] == 0.0
-
-    def test_run_with_empty_predictions(self):
-        evaluator = StatisticalEvaluator(metric=StatisticalMetric.RECALL_SINGLE_HIT)
-        labels = ["Eiffel Tower", "Louvre Museum", "Colosseum", "Trajan's Column"]
-        result = evaluator.run(labels=labels, predictions=[])
-        assert len(result) == 1
-        assert result["result"] == 0.0
-
-
-class TestStatisticalEvaluatorRecallMultiHit:
-    def test_run(self):
-        evaluator = StatisticalEvaluator(metric=StatisticalMetric.RECALL_MULTI_HIT)
-        labels = ["Eiffel Tower", "Louvre Museum", "Colosseum", "Trajan's Column"]
-        predictions = [
-            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
-            "The Eiffel Tower max height is 330 meters.",
-            "Louvre Museum is the world's largest art museum and a historic monument in Paris, France.",
-            "The Leaning Tower of Pisa is the campanile, or freestanding bell tower, of Pisa Cathedral.",
-        ]
-        result = evaluator.run(labels=labels, predictions=predictions)
-        assert len(result) == 1
-        assert result["result"] == 0.75
-
-    def test_run_with_empty_labels(self):
-        evaluator = StatisticalEvaluator(metric=StatisticalMetric.RECALL_MULTI_HIT)
-        predictions = [
-            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
-            "The Eiffel Tower max height is 330 meters.",
-            "Louvre Museum is the world's largest art museum and a historic monument in Paris, France.",
-            "The Leaning Tower of Pisa is the campanile, or freestanding bell tower, of Pisa Cathedral.",
-        ]
-        result = evaluator.run(labels=[], predictions=predictions)
-        assert len(result) == 1
-        assert result["result"] == 0.0
-
-    def test_run_with_empty_predictions(self):
-        evaluator = StatisticalEvaluator(metric=StatisticalMetric.RECALL_MULTI_HIT)
-        labels = ["Eiffel Tower", "Louvre Museum", "Colosseum", "Trajan's Column"]
-        result = evaluator.run(labels=labels, predictions=[])
-        assert len(result) == 1
-        assert result["result"] == 0.0
-
-
-class TestStatisticalEvaluatorMRR:
-    def test_run(self):
-        evaluator = StatisticalEvaluator(metric=StatisticalMetric.MRR)
-        labels = ["Eiffel Tower", "Louvre Museum", "Colosseum", "Trajan's Column"]
-        predictions = [
-            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
-            "The Eiffel Tower max height is 330 meters.",
-            "Louvre Museum is the world's largest art museum and a historic monument in Paris, France.",
-            "The Leaning Tower of Pisa is the campanile, or freestanding bell tower, of Pisa Cathedral.",
-        ]
-        result = evaluator.run(labels=labels, predictions=predictions)
-        assert len(result) == 1
-        assert result["result"] == 1 / 3
-
-    def test_run_with_empty_labels(self):
-        evaluator = StatisticalEvaluator(metric=StatisticalMetric.MRR)
-        predictions = [
-            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
-            "The Eiffel Tower max height is 330 meters.",
-            "Louvre Museum is the world's largest art museum and a historic monument in Paris, France.",
-            "The Leaning Tower of Pisa is the campanile, or freestanding bell tower, of Pisa Cathedral.",
-        ]
-        result = evaluator.run(labels=[], predictions=predictions)
-        assert len(result) == 1
-        assert result["result"] == 0.0
-
-    def test_run_with_empty_predictions(self):
-        evaluator = StatisticalEvaluator(metric=StatisticalMetric.MRR)
-        labels = ["Eiffel Tower", "Louvre Museum", "Colosseum", "Trajan's Column"]
-        result = evaluator.run(labels=labels, predictions=[])
-        assert len(result) == 1
-        assert result["result"] == 0.0