haystack/test/evaluation/test_eval_sas.py

import pytest

from haystack import Pipeline
from haystack.dataclasses import GeneratedAnswer
from haystack.evaluation.eval import EvaluationResult


class TestSAS:
    def create_evaluation_result(self, predictions, labels):
        """
        Creates an evaluation result of a RAG pipeline using the list of predictions and labels for testing the
         Semantic Answer Similarity (SAS) Metric.
        """
        runnable = Pipeline()
        inputs = []
        outputs = [
            {"answer_builder": {"answers": [GeneratedAnswer(data=pred, query="", documents=[], meta={})]}}
            for pred in predictions
        ]
        expected_outputs = [
            {"answer_builder": {"answers": [GeneratedAnswer(data=label, query="", documents=[], meta={})]}}
            for label in labels
        ]
        evaluation_result = EvaluationResult(runnable, inputs, outputs, expected_outputs)
        return evaluation_result

    def test_sas_empty_inputs(self):
        """
        Test calculation of Semantic Answer Similarity (SAS) Score with empty inputs.
        """
        runnable = Pipeline()
        inputs = []
        outputs = [
            {"answer_builder": {"answers": []}},
            {"answer_builder": {"answers": []}},
            {"answer_builder": {"answers": []}},
        ]
        expected_outputs = [
            {"answer_builder": {"answers": []}},
            {"answer_builder": {"answers": []}},
            {"answer_builder": {"answers": []}},
        ]
        evaluation_result = EvaluationResult(runnable, inputs, outputs, expected_outputs)
        # Expecting 0% SAS for empty inputs
        sas_result = evaluation_result._calculate_sas(
            output_key="answers", model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
        )

        assert sas_result["sas"] == 0.0
        assert sas_result["scores"] == [0.0]

    def test_calculate_sas_with_different_lengths(self):
        """
        Test calculation of Semantic Answer Similarity (SAS) Score with default parameters.
        """
        predictions = [
            "A construction budget of US $2.3 billion",
            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
            "The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
        ]
        labels = [
            "A construction budget of US $2.3 billion",
            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
        ]
        evaluation_result = self.create_evaluation_result(predictions, labels)

        with pytest.raises(ValueError, match="The number of predictions and labels must be the same."):
            evaluation_result._calculate_sas(
                output_key="answers", model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
            )

    @pytest.mark.integration
    def test_sas_same_inputs(self):
        """
        Test calculation of Semantic Answer Similarity (SAS) Score with default parameters.
        """
        predictions = [
            "A construction budget of US $2.3 billion",
            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
            "The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
        ]
        labels = [
            "A construction budget of US $2.3 billion",
            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
            "The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
        ]
        evaluation_result = self.create_evaluation_result(predictions, labels)
        sas_result = evaluation_result._calculate_sas(
            output_key="answers", model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
        )

        assert sas_result["sas"] == pytest.approx(1.0)
        assert sas_result["scores"] == pytest.approx([1.0, 1.0, 1.0])

    @pytest.mark.integration
    def test_sas_single_word(self):
        """
        Test calculation of Semantic Answer Similarity (SAS) Score with single-word inputs.
        """
        predictions = ["A construction budget of US $2.3 billion"]
        labels = ["US $2.3 billion"]

        evaluation_result = self.create_evaluation_result(predictions, labels)
        sas_result = evaluation_result._calculate_sas(
            output_key="answers", model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
        )

        assert sas_result["sas"] == pytest.approx(0.689089, abs=1e-5)
        assert sas_result["scores"] == pytest.approx([0.689089], abs=1e-5)

    @pytest.mark.integration
    def test_sas_negative_case(self):
        """
        Test calculation of Semantic Answer Similarity (SAS) Score with deliberately mismatched predictions and labels.
        """
        predictions = [
            "A construction budget of US $2.3 billion",
            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
            "The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
        ]
        labels = [
            "US $2.3 billion",
            "Paris's cultural magnificence is symbolized by the Eiffel Tower",
            "Japan was transformed into a modernized world power after the Meiji Restoration.",
        ]

        evaluation_result = self.create_evaluation_result(predictions, labels)
        sas_result = evaluation_result._calculate_sas(
            output_key="answers", model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
        )

        assert sas_result["sas"] == pytest.approx(0.8227189)
        assert sas_result["scores"] == pytest.approx([0.689089, 0.870389, 0.908679], abs=1e-5)

    @pytest.mark.integration
    def test_sas_ignore_case(self):
        """
        Test calculation of Semantic Answer Similarity (SAS) Score with ignoring case sensitivity.
        """
        predictions = [
            "A construction budget of US $2.3 billion",
            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
            "The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
        ]
        labels = [
            "A construction budget of US $2.3 BILLION",
            "The EIFFEL TOWER, completed in 1889, symbolizes Paris's cultural magnificence.",
            "The MEIJI RESTORATION in 1868 transformed Japan into a modernized world power.",
        ]

        evaluation_result = self.create_evaluation_result(predictions, labels)
        # SAS after case ignoring
        sas_result = evaluation_result._calculate_sas(
            output_key="answers", model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2", ignore_case=True
        )

        assert sas_result["sas"] == pytest.approx(1.0)
        assert sas_result["scores"] == pytest.approx([1.0, 1.0, 1.0])

    @pytest.mark.integration
    def test_sas_ignore_punctuation(self):
        """
        Test calculation of Semantic Answer Similarity (SAS) Score with ignoring punctuation.
        """
        predictions = [
            "A construction budget of US $2.3 billion",
            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
            "The Meiji Restoration, in 1868, transformed Japan into a modernized world power.",
        ]
        labels = [
            "A construction budget of US $2.3 billion",
            "The Eiffel Tower completed in 1889 symbolizes Paris's cultural magnificence",
            "The Meiji Restoration in 1868 transformed Japan into a modernized world power",
        ]

        evaluation_result = self.create_evaluation_result(predictions, labels)
        # SAS after ignoring punctuation
        sas_result = evaluation_result._calculate_sas(
            output_key="answers",
            model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
            ignore_punctuation=True,
        )

        assert sas_result["sas"] == pytest.approx(1.0)
        assert sas_result["scores"] == pytest.approx([1.0, 1.0, 1.0])

    @pytest.mark.integration
    def test_sas_ignore_numbers(self):
        """
        Test calculation of Semantic Answer Similarity (SAS) Score with ignoring numbers.
        """
        predictions = [
            "A construction budget of US $2.3 billion",
            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
            "The Meiji Restoration, in 1868, transformed Japan into a modernized world power.",
        ]
        labels = [
            "A construction budget of US $10.3 billion",
            "The Eiffel Tower, completed in 2005, symbolizes Paris's cultural magnificence.",
            "The Meiji Restoration, in 1989, transformed Japan into a modernized world power.",
        ]

        evaluation_result = self.create_evaluation_result(predictions, labels)
        # SAS after ignoring numbers
        sas_result = evaluation_result._calculate_sas(
            output_key="answers",
            model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
            ignore_numbers=True,
        )

        assert sas_result["sas"] == pytest.approx(1.0)
        assert sas_result["scores"] == pytest.approx([1.0, 1.0, 1.0])

    @pytest.mark.integration
    def test_sas_regex_ignore(self):
        """
        Test calculation of Semantic Answer Similarity (SAS) Score with ignoring specific regex patterns.
        """
        predictions = [
            "A construction budget of US $2.3 billion",
            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
            "The Meiji Restoration, in 1868, transformed Japan into a modernized world power.",
        ]
        labels = [
            "A construction budget of US $10.3 billion",
            "The Eiffel Tower, completed in 2005, symbolizes Paris's cultural magnificence.",
            "The Meiji Restoration, in 1989, transformed Japan into a modernized world power.",
        ]

        evaluation_result = self.create_evaluation_result(predictions, labels)
        # Ignore numeric patterns
        regex_to_ignore = [r"\d+"]
        sas_result = evaluation_result._calculate_sas(
            output_key="answers",
            model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
            regexes_to_ignore=regex_to_ignore,
        )

        assert sas_result["sas"] == pytest.approx(1.0)
        assert sas_result["scores"] == pytest.approx([1.0, 1.0, 1.0])

    @pytest.mark.integration
    def test_sas_multiple_ignore_regex(self):
        """
        Test calculation of Semantic Answer Similarity (SAS) Score with multiple ignoring parameters.
        """
        predictions = [
            "A construction budget of US $2.3 billion",
            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
            "The Meiji Restoration, in 1868, transformed Japan into a modernized world power.",
        ]
        labels = [
            "A construction budget of US #10.3 billion",
            "The Eiffel Tower!!, completed in 2005, symbolizes Paris's cultural magnificence.",
            "The **Meiji Restoration**, in 1989, transformed Japan into a modernized world power.",
        ]

        evaluation_result = self.create_evaluation_result(predictions, labels)
        # Ignore numeric patterns and punctuation excluding whitespaces
        regex_to_ignore = [r"\d+", r"[^\w\s]"]
        sas_result = evaluation_result._calculate_sas(
            output_key="answers",
            model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
            regexes_to_ignore=regex_to_ignore,
        )

        assert sas_result["sas"] == pytest.approx(1.0)
        assert sas_result["scores"] == pytest.approx([1.0, 1.0, 1.0])

    @pytest.mark.integration
    def test_sas_multiple_ignore_combination(self):
        """
        Test calculation of Semantic Answer Similarity (SAS) Score with multiple ignoring parameters combined.
        """
        predictions = [
            "A construction budget of US $2.3 billion",
            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
            "The Meiji Restoration, in 1868, transformed Japan into a modernized world power.",
        ]
        labels = [
            "A construction budget of US #10.3 BILLION",
            "The EIFFEL TOWER!!, completed in 2005, symbolizes Paris's cultural magnificence.",
            "The **MEIJI RESTORATION**, in 1989, transformed Japan into a modernized world power.",
        ]

        evaluation_result = self.create_evaluation_result(predictions, labels)
        # Ignore only special characters using regex
        regex_to_ignore = [r"[^\w\s\d]+"]
        sas_result = evaluation_result._calculate_sas(
            output_key="answers",
            model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
            ignore_numbers=True,
            ignore_punctuation=True,
            ignore_case=True,
            regexes_to_ignore=regex_to_ignore,
        )

        assert sas_result["sas"] == pytest.approx(1.0)
        assert sas_result["scores"] == pytest.approx([1.0, 1.0, 1.0])

    @pytest.mark.integration
    def test_sas_bi_encoder(self):
        """
        Test calculation of Semantic Answer Similarity (SAS) Score using a Bi-Encoder model.
        """
        predictions = [
            "A construction budget of US $2.3 billion",
            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
            "The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
        ]
        labels = [
            "A construction budget of US $2.3 billion",
            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
            "The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
        ]
        evaluation_result = self.create_evaluation_result(predictions, labels)
        sas_result = evaluation_result._calculate_sas(
            output_key="answers", model="sentence-transformers/all-mpnet-base-v2"
        )

        assert sas_result["sas"] == pytest.approx(1.0)
        assert sas_result["scores"] == pytest.approx([1.0, 1.0, 1.0])

    @pytest.mark.integration
    def test_sas_cross_encoder(self):
        """
        Test calculation of Semantic Answer Similarity (SAS) Score using a Cross Encoder model.
        """
        predictions = [
            "A construction budget of US $2.3 billion",
            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
            "The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
        ]
        labels = [
            "A construction budget of US $2.3 billion",
            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
            "The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
        ]
        evaluation_result = self.create_evaluation_result(predictions, labels)
        sas_result = evaluation_result._calculate_sas(
            output_key="answers", model="cross-encoder/ms-marco-MiniLM-L-6-v2"
        )

        assert sas_result["sas"] == pytest.approx(0.999967, abs=1e-5)
        assert sas_result["scores"] == pytest.approx(
            [0.9999765157699585, 0.999968409538269, 0.9999572038650513], abs=1e-5
        )