mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-08 05:43:41 +00:00
* Add SAS metric * Add release notes * Round similarity scores for precision consistency * Add tolerance to tests * Update haystack/evaluation/eval.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * Add types for preprocess_text; Add additional types for f1 and em methods --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
348 lines
15 KiB
Python
348 lines
15 KiB
Python
import pytest
|
|
|
|
from haystack import Pipeline
|
|
from haystack.dataclasses import GeneratedAnswer
|
|
from haystack.evaluation.eval import EvaluationResult
|
|
|
|
|
|
class TestSAS:
|
|
def create_evaluation_result(self, predictions, labels):
|
|
"""
|
|
Creates an evaluation result of a RAG pipeline using the list of predictions and labels for testing the
|
|
Semantic Answer Similarity (SAS) Metric.
|
|
"""
|
|
runnable = Pipeline()
|
|
inputs = []
|
|
outputs = [
|
|
{"answer_builder": {"answers": [GeneratedAnswer(data=pred, query="", documents=[], meta={})]}}
|
|
for pred in predictions
|
|
]
|
|
expected_outputs = [
|
|
{"answer_builder": {"answers": [GeneratedAnswer(data=label, query="", documents=[], meta={})]}}
|
|
for label in labels
|
|
]
|
|
evaluation_result = EvaluationResult(runnable, inputs, outputs, expected_outputs)
|
|
return evaluation_result
|
|
|
|
def test_sas_empty_inputs(self):
|
|
"""
|
|
Test calculation of Semantic Answer Similarity (SAS) Score with empty inputs.
|
|
"""
|
|
runnable = Pipeline()
|
|
inputs = []
|
|
outputs = [
|
|
{"answer_builder": {"answers": []}},
|
|
{"answer_builder": {"answers": []}},
|
|
{"answer_builder": {"answers": []}},
|
|
]
|
|
expected_outputs = [
|
|
{"answer_builder": {"answers": []}},
|
|
{"answer_builder": {"answers": []}},
|
|
{"answer_builder": {"answers": []}},
|
|
]
|
|
evaluation_result = EvaluationResult(runnable, inputs, outputs, expected_outputs)
|
|
# Expecting 0% SAS for empty inputs
|
|
sas_result = evaluation_result._calculate_sas(
|
|
output_key="answers", model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
|
|
)
|
|
|
|
assert sas_result["sas"] == 0.0
|
|
assert sas_result["scores"] == [0.0]
|
|
|
|
def test_calculate_sas_with_different_lengths(self):
|
|
"""
|
|
Test calculation of Semantic Answer Similarity (SAS) Score with default parameters.
|
|
"""
|
|
predictions = [
|
|
"A construction budget of US $2.3 billion",
|
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
|
]
|
|
labels = [
|
|
"A construction budget of US $2.3 billion",
|
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
|
]
|
|
evaluation_result = self.create_evaluation_result(predictions, labels)
|
|
|
|
with pytest.raises(ValueError, match="The number of predictions and labels must be the same."):
|
|
evaluation_result._calculate_sas(
|
|
output_key="answers", model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
|
|
)
|
|
|
|
@pytest.mark.integration
|
|
def test_sas_same_inputs(self):
|
|
"""
|
|
Test calculation of Semantic Answer Similarity (SAS) Score with default parameters.
|
|
"""
|
|
predictions = [
|
|
"A construction budget of US $2.3 billion",
|
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
|
]
|
|
labels = [
|
|
"A construction budget of US $2.3 billion",
|
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
|
]
|
|
evaluation_result = self.create_evaluation_result(predictions, labels)
|
|
sas_result = evaluation_result._calculate_sas(
|
|
output_key="answers", model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
|
|
)
|
|
|
|
assert sas_result["sas"] == pytest.approx(1.0)
|
|
assert sas_result["scores"] == pytest.approx([1.0, 1.0, 1.0])
|
|
|
|
@pytest.mark.integration
|
|
def test_sas_single_word(self):
|
|
"""
|
|
Test calculation of Semantic Answer Similarity (SAS) Score with single-word inputs.
|
|
"""
|
|
predictions = ["A construction budget of US $2.3 billion"]
|
|
labels = ["US $2.3 billion"]
|
|
|
|
evaluation_result = self.create_evaluation_result(predictions, labels)
|
|
sas_result = evaluation_result._calculate_sas(
|
|
output_key="answers", model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
|
|
)
|
|
|
|
assert sas_result["sas"] == pytest.approx(0.689089, abs=1e-5)
|
|
assert sas_result["scores"] == pytest.approx([0.689089], abs=1e-5)
|
|
|
|
@pytest.mark.integration
|
|
def test_sas_negative_case(self):
|
|
"""
|
|
Test calculation of Semantic Answer Similarity (SAS) Score with deliberately mismatched predictions and labels.
|
|
"""
|
|
predictions = [
|
|
"A construction budget of US $2.3 billion",
|
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
|
]
|
|
labels = [
|
|
"US $2.3 billion",
|
|
"Paris's cultural magnificence is symbolized by the Eiffel Tower",
|
|
"Japan was transformed into a modernized world power after the Meiji Restoration.",
|
|
]
|
|
|
|
evaluation_result = self.create_evaluation_result(predictions, labels)
|
|
sas_result = evaluation_result._calculate_sas(
|
|
output_key="answers", model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
|
|
)
|
|
|
|
assert sas_result["sas"] == pytest.approx(0.8227189)
|
|
assert sas_result["scores"] == pytest.approx([0.689089, 0.870389, 0.908679], abs=1e-5)
|
|
|
|
@pytest.mark.integration
|
|
def test_sas_ignore_case(self):
|
|
"""
|
|
Test calculation of Semantic Answer Similarity (SAS) Score with ignoring case sensitivity.
|
|
"""
|
|
predictions = [
|
|
"A construction budget of US $2.3 billion",
|
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
|
]
|
|
labels = [
|
|
"A construction budget of US $2.3 BILLION",
|
|
"The EIFFEL TOWER, completed in 1889, symbolizes Paris's cultural magnificence.",
|
|
"The MEIJI RESTORATION in 1868 transformed Japan into a modernized world power.",
|
|
]
|
|
|
|
evaluation_result = self.create_evaluation_result(predictions, labels)
|
|
# SAS after case ignoring
|
|
sas_result = evaluation_result._calculate_sas(
|
|
output_key="answers", model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2", ignore_case=True
|
|
)
|
|
|
|
assert sas_result["sas"] == pytest.approx(1.0)
|
|
assert sas_result["scores"] == pytest.approx([1.0, 1.0, 1.0])
|
|
|
|
@pytest.mark.integration
|
|
def test_sas_ignore_punctuation(self):
|
|
"""
|
|
Test calculation of Semantic Answer Similarity (SAS) Score with ignoring punctuation.
|
|
"""
|
|
predictions = [
|
|
"A construction budget of US $2.3 billion",
|
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
|
"The Meiji Restoration, in 1868, transformed Japan into a modernized world power.",
|
|
]
|
|
labels = [
|
|
"A construction budget of US $2.3 billion",
|
|
"The Eiffel Tower completed in 1889 symbolizes Paris's cultural magnificence",
|
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power",
|
|
]
|
|
|
|
evaluation_result = self.create_evaluation_result(predictions, labels)
|
|
# SAS after ignoring punctuation
|
|
sas_result = evaluation_result._calculate_sas(
|
|
output_key="answers",
|
|
model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
|
|
ignore_punctuation=True,
|
|
)
|
|
|
|
assert sas_result["sas"] == pytest.approx(1.0)
|
|
assert sas_result["scores"] == pytest.approx([1.0, 1.0, 1.0])
|
|
|
|
@pytest.mark.integration
|
|
def test_sas_ignore_numbers(self):
|
|
"""
|
|
Test calculation of Semantic Answer Similarity (SAS) Score with ignoring numbers.
|
|
"""
|
|
predictions = [
|
|
"A construction budget of US $2.3 billion",
|
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
|
"The Meiji Restoration, in 1868, transformed Japan into a modernized world power.",
|
|
]
|
|
labels = [
|
|
"A construction budget of US $10.3 billion",
|
|
"The Eiffel Tower, completed in 2005, symbolizes Paris's cultural magnificence.",
|
|
"The Meiji Restoration, in 1989, transformed Japan into a modernized world power.",
|
|
]
|
|
|
|
evaluation_result = self.create_evaluation_result(predictions, labels)
|
|
# SAS after ignoring numbers
|
|
sas_result = evaluation_result._calculate_sas(
|
|
output_key="answers",
|
|
model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
|
|
ignore_numbers=True,
|
|
)
|
|
|
|
assert sas_result["sas"] == pytest.approx(1.0)
|
|
assert sas_result["scores"] == pytest.approx([1.0, 1.0, 1.0])
|
|
|
|
@pytest.mark.integration
|
|
def test_sas_regex_ignore(self):
|
|
"""
|
|
Test calculation of Semantic Answer Similarity (SAS) Score with ignoring specific regex patterns.
|
|
"""
|
|
predictions = [
|
|
"A construction budget of US $2.3 billion",
|
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
|
"The Meiji Restoration, in 1868, transformed Japan into a modernized world power.",
|
|
]
|
|
labels = [
|
|
"A construction budget of US $10.3 billion",
|
|
"The Eiffel Tower, completed in 2005, symbolizes Paris's cultural magnificence.",
|
|
"The Meiji Restoration, in 1989, transformed Japan into a modernized world power.",
|
|
]
|
|
|
|
evaluation_result = self.create_evaluation_result(predictions, labels)
|
|
# Ignore numeric patterns
|
|
regex_to_ignore = [r"\d+"]
|
|
sas_result = evaluation_result._calculate_sas(
|
|
output_key="answers",
|
|
model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
|
|
regexes_to_ignore=regex_to_ignore,
|
|
)
|
|
|
|
assert sas_result["sas"] == pytest.approx(1.0)
|
|
assert sas_result["scores"] == pytest.approx([1.0, 1.0, 1.0])
|
|
|
|
@pytest.mark.integration
|
|
def test_sas_multiple_ignore_regex(self):
|
|
"""
|
|
Test calculation of Semantic Answer Similarity (SAS) Score with multiple ignoring parameters.
|
|
"""
|
|
predictions = [
|
|
"A construction budget of US $2.3 billion",
|
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
|
"The Meiji Restoration, in 1868, transformed Japan into a modernized world power.",
|
|
]
|
|
labels = [
|
|
"A construction budget of US #10.3 billion",
|
|
"The Eiffel Tower!!, completed in 2005, symbolizes Paris's cultural magnificence.",
|
|
"The **Meiji Restoration**, in 1989, transformed Japan into a modernized world power.",
|
|
]
|
|
|
|
evaluation_result = self.create_evaluation_result(predictions, labels)
|
|
# Ignore numeric patterns and punctuation excluding whitespaces
|
|
regex_to_ignore = [r"\d+", r"[^\w\s]"]
|
|
sas_result = evaluation_result._calculate_sas(
|
|
output_key="answers",
|
|
model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
|
|
regexes_to_ignore=regex_to_ignore,
|
|
)
|
|
|
|
assert sas_result["sas"] == pytest.approx(1.0)
|
|
assert sas_result["scores"] == pytest.approx([1.0, 1.0, 1.0])
|
|
|
|
@pytest.mark.integration
|
|
def test_sas_multiple_ignore_combination(self):
|
|
"""
|
|
Test calculation of Semantic Answer Similarity (SAS) Score with multiple ignoring parameters combined.
|
|
"""
|
|
predictions = [
|
|
"A construction budget of US $2.3 billion",
|
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
|
"The Meiji Restoration, in 1868, transformed Japan into a modernized world power.",
|
|
]
|
|
labels = [
|
|
"A construction budget of US #10.3 BILLION",
|
|
"The EIFFEL TOWER!!, completed in 2005, symbolizes Paris's cultural magnificence.",
|
|
"The **MEIJI RESTORATION**, in 1989, transformed Japan into a modernized world power.",
|
|
]
|
|
|
|
evaluation_result = self.create_evaluation_result(predictions, labels)
|
|
# Ignore only special characters using regex
|
|
regex_to_ignore = [r"[^\w\s\d]+"]
|
|
sas_result = evaluation_result._calculate_sas(
|
|
output_key="answers",
|
|
model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
|
|
ignore_numbers=True,
|
|
ignore_punctuation=True,
|
|
ignore_case=True,
|
|
regexes_to_ignore=regex_to_ignore,
|
|
)
|
|
|
|
assert sas_result["sas"] == pytest.approx(1.0)
|
|
assert sas_result["scores"] == pytest.approx([1.0, 1.0, 1.0])
|
|
|
|
@pytest.mark.integration
|
|
def test_sas_bi_encoder(self):
|
|
"""
|
|
Test calculation of Semantic Answer Similarity (SAS) Score using a Bi-Encoder model.
|
|
"""
|
|
predictions = [
|
|
"A construction budget of US $2.3 billion",
|
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
|
]
|
|
labels = [
|
|
"A construction budget of US $2.3 billion",
|
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
|
]
|
|
evaluation_result = self.create_evaluation_result(predictions, labels)
|
|
sas_result = evaluation_result._calculate_sas(
|
|
output_key="answers", model="sentence-transformers/all-mpnet-base-v2"
|
|
)
|
|
|
|
assert sas_result["sas"] == pytest.approx(1.0)
|
|
assert sas_result["scores"] == pytest.approx([1.0, 1.0, 1.0])
|
|
|
|
@pytest.mark.integration
|
|
def test_sas_cross_encoder(self):
|
|
"""
|
|
Test calculation of Semantic Answer Similarity (SAS) Score using a Cross Encoder model.
|
|
"""
|
|
predictions = [
|
|
"A construction budget of US $2.3 billion",
|
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
|
]
|
|
labels = [
|
|
"A construction budget of US $2.3 billion",
|
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
|
]
|
|
evaluation_result = self.create_evaluation_result(predictions, labels)
|
|
sas_result = evaluation_result._calculate_sas(
|
|
output_key="answers", model="cross-encoder/ms-marco-MiniLM-L-6-v2"
|
|
)
|
|
|
|
assert sas_result["sas"] == pytest.approx(0.999967, abs=1e-5)
|
|
assert sas_result["scores"] == pytest.approx(
|
|
[0.9999765157699585, 0.999968409538269, 0.9999572038650513], abs=1e-5
|
|
)
|