haystack/test/evaluation/test_eval_sas.py
Ashwin Mathur 393a7993c3
feat: Add Semantic Answer Similarity metric (#6877)
* Add SAS metric

* Add release notes

* Round similarity scores for precision consistency

* Add tolerance to tests

* Update haystack/evaluation/eval.py

Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>

* Add types for preprocess_text; Add additional types for f1 and em methods

---------

Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
2024-02-02 17:07:52 +01:00

348 lines
15 KiB
Python

import pytest
from haystack import Pipeline
from haystack.dataclasses import GeneratedAnswer
from haystack.evaluation.eval import EvaluationResult
class TestSAS:
def create_evaluation_result(self, predictions, labels):
"""
Creates an evaluation result of a RAG pipeline using the list of predictions and labels for testing the
Semantic Answer Similarity (SAS) Metric.
"""
runnable = Pipeline()
inputs = []
outputs = [
{"answer_builder": {"answers": [GeneratedAnswer(data=pred, query="", documents=[], meta={})]}}
for pred in predictions
]
expected_outputs = [
{"answer_builder": {"answers": [GeneratedAnswer(data=label, query="", documents=[], meta={})]}}
for label in labels
]
evaluation_result = EvaluationResult(runnable, inputs, outputs, expected_outputs)
return evaluation_result
def test_sas_empty_inputs(self):
"""
Test calculation of Semantic Answer Similarity (SAS) Score with empty inputs.
"""
runnable = Pipeline()
inputs = []
outputs = [
{"answer_builder": {"answers": []}},
{"answer_builder": {"answers": []}},
{"answer_builder": {"answers": []}},
]
expected_outputs = [
{"answer_builder": {"answers": []}},
{"answer_builder": {"answers": []}},
{"answer_builder": {"answers": []}},
]
evaluation_result = EvaluationResult(runnable, inputs, outputs, expected_outputs)
# Expecting 0% SAS for empty inputs
sas_result = evaluation_result._calculate_sas(
output_key="answers", model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
)
assert sas_result["sas"] == 0.0
assert sas_result["scores"] == [0.0]
def test_calculate_sas_with_different_lengths(self):
"""
Test calculation of Semantic Answer Similarity (SAS) Score with default parameters.
"""
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
labels = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
]
evaluation_result = self.create_evaluation_result(predictions, labels)
with pytest.raises(ValueError, match="The number of predictions and labels must be the same."):
evaluation_result._calculate_sas(
output_key="answers", model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
)
@pytest.mark.integration
def test_sas_same_inputs(self):
"""
Test calculation of Semantic Answer Similarity (SAS) Score with default parameters.
"""
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
labels = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
evaluation_result = self.create_evaluation_result(predictions, labels)
sas_result = evaluation_result._calculate_sas(
output_key="answers", model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
)
assert sas_result["sas"] == pytest.approx(1.0)
assert sas_result["scores"] == pytest.approx([1.0, 1.0, 1.0])
@pytest.mark.integration
def test_sas_single_word(self):
"""
Test calculation of Semantic Answer Similarity (SAS) Score with single-word inputs.
"""
predictions = ["A construction budget of US $2.3 billion"]
labels = ["US $2.3 billion"]
evaluation_result = self.create_evaluation_result(predictions, labels)
sas_result = evaluation_result._calculate_sas(
output_key="answers", model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
)
assert sas_result["sas"] == pytest.approx(0.689089, abs=1e-5)
assert sas_result["scores"] == pytest.approx([0.689089], abs=1e-5)
@pytest.mark.integration
def test_sas_negative_case(self):
"""
Test calculation of Semantic Answer Similarity (SAS) Score with deliberately mismatched predictions and labels.
"""
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
labels = [
"US $2.3 billion",
"Paris's cultural magnificence is symbolized by the Eiffel Tower",
"Japan was transformed into a modernized world power after the Meiji Restoration.",
]
evaluation_result = self.create_evaluation_result(predictions, labels)
sas_result = evaluation_result._calculate_sas(
output_key="answers", model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
)
assert sas_result["sas"] == pytest.approx(0.8227189)
assert sas_result["scores"] == pytest.approx([0.689089, 0.870389, 0.908679], abs=1e-5)
@pytest.mark.integration
def test_sas_ignore_case(self):
"""
Test calculation of Semantic Answer Similarity (SAS) Score with ignoring case sensitivity.
"""
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
labels = [
"A construction budget of US $2.3 BILLION",
"The EIFFEL TOWER, completed in 1889, symbolizes Paris's cultural magnificence.",
"The MEIJI RESTORATION in 1868 transformed Japan into a modernized world power.",
]
evaluation_result = self.create_evaluation_result(predictions, labels)
# SAS after case ignoring
sas_result = evaluation_result._calculate_sas(
output_key="answers", model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2", ignore_case=True
)
assert sas_result["sas"] == pytest.approx(1.0)
assert sas_result["scores"] == pytest.approx([1.0, 1.0, 1.0])
@pytest.mark.integration
def test_sas_ignore_punctuation(self):
"""
Test calculation of Semantic Answer Similarity (SAS) Score with ignoring punctuation.
"""
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration, in 1868, transformed Japan into a modernized world power.",
]
labels = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower completed in 1889 symbolizes Paris's cultural magnificence",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power",
]
evaluation_result = self.create_evaluation_result(predictions, labels)
# SAS after ignoring punctuation
sas_result = evaluation_result._calculate_sas(
output_key="answers",
model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
ignore_punctuation=True,
)
assert sas_result["sas"] == pytest.approx(1.0)
assert sas_result["scores"] == pytest.approx([1.0, 1.0, 1.0])
@pytest.mark.integration
def test_sas_ignore_numbers(self):
"""
Test calculation of Semantic Answer Similarity (SAS) Score with ignoring numbers.
"""
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration, in 1868, transformed Japan into a modernized world power.",
]
labels = [
"A construction budget of US $10.3 billion",
"The Eiffel Tower, completed in 2005, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration, in 1989, transformed Japan into a modernized world power.",
]
evaluation_result = self.create_evaluation_result(predictions, labels)
# SAS after ignoring numbers
sas_result = evaluation_result._calculate_sas(
output_key="answers",
model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
ignore_numbers=True,
)
assert sas_result["sas"] == pytest.approx(1.0)
assert sas_result["scores"] == pytest.approx([1.0, 1.0, 1.0])
@pytest.mark.integration
def test_sas_regex_ignore(self):
"""
Test calculation of Semantic Answer Similarity (SAS) Score with ignoring specific regex patterns.
"""
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration, in 1868, transformed Japan into a modernized world power.",
]
labels = [
"A construction budget of US $10.3 billion",
"The Eiffel Tower, completed in 2005, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration, in 1989, transformed Japan into a modernized world power.",
]
evaluation_result = self.create_evaluation_result(predictions, labels)
# Ignore numeric patterns
regex_to_ignore = [r"\d+"]
sas_result = evaluation_result._calculate_sas(
output_key="answers",
model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
regexes_to_ignore=regex_to_ignore,
)
assert sas_result["sas"] == pytest.approx(1.0)
assert sas_result["scores"] == pytest.approx([1.0, 1.0, 1.0])
@pytest.mark.integration
def test_sas_multiple_ignore_regex(self):
"""
Test calculation of Semantic Answer Similarity (SAS) Score with multiple ignoring parameters.
"""
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration, in 1868, transformed Japan into a modernized world power.",
]
labels = [
"A construction budget of US #10.3 billion",
"The Eiffel Tower!!, completed in 2005, symbolizes Paris's cultural magnificence.",
"The **Meiji Restoration**, in 1989, transformed Japan into a modernized world power.",
]
evaluation_result = self.create_evaluation_result(predictions, labels)
# Ignore numeric patterns and punctuation excluding whitespaces
regex_to_ignore = [r"\d+", r"[^\w\s]"]
sas_result = evaluation_result._calculate_sas(
output_key="answers",
model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
regexes_to_ignore=regex_to_ignore,
)
assert sas_result["sas"] == pytest.approx(1.0)
assert sas_result["scores"] == pytest.approx([1.0, 1.0, 1.0])
@pytest.mark.integration
def test_sas_multiple_ignore_combination(self):
"""
Test calculation of Semantic Answer Similarity (SAS) Score with multiple ignoring parameters combined.
"""
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration, in 1868, transformed Japan into a modernized world power.",
]
labels = [
"A construction budget of US #10.3 BILLION",
"The EIFFEL TOWER!!, completed in 2005, symbolizes Paris's cultural magnificence.",
"The **MEIJI RESTORATION**, in 1989, transformed Japan into a modernized world power.",
]
evaluation_result = self.create_evaluation_result(predictions, labels)
# Ignore only special characters using regex
regex_to_ignore = [r"[^\w\s\d]+"]
sas_result = evaluation_result._calculate_sas(
output_key="answers",
model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
ignore_numbers=True,
ignore_punctuation=True,
ignore_case=True,
regexes_to_ignore=regex_to_ignore,
)
assert sas_result["sas"] == pytest.approx(1.0)
assert sas_result["scores"] == pytest.approx([1.0, 1.0, 1.0])
@pytest.mark.integration
def test_sas_bi_encoder(self):
"""
Test calculation of Semantic Answer Similarity (SAS) Score using a Bi-Encoder model.
"""
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
labels = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
evaluation_result = self.create_evaluation_result(predictions, labels)
sas_result = evaluation_result._calculate_sas(
output_key="answers", model="sentence-transformers/all-mpnet-base-v2"
)
assert sas_result["sas"] == pytest.approx(1.0)
assert sas_result["scores"] == pytest.approx([1.0, 1.0, 1.0])
@pytest.mark.integration
def test_sas_cross_encoder(self):
"""
Test calculation of Semantic Answer Similarity (SAS) Score using a Cross Encoder model.
"""
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
labels = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
evaluation_result = self.create_evaluation_result(predictions, labels)
sas_result = evaluation_result._calculate_sas(
output_key="answers", model="cross-encoder/ms-marco-MiniLM-L-6-v2"
)
assert sas_result["sas"] == pytest.approx(0.999967, abs=1e-5)
assert sas_result["scores"] == pytest.approx(
[0.9999765157699585, 0.999968409538269, 0.9999572038650513], abs=1e-5
)