haystack/test/components/eval/test_sas_evaluator.py

293 lines
14 KiB
Python
Raw Normal View History

import pytest
from haystack.components.eval import SASEvaluator
from haystack.utils.device import ComponentDevice
class TestSASEvaluator:
def test_init_default(self, monkeypatch):
monkeypatch.setenv("HF_API_TOKEN", "fake-token")
labels = ["label1", "label2", "label3"]
evaluator = SASEvaluator(labels=labels)
assert evaluator._labels == labels
assert evaluator._regexes_to_ignore is None
assert evaluator._ignore_case is False
assert evaluator._ignore_punctuation is False
assert evaluator._ignore_numbers is False
assert evaluator._model == "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
assert evaluator._batch_size == 32
assert evaluator._device is None
assert evaluator._token.resolve_value() == "fake-token"
def test_to_dict(self, monkeypatch):
monkeypatch.setenv("HF_API_TOKEN", "fake-token")
labels = ["label1", "label2", "label3"]
evaluator = SASEvaluator(labels=labels, device=ComponentDevice.from_str("cuda:0"))
expected_dict = {
"type": "haystack.components.eval.sas_evaluator.SASEvaluator",
"init_parameters": {
"labels": labels,
"regexes_to_ignore": None,
"ignore_case": False,
"ignore_punctuation": False,
"ignore_numbers": False,
"model": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
"batch_size": 32,
"device": {"type": "single", "device": "cuda:0"},
"token": {"type": "env_var", "env_vars": ["HF_API_TOKEN"], "strict": False},
},
}
assert evaluator.to_dict() == expected_dict
def test_from_dict(self, monkeypatch):
monkeypatch.setenv("HF_API_TOKEN", "fake-token")
evaluator = SASEvaluator.from_dict(
{
"type": "haystack.components.eval.sas_evaluator.SASEvaluator",
"init_parameters": {
"labels": ["label1", "label2", "label3"],
"regexes_to_ignore": None,
"ignore_case": False,
"ignore_punctuation": False,
"ignore_numbers": False,
"model": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
"batch_size": 32,
"device": {"type": "single", "device": "cuda:0"},
"token": {"type": "env_var", "env_vars": ["HF_API_TOKEN"], "strict": False},
},
}
)
assert evaluator._labels == ["label1", "label2", "label3"]
assert evaluator._regexes_to_ignore is None
assert evaluator._ignore_case is False
assert evaluator._ignore_punctuation is False
assert evaluator._ignore_numbers is False
assert evaluator._model == "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
assert evaluator._batch_size == 32
assert evaluator._device.to_torch_str() == "cuda:0"
assert evaluator._token.resolve_value() == "fake-token"
@pytest.mark.integration
def test_run_with_empty_inputs(self):
evaluator = SASEvaluator(labels=[])
result = evaluator.run(predictions=[])
assert len(result) == 2
assert result["sas"] == 0.0
assert result["scores"] == [0.0]
@pytest.mark.integration
def test_run_with_different_lengths(self):
labels = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
]
evaluator = SASEvaluator(labels=labels)
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
with pytest.raises(ValueError):
evaluator.run(predictions)
@pytest.mark.integration
def test_run_with_matching_predictions(self):
labels = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
evaluator = SASEvaluator(labels=labels)
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
result = evaluator.run(predictions=predictions)
assert len(result) == 2
assert result["sas"] == pytest.approx(1.0)
assert result["scores"] == pytest.approx([1.0, 1.0, 1.0])
@pytest.mark.integration
def test_run_with_single_prediction(self):
labels = ["US $2.3 billion"]
evaluator = SASEvaluator(labels=labels)
result = evaluator.run(predictions=["A construction budget of US $2.3 billion"])
assert len(result) == 2
assert result["sas"] == pytest.approx(0.689089, abs=1e-5)
assert result["scores"] == pytest.approx([0.689089], abs=1e-5)
@pytest.mark.integration
def test_run_with_mismatched_predictions(self):
labels = [
"US $2.3 billion",
"Paris's cultural magnificence is symbolized by the Eiffel Tower",
"Japan was transformed into a modernized world power after the Meiji Restoration.",
]
evaluator = SASEvaluator(labels=labels)
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
result = evaluator.run(predictions=predictions)
assert len(result) == 2
assert result["sas"] == pytest.approx(0.8227189)
assert result["scores"] == pytest.approx([0.689089, 0.870389, 0.908679], abs=1e-5)
@pytest.mark.integration
def test_run_with_ignore_case(self):
labels = [
"A construction budget of US $2.3 BILLION",
"The EIFFEL TOWER, completed in 1889, symbolizes Paris's cultural magnificence.",
"The MEIJI RESTORATION in 1868 transformed Japan into a modernized world power.",
]
evaluator = SASEvaluator(labels=labels, ignore_case=True)
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
result = evaluator.run(predictions=predictions)
assert len(result) == 2
assert result["sas"] == pytest.approx(1.0)
assert result["scores"] == pytest.approx([1.0, 1.0, 1.0])
@pytest.mark.integration
def test_run_with_ignore_punctuation(self):
labels = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower completed in 1889 symbolizes Paris's cultural magnificence",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power",
]
evaluator = SASEvaluator(labels=labels, ignore_punctuation=True)
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration, in 1868, transformed Japan into a modernized world power.",
]
result = evaluator.run(predictions=predictions)
assert len(result) == 2
assert result["sas"] == pytest.approx(1.0)
assert result["scores"] == pytest.approx([1.0, 1.0, 1.0])
@pytest.mark.integration
def test_run_with_ignore_numbers(self):
labels = [
"A construction budget of US $10.3 billion",
"The Eiffel Tower, completed in 2005, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration, in 1989, transformed Japan into a modernized world power.",
]
evaluator = SASEvaluator(labels=labels, ignore_numbers=True)
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration, in 1868, transformed Japan into a modernized world power.",
]
result = evaluator.run(predictions=predictions)
assert result["sas"] == pytest.approx(1.0)
assert result["scores"] == pytest.approx([1.0, 1.0, 1.0])
@pytest.mark.integration
def test_run_with_regex_to_ignore(self):
labels = [
"A construction budget of US $10.3 billion",
"The Eiffel Tower, completed in 2005, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration, in 1989, transformed Japan into a modernized world power.",
]
evaluator = SASEvaluator(labels=labels, regexes_to_ignore=[r"\d+"])
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration, in 1868, transformed Japan into a modernized world power.",
]
result = evaluator.run(predictions=predictions)
assert len(result) == 2
assert result["sas"] == pytest.approx(1.0)
assert result["scores"] == pytest.approx([1.0, 1.0, 1.0])
@pytest.mark.integration
def test_run_with_multiple_regex_to_ignore(self):
labels = [
"A construction budget of US $10.3 billion",
"The Eiffel Tower, completed in 2005, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration, in 1989, transformed Japan into a modernized world power.",
]
evaluator = SASEvaluator(labels=labels, regexes_to_ignore=[r"\d+", r"[^\w\s]"])
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration, in 1868, transformed Japan into a modernized world power.",
]
result = evaluator.run(predictions=predictions)
assert len(result) == 2
assert result["sas"] == pytest.approx(1.0)
assert result["scores"] == pytest.approx([1.0, 1.0, 1.0])
@pytest.mark.integration
def test_run_with_multiple_ignore_parameters(self):
labels = [
"A construction budget of US $10.3 billion",
"The Eiffel Tower, completed in 2005, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration, in 1989, transformed Japan into a modernized world power.",
]
evaluator = SASEvaluator(
labels=labels,
ignore_numbers=True,
ignore_punctuation=True,
ignore_case=True,
regexes_to_ignore=[r"[^\w\s\d]+"],
)
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration, in 1868, transformed Japan into a modernized world power.",
]
result = evaluator.run(predictions=predictions)
assert len(result) == 2
assert result["sas"] == pytest.approx(1.0)
assert result["scores"] == pytest.approx([1.0, 1.0, 1.0])
@pytest.mark.integration
def test_run_with_bi_encoder_model(self):
labels = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
evaluator = SASEvaluator(labels=labels, model="sentence-transformers/all-mpnet-base-v2")
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
result = evaluator.run(predictions=predictions)
assert len(result) == 2
assert result["sas"] == pytest.approx(1.0)
assert result["scores"] == pytest.approx([1.0, 1.0, 1.0])
@pytest.mark.integration
def test_run_with_cross_encoder_model(self):
labels = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
evaluator = SASEvaluator(labels=labels, model="cross-encoder/ms-marco-MiniLM-L-6-v2")
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
result = evaluator.run(predictions=predictions)
assert len(result) == 2
assert result["sas"] == pytest.approx(0.999967, abs=1e-5)
assert result["scores"] == pytest.approx([0.9999765157699585, 0.999968409538269, 0.9999572038650513], abs=1e-5)