mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-19 15:01:40 +00:00
293 lines
14 KiB
Python
293 lines
14 KiB
Python
![]() |
import pytest
|
||
|
|
||
|
from haystack.components.eval import SASEvaluator
|
||
|
from haystack.utils.device import ComponentDevice
|
||
|
|
||
|
|
||
|
class TestSASEvaluator:
|
||
|
def test_init_default(self, monkeypatch):
|
||
|
monkeypatch.setenv("HF_API_TOKEN", "fake-token")
|
||
|
labels = ["label1", "label2", "label3"]
|
||
|
evaluator = SASEvaluator(labels=labels)
|
||
|
|
||
|
assert evaluator._labels == labels
|
||
|
assert evaluator._regexes_to_ignore is None
|
||
|
assert evaluator._ignore_case is False
|
||
|
assert evaluator._ignore_punctuation is False
|
||
|
assert evaluator._ignore_numbers is False
|
||
|
assert evaluator._model == "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
|
||
|
assert evaluator._batch_size == 32
|
||
|
assert evaluator._device is None
|
||
|
assert evaluator._token.resolve_value() == "fake-token"
|
||
|
|
||
|
def test_to_dict(self, monkeypatch):
|
||
|
monkeypatch.setenv("HF_API_TOKEN", "fake-token")
|
||
|
|
||
|
labels = ["label1", "label2", "label3"]
|
||
|
evaluator = SASEvaluator(labels=labels, device=ComponentDevice.from_str("cuda:0"))
|
||
|
|
||
|
expected_dict = {
|
||
|
"type": "haystack.components.eval.sas_evaluator.SASEvaluator",
|
||
|
"init_parameters": {
|
||
|
"labels": labels,
|
||
|
"regexes_to_ignore": None,
|
||
|
"ignore_case": False,
|
||
|
"ignore_punctuation": False,
|
||
|
"ignore_numbers": False,
|
||
|
"model": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
|
||
|
"batch_size": 32,
|
||
|
"device": {"type": "single", "device": "cuda:0"},
|
||
|
"token": {"type": "env_var", "env_vars": ["HF_API_TOKEN"], "strict": False},
|
||
|
},
|
||
|
}
|
||
|
assert evaluator.to_dict() == expected_dict
|
||
|
|
||
|
def test_from_dict(self, monkeypatch):
|
||
|
monkeypatch.setenv("HF_API_TOKEN", "fake-token")
|
||
|
evaluator = SASEvaluator.from_dict(
|
||
|
{
|
||
|
"type": "haystack.components.eval.sas_evaluator.SASEvaluator",
|
||
|
"init_parameters": {
|
||
|
"labels": ["label1", "label2", "label3"],
|
||
|
"regexes_to_ignore": None,
|
||
|
"ignore_case": False,
|
||
|
"ignore_punctuation": False,
|
||
|
"ignore_numbers": False,
|
||
|
"model": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
|
||
|
"batch_size": 32,
|
||
|
"device": {"type": "single", "device": "cuda:0"},
|
||
|
"token": {"type": "env_var", "env_vars": ["HF_API_TOKEN"], "strict": False},
|
||
|
},
|
||
|
}
|
||
|
)
|
||
|
|
||
|
assert evaluator._labels == ["label1", "label2", "label3"]
|
||
|
assert evaluator._regexes_to_ignore is None
|
||
|
assert evaluator._ignore_case is False
|
||
|
assert evaluator._ignore_punctuation is False
|
||
|
assert evaluator._ignore_numbers is False
|
||
|
assert evaluator._model == "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
|
||
|
assert evaluator._batch_size == 32
|
||
|
assert evaluator._device.to_torch_str() == "cuda:0"
|
||
|
assert evaluator._token.resolve_value() == "fake-token"
|
||
|
|
||
|
@pytest.mark.integration
|
||
|
def test_run_with_empty_inputs(self):
|
||
|
evaluator = SASEvaluator(labels=[])
|
||
|
result = evaluator.run(predictions=[])
|
||
|
assert len(result) == 2
|
||
|
assert result["sas"] == 0.0
|
||
|
assert result["scores"] == [0.0]
|
||
|
|
||
|
@pytest.mark.integration
|
||
|
def test_run_with_different_lengths(self):
|
||
|
labels = [
|
||
|
"A construction budget of US $2.3 billion",
|
||
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||
|
]
|
||
|
evaluator = SASEvaluator(labels=labels)
|
||
|
|
||
|
predictions = [
|
||
|
"A construction budget of US $2.3 billion",
|
||
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
||
|
]
|
||
|
with pytest.raises(ValueError):
|
||
|
evaluator.run(predictions)
|
||
|
|
||
|
@pytest.mark.integration
|
||
|
def test_run_with_matching_predictions(self):
|
||
|
labels = [
|
||
|
"A construction budget of US $2.3 billion",
|
||
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
||
|
]
|
||
|
evaluator = SASEvaluator(labels=labels)
|
||
|
predictions = [
|
||
|
"A construction budget of US $2.3 billion",
|
||
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
||
|
]
|
||
|
result = evaluator.run(predictions=predictions)
|
||
|
|
||
|
assert len(result) == 2
|
||
|
assert result["sas"] == pytest.approx(1.0)
|
||
|
assert result["scores"] == pytest.approx([1.0, 1.0, 1.0])
|
||
|
|
||
|
@pytest.mark.integration
|
||
|
def test_run_with_single_prediction(self):
|
||
|
labels = ["US $2.3 billion"]
|
||
|
evaluator = SASEvaluator(labels=labels)
|
||
|
|
||
|
result = evaluator.run(predictions=["A construction budget of US $2.3 billion"])
|
||
|
assert len(result) == 2
|
||
|
assert result["sas"] == pytest.approx(0.689089, abs=1e-5)
|
||
|
assert result["scores"] == pytest.approx([0.689089], abs=1e-5)
|
||
|
|
||
|
@pytest.mark.integration
|
||
|
def test_run_with_mismatched_predictions(self):
|
||
|
labels = [
|
||
|
"US $2.3 billion",
|
||
|
"Paris's cultural magnificence is symbolized by the Eiffel Tower",
|
||
|
"Japan was transformed into a modernized world power after the Meiji Restoration.",
|
||
|
]
|
||
|
evaluator = SASEvaluator(labels=labels)
|
||
|
predictions = [
|
||
|
"A construction budget of US $2.3 billion",
|
||
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
||
|
]
|
||
|
result = evaluator.run(predictions=predictions)
|
||
|
assert len(result) == 2
|
||
|
assert result["sas"] == pytest.approx(0.8227189)
|
||
|
assert result["scores"] == pytest.approx([0.689089, 0.870389, 0.908679], abs=1e-5)
|
||
|
|
||
|
@pytest.mark.integration
|
||
|
def test_run_with_ignore_case(self):
|
||
|
labels = [
|
||
|
"A construction budget of US $2.3 BILLION",
|
||
|
"The EIFFEL TOWER, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||
|
"The MEIJI RESTORATION in 1868 transformed Japan into a modernized world power.",
|
||
|
]
|
||
|
evaluator = SASEvaluator(labels=labels, ignore_case=True)
|
||
|
predictions = [
|
||
|
"A construction budget of US $2.3 billion",
|
||
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
||
|
]
|
||
|
result = evaluator.run(predictions=predictions)
|
||
|
assert len(result) == 2
|
||
|
assert result["sas"] == pytest.approx(1.0)
|
||
|
assert result["scores"] == pytest.approx([1.0, 1.0, 1.0])
|
||
|
|
||
|
@pytest.mark.integration
|
||
|
def test_run_with_ignore_punctuation(self):
|
||
|
labels = [
|
||
|
"A construction budget of US $2.3 billion",
|
||
|
"The Eiffel Tower completed in 1889 symbolizes Paris's cultural magnificence",
|
||
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power",
|
||
|
]
|
||
|
evaluator = SASEvaluator(labels=labels, ignore_punctuation=True)
|
||
|
predictions = [
|
||
|
"A construction budget of US $2.3 billion",
|
||
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||
|
"The Meiji Restoration, in 1868, transformed Japan into a modernized world power.",
|
||
|
]
|
||
|
result = evaluator.run(predictions=predictions)
|
||
|
assert len(result) == 2
|
||
|
assert result["sas"] == pytest.approx(1.0)
|
||
|
assert result["scores"] == pytest.approx([1.0, 1.0, 1.0])
|
||
|
|
||
|
@pytest.mark.integration
|
||
|
def test_run_with_ignore_numbers(self):
|
||
|
labels = [
|
||
|
"A construction budget of US $10.3 billion",
|
||
|
"The Eiffel Tower, completed in 2005, symbolizes Paris's cultural magnificence.",
|
||
|
"The Meiji Restoration, in 1989, transformed Japan into a modernized world power.",
|
||
|
]
|
||
|
evaluator = SASEvaluator(labels=labels, ignore_numbers=True)
|
||
|
predictions = [
|
||
|
"A construction budget of US $2.3 billion",
|
||
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||
|
"The Meiji Restoration, in 1868, transformed Japan into a modernized world power.",
|
||
|
]
|
||
|
result = evaluator.run(predictions=predictions)
|
||
|
assert result["sas"] == pytest.approx(1.0)
|
||
|
assert result["scores"] == pytest.approx([1.0, 1.0, 1.0])
|
||
|
|
||
|
@pytest.mark.integration
|
||
|
def test_run_with_regex_to_ignore(self):
|
||
|
labels = [
|
||
|
"A construction budget of US $10.3 billion",
|
||
|
"The Eiffel Tower, completed in 2005, symbolizes Paris's cultural magnificence.",
|
||
|
"The Meiji Restoration, in 1989, transformed Japan into a modernized world power.",
|
||
|
]
|
||
|
evaluator = SASEvaluator(labels=labels, regexes_to_ignore=[r"\d+"])
|
||
|
predictions = [
|
||
|
"A construction budget of US $2.3 billion",
|
||
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||
|
"The Meiji Restoration, in 1868, transformed Japan into a modernized world power.",
|
||
|
]
|
||
|
result = evaluator.run(predictions=predictions)
|
||
|
assert len(result) == 2
|
||
|
assert result["sas"] == pytest.approx(1.0)
|
||
|
assert result["scores"] == pytest.approx([1.0, 1.0, 1.0])
|
||
|
|
||
|
@pytest.mark.integration
|
||
|
def test_run_with_multiple_regex_to_ignore(self):
|
||
|
labels = [
|
||
|
"A construction budget of US $10.3 billion",
|
||
|
"The Eiffel Tower, completed in 2005, symbolizes Paris's cultural magnificence.",
|
||
|
"The Meiji Restoration, in 1989, transformed Japan into a modernized world power.",
|
||
|
]
|
||
|
evaluator = SASEvaluator(labels=labels, regexes_to_ignore=[r"\d+", r"[^\w\s]"])
|
||
|
predictions = [
|
||
|
"A construction budget of US $2.3 billion",
|
||
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||
|
"The Meiji Restoration, in 1868, transformed Japan into a modernized world power.",
|
||
|
]
|
||
|
result = evaluator.run(predictions=predictions)
|
||
|
assert len(result) == 2
|
||
|
assert result["sas"] == pytest.approx(1.0)
|
||
|
assert result["scores"] == pytest.approx([1.0, 1.0, 1.0])
|
||
|
|
||
|
@pytest.mark.integration
|
||
|
def test_run_with_multiple_ignore_parameters(self):
|
||
|
labels = [
|
||
|
"A construction budget of US $10.3 billion",
|
||
|
"The Eiffel Tower, completed in 2005, symbolizes Paris's cultural magnificence.",
|
||
|
"The Meiji Restoration, in 1989, transformed Japan into a modernized world power.",
|
||
|
]
|
||
|
evaluator = SASEvaluator(
|
||
|
labels=labels,
|
||
|
ignore_numbers=True,
|
||
|
ignore_punctuation=True,
|
||
|
ignore_case=True,
|
||
|
regexes_to_ignore=[r"[^\w\s\d]+"],
|
||
|
)
|
||
|
predictions = [
|
||
|
"A construction budget of US $2.3 billion",
|
||
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||
|
"The Meiji Restoration, in 1868, transformed Japan into a modernized world power.",
|
||
|
]
|
||
|
result = evaluator.run(predictions=predictions)
|
||
|
assert len(result) == 2
|
||
|
assert result["sas"] == pytest.approx(1.0)
|
||
|
assert result["scores"] == pytest.approx([1.0, 1.0, 1.0])
|
||
|
|
||
|
@pytest.mark.integration
|
||
|
def test_run_with_bi_encoder_model(self):
|
||
|
labels = [
|
||
|
"A construction budget of US $2.3 billion",
|
||
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
||
|
]
|
||
|
evaluator = SASEvaluator(labels=labels, model="sentence-transformers/all-mpnet-base-v2")
|
||
|
predictions = [
|
||
|
"A construction budget of US $2.3 billion",
|
||
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
||
|
]
|
||
|
result = evaluator.run(predictions=predictions)
|
||
|
assert len(result) == 2
|
||
|
assert result["sas"] == pytest.approx(1.0)
|
||
|
assert result["scores"] == pytest.approx([1.0, 1.0, 1.0])
|
||
|
|
||
|
@pytest.mark.integration
|
||
|
def test_run_with_cross_encoder_model(self):
|
||
|
labels = [
|
||
|
"A construction budget of US $2.3 billion",
|
||
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
||
|
]
|
||
|
evaluator = SASEvaluator(labels=labels, model="cross-encoder/ms-marco-MiniLM-L-6-v2")
|
||
|
predictions = [
|
||
|
"A construction budget of US $2.3 billion",
|
||
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
||
|
]
|
||
|
result = evaluator.run(predictions=predictions)
|
||
|
assert len(result) == 2
|
||
|
assert result["sas"] == pytest.approx(0.999967, abs=1e-5)
|
||
|
assert result["scores"] == pytest.approx([0.9999765157699585, 0.999968409538269, 0.9999572038650513], abs=1e-5)
|