mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-06-26 22:00:13 +00:00
208 lines
9.7 KiB
Python
208 lines
9.7 KiB
Python
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
import pytest
|
|
|
|
from haystack.components.evaluators.sas_evaluator import SASEvaluator
|
|
from haystack.utils.device import ComponentDevice
|
|
|
|
|
|
class TestSASEvaluator:
|
|
def test_init_default(self, monkeypatch):
|
|
monkeypatch.setenv("HF_API_TOKEN", "fake-token")
|
|
evaluator = SASEvaluator()
|
|
|
|
assert evaluator._model == "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
|
|
assert evaluator._batch_size == 32
|
|
assert evaluator._device is None
|
|
assert evaluator._token.resolve_value() == "fake-token"
|
|
|
|
def test_to_dict(self, monkeypatch):
|
|
monkeypatch.setenv("HF_API_TOKEN", "fake-token")
|
|
|
|
evaluator = SASEvaluator(device=ComponentDevice.from_str("cuda:0"))
|
|
|
|
expected_dict = {
|
|
"type": "haystack.components.evaluators.sas_evaluator.SASEvaluator",
|
|
"init_parameters": {
|
|
"model": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
|
|
"batch_size": 32,
|
|
"device": {"type": "single", "device": "cuda:0"},
|
|
"token": {"type": "env_var", "env_vars": ["HF_API_TOKEN", "HF_TOKEN"], "strict": False},
|
|
},
|
|
}
|
|
assert evaluator.to_dict() == expected_dict
|
|
|
|
def test_from_dict(self, monkeypatch):
|
|
monkeypatch.setenv("HF_API_TOKEN", "fake-token")
|
|
evaluator = SASEvaluator.from_dict(
|
|
{
|
|
"type": "haystack.components.evaluators.sas_evaluator.SASEvaluator",
|
|
"init_parameters": {
|
|
"model": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
|
|
"batch_size": 32,
|
|
"device": {"type": "single", "device": "cuda:0"},
|
|
"token": {"type": "env_var", "env_vars": ["HF_API_TOKEN", "HF_TOKEN"], "strict": False},
|
|
},
|
|
}
|
|
)
|
|
|
|
assert evaluator._model == "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
|
|
assert evaluator._batch_size == 32
|
|
assert evaluator._device.to_torch_str() == "cuda:0"
|
|
assert evaluator._token.resolve_value() == "fake-token"
|
|
|
|
def test_run_with_empty_inputs(self):
|
|
evaluator = SASEvaluator()
|
|
result = evaluator.run(ground_truth_answers=[], predicted_answers=[])
|
|
assert len(result) == 2
|
|
assert result["score"] == 0.0
|
|
assert result["individual_scores"] == [0.0]
|
|
|
|
def test_run_with_different_lengths(self):
|
|
evaluator = SASEvaluator()
|
|
ground_truths = [
|
|
"A construction budget of US $2.3 billion",
|
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
|
]
|
|
predictions = [
|
|
"A construction budget of US $2.3 billion",
|
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
|
]
|
|
with pytest.raises(ValueError):
|
|
evaluator.run(ground_truth_answers=ground_truths, predicted_answers=predictions)
|
|
|
|
def test_run_with_none_in_predictions(self):
|
|
evaluator = SASEvaluator()
|
|
ground_truths = [
|
|
"A construction budget of US $2.3 billion",
|
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
|
]
|
|
predictions = [
|
|
"A construction budget of US $2.3 billion",
|
|
None,
|
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
|
]
|
|
with pytest.raises(ValueError):
|
|
evaluator.run(ground_truth_answers=ground_truths, predicted_answers=predictions)
|
|
|
|
def test_run_not_warmed_up(self):
|
|
evaluator = SASEvaluator()
|
|
ground_truths = [
|
|
"A construction budget of US $2.3 billion",
|
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
|
]
|
|
predictions = [
|
|
"A construction budget of US $2.3 billion",
|
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
|
]
|
|
with pytest.raises(RuntimeError):
|
|
evaluator.run(ground_truth_answers=ground_truths, predicted_answers=predictions)
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.slow
|
|
def test_run_with_matching_predictions(self, monkeypatch):
|
|
monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811
|
|
evaluator = SASEvaluator()
|
|
ground_truths = [
|
|
"A construction budget of US $2.3 billion",
|
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
|
]
|
|
predictions = [
|
|
"A construction budget of US $2.3 billion",
|
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
|
]
|
|
evaluator.warm_up()
|
|
result = evaluator.run(ground_truth_answers=ground_truths, predicted_answers=predictions)
|
|
|
|
assert len(result) == 2
|
|
assert result["score"] == pytest.approx(1.0)
|
|
assert result["individual_scores"] == pytest.approx([1.0, 1.0, 1.0])
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.slow
|
|
def test_run_with_single_prediction(self, monkeypatch):
|
|
monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811
|
|
evaluator = SASEvaluator()
|
|
|
|
ground_truths = ["US $2.3 billion"]
|
|
evaluator.warm_up()
|
|
result = evaluator.run(
|
|
ground_truth_answers=ground_truths, predicted_answers=["A construction budget of US $2.3 billion"]
|
|
)
|
|
assert len(result) == 2
|
|
assert result["score"] == pytest.approx(0.689089, abs=1e-5)
|
|
assert result["individual_scores"] == pytest.approx([0.689089], abs=1e-5)
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.slow
|
|
def test_run_with_mismatched_predictions(self, monkeypatch):
|
|
monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811
|
|
evaluator = SASEvaluator()
|
|
ground_truths = [
|
|
"US $2.3 billion",
|
|
"Paris's cultural magnificence is symbolized by the Eiffel Tower",
|
|
"Japan was transformed into a modernized world power after the Meiji Restoration.",
|
|
]
|
|
predictions = [
|
|
"A construction budget of US $2.3 billion",
|
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
|
]
|
|
evaluator.warm_up()
|
|
result = evaluator.run(ground_truth_answers=ground_truths, predicted_answers=predictions)
|
|
assert len(result) == 2
|
|
assert result["score"] == pytest.approx(0.8227189)
|
|
assert result["individual_scores"] == pytest.approx([0.689089, 0.870389, 0.908679], abs=1e-5)
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.slow
|
|
def test_run_with_bi_encoder_model(self, monkeypatch):
|
|
monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811
|
|
evaluator = SASEvaluator(model="sentence-transformers/all-mpnet-base-v2")
|
|
ground_truths = [
|
|
"A construction budget of US $2.3 billion",
|
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
|
]
|
|
predictions = [
|
|
"A construction budget of US $2.3 billion",
|
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
|
]
|
|
evaluator.warm_up()
|
|
result = evaluator.run(ground_truth_answers=ground_truths, predicted_answers=predictions)
|
|
assert len(result) == 2
|
|
assert result["score"] == pytest.approx(1.0)
|
|
assert result["individual_scores"] == pytest.approx([1.0, 1.0, 1.0])
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.slow
|
|
def test_run_with_cross_encoder_model(self, monkeypatch):
|
|
monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811
|
|
evaluator = SASEvaluator(model="cross-encoder/ms-marco-MiniLM-L-6-v2")
|
|
ground_truths = [
|
|
"A construction budget of US $2.3 billion",
|
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
|
]
|
|
predictions = [
|
|
"A construction budget of US $2.3 billion",
|
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
|
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
|
|
]
|
|
evaluator.warm_up()
|
|
result = evaluator.run(ground_truth_answers=ground_truths, predicted_answers=predictions)
|
|
assert len(result) == 2
|
|
assert result["score"] == pytest.approx(0.999967, abs=1e-5)
|
|
assert result["individual_scores"] == pytest.approx(
|
|
[0.9999765157699585, 0.999968409538269, 0.9999572038650513], abs=1e-5
|
|
)
|