haystack/test/components/eval/test_statistical_evaluator.py
Silvano Cerza 2b8a606cb8
refactor: Refactor StatisticalEvaluator (#6999)
* Refactor StatisticalEvaluator

* Update StatisticalEvaluator

* Rename StatisticalMetric.from_string to from_str and change internal logic

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Fix tests

---------

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
2024-02-15 16:47:35 +01:00

124 lines
5.0 KiB
Python

import pytest
from haystack.components.eval import StatisticalEvaluator, StatisticalMetric
class TestStatisticalEvaluator:
def test_init_default(self):
evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1)
assert evaluator._metric == StatisticalMetric.F1
def test_init_with_string(self):
evaluator = StatisticalEvaluator(metric="exact_match")
assert evaluator._metric == StatisticalMetric.EM
def test_to_dict(self):
evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1)
expected_dict = {
"type": "haystack.components.eval.statistical_evaluator.StatisticalEvaluator",
"init_parameters": {"metric": "f1"},
}
assert evaluator.to_dict() == expected_dict
def test_from_dict(self):
evaluator = StatisticalEvaluator.from_dict(
{
"type": "haystack.components.eval.statistical_evaluator.StatisticalEvaluator",
"init_parameters": {"metric": "f1"},
}
)
assert evaluator._metric == StatisticalMetric.F1
class TestStatisticalEvaluatorF1:
def test_run_with_empty_inputs(self):
evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1)
result = evaluator.run(labels=[], predictions=[])
assert len(result) == 1
assert result["result"] == 0.0
def test_run_with_different_lengths(self):
evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1)
labels = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
]
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
with pytest.raises(ValueError):
evaluator.run(labels=labels, predictions=predictions)
def test_run_with_matching_predictions(self):
evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1)
labels = ["OpenSource", "HaystackAI", "LLMs"]
predictions = ["OpenSource", "HaystackAI", "LLMs"]
result = evaluator.run(labels=labels, predictions=predictions)
assert len(result) == 1
assert result["result"] == 1.0
def test_run_with_single_prediction(self):
evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1)
result = evaluator.run(labels=["Source"], predictions=["Open Source"])
assert len(result) == 1
assert result["result"] == pytest.approx(2 / 3)
def test_run_with_mismatched_predictions(self):
labels = ["Source", "HaystackAI"]
evaluator = StatisticalEvaluator(metric=StatisticalMetric.F1)
predictions = ["Open Source", "HaystackAI"]
result = evaluator.run(labels=labels, predictions=predictions)
assert len(result) == 1
assert result["result"] == pytest.approx(5 / 6)
class TestStatisticalEvaluatorExactMatch:
def test_run_with_empty_inputs(self):
evaluator = StatisticalEvaluator(metric=StatisticalMetric.EM)
result = evaluator.run(predictions=[], labels=[])
assert len(result) == 1
assert result["result"] == 0.0
def test_run_with_different_lengths(self):
evaluator = StatisticalEvaluator(metric=StatisticalMetric.EM)
labels = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
]
predictions = [
"A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
]
with pytest.raises(ValueError):
evaluator.run(labels=labels, predictions=predictions)
def test_run_with_matching_predictions(self):
labels = ["OpenSource", "HaystackAI", "LLMs"]
evaluator = StatisticalEvaluator(metric=StatisticalMetric.EM)
predictions = ["OpenSource", "HaystackAI", "LLMs"]
result = evaluator.run(labels=labels, predictions=predictions)
assert len(result) == 1
assert result["result"] == 1.0
def test_run_with_single_prediction(self):
evaluator = StatisticalEvaluator(metric=StatisticalMetric.EM)
result = evaluator.run(labels=["OpenSource"], predictions=["OpenSource"])
assert len(result) == 1
assert result["result"] == 1.0
def test_run_with_mismatched_predictions(self):
evaluator = StatisticalEvaluator(metric=StatisticalMetric.EM)
labels = ["Source", "HaystackAI", "LLMs"]
predictions = ["OpenSource", "HaystackAI", "LLMs"]
result = evaluator.run(labels=labels, predictions=predictions)
assert len(result) == 1
assert result["result"] == 2 / 3