mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-08 21:37:12 +00:00
Add AnswerExactMatchEvaluator (#7050)
* Add AnswerExactMatchEvaluator * Add release notes * Fix linting * Update docstrings
This commit is contained in:
parent
bc8a48cc3c
commit
b4011af8e9
3
haystack/components/evaluators/__init__.py
Normal file
3
haystack/components/evaluators/__init__.py
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
from .answer_exact_match import AnswerExactMatchEvaluator
|
||||||
|
|
||||||
|
__all__ = ["AnswerExactMatchEvaluator"]
|
||||||
49
haystack/components/evaluators/answer_exact_match.py
Normal file
49
haystack/components/evaluators/answer_exact_match.py
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
|
from haystack import default_from_dict, default_to_dict
|
||||||
|
from haystack.core.component import component
|
||||||
|
|
||||||
|
|
||||||
|
@component
|
||||||
|
class AnswerExactMatchEvaluator:
|
||||||
|
"""
|
||||||
|
Evaluator that checks if the predicted answers matches any of the ground truth answers exactly.
|
||||||
|
The result is a number from 0.0 to 1.0, it represents the proportion of questions where any predicted answer
|
||||||
|
matched one of the ground truth answers.
|
||||||
|
Each question can have multiple ground truth answers and multiple predicted answers.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
return default_to_dict(self)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, data: Dict[str, Any]) -> "AnswerExactMatchEvaluator":
|
||||||
|
return default_from_dict(cls, data)
|
||||||
|
|
||||||
|
@component.output_types(result=float)
|
||||||
|
def run(
|
||||||
|
self, questions: List[str], ground_truth_answers: List[List[str]], predicted_answers: List[List[str]]
|
||||||
|
) -> Dict[str, float]:
|
||||||
|
"""
|
||||||
|
Run the AnswerExactMatchEvaluator on the given inputs.
|
||||||
|
All lists must have the same length.
|
||||||
|
|
||||||
|
:param questions: A list of questions.
|
||||||
|
:param ground_truth_answers: A list of expected answers for each question.
|
||||||
|
:param predicted_answers: A list of predicted answers for each question.
|
||||||
|
:returns: A dictionary with the following outputs:
|
||||||
|
* `result` - A number from 0.0 to 1.0 that represents the proportion of questions where any predicted
|
||||||
|
answer matched one of the ground truth answers.
|
||||||
|
"""
|
||||||
|
if not len(questions) == len(ground_truth_answers) == len(predicted_answers):
|
||||||
|
raise ValueError("The length of questions, ground_truth_answers, and predicted_answers must be the same.")
|
||||||
|
|
||||||
|
matches = 0
|
||||||
|
for truths, extracted in zip(ground_truth_answers, predicted_answers):
|
||||||
|
if set(truths) & set(extracted):
|
||||||
|
matches += 1
|
||||||
|
|
||||||
|
# The proportion of questions where any predicted answer matched one of the ground truth answers
|
||||||
|
result = matches / len(questions)
|
||||||
|
|
||||||
|
return {"result": result}
|
||||||
@ -0,0 +1,6 @@
|
|||||||
|
---
|
||||||
|
features:
|
||||||
|
- |
|
||||||
|
Add `AnswerExactMatchEvaluator`, a Component that can be used to calculate the Exact Match metric
|
||||||
|
given a list of questions, a list of expected answers for each question and the list of predicted
|
||||||
|
answers for each question.
|
||||||
61
test/components/evaluators/test_answer_exact_match.py
Normal file
61
test/components/evaluators/test_answer_exact_match.py
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
import pytest
|
||||||
|
|
||||||
|
from haystack.components.evaluators import AnswerExactMatchEvaluator
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_with_all_matching():
|
||||||
|
evaluator = AnswerExactMatchEvaluator()
|
||||||
|
result = evaluator.run(
|
||||||
|
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
||||||
|
ground_truth_answers=[["Berlin"], ["Paris"]],
|
||||||
|
predicted_answers=[["Berlin"], ["Paris"]],
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result["result"] == 1.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_with_no_matching():
|
||||||
|
evaluator = AnswerExactMatchEvaluator()
|
||||||
|
result = evaluator.run(
|
||||||
|
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
||||||
|
ground_truth_answers=[["Berlin"], ["Paris"]],
|
||||||
|
predicted_answers=[["Paris"], ["London"]],
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result["result"] == 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_with_partial_matching():
|
||||||
|
evaluator = AnswerExactMatchEvaluator()
|
||||||
|
result = evaluator.run(
|
||||||
|
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
||||||
|
ground_truth_answers=[["Berlin"], ["Paris"]],
|
||||||
|
predicted_answers=[["Berlin"], ["London"]],
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result["result"] == 0.5
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_with_different_lengths():
|
||||||
|
evaluator = AnswerExactMatchEvaluator()
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
evaluator.run(
|
||||||
|
questions=["What is the capital of Germany?"],
|
||||||
|
ground_truth_answers=[["Berlin"], ["Paris"]],
|
||||||
|
predicted_answers=[["Berlin"], ["London"]],
|
||||||
|
)
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
evaluator.run(
|
||||||
|
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
||||||
|
ground_truth_answers=[["Berlin"]],
|
||||||
|
predicted_answers=[["Berlin"], ["London"]],
|
||||||
|
)
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
evaluator.run(
|
||||||
|
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
||||||
|
ground_truth_answers=[["Berlin"], ["Paris"]],
|
||||||
|
predicted_answers=[["Berlin"]],
|
||||||
|
)
|
||||||
Loading…
x
Reference in New Issue
Block a user