mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-05 03:17:31 +00:00
Add AnswerExactMatchEvaluator (#7050)
* Add AnswerExactMatchEvaluator * Add release notes * Fix linting * Update docstrings
This commit is contained in:
parent
bc8a48cc3c
commit
b4011af8e9
3
haystack/components/evaluators/__init__.py
Normal file
3
haystack/components/evaluators/__init__.py
Normal file
@ -0,0 +1,3 @@
|
||||
from .answer_exact_match import AnswerExactMatchEvaluator
|
||||
|
||||
__all__ = ["AnswerExactMatchEvaluator"]
|
||||
49
haystack/components/evaluators/answer_exact_match.py
Normal file
49
haystack/components/evaluators/answer_exact_match.py
Normal file
@ -0,0 +1,49 @@
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from haystack import default_from_dict, default_to_dict
|
||||
from haystack.core.component import component
|
||||
|
||||
|
||||
@component
|
||||
class AnswerExactMatchEvaluator:
|
||||
"""
|
||||
Evaluator that checks if the predicted answers matches any of the ground truth answers exactly.
|
||||
The result is a number from 0.0 to 1.0, it represents the proportion of questions where any predicted answer
|
||||
matched one of the ground truth answers.
|
||||
Each question can have multiple ground truth answers and multiple predicted answers.
|
||||
"""
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return default_to_dict(self)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "AnswerExactMatchEvaluator":
|
||||
return default_from_dict(cls, data)
|
||||
|
||||
@component.output_types(result=float)
|
||||
def run(
|
||||
self, questions: List[str], ground_truth_answers: List[List[str]], predicted_answers: List[List[str]]
|
||||
) -> Dict[str, float]:
|
||||
"""
|
||||
Run the AnswerExactMatchEvaluator on the given inputs.
|
||||
All lists must have the same length.
|
||||
|
||||
:param questions: A list of questions.
|
||||
:param ground_truth_answers: A list of expected answers for each question.
|
||||
:param predicted_answers: A list of predicted answers for each question.
|
||||
:returns: A dictionary with the following outputs:
|
||||
* `result` - A number from 0.0 to 1.0 that represents the proportion of questions where any predicted
|
||||
answer matched one of the ground truth answers.
|
||||
"""
|
||||
if not len(questions) == len(ground_truth_answers) == len(predicted_answers):
|
||||
raise ValueError("The length of questions, ground_truth_answers, and predicted_answers must be the same.")
|
||||
|
||||
matches = 0
|
||||
for truths, extracted in zip(ground_truth_answers, predicted_answers):
|
||||
if set(truths) & set(extracted):
|
||||
matches += 1
|
||||
|
||||
# The proportion of questions where any predicted answer matched one of the ground truth answers
|
||||
result = matches / len(questions)
|
||||
|
||||
return {"result": result}
|
||||
@ -0,0 +1,6 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
Add `AnswerExactMatchEvaluator`, a Component that can be used to calculate the Exact Match metric
|
||||
given a list of questions, a list of expected answers for each question and the list of predicted
|
||||
answers for each question.
|
||||
61
test/components/evaluators/test_answer_exact_match.py
Normal file
61
test/components/evaluators/test_answer_exact_match.py
Normal file
@ -0,0 +1,61 @@
|
||||
import pytest
|
||||
|
||||
from haystack.components.evaluators import AnswerExactMatchEvaluator
|
||||
|
||||
|
||||
def test_run_with_all_matching():
|
||||
evaluator = AnswerExactMatchEvaluator()
|
||||
result = evaluator.run(
|
||||
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
||||
ground_truth_answers=[["Berlin"], ["Paris"]],
|
||||
predicted_answers=[["Berlin"], ["Paris"]],
|
||||
)
|
||||
|
||||
assert result["result"] == 1.0
|
||||
|
||||
|
||||
def test_run_with_no_matching():
|
||||
evaluator = AnswerExactMatchEvaluator()
|
||||
result = evaluator.run(
|
||||
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
||||
ground_truth_answers=[["Berlin"], ["Paris"]],
|
||||
predicted_answers=[["Paris"], ["London"]],
|
||||
)
|
||||
|
||||
assert result["result"] == 0.0
|
||||
|
||||
|
||||
def test_run_with_partial_matching():
|
||||
evaluator = AnswerExactMatchEvaluator()
|
||||
result = evaluator.run(
|
||||
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
||||
ground_truth_answers=[["Berlin"], ["Paris"]],
|
||||
predicted_answers=[["Berlin"], ["London"]],
|
||||
)
|
||||
|
||||
assert result["result"] == 0.5
|
||||
|
||||
|
||||
def test_run_with_different_lengths():
|
||||
evaluator = AnswerExactMatchEvaluator()
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
evaluator.run(
|
||||
questions=["What is the capital of Germany?"],
|
||||
ground_truth_answers=[["Berlin"], ["Paris"]],
|
||||
predicted_answers=[["Berlin"], ["London"]],
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
evaluator.run(
|
||||
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
||||
ground_truth_answers=[["Berlin"]],
|
||||
predicted_answers=[["Berlin"], ["London"]],
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
evaluator.run(
|
||||
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
||||
ground_truth_answers=[["Berlin"], ["Paris"]],
|
||||
predicted_answers=[["Berlin"]],
|
||||
)
|
||||
Loading…
x
Reference in New Issue
Block a user