mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-08 13:06:29 +00:00
refactor: AnswerExactMatchEvaluator component inputs (#7536)
* refactor component inputs * release notes * Update class docstring * pylint * update existing note instead of creating a new one --------- Co-authored-by: Julian Risch <julian.risch@deepset.ai>
This commit is contained in:
parent
e90ffafb47
commit
2bad5bcb96
@ -6,10 +6,11 @@ from haystack.core.component import component
|
||||
@component
|
||||
class AnswerExactMatchEvaluator:
|
||||
"""
|
||||
Evaluator that checks if the predicted answers matches any of the ground truth answers exactly.
|
||||
The result is a number from 0.0 to 1.0, it represents the proportion any predicted answer
|
||||
that matched one of the ground truth answers.
|
||||
There can be multiple ground truth answers and multiple predicted answers as input.
|
||||
Evaluator that checks if predicted answers exactly match ground truth answers.
|
||||
|
||||
Each predicted answer is compared to one ground truth answer.
|
||||
The final score is a number ranging from 0.0 to 1.0.
|
||||
It represents the proportion of predicted answers that match their corresponding ground truth answer.
|
||||
|
||||
Usage example:
|
||||
```python
|
||||
@ -17,8 +18,8 @@ class AnswerExactMatchEvaluator:
|
||||
|
||||
evaluator = AnswerExactMatchEvaluator()
|
||||
result = evaluator.run(
|
||||
ground_truth_answers=[["Berlin"], ["Paris"]],
|
||||
predicted_answers=[["Berlin"], ["Lyon"]],
|
||||
ground_truth_answers=["Berlin", "Paris"],
|
||||
predicted_answers=["Berlin", "Lyon"],
|
||||
)
|
||||
|
||||
print(result["individual_scores"])
|
||||
@ -29,15 +30,15 @@ class AnswerExactMatchEvaluator:
|
||||
"""
|
||||
|
||||
@component.output_types(individual_scores=List[int], score=float)
|
||||
def run(self, ground_truth_answers: List[List[str]], predicted_answers: List[List[str]]) -> Dict[str, Any]:
|
||||
def run(self, ground_truth_answers: List[str], predicted_answers: List[str]) -> Dict[str, Any]:
|
||||
"""
|
||||
Run the AnswerExactMatchEvaluator on the given inputs.
|
||||
`ground_truth_answers` and `retrieved_answers` must have the same length.
|
||||
|
||||
:param ground_truth_answers:
|
||||
A list of expected answers for each question.
|
||||
A list of expected answers.
|
||||
:param predicted_answers:
|
||||
A list of predicted answers for each question.
|
||||
A list of predicted answers.
|
||||
:returns:
|
||||
A dictionary with the following outputs:
|
||||
- `individual_scores` - A list of 0s and 1s, where 1 means that the predicted answer matched one of the ground truth.
|
||||
@ -48,8 +49,8 @@ class AnswerExactMatchEvaluator:
|
||||
raise ValueError("The length of ground_truth_answers and predicted_answers must be the same.")
|
||||
|
||||
matches = []
|
||||
for truths, extracted in zip(ground_truth_answers, predicted_answers):
|
||||
if set(truths) & set(extracted):
|
||||
for truth, extracted in zip(ground_truth_answers, predicted_answers):
|
||||
if truth == extracted:
|
||||
matches.append(1)
|
||||
else:
|
||||
matches.append(0)
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
Add `AnswerExactMatchEvaluator`, a Component that can be used to calculate the Exact Match metric
|
||||
given a list of questions, a list of expected answers for each question and the list of predicted
|
||||
answers for each question.
|
||||
Add `AnswerExactMatchEvaluator`, a component that can be used to calculate the Exact Match metric
|
||||
comparing a list of expected answers with a list of predicted answers.
|
||||
|
||||
@ -5,21 +5,21 @@ from haystack.components.evaluators import AnswerExactMatchEvaluator
|
||||
|
||||
def test_run_with_all_matching():
|
||||
evaluator = AnswerExactMatchEvaluator()
|
||||
result = evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Berlin"], ["Paris"]])
|
||||
result = evaluator.run(ground_truth_answers=["Berlin", "Paris"], predicted_answers=["Berlin", "Paris"])
|
||||
|
||||
assert result == {"individual_scores": [1, 1], "score": 1.0}
|
||||
|
||||
|
||||
def test_run_with_no_matching():
|
||||
evaluator = AnswerExactMatchEvaluator()
|
||||
result = evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Paris"], ["London"]])
|
||||
result = evaluator.run(ground_truth_answers=["Berlin", "Paris"], predicted_answers=["Paris", "London"])
|
||||
|
||||
assert result == {"individual_scores": [0, 0], "score": 0.0}
|
||||
|
||||
|
||||
def test_run_with_partial_matching():
|
||||
evaluator = AnswerExactMatchEvaluator()
|
||||
result = evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Berlin"], ["London"]])
|
||||
result = evaluator.run(ground_truth_answers=["Berlin", "Paris"], predicted_answers=["Berlin", "London"])
|
||||
|
||||
assert result == {"individual_scores": [1, 0], "score": 0.5}
|
||||
|
||||
@ -28,30 +28,42 @@ def test_run_with_complex_data():
|
||||
evaluator = AnswerExactMatchEvaluator()
|
||||
result = evaluator.run(
|
||||
ground_truth_answers=[
|
||||
["France"],
|
||||
["9th century", "9th"],
|
||||
["classical music", "classical"],
|
||||
["11th century", "the 11th"],
|
||||
["Denmark", "Iceland", "Norway"],
|
||||
["10th century", "10th"],
|
||||
"France",
|
||||
"9th century",
|
||||
"9th",
|
||||
"classical music",
|
||||
"classical",
|
||||
"11th century",
|
||||
"the 11th",
|
||||
"Denmark",
|
||||
"Iceland",
|
||||
"Norway",
|
||||
"10th century",
|
||||
"10th",
|
||||
],
|
||||
predicted_answers=[
|
||||
["France"],
|
||||
["9th century", "10th century", "9th"],
|
||||
["classic music", "rock music", "dubstep"],
|
||||
["11th", "the 11th", "11th century"],
|
||||
["Denmark, Iceland and Norway"],
|
||||
["10th century", "the first half of the 10th century", "10th", "10th"],
|
||||
"France",
|
||||
"9th century",
|
||||
"10th century",
|
||||
"9th",
|
||||
"classic music",
|
||||
"rock music",
|
||||
"dubstep",
|
||||
"the 11th",
|
||||
"11th century",
|
||||
"Denmark, Iceland and Norway",
|
||||
"10th century",
|
||||
"10th",
|
||||
],
|
||||
)
|
||||
assert result == {"individual_scores": [1, 1, 0, 1, 0, 1], "score": 0.6666666666666666}
|
||||
assert result == {"individual_scores": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], "score": 0.3333333333333333}
|
||||
|
||||
|
||||
def test_run_with_different_lengths():
|
||||
evaluator = AnswerExactMatchEvaluator()
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
evaluator.run(ground_truth_answers=[["Berlin"]], predicted_answers=[["Berlin"], ["London"]])
|
||||
evaluator.run(ground_truth_answers=["Berlin"], predicted_answers=["Berlin", "London"])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Berlin"]])
|
||||
evaluator.run(ground_truth_answers=["Berlin", "Paris"], predicted_answers=["Berlin"])
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user