refactor: AnswerExactMatchEvaluator component inputs (#7536)

* refactor component inputs

* release notes

* Update class docstring

* pylint

* update existing note instead of creating a new one

---------

Co-authored-by: Julian Risch <julian.risch@deepset.ai>
This commit is contained in:
Massimiliano Pippi 2024-04-12 08:59:16 +02:00 committed by GitHub
parent e90ffafb47
commit 2bad5bcb96
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 44 additions and 32 deletions

View File

@ -6,10 +6,11 @@ from haystack.core.component import component
@component
class AnswerExactMatchEvaluator:
"""
Evaluator that checks if the predicted answers matches any of the ground truth answers exactly.
The result is a number from 0.0 to 1.0, it represents the proportion any predicted answer
that matched one of the ground truth answers.
There can be multiple ground truth answers and multiple predicted answers as input.
Evaluator that checks if predicted answers exactly match ground truth answers.
Each predicted answer is compared to one ground truth answer.
The final score is a number ranging from 0.0 to 1.0.
It represents the proportion of predicted answers that match their corresponding ground truth answer.
Usage example:
```python
@ -17,8 +18,8 @@ class AnswerExactMatchEvaluator:
evaluator = AnswerExactMatchEvaluator()
result = evaluator.run(
ground_truth_answers=[["Berlin"], ["Paris"]],
predicted_answers=[["Berlin"], ["Lyon"]],
ground_truth_answers=["Berlin", "Paris"],
predicted_answers=["Berlin", "Lyon"],
)
print(result["individual_scores"])
@ -29,15 +30,15 @@ class AnswerExactMatchEvaluator:
"""
@component.output_types(individual_scores=List[int], score=float)
def run(self, ground_truth_answers: List[List[str]], predicted_answers: List[List[str]]) -> Dict[str, Any]:
def run(self, ground_truth_answers: List[str], predicted_answers: List[str]) -> Dict[str, Any]:
"""
Run the AnswerExactMatchEvaluator on the given inputs.
`ground_truth_answers` and `retrieved_answers` must have the same length.
:param ground_truth_answers:
A list of expected answers for each question.
A list of expected answers.
:param predicted_answers:
A list of predicted answers for each question.
A list of predicted answers.
:returns:
A dictionary with the following outputs:
- `individual_scores` - A list of 0s and 1s, where 1 means that the predicted answer matched one of the ground truth.
@ -48,8 +49,8 @@ class AnswerExactMatchEvaluator:
raise ValueError("The length of ground_truth_answers and predicted_answers must be the same.")
matches = []
for truths, extracted in zip(ground_truth_answers, predicted_answers):
if set(truths) & set(extracted):
for truth, extracted in zip(ground_truth_answers, predicted_answers):
if truth == extracted:
matches.append(1)
else:
matches.append(0)

View File

@ -1,6 +1,5 @@
---
features:
- |
Add `AnswerExactMatchEvaluator`, a Component that can be used to calculate the Exact Match metric
given a list of questions, a list of expected answers for each question and the list of predicted
answers for each question.
Add `AnswerExactMatchEvaluator`, a component that can be used to calculate the Exact Match metric
comparing a list of expected answers with a list of predicted answers.

View File

@ -5,21 +5,21 @@ from haystack.components.evaluators import AnswerExactMatchEvaluator
def test_run_with_all_matching():
evaluator = AnswerExactMatchEvaluator()
result = evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Berlin"], ["Paris"]])
result = evaluator.run(ground_truth_answers=["Berlin", "Paris"], predicted_answers=["Berlin", "Paris"])
assert result == {"individual_scores": [1, 1], "score": 1.0}
def test_run_with_no_matching():
evaluator = AnswerExactMatchEvaluator()
result = evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Paris"], ["London"]])
result = evaluator.run(ground_truth_answers=["Berlin", "Paris"], predicted_answers=["Paris", "London"])
assert result == {"individual_scores": [0, 0], "score": 0.0}
def test_run_with_partial_matching():
evaluator = AnswerExactMatchEvaluator()
result = evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Berlin"], ["London"]])
result = evaluator.run(ground_truth_answers=["Berlin", "Paris"], predicted_answers=["Berlin", "London"])
assert result == {"individual_scores": [1, 0], "score": 0.5}
@ -28,30 +28,42 @@ def test_run_with_complex_data():
evaluator = AnswerExactMatchEvaluator()
result = evaluator.run(
ground_truth_answers=[
["France"],
["9th century", "9th"],
["classical music", "classical"],
["11th century", "the 11th"],
["Denmark", "Iceland", "Norway"],
["10th century", "10th"],
"France",
"9th century",
"9th",
"classical music",
"classical",
"11th century",
"the 11th",
"Denmark",
"Iceland",
"Norway",
"10th century",
"10th",
],
predicted_answers=[
["France"],
["9th century", "10th century", "9th"],
["classic music", "rock music", "dubstep"],
["11th", "the 11th", "11th century"],
["Denmark, Iceland and Norway"],
["10th century", "the first half of the 10th century", "10th", "10th"],
"France",
"9th century",
"10th century",
"9th",
"classic music",
"rock music",
"dubstep",
"the 11th",
"11th century",
"Denmark, Iceland and Norway",
"10th century",
"10th",
],
)
assert result == {"individual_scores": [1, 1, 0, 1, 0, 1], "score": 0.6666666666666666}
assert result == {"individual_scores": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], "score": 0.3333333333333333}
def test_run_with_different_lengths():
evaluator = AnswerExactMatchEvaluator()
with pytest.raises(ValueError):
evaluator.run(ground_truth_answers=[["Berlin"]], predicted_answers=[["Berlin"], ["London"]])
evaluator.run(ground_truth_answers=["Berlin"], predicted_answers=["Berlin", "London"])
with pytest.raises(ValueError):
evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Berlin"]])
evaluator.run(ground_truth_answers=["Berlin", "Paris"], predicted_answers=["Berlin"])