mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-08 21:28:00 +00:00
refactor: Remove questions inputs from evaluators (#7466)
* Remove questions input from AnswerExactMatchEvaluator * Remove questions input from DocumentRecallEvaluator
This commit is contained in:
parent
12acb3f12e
commit
dc87f51759
@ -7,9 +7,9 @@ from haystack.core.component import component
|
||||
class AnswerExactMatchEvaluator:
|
||||
"""
|
||||
Evaluator that checks if the predicted answers matches any of the ground truth answers exactly.
|
||||
The result is a number from 0.0 to 1.0, it represents the proportion of questions where any predicted answer
|
||||
matched one of the ground truth answers.
|
||||
Each question can have multiple ground truth answers and multiple predicted answers.
|
||||
The result is a number from 0.0 to 1.0, it represents the proportion any predicted answer
|
||||
that matched one of the ground truth answers.
|
||||
There can be multiple ground truth answers and multiple predicted answers as input.
|
||||
|
||||
Usage example:
|
||||
```python
|
||||
@ -17,7 +17,6 @@ class AnswerExactMatchEvaluator:
|
||||
|
||||
evaluator = AnswerExactMatchEvaluator()
|
||||
result = evaluator.run(
|
||||
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
||||
ground_truth_answers=[["Berlin"], ["Paris"]],
|
||||
predicted_answers=[["Berlin"], ["Lyon"]],
|
||||
)
|
||||
@ -30,15 +29,11 @@ class AnswerExactMatchEvaluator:
|
||||
"""
|
||||
|
||||
@component.output_types(individual_scores=List[int], score=float)
|
||||
def run(
|
||||
self, questions: List[str], ground_truth_answers: List[List[str]], predicted_answers: List[List[str]]
|
||||
) -> Dict[str, Any]:
|
||||
def run(self, ground_truth_answers: List[List[str]], predicted_answers: List[List[str]]) -> Dict[str, Any]:
|
||||
"""
|
||||
Run the AnswerExactMatchEvaluator on the given inputs.
|
||||
All lists must have the same length.
|
||||
`ground_truth_answers` and `retrieved_answers` must have the same length.
|
||||
|
||||
:param questions:
|
||||
A list of questions.
|
||||
:param ground_truth_answers:
|
||||
A list of expected answers for each question.
|
||||
:param predicted_answers:
|
||||
@ -49,8 +44,8 @@ class AnswerExactMatchEvaluator:
|
||||
- `score` - A number from 0.0 to 1.0 that represents the proportion of questions where any predicted
|
||||
answer matched one of the ground truth answers.
|
||||
"""
|
||||
if not len(questions) == len(ground_truth_answers) == len(predicted_answers):
|
||||
raise ValueError("The length of questions, ground_truth_answers, and predicted_answers must be the same.")
|
||||
if not len(ground_truth_answers) == len(predicted_answers):
|
||||
raise ValueError("The length of ground_truth_answers and predicted_answers must be the same.")
|
||||
|
||||
matches = []
|
||||
for truths, extracted in zip(ground_truth_answers, predicted_answers):
|
||||
@ -60,6 +55,6 @@ class AnswerExactMatchEvaluator:
|
||||
matches.append(0)
|
||||
|
||||
# The proportion of questions where any predicted answer matched one of the ground truth answers
|
||||
average = sum(matches) / len(questions)
|
||||
average = sum(matches) / len(predicted_answers)
|
||||
|
||||
return {"individual_scores": matches, "score": average}
|
||||
|
||||
@ -31,16 +31,15 @@ class RecallMode(Enum):
|
||||
@component
|
||||
class DocumentRecallEvaluator:
|
||||
"""
|
||||
Evaluator that calculates the Recall score for a list of questions.
|
||||
Evaluator that calculates the Recall score for a list of documents.
|
||||
Returns both a list of scores for each question and the average.
|
||||
Each question can have multiple ground truth documents and multiple predicted documents.
|
||||
There can be multiple ground truth documents and multiple predicted documents as input.
|
||||
|
||||
Usage example:
|
||||
```python
|
||||
from haystack.components.evaluators import DocumentRecallEvaluator
|
||||
evaluator = DocumentRecallEvaluator()
|
||||
result = evaluator.run(
|
||||
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
||||
ground_truth_answers=[["Berlin"], ["Paris"]],
|
||||
predicted_answers=[["Paris"], ["London"]],
|
||||
)
|
||||
@ -80,17 +79,12 @@ class DocumentRecallEvaluator:
|
||||
|
||||
@component.output_types(score=float, individual_scores=List[float])
|
||||
def run(
|
||||
self,
|
||||
questions: List[str],
|
||||
ground_truth_documents: List[List[Document]],
|
||||
retrieved_documents: List[List[Document]],
|
||||
self, ground_truth_documents: List[List[Document]], retrieved_documents: List[List[Document]]
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Run the DocumentRecallEvaluator on the given inputs.
|
||||
All lists must have the same length.
|
||||
`ground_truth_documents` and `retrieved_documents` must have the same length.
|
||||
|
||||
:param questions:
|
||||
A list of questions.
|
||||
:param ground_truth_documents:
|
||||
A list of expected documents for each question.
|
||||
:param retrieved_documents:
|
||||
@ -100,8 +94,8 @@ class DocumentRecallEvaluator:
|
||||
- `invididual_scores` - A list of numbers from 0.0 to 1.0 that represents the proportion of matching documents retrieved.
|
||||
If the mode is `single_hit`, the individual scores are True or False.
|
||||
"""
|
||||
if not len(questions) == len(ground_truth_documents) == len(retrieved_documents):
|
||||
msg = "The length of questions, ground_truth_documents, and predicted_documents must be the same."
|
||||
if len(ground_truth_documents) != len(retrieved_documents):
|
||||
msg = "The length of ground_truth_documents and retrieved_documents must be the same."
|
||||
raise ValueError(msg)
|
||||
|
||||
scores = []
|
||||
@ -109,4 +103,4 @@ class DocumentRecallEvaluator:
|
||||
score = self.mode_function(ground_truth, retrieved)
|
||||
scores.append(score)
|
||||
|
||||
return {"score": sum(scores) / len(questions), "individual_scores": scores}
|
||||
return {"score": sum(scores) / len(retrieved_documents), "individual_scores": scores}
|
||||
|
||||
@ -5,33 +5,21 @@ from haystack.components.evaluators import AnswerExactMatchEvaluator
|
||||
|
||||
def test_run_with_all_matching():
|
||||
evaluator = AnswerExactMatchEvaluator()
|
||||
result = evaluator.run(
|
||||
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
||||
ground_truth_answers=[["Berlin"], ["Paris"]],
|
||||
predicted_answers=[["Berlin"], ["Paris"]],
|
||||
)
|
||||
result = evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Berlin"], ["Paris"]])
|
||||
|
||||
assert result == {"individual_scores": [1, 1], "score": 1.0}
|
||||
|
||||
|
||||
def test_run_with_no_matching():
|
||||
evaluator = AnswerExactMatchEvaluator()
|
||||
result = evaluator.run(
|
||||
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
||||
ground_truth_answers=[["Berlin"], ["Paris"]],
|
||||
predicted_answers=[["Paris"], ["London"]],
|
||||
)
|
||||
result = evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Paris"], ["London"]])
|
||||
|
||||
assert result == {"individual_scores": [0, 0], "score": 0.0}
|
||||
|
||||
|
||||
def test_run_with_partial_matching():
|
||||
evaluator = AnswerExactMatchEvaluator()
|
||||
result = evaluator.run(
|
||||
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
||||
ground_truth_answers=[["Berlin"], ["Paris"]],
|
||||
predicted_answers=[["Berlin"], ["London"]],
|
||||
)
|
||||
result = evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Berlin"], ["London"]])
|
||||
|
||||
assert result == {"individual_scores": [1, 0], "score": 0.5}
|
||||
|
||||
@ -39,14 +27,6 @@ def test_run_with_partial_matching():
|
||||
def test_run_with_complex_data():
|
||||
evaluator = AnswerExactMatchEvaluator()
|
||||
result = evaluator.run(
|
||||
questions=[
|
||||
"In what country is Normandy located?",
|
||||
"When was the Latin version of the word Norman first recorded?",
|
||||
"What developed in Normandy during the 1100s?",
|
||||
"In what century did important classical music developments occur in Normandy?",
|
||||
"From which countries did the Norse originate?",
|
||||
"What century did the Normans first gain their separate identity?",
|
||||
],
|
||||
ground_truth_answers=[
|
||||
["France"],
|
||||
["9th century", "9th"],
|
||||
@ -71,22 +51,7 @@ def test_run_with_different_lengths():
|
||||
evaluator = AnswerExactMatchEvaluator()
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
evaluator.run(
|
||||
questions=["What is the capital of Germany?"],
|
||||
ground_truth_answers=[["Berlin"], ["Paris"]],
|
||||
predicted_answers=[["Berlin"], ["London"]],
|
||||
)
|
||||
evaluator.run(ground_truth_answers=[["Berlin"]], predicted_answers=[["Berlin"], ["London"]])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
evaluator.run(
|
||||
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
||||
ground_truth_answers=[["Berlin"]],
|
||||
predicted_answers=[["Berlin"], ["London"]],
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
evaluator.run(
|
||||
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
||||
ground_truth_answers=[["Berlin"], ["Paris"]],
|
||||
predicted_answers=[["Berlin"]],
|
||||
)
|
||||
evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Berlin"]])
|
||||
|
||||
@ -16,7 +16,6 @@ class TestDocumentRecallEvaluatorSingleHit:
|
||||
|
||||
def test_run_with_all_matching(self, evaluator):
|
||||
result = evaluator.run(
|
||||
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
||||
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
||||
retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
||||
)
|
||||
@ -25,7 +24,6 @@ class TestDocumentRecallEvaluatorSingleHit:
|
||||
|
||||
def test_run_with_no_matching(self, evaluator):
|
||||
result = evaluator.run(
|
||||
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
||||
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
||||
retrieved_documents=[[Document(content="Paris")], [Document(content="London")]],
|
||||
)
|
||||
@ -34,7 +32,6 @@ class TestDocumentRecallEvaluatorSingleHit:
|
||||
|
||||
def test_run_with_partial_matching(self, evaluator):
|
||||
result = evaluator.run(
|
||||
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
||||
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
||||
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
|
||||
)
|
||||
@ -43,14 +40,6 @@ class TestDocumentRecallEvaluatorSingleHit:
|
||||
|
||||
def test_run_with_complex_data(self, evaluator):
|
||||
result = evaluator.run(
|
||||
questions=[
|
||||
"In what country is Normandy located?",
|
||||
"When was the Latin version of the word Norman first recorded?",
|
||||
"What developed in Normandy during the 1100s?",
|
||||
"In what century did important classical music developments occur in Normandy?",
|
||||
"From which countries did the Norse originate?",
|
||||
"What century did the Normans first gain their separate identity?",
|
||||
],
|
||||
ground_truth_documents=[
|
||||
[Document(content="France")],
|
||||
[Document(content="9th century"), Document(content="9th")],
|
||||
@ -78,21 +67,12 @@ class TestDocumentRecallEvaluatorSingleHit:
|
||||
def test_run_with_different_lengths(self, evaluator):
|
||||
with pytest.raises(ValueError):
|
||||
evaluator.run(
|
||||
questions=["What is the capital of Germany?"],
|
||||
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
||||
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
evaluator.run(
|
||||
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
||||
ground_truth_documents=[[Document(content="Berlin")]],
|
||||
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
evaluator.run(
|
||||
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
||||
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
||||
retrieved_documents=[[Document(content="Berlin")]],
|
||||
)
|
||||
@ -105,7 +85,6 @@ class TestDocumentRecallEvaluatorMultiHit:
|
||||
|
||||
def test_run_with_all_matching(self, evaluator):
|
||||
result = evaluator.run(
|
||||
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
||||
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
||||
retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
||||
)
|
||||
@ -114,7 +93,6 @@ class TestDocumentRecallEvaluatorMultiHit:
|
||||
|
||||
def test_run_with_no_matching(self, evaluator):
|
||||
result = evaluator.run(
|
||||
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
||||
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
||||
retrieved_documents=[[Document(content="Paris")], [Document(content="London")]],
|
||||
)
|
||||
@ -123,7 +101,6 @@ class TestDocumentRecallEvaluatorMultiHit:
|
||||
|
||||
def test_run_with_partial_matching(self, evaluator):
|
||||
result = evaluator.run(
|
||||
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
||||
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
||||
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
|
||||
)
|
||||
@ -132,14 +109,6 @@ class TestDocumentRecallEvaluatorMultiHit:
|
||||
|
||||
def test_run_with_complex_data(self, evaluator):
|
||||
result = evaluator.run(
|
||||
questions=[
|
||||
"In what country is Normandy located?",
|
||||
"When was the Latin version of the word Norman first recorded?",
|
||||
"What developed in Normandy during the 1100s?",
|
||||
"In what century did important classical music developments occur in Normandy?",
|
||||
"From which countries did the Norse originate?",
|
||||
"What century did the Normans first gain their separate identity?",
|
||||
],
|
||||
ground_truth_documents=[
|
||||
[Document(content="France")],
|
||||
[Document(content="9th century"), Document(content="9th")],
|
||||
@ -172,21 +141,12 @@ class TestDocumentRecallEvaluatorMultiHit:
|
||||
def test_run_with_different_lengths(self, evaluator):
|
||||
with pytest.raises(ValueError):
|
||||
evaluator.run(
|
||||
questions=["What is the capital of Germany?"],
|
||||
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
||||
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
evaluator.run(
|
||||
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
||||
ground_truth_documents=[[Document(content="Berlin")]],
|
||||
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
evaluator.run(
|
||||
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
||||
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
||||
retrieved_documents=[[Document(content="Berlin")]],
|
||||
)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user