refactor: Remove questions inputs from evaluators (#7466)

* Remove questions input from AnswerExactMatchEvaluator

* Remove questions input from DocumentRecallEvaluator
This commit is contained in:
Silvano Cerza 2024-04-04 14:14:18 +02:00 committed by GitHub
parent 12acb3f12e
commit dc87f51759
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 20 additions and 106 deletions

View File

@ -7,9 +7,9 @@ from haystack.core.component import component
class AnswerExactMatchEvaluator:
"""
Evaluator that checks if the predicted answers matches any of the ground truth answers exactly.
The result is a number from 0.0 to 1.0, it represents the proportion of questions where any predicted answer
matched one of the ground truth answers.
Each question can have multiple ground truth answers and multiple predicted answers.
The result is a number from 0.0 to 1.0, it represents the proportion any predicted answer
that matched one of the ground truth answers.
There can be multiple ground truth answers and multiple predicted answers as input.
Usage example:
```python
@ -17,7 +17,6 @@ class AnswerExactMatchEvaluator:
evaluator = AnswerExactMatchEvaluator()
result = evaluator.run(
questions=["What is the capital of Germany?", "What is the capital of France?"],
ground_truth_answers=[["Berlin"], ["Paris"]],
predicted_answers=[["Berlin"], ["Lyon"]],
)
@ -30,15 +29,11 @@ class AnswerExactMatchEvaluator:
"""
@component.output_types(individual_scores=List[int], score=float)
def run(
self, questions: List[str], ground_truth_answers: List[List[str]], predicted_answers: List[List[str]]
) -> Dict[str, Any]:
def run(self, ground_truth_answers: List[List[str]], predicted_answers: List[List[str]]) -> Dict[str, Any]:
"""
Run the AnswerExactMatchEvaluator on the given inputs.
All lists must have the same length.
`ground_truth_answers` and `retrieved_answers` must have the same length.
:param questions:
A list of questions.
:param ground_truth_answers:
A list of expected answers for each question.
:param predicted_answers:
@ -49,8 +44,8 @@ class AnswerExactMatchEvaluator:
- `score` - A number from 0.0 to 1.0 that represents the proportion of questions where any predicted
answer matched one of the ground truth answers.
"""
if not len(questions) == len(ground_truth_answers) == len(predicted_answers):
raise ValueError("The length of questions, ground_truth_answers, and predicted_answers must be the same.")
if not len(ground_truth_answers) == len(predicted_answers):
raise ValueError("The length of ground_truth_answers and predicted_answers must be the same.")
matches = []
for truths, extracted in zip(ground_truth_answers, predicted_answers):
@ -60,6 +55,6 @@ class AnswerExactMatchEvaluator:
matches.append(0)
# The proportion of questions where any predicted answer matched one of the ground truth answers
average = sum(matches) / len(questions)
average = sum(matches) / len(predicted_answers)
return {"individual_scores": matches, "score": average}

View File

@ -31,16 +31,15 @@ class RecallMode(Enum):
@component
class DocumentRecallEvaluator:
"""
Evaluator that calculates the Recall score for a list of questions.
Evaluator that calculates the Recall score for a list of documents.
Returns both a list of scores for each question and the average.
Each question can have multiple ground truth documents and multiple predicted documents.
There can be multiple ground truth documents and multiple predicted documents as input.
Usage example:
```python
from haystack.components.evaluators import DocumentRecallEvaluator
evaluator = DocumentRecallEvaluator()
result = evaluator.run(
questions=["What is the capital of Germany?", "What is the capital of France?"],
ground_truth_answers=[["Berlin"], ["Paris"]],
predicted_answers=[["Paris"], ["London"]],
)
@ -80,17 +79,12 @@ class DocumentRecallEvaluator:
@component.output_types(score=float, individual_scores=List[float])
def run(
self,
questions: List[str],
ground_truth_documents: List[List[Document]],
retrieved_documents: List[List[Document]],
self, ground_truth_documents: List[List[Document]], retrieved_documents: List[List[Document]]
) -> Dict[str, Any]:
"""
Run the DocumentRecallEvaluator on the given inputs.
All lists must have the same length.
`ground_truth_documents` and `retrieved_documents` must have the same length.
:param questions:
A list of questions.
:param ground_truth_documents:
A list of expected documents for each question.
:param retrieved_documents:
@ -100,8 +94,8 @@ class DocumentRecallEvaluator:
- `invididual_scores` - A list of numbers from 0.0 to 1.0 that represents the proportion of matching documents retrieved.
If the mode is `single_hit`, the individual scores are True or False.
"""
if not len(questions) == len(ground_truth_documents) == len(retrieved_documents):
msg = "The length of questions, ground_truth_documents, and predicted_documents must be the same."
if len(ground_truth_documents) != len(retrieved_documents):
msg = "The length of ground_truth_documents and retrieved_documents must be the same."
raise ValueError(msg)
scores = []
@ -109,4 +103,4 @@ class DocumentRecallEvaluator:
score = self.mode_function(ground_truth, retrieved)
scores.append(score)
return {"score": sum(scores) / len(questions), "individual_scores": scores}
return {"score": sum(scores) / len(retrieved_documents), "individual_scores": scores}

View File

@ -5,33 +5,21 @@ from haystack.components.evaluators import AnswerExactMatchEvaluator
def test_run_with_all_matching():
evaluator = AnswerExactMatchEvaluator()
result = evaluator.run(
questions=["What is the capital of Germany?", "What is the capital of France?"],
ground_truth_answers=[["Berlin"], ["Paris"]],
predicted_answers=[["Berlin"], ["Paris"]],
)
result = evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Berlin"], ["Paris"]])
assert result == {"individual_scores": [1, 1], "score": 1.0}
def test_run_with_no_matching():
evaluator = AnswerExactMatchEvaluator()
result = evaluator.run(
questions=["What is the capital of Germany?", "What is the capital of France?"],
ground_truth_answers=[["Berlin"], ["Paris"]],
predicted_answers=[["Paris"], ["London"]],
)
result = evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Paris"], ["London"]])
assert result == {"individual_scores": [0, 0], "score": 0.0}
def test_run_with_partial_matching():
evaluator = AnswerExactMatchEvaluator()
result = evaluator.run(
questions=["What is the capital of Germany?", "What is the capital of France?"],
ground_truth_answers=[["Berlin"], ["Paris"]],
predicted_answers=[["Berlin"], ["London"]],
)
result = evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Berlin"], ["London"]])
assert result == {"individual_scores": [1, 0], "score": 0.5}
@ -39,14 +27,6 @@ def test_run_with_partial_matching():
def test_run_with_complex_data():
evaluator = AnswerExactMatchEvaluator()
result = evaluator.run(
questions=[
"In what country is Normandy located?",
"When was the Latin version of the word Norman first recorded?",
"What developed in Normandy during the 1100s?",
"In what century did important classical music developments occur in Normandy?",
"From which countries did the Norse originate?",
"What century did the Normans first gain their separate identity?",
],
ground_truth_answers=[
["France"],
["9th century", "9th"],
@ -71,22 +51,7 @@ def test_run_with_different_lengths():
evaluator = AnswerExactMatchEvaluator()
with pytest.raises(ValueError):
evaluator.run(
questions=["What is the capital of Germany?"],
ground_truth_answers=[["Berlin"], ["Paris"]],
predicted_answers=[["Berlin"], ["London"]],
)
evaluator.run(ground_truth_answers=[["Berlin"]], predicted_answers=[["Berlin"], ["London"]])
with pytest.raises(ValueError):
evaluator.run(
questions=["What is the capital of Germany?", "What is the capital of France?"],
ground_truth_answers=[["Berlin"]],
predicted_answers=[["Berlin"], ["London"]],
)
with pytest.raises(ValueError):
evaluator.run(
questions=["What is the capital of Germany?", "What is the capital of France?"],
ground_truth_answers=[["Berlin"], ["Paris"]],
predicted_answers=[["Berlin"]],
)
evaluator.run(ground_truth_answers=[["Berlin"], ["Paris"]], predicted_answers=[["Berlin"]])

View File

@ -16,7 +16,6 @@ class TestDocumentRecallEvaluatorSingleHit:
def test_run_with_all_matching(self, evaluator):
result = evaluator.run(
questions=["What is the capital of Germany?", "What is the capital of France?"],
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
)
@ -25,7 +24,6 @@ class TestDocumentRecallEvaluatorSingleHit:
def test_run_with_no_matching(self, evaluator):
result = evaluator.run(
questions=["What is the capital of Germany?", "What is the capital of France?"],
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
retrieved_documents=[[Document(content="Paris")], [Document(content="London")]],
)
@ -34,7 +32,6 @@ class TestDocumentRecallEvaluatorSingleHit:
def test_run_with_partial_matching(self, evaluator):
result = evaluator.run(
questions=["What is the capital of Germany?", "What is the capital of France?"],
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
)
@ -43,14 +40,6 @@ class TestDocumentRecallEvaluatorSingleHit:
def test_run_with_complex_data(self, evaluator):
result = evaluator.run(
questions=[
"In what country is Normandy located?",
"When was the Latin version of the word Norman first recorded?",
"What developed in Normandy during the 1100s?",
"In what century did important classical music developments occur in Normandy?",
"From which countries did the Norse originate?",
"What century did the Normans first gain their separate identity?",
],
ground_truth_documents=[
[Document(content="France")],
[Document(content="9th century"), Document(content="9th")],
@ -78,21 +67,12 @@ class TestDocumentRecallEvaluatorSingleHit:
def test_run_with_different_lengths(self, evaluator):
with pytest.raises(ValueError):
evaluator.run(
questions=["What is the capital of Germany?"],
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
)
with pytest.raises(ValueError):
evaluator.run(
questions=["What is the capital of Germany?", "What is the capital of France?"],
ground_truth_documents=[[Document(content="Berlin")]],
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
)
with pytest.raises(ValueError):
evaluator.run(
questions=["What is the capital of Germany?", "What is the capital of France?"],
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
retrieved_documents=[[Document(content="Berlin")]],
)
@ -105,7 +85,6 @@ class TestDocumentRecallEvaluatorMultiHit:
def test_run_with_all_matching(self, evaluator):
result = evaluator.run(
questions=["What is the capital of Germany?", "What is the capital of France?"],
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
)
@ -114,7 +93,6 @@ class TestDocumentRecallEvaluatorMultiHit:
def test_run_with_no_matching(self, evaluator):
result = evaluator.run(
questions=["What is the capital of Germany?", "What is the capital of France?"],
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
retrieved_documents=[[Document(content="Paris")], [Document(content="London")]],
)
@ -123,7 +101,6 @@ class TestDocumentRecallEvaluatorMultiHit:
def test_run_with_partial_matching(self, evaluator):
result = evaluator.run(
questions=["What is the capital of Germany?", "What is the capital of France?"],
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
)
@ -132,14 +109,6 @@ class TestDocumentRecallEvaluatorMultiHit:
def test_run_with_complex_data(self, evaluator):
result = evaluator.run(
questions=[
"In what country is Normandy located?",
"When was the Latin version of the word Norman first recorded?",
"What developed in Normandy during the 1100s?",
"In what century did important classical music developments occur in Normandy?",
"From which countries did the Norse originate?",
"What century did the Normans first gain their separate identity?",
],
ground_truth_documents=[
[Document(content="France")],
[Document(content="9th century"), Document(content="9th")],
@ -172,21 +141,12 @@ class TestDocumentRecallEvaluatorMultiHit:
def test_run_with_different_lengths(self, evaluator):
with pytest.raises(ValueError):
evaluator.run(
questions=["What is the capital of Germany?"],
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
)
with pytest.raises(ValueError):
evaluator.run(
questions=["What is the capital of Germany?", "What is the capital of France?"],
ground_truth_documents=[[Document(content="Berlin")]],
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
)
with pytest.raises(ValueError):
evaluator.run(
questions=["What is the capital of Germany?", "What is the capital of France?"],
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
retrieved_documents=[[Document(content="Berlin")]],
)