mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-17 13:07:42 +00:00
docs: Fix eval metric examples in docstrings (#7505)
* fix eval metric docstrings, change type of individual scores * change import order * change exactmatch docstring to single ground truth answer * change exactmatch comment to single ground truth answer * reverted changing docs to single ground truth * add warm up in SASEvaluator example * fix FaithfulnessEvaluator docstring example * extend FaithfulnessEvaluator docstring example * Update FaithfulnessEvaluator init docstring * Remove outdated default from LLMEvaluator docstring * Add examples param to LLMEvaluator docstring example * Add import and print to LLMEvaluator docstring example
This commit is contained in:
parent
932213e8db
commit
e974a23fa3
@ -15,7 +15,8 @@ class DocumentMAPEvaluator:
|
|||||||
|
|
||||||
Usage example:
|
Usage example:
|
||||||
```python
|
```python
|
||||||
from haystack.components.evaluators import AnswerExactMatchEvaluator
|
from haystack import Document
|
||||||
|
from haystack.components.evaluators import DocumentMAPEvaluator
|
||||||
|
|
||||||
evaluator = DocumentMAPEvaluator()
|
evaluator = DocumentMAPEvaluator()
|
||||||
result = evaluator.run(
|
result = evaluator.run(
|
||||||
|
@ -16,7 +16,9 @@ class DocumentMRREvaluator:
|
|||||||
|
|
||||||
Usage example:
|
Usage example:
|
||||||
```python
|
```python
|
||||||
from haystack.components.evaluators import AnswerExactMatchEvaluator
|
from haystack import Document
|
||||||
|
from haystack.components.evaluators import DocumentMRREvaluator
|
||||||
|
|
||||||
evaluator = DocumentMRREvaluator()
|
evaluator = DocumentMRREvaluator()
|
||||||
result = evaluator.run(
|
result = evaluator.run(
|
||||||
ground_truth_documents=[
|
ground_truth_documents=[
|
||||||
@ -29,9 +31,9 @@ class DocumentMRREvaluator:
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
print(result["individual_scores"])
|
print(result["individual_scores"])
|
||||||
# [1.0, 0.8333333333333333]
|
# [1.0, 1.0]
|
||||||
print(result["score"])
|
print(result["score"])
|
||||||
# 0.9166666666666666
|
# 1.0
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -37,16 +37,24 @@ class DocumentRecallEvaluator:
|
|||||||
|
|
||||||
Usage example:
|
Usage example:
|
||||||
```python
|
```python
|
||||||
|
from haystack import Document
|
||||||
from haystack.components.evaluators import DocumentRecallEvaluator
|
from haystack.components.evaluators import DocumentRecallEvaluator
|
||||||
|
|
||||||
evaluator = DocumentRecallEvaluator()
|
evaluator = DocumentRecallEvaluator()
|
||||||
result = evaluator.run(
|
result = evaluator.run(
|
||||||
ground_truth_answers=[["Berlin"], ["Paris"]],
|
ground_truth_documents=[
|
||||||
predicted_answers=[["Paris"], ["London"]],
|
[Document(content="France")],
|
||||||
|
[Document(content="9th century"), Document(content="9th")],
|
||||||
|
],
|
||||||
|
retrieved_documents=[
|
||||||
|
[Document(content="France")],
|
||||||
|
[Document(content="9th century"), Document(content="10th century"), Document(content="9th")],
|
||||||
|
],
|
||||||
)
|
)
|
||||||
print(result["individual_scores"])
|
print(result["individual_scores"])
|
||||||
# [0.0, 0.0]
|
# [1.0, 1.0]
|
||||||
print(result["score"])
|
print(result["score"])
|
||||||
# 0.0
|
# 1.0
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -63,12 +71,12 @@ class DocumentRecallEvaluator:
|
|||||||
mode_functions = {RecallMode.SINGLE_HIT: self._recall_single_hit, RecallMode.MULTI_HIT: self._recall_multi_hit}
|
mode_functions = {RecallMode.SINGLE_HIT: self._recall_single_hit, RecallMode.MULTI_HIT: self._recall_multi_hit}
|
||||||
self.mode_function = mode_functions[mode]
|
self.mode_function = mode_functions[mode]
|
||||||
|
|
||||||
def _recall_single_hit(self, ground_truth_documents: List[Document], retrieved_documents: List[Document]) -> bool:
|
def _recall_single_hit(self, ground_truth_documents: List[Document], retrieved_documents: List[Document]) -> float:
|
||||||
unique_truths = {g.content for g in ground_truth_documents}
|
unique_truths = {g.content for g in ground_truth_documents}
|
||||||
unique_retrievals = {p.content for p in retrieved_documents}
|
unique_retrievals = {p.content for p in retrieved_documents}
|
||||||
retrieved_ground_truths = unique_truths.intersection(unique_retrievals)
|
retrieved_ground_truths = unique_truths.intersection(unique_retrievals)
|
||||||
|
|
||||||
return len(retrieved_ground_truths) > 0
|
return float(len(retrieved_ground_truths) > 0)
|
||||||
|
|
||||||
def _recall_multi_hit(self, ground_truth_documents: List[Document], retrieved_documents: List[Document]) -> float:
|
def _recall_multi_hit(self, ground_truth_documents: List[Document], retrieved_documents: List[Document]) -> float:
|
||||||
unique_truths = {g.content for g in ground_truth_documents}
|
unique_truths = {g.content for g in ground_truth_documents}
|
||||||
@ -92,7 +100,7 @@ class DocumentRecallEvaluator:
|
|||||||
A dictionary with the following outputs:
|
A dictionary with the following outputs:
|
||||||
- `score` - The average of calculated scores.
|
- `score` - The average of calculated scores.
|
||||||
- `invididual_scores` - A list of numbers from 0.0 to 1.0 that represents the proportion of matching documents retrieved.
|
- `invididual_scores` - A list of numbers from 0.0 to 1.0 that represents the proportion of matching documents retrieved.
|
||||||
If the mode is `single_hit`, the individual scores are True or False.
|
If the mode is `single_hit`, the individual scores are 0 or 1.
|
||||||
"""
|
"""
|
||||||
if len(ground_truth_documents) != len(retrieved_documents):
|
if len(ground_truth_documents) != len(retrieved_documents):
|
||||||
msg = "The length of ground_truth_documents and retrieved_documents must be the same."
|
msg = "The length of ground_truth_documents and retrieved_documents must be the same."
|
||||||
|
@ -29,11 +29,14 @@ class FaithfulnessEvaluator(LLMEvaluator):
|
|||||||
responses = ["Python is a high-level general-purpose programming language that was created by George Lucas."]
|
responses = ["Python is a high-level general-purpose programming language that was created by George Lucas."]
|
||||||
evaluator = FaithfulnessEvaluator()
|
evaluator = FaithfulnessEvaluator()
|
||||||
result = evaluator.run(questions=questions, contexts=contexts, responses=responses)
|
result = evaluator.run(questions=questions, contexts=contexts, responses=responses)
|
||||||
print(results["evaluator"])
|
|
||||||
# {'results': [{'statements': ['Python is a high-level general-purpose programming language.',
|
|
||||||
# 'Python was created by George Lucas.'], 'statement_scores':
|
|
||||||
# [1, 0], 'score': 0.5}], 'score': 0.5, 'individual_scores': [0.5]}
|
|
||||||
|
|
||||||
|
print(result["individual_scores"])
|
||||||
|
# [0.5]
|
||||||
|
print(result["score"])
|
||||||
|
# 0.5
|
||||||
|
print(result["results"])
|
||||||
|
# [{'statements': ['Python is a high-level general-purpose programming language.',
|
||||||
|
'Python was created by George Lucas.'], 'statement_scores': [1, 0], 'score': 0.5}]
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -44,7 +47,7 @@ class FaithfulnessEvaluator(LLMEvaluator):
|
|||||||
api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
|
api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Creates an instance of LLMEvaluator.
|
Creates an instance of FaithfulnessEvaluator.
|
||||||
|
|
||||||
:param examples:
|
:param examples:
|
||||||
Few-shot examples conforming to the expected input and output format of FaithfulnessEvaluator.
|
Few-shot examples conforming to the expected input and output format of FaithfulnessEvaluator.
|
||||||
|
@ -19,17 +19,24 @@ class LLMEvaluator:
|
|||||||
|
|
||||||
Usage example:
|
Usage example:
|
||||||
```python
|
```python
|
||||||
|
from typing import List
|
||||||
from haystack.components.evaluators import LLMEvaluator
|
from haystack.components.evaluators import LLMEvaluator
|
||||||
evaluator = LLMEvaluator(
|
evaluator = LLMEvaluator(
|
||||||
instructions="Is this answer problematic for children?",
|
instructions="Is this answer problematic for children?",
|
||||||
inputs=[("responses", List[str])],
|
inputs=[("responses", List[str])],
|
||||||
outputs=["score"],
|
outputs=["score"],
|
||||||
|
examples=[
|
||||||
|
{"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}},
|
||||||
|
{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}},
|
||||||
|
],
|
||||||
)
|
)
|
||||||
RESPONSES = [
|
RESPONSES = [
|
||||||
"Football is the most popular sport with around 4 billion followers worldwide",
|
"Football is the most popular sport with around 4 billion followers worldwide",
|
||||||
"Python language was created by Guido van Rossum.",
|
"Python language was created by Guido van Rossum.",
|
||||||
]
|
]
|
||||||
results = evaluator.run(responses=RESPONSES)
|
results = evaluator.run(responses=RESPONSES)
|
||||||
|
print(results)
|
||||||
|
# {'results': [{'score': 0}, {'score': 0}]}
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -54,7 +61,6 @@ class LLMEvaluator:
|
|||||||
Each input is a tuple of an input name and input type. Input types must be lists.
|
Each input is a tuple of an input name and input type. Input types must be lists.
|
||||||
:param outputs:
|
:param outputs:
|
||||||
Output names of the evaluation results. They correspond to keys in the output dictionary.
|
Output names of the evaluation results. They correspond to keys in the output dictionary.
|
||||||
The default is a single key "score".
|
|
||||||
:param examples:
|
:param examples:
|
||||||
Few-shot examples conforming to the expected input and output format as defined in the `inputs` and
|
Few-shot examples conforming to the expected input and output format as defined in the `inputs` and
|
||||||
`outputs` parameters.
|
`outputs` parameters.
|
||||||
|
@ -26,6 +26,7 @@ class SASEvaluator:
|
|||||||
from haystack.components.evaluators.sas_evaluator import SASEvaluator
|
from haystack.components.evaluators.sas_evaluator import SASEvaluator
|
||||||
|
|
||||||
evaluator = SASEvaluator(model="cross-encoder/ms-marco-MiniLM-L-6-v2")
|
evaluator = SASEvaluator(model="cross-encoder/ms-marco-MiniLM-L-6-v2")
|
||||||
|
evaluator.warm_up()
|
||||||
ground_truths = [
|
ground_truths = [
|
||||||
"A construction budget of US $2.3 billion",
|
"A construction budget of US $2.3 billion",
|
||||||
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
|
||||||
|
@ -19,7 +19,7 @@ class TestDocumentRecallEvaluatorSingleHit:
|
|||||||
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
||||||
retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
||||||
)
|
)
|
||||||
|
assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
|
||||||
assert result == {"individual_scores": [1.0, 1.0], "score": 1.0}
|
assert result == {"individual_scores": [1.0, 1.0], "score": 1.0}
|
||||||
|
|
||||||
def test_run_with_no_matching(self, evaluator):
|
def test_run_with_no_matching(self, evaluator):
|
||||||
@ -27,7 +27,7 @@ class TestDocumentRecallEvaluatorSingleHit:
|
|||||||
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
||||||
retrieved_documents=[[Document(content="Paris")], [Document(content="London")]],
|
retrieved_documents=[[Document(content="Paris")], [Document(content="London")]],
|
||||||
)
|
)
|
||||||
|
assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
|
||||||
assert result == {"individual_scores": [0.0, 0.0], "score": 0.0}
|
assert result == {"individual_scores": [0.0, 0.0], "score": 0.0}
|
||||||
|
|
||||||
def test_run_with_partial_matching(self, evaluator):
|
def test_run_with_partial_matching(self, evaluator):
|
||||||
@ -35,7 +35,7 @@ class TestDocumentRecallEvaluatorSingleHit:
|
|||||||
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
||||||
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
|
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
|
||||||
)
|
)
|
||||||
|
assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
|
||||||
assert result == {"individual_scores": [1.0, 0.0], "score": 0.5}
|
assert result == {"individual_scores": [1.0, 0.0], "score": 0.5}
|
||||||
|
|
||||||
def test_run_with_complex_data(self, evaluator):
|
def test_run_with_complex_data(self, evaluator):
|
||||||
@ -62,7 +62,8 @@ class TestDocumentRecallEvaluatorSingleHit:
|
|||||||
],
|
],
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
assert result == {"individual_scores": [True, True, True, True, False, True], "score": 0.8333333333333334}
|
assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
|
||||||
|
assert result == {"individual_scores": [1, 1, 1, 1, 0, 1], "score": 0.8333333333333334}
|
||||||
|
|
||||||
def test_run_with_different_lengths(self, evaluator):
|
def test_run_with_different_lengths(self, evaluator):
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
@ -88,7 +89,7 @@ class TestDocumentRecallEvaluatorMultiHit:
|
|||||||
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
||||||
retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
||||||
)
|
)
|
||||||
|
assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
|
||||||
assert result == {"individual_scores": [1.0, 1.0], "score": 1.0}
|
assert result == {"individual_scores": [1.0, 1.0], "score": 1.0}
|
||||||
|
|
||||||
def test_run_with_no_matching(self, evaluator):
|
def test_run_with_no_matching(self, evaluator):
|
||||||
@ -96,7 +97,7 @@ class TestDocumentRecallEvaluatorMultiHit:
|
|||||||
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
||||||
retrieved_documents=[[Document(content="Paris")], [Document(content="London")]],
|
retrieved_documents=[[Document(content="Paris")], [Document(content="London")]],
|
||||||
)
|
)
|
||||||
|
assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
|
||||||
assert result == {"individual_scores": [0.0, 0.0], "score": 0.0}
|
assert result == {"individual_scores": [0.0, 0.0], "score": 0.0}
|
||||||
|
|
||||||
def test_run_with_partial_matching(self, evaluator):
|
def test_run_with_partial_matching(self, evaluator):
|
||||||
@ -104,7 +105,7 @@ class TestDocumentRecallEvaluatorMultiHit:
|
|||||||
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
|
||||||
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
|
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
|
||||||
)
|
)
|
||||||
|
assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
|
||||||
assert result == {"individual_scores": [1.0, 0.0], "score": 0.5}
|
assert result == {"individual_scores": [1.0, 0.0], "score": 0.5}
|
||||||
|
|
||||||
def test_run_with_complex_data(self, evaluator):
|
def test_run_with_complex_data(self, evaluator):
|
||||||
@ -136,6 +137,7 @@ class TestDocumentRecallEvaluatorMultiHit:
|
|||||||
],
|
],
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
|
||||||
assert result == {"individual_scores": [1.0, 1.0, 0.5, 1.0, 0.75, 1.0], "score": 0.875}
|
assert result == {"individual_scores": [1.0, 1.0, 0.5, 1.0, 0.75, 1.0], "score": 0.875}
|
||||||
|
|
||||||
def test_run_with_different_lengths(self, evaluator):
|
def test_run_with_different_lengths(self, evaluator):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user