docs: Fix eval metric examples in docstrings (#7505)

* fix eval metric docstrings, change type of individual scores

* change import order

* change exactmatch docstring to single ground truth answer

* change exactmatch comment to single ground truth answer

* reverted changing docs to single ground truth

* add warm up in SASEvaluator example

* fix FaithfulnessEvaluator docstring example

* extend FaithfulnessEvaluator docstring example

* Update FaithfulnessEvaluator init docstring

* Remove outdated default from LLMEvaluator docstring

* Add examples param to LLMEvaluator docstring example

* Add import and print to LLMEvaluator docstring example
This commit is contained in:
Julian Risch 2024-04-10 11:00:20 +02:00 committed by GitHub
parent 932213e8db
commit e974a23fa3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 47 additions and 24 deletions

View File

@ -15,7 +15,8 @@ class DocumentMAPEvaluator:
Usage example: Usage example:
```python ```python
from haystack.components.evaluators import AnswerExactMatchEvaluator from haystack import Document
from haystack.components.evaluators import DocumentMAPEvaluator
evaluator = DocumentMAPEvaluator() evaluator = DocumentMAPEvaluator()
result = evaluator.run( result = evaluator.run(

View File

@ -16,7 +16,9 @@ class DocumentMRREvaluator:
Usage example: Usage example:
```python ```python
from haystack.components.evaluators import AnswerExactMatchEvaluator from haystack import Document
from haystack.components.evaluators import DocumentMRREvaluator
evaluator = DocumentMRREvaluator() evaluator = DocumentMRREvaluator()
result = evaluator.run( result = evaluator.run(
ground_truth_documents=[ ground_truth_documents=[
@ -29,9 +31,9 @@ class DocumentMRREvaluator:
], ],
) )
print(result["individual_scores"]) print(result["individual_scores"])
# [1.0, 0.8333333333333333] # [1.0, 1.0]
print(result["score"]) print(result["score"])
# 0.9166666666666666 # 1.0
``` ```
""" """

View File

@ -37,16 +37,24 @@ class DocumentRecallEvaluator:
Usage example: Usage example:
```python ```python
from haystack import Document
from haystack.components.evaluators import DocumentRecallEvaluator from haystack.components.evaluators import DocumentRecallEvaluator
evaluator = DocumentRecallEvaluator() evaluator = DocumentRecallEvaluator()
result = evaluator.run( result = evaluator.run(
ground_truth_answers=[["Berlin"], ["Paris"]], ground_truth_documents=[
predicted_answers=[["Paris"], ["London"]], [Document(content="France")],
[Document(content="9th century"), Document(content="9th")],
],
retrieved_documents=[
[Document(content="France")],
[Document(content="9th century"), Document(content="10th century"), Document(content="9th")],
],
) )
print(result["individual_scores"]) print(result["individual_scores"])
# [0.0, 0.0] # [1.0, 1.0]
print(result["score"]) print(result["score"])
# 0.0 # 1.0
``` ```
""" """
@ -63,12 +71,12 @@ class DocumentRecallEvaluator:
mode_functions = {RecallMode.SINGLE_HIT: self._recall_single_hit, RecallMode.MULTI_HIT: self._recall_multi_hit} mode_functions = {RecallMode.SINGLE_HIT: self._recall_single_hit, RecallMode.MULTI_HIT: self._recall_multi_hit}
self.mode_function = mode_functions[mode] self.mode_function = mode_functions[mode]
def _recall_single_hit(self, ground_truth_documents: List[Document], retrieved_documents: List[Document]) -> bool: def _recall_single_hit(self, ground_truth_documents: List[Document], retrieved_documents: List[Document]) -> float:
unique_truths = {g.content for g in ground_truth_documents} unique_truths = {g.content for g in ground_truth_documents}
unique_retrievals = {p.content for p in retrieved_documents} unique_retrievals = {p.content for p in retrieved_documents}
retrieved_ground_truths = unique_truths.intersection(unique_retrievals) retrieved_ground_truths = unique_truths.intersection(unique_retrievals)
return len(retrieved_ground_truths) > 0 return float(len(retrieved_ground_truths) > 0)
def _recall_multi_hit(self, ground_truth_documents: List[Document], retrieved_documents: List[Document]) -> float: def _recall_multi_hit(self, ground_truth_documents: List[Document], retrieved_documents: List[Document]) -> float:
unique_truths = {g.content for g in ground_truth_documents} unique_truths = {g.content for g in ground_truth_documents}
@ -92,7 +100,7 @@ class DocumentRecallEvaluator:
A dictionary with the following outputs: A dictionary with the following outputs:
- `score` - The average of calculated scores. - `score` - The average of calculated scores.
- `invididual_scores` - A list of numbers from 0.0 to 1.0 that represents the proportion of matching documents retrieved. - `invididual_scores` - A list of numbers from 0.0 to 1.0 that represents the proportion of matching documents retrieved.
If the mode is `single_hit`, the individual scores are True or False. If the mode is `single_hit`, the individual scores are 0 or 1.
""" """
if len(ground_truth_documents) != len(retrieved_documents): if len(ground_truth_documents) != len(retrieved_documents):
msg = "The length of ground_truth_documents and retrieved_documents must be the same." msg = "The length of ground_truth_documents and retrieved_documents must be the same."

View File

@ -29,11 +29,14 @@ class FaithfulnessEvaluator(LLMEvaluator):
responses = ["Python is a high-level general-purpose programming language that was created by George Lucas."] responses = ["Python is a high-level general-purpose programming language that was created by George Lucas."]
evaluator = FaithfulnessEvaluator() evaluator = FaithfulnessEvaluator()
result = evaluator.run(questions=questions, contexts=contexts, responses=responses) result = evaluator.run(questions=questions, contexts=contexts, responses=responses)
print(results["evaluator"])
# {'results': [{'statements': ['Python is a high-level general-purpose programming language.',
# 'Python was created by George Lucas.'], 'statement_scores':
# [1, 0], 'score': 0.5}], 'score': 0.5, 'individual_scores': [0.5]}
print(result["individual_scores"])
# [0.5]
print(result["score"])
# 0.5
print(result["results"])
# [{'statements': ['Python is a high-level general-purpose programming language.',
'Python was created by George Lucas.'], 'statement_scores': [1, 0], 'score': 0.5}]
``` ```
""" """
@ -44,7 +47,7 @@ class FaithfulnessEvaluator(LLMEvaluator):
api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"), api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
): ):
""" """
Creates an instance of LLMEvaluator. Creates an instance of FaithfulnessEvaluator.
:param examples: :param examples:
Few-shot examples conforming to the expected input and output format of FaithfulnessEvaluator. Few-shot examples conforming to the expected input and output format of FaithfulnessEvaluator.

View File

@ -19,17 +19,24 @@ class LLMEvaluator:
Usage example: Usage example:
```python ```python
from typing import List
from haystack.components.evaluators import LLMEvaluator from haystack.components.evaluators import LLMEvaluator
evaluator = LLMEvaluator( evaluator = LLMEvaluator(
instructions="Is this answer problematic for children?", instructions="Is this answer problematic for children?",
inputs=[("responses", List[str])], inputs=[("responses", List[str])],
outputs=["score"], outputs=["score"],
examples=[
{"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}},
{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}},
],
) )
RESPONSES = [ RESPONSES = [
"Football is the most popular sport with around 4 billion followers worldwide", "Football is the most popular sport with around 4 billion followers worldwide",
"Python language was created by Guido van Rossum.", "Python language was created by Guido van Rossum.",
] ]
results = evaluator.run(responses=RESPONSES) results = evaluator.run(responses=RESPONSES)
print(results)
# {'results': [{'score': 0}, {'score': 0}]}
``` ```
""" """
@ -54,7 +61,6 @@ class LLMEvaluator:
Each input is a tuple of an input name and input type. Input types must be lists. Each input is a tuple of an input name and input type. Input types must be lists.
:param outputs: :param outputs:
Output names of the evaluation results. They correspond to keys in the output dictionary. Output names of the evaluation results. They correspond to keys in the output dictionary.
The default is a single key "score".
:param examples: :param examples:
Few-shot examples conforming to the expected input and output format as defined in the `inputs` and Few-shot examples conforming to the expected input and output format as defined in the `inputs` and
`outputs` parameters. `outputs` parameters.

View File

@ -26,6 +26,7 @@ class SASEvaluator:
from haystack.components.evaluators.sas_evaluator import SASEvaluator from haystack.components.evaluators.sas_evaluator import SASEvaluator
evaluator = SASEvaluator(model="cross-encoder/ms-marco-MiniLM-L-6-v2") evaluator = SASEvaluator(model="cross-encoder/ms-marco-MiniLM-L-6-v2")
evaluator.warm_up()
ground_truths = [ ground_truths = [
"A construction budget of US $2.3 billion", "A construction budget of US $2.3 billion",
"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",

View File

@ -19,7 +19,7 @@ class TestDocumentRecallEvaluatorSingleHit:
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]], ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]], retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
) )
assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
assert result == {"individual_scores": [1.0, 1.0], "score": 1.0} assert result == {"individual_scores": [1.0, 1.0], "score": 1.0}
def test_run_with_no_matching(self, evaluator): def test_run_with_no_matching(self, evaluator):
@ -27,7 +27,7 @@ class TestDocumentRecallEvaluatorSingleHit:
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]], ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
retrieved_documents=[[Document(content="Paris")], [Document(content="London")]], retrieved_documents=[[Document(content="Paris")], [Document(content="London")]],
) )
assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
assert result == {"individual_scores": [0.0, 0.0], "score": 0.0} assert result == {"individual_scores": [0.0, 0.0], "score": 0.0}
def test_run_with_partial_matching(self, evaluator): def test_run_with_partial_matching(self, evaluator):
@ -35,7 +35,7 @@ class TestDocumentRecallEvaluatorSingleHit:
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]], ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]], retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
) )
assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
assert result == {"individual_scores": [1.0, 0.0], "score": 0.5} assert result == {"individual_scores": [1.0, 0.0], "score": 0.5}
def test_run_with_complex_data(self, evaluator): def test_run_with_complex_data(self, evaluator):
@ -62,7 +62,8 @@ class TestDocumentRecallEvaluatorSingleHit:
], ],
], ],
) )
assert result == {"individual_scores": [True, True, True, True, False, True], "score": 0.8333333333333334} assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
assert result == {"individual_scores": [1, 1, 1, 1, 0, 1], "score": 0.8333333333333334}
def test_run_with_different_lengths(self, evaluator): def test_run_with_different_lengths(self, evaluator):
with pytest.raises(ValueError): with pytest.raises(ValueError):
@ -88,7 +89,7 @@ class TestDocumentRecallEvaluatorMultiHit:
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]], ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]], retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
) )
assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
assert result == {"individual_scores": [1.0, 1.0], "score": 1.0} assert result == {"individual_scores": [1.0, 1.0], "score": 1.0}
def test_run_with_no_matching(self, evaluator): def test_run_with_no_matching(self, evaluator):
@ -96,7 +97,7 @@ class TestDocumentRecallEvaluatorMultiHit:
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]], ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
retrieved_documents=[[Document(content="Paris")], [Document(content="London")]], retrieved_documents=[[Document(content="Paris")], [Document(content="London")]],
) )
assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
assert result == {"individual_scores": [0.0, 0.0], "score": 0.0} assert result == {"individual_scores": [0.0, 0.0], "score": 0.0}
def test_run_with_partial_matching(self, evaluator): def test_run_with_partial_matching(self, evaluator):
@ -104,7 +105,7 @@ class TestDocumentRecallEvaluatorMultiHit:
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]], ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]], retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
) )
assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
assert result == {"individual_scores": [1.0, 0.0], "score": 0.5} assert result == {"individual_scores": [1.0, 0.0], "score": 0.5}
def test_run_with_complex_data(self, evaluator): def test_run_with_complex_data(self, evaluator):
@ -136,6 +137,7 @@ class TestDocumentRecallEvaluatorMultiHit:
], ],
], ],
) )
assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
assert result == {"individual_scores": [1.0, 1.0, 0.5, 1.0, 0.75, 1.0], "score": 0.875} assert result == {"individual_scores": [1.0, 1.0, 0.5, 1.0, 0.75, 1.0], "score": 0.875}
def test_run_with_different_lengths(self, evaluator): def test_run_with_different_lengths(self, evaluator):