mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-16 20:47:39 +00:00

* fix eval metric docstrings, change type of individual scores * change import order * change exactmatch docstring to single ground truth answer * change exactmatch comment to single ground truth answer * reverted changing docs to single ground truth * add warm up in SASEvaluator example * fix FaithfulnessEvaluator docstring example * extend FaithfulnessEvaluator docstring example * Update FaithfulnessEvaluator init docstring * Remove outdated default from LLMEvaluator docstring * Add examples param to LLMEvaluator docstring example * Add import and print to LLMEvaluator docstring example
165 lines
7.0 KiB
Python
165 lines
7.0 KiB
Python
from typing import Any, Dict, List, Optional
|
|
|
|
from numpy import mean as np_mean
|
|
|
|
from haystack import default_from_dict
|
|
from haystack.components.evaluators.llm_evaluator import LLMEvaluator
|
|
from haystack.core.component import component
|
|
from haystack.utils import Secret, deserialize_secrets_inplace
|
|
|
|
|
|
class FaithfulnessEvaluator(LLMEvaluator):
|
|
"""
|
|
Evaluator that checks if a generated answer can be inferred from the provided contexts.
|
|
|
|
An LLM separates the answer into multiple statements and checks whether the statement can be inferred from the
|
|
context or not. The final score for the full answer is a number from 0.0 to 1.0. It represents the proportion of
|
|
statements that can be inferred from the provided contexts.
|
|
|
|
Usage example:
|
|
```python
|
|
from haystack.components.evaluators import FaithfulnessEvaluator
|
|
|
|
questions = ["Who created the Python language?"]
|
|
contexts = [
|
|
[
|
|
"Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming language. Its design philosophy emphasizes code readability, and its language constructs aim to help programmers write clear, logical code for both small and large-scale software projects."
|
|
],
|
|
]
|
|
responses = ["Python is a high-level general-purpose programming language that was created by George Lucas."]
|
|
evaluator = FaithfulnessEvaluator()
|
|
result = evaluator.run(questions=questions, contexts=contexts, responses=responses)
|
|
|
|
print(result["individual_scores"])
|
|
# [0.5]
|
|
print(result["score"])
|
|
# 0.5
|
|
print(result["results"])
|
|
# [{'statements': ['Python is a high-level general-purpose programming language.',
|
|
'Python was created by George Lucas.'], 'statement_scores': [1, 0], 'score': 0.5}]
|
|
```
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
examples: Optional[List[Dict[str, Any]]] = None,
|
|
api: str = "openai",
|
|
api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
|
|
):
|
|
"""
|
|
Creates an instance of FaithfulnessEvaluator.
|
|
|
|
:param examples:
|
|
Few-shot examples conforming to the expected input and output format of FaithfulnessEvaluator.
|
|
Each example must be a dictionary with keys "inputs" and "outputs".
|
|
"inputs" must be a dictionary with keys "questions", "contexts", and "responses".
|
|
"outputs" must be a dictionary with "statements" and "statement_scores".
|
|
Expected format:
|
|
[{
|
|
"inputs": {
|
|
"questions": "What is the capital of Italy?", "contexts": ["Rome is the capital of Italy."],
|
|
"responses": "Rome is the capital of Italy with more than 4 million inhabitants.",
|
|
},
|
|
"outputs": {
|
|
"statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."],
|
|
"statement_scores": [1, 0],
|
|
},
|
|
}]
|
|
:param api:
|
|
The API to use for calling an LLM through a Generator.
|
|
Supported APIs: "openai".
|
|
:param api_key:
|
|
The API key.
|
|
|
|
"""
|
|
self.instructions = (
|
|
"Your task is to judge the faithfulness or groundedness of statements based "
|
|
"on context information. First, please extract statements from a provided "
|
|
"response to a question. Second, calculate a faithfulness score for each "
|
|
"statement made in the response. The score is 1 if the statement can be "
|
|
"inferred from the provided context or 0 if it cannot be inferred."
|
|
)
|
|
self.inputs = [("questions", List[str]), ("contexts", List[List[str]]), ("responses", List[str])]
|
|
self.outputs = ["statements", "statement_scores"]
|
|
self.examples = examples or [
|
|
{
|
|
"inputs": {
|
|
"questions": "What is the capital of Germany and when was it founded?",
|
|
"contexts": ["Berlin is the capital of Germany and was founded in 1244."],
|
|
"responses": "The capital of Germany, Berlin, was founded in the 13th century.",
|
|
},
|
|
"outputs": {
|
|
"statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."],
|
|
"statement_scores": [1, 1],
|
|
},
|
|
},
|
|
{
|
|
"inputs": {
|
|
"questions": "What is the capital of France?",
|
|
"contexts": ["Berlin is the capital of Germany."],
|
|
"responses": "Paris",
|
|
},
|
|
"outputs": {"statements": ["Paris is the capital of France."], "statement_scores": [0]},
|
|
},
|
|
{
|
|
"inputs": {
|
|
"questions": "What is the capital of Italy?",
|
|
"contexts": ["Rome is the capital of Italy."],
|
|
"responses": "Rome is the capital of Italy with more than 4 million inhabitants.",
|
|
},
|
|
"outputs": {
|
|
"statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."],
|
|
"statement_scores": [1, 0],
|
|
},
|
|
},
|
|
]
|
|
self.api = api
|
|
self.api_key = api_key
|
|
|
|
super().__init__(
|
|
instructions=self.instructions,
|
|
inputs=self.inputs,
|
|
outputs=self.outputs,
|
|
examples=self.examples,
|
|
api=self.api,
|
|
api_key=self.api_key,
|
|
)
|
|
|
|
@component.output_types(results=List[Dict[str, Any]])
|
|
def run(self, **inputs) -> Dict[str, Any]:
|
|
"""
|
|
Run the LLM evaluator.
|
|
|
|
:param inputs:
|
|
The input values to evaluate. The keys are the input names and the values are lists of input values.
|
|
:returns:
|
|
A dictionary with the following outputs:
|
|
- `score`: Mean faithfulness score over all the provided input answers.
|
|
- `individual_scores`: A list of faithfulness scores for each input answer.
|
|
- `results`: A list of dictionaries with `statements` and `statement_scores` for each input answer.
|
|
"""
|
|
result = super().run(**inputs)
|
|
|
|
# calculate average statement faithfulness score per query
|
|
for res in result["results"]:
|
|
res["score"] = np_mean(res["statement_scores"])
|
|
|
|
# calculate average answer faithfulness score over all queries
|
|
result["score"] = np_mean([res["score"] for res in result["results"]])
|
|
result["individual_scores"] = [res["score"] for res in result["results"]]
|
|
|
|
return result
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> "FaithfulnessEvaluator":
|
|
"""
|
|
Deserialize this component from a dictionary.
|
|
|
|
:param data:
|
|
The dictionary representation of this component.
|
|
:returns:
|
|
The deserialized component instance.
|
|
"""
|
|
deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
|
|
return default_from_dict(cls, data)
|