mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-08 13:06:29 +00:00
feat: Add ContextRelevanceEvaluator component (#7519)
* feat: Add ContextRelevanceEvaluator component * reno * fix expected inputs and example docstring * remove responses parameter from tests * specify inputs explicitly * add new evaluator to api reference docs
This commit is contained in:
parent
5d0ccfe7d4
commit
b12e0db134
@ -4,6 +4,7 @@ loaders:
|
||||
modules:
|
||||
[
|
||||
"answer_exact_match",
|
||||
"context_relevance",
|
||||
"document_map",
|
||||
"document_mrr",
|
||||
"document_recall",
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
from .answer_exact_match import AnswerExactMatchEvaluator
|
||||
from .context_relevance import ContextRelevanceEvaluator
|
||||
from .document_map import DocumentMAPEvaluator
|
||||
from .document_mrr import DocumentMRREvaluator
|
||||
from .document_recall import DocumentRecallEvaluator
|
||||
@ -9,6 +10,7 @@ from .sas_evaluator import SASEvaluator
|
||||
|
||||
__all__ = [
|
||||
"AnswerExactMatchEvaluator",
|
||||
"ContextRelevanceEvaluator",
|
||||
"DocumentMAPEvaluator",
|
||||
"DocumentMRREvaluator",
|
||||
"DocumentRecallEvaluator",
|
||||
|
||||
154
haystack/components/evaluators/context_relevance.py
Normal file
154
haystack/components/evaluators/context_relevance.py
Normal file
@ -0,0 +1,154 @@
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from numpy import mean as np_mean
|
||||
|
||||
from haystack import default_from_dict
|
||||
from haystack.components.evaluators.llm_evaluator import LLMEvaluator
|
||||
from haystack.core.component import component
|
||||
from haystack.utils import Secret, deserialize_secrets_inplace
|
||||
|
||||
# Private global variable for default examples to include in the prompt if the user does not provide any examples
|
||||
_DEFAULT_EXAMPLES = [
|
||||
{
|
||||
"inputs": {
|
||||
"questions": "What is the capital of Germany?",
|
||||
"contexts": ["Berlin is the capital of Germany and was founded in 1244."],
|
||||
},
|
||||
"outputs": {
|
||||
"statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."],
|
||||
"statement_scores": [1, 0],
|
||||
},
|
||||
},
|
||||
{
|
||||
"inputs": {"questions": "What is the capital of France?", "contexts": ["Berlin is the capital of Germany."]},
|
||||
"outputs": {"statements": ["Berlin is the capital of Germany."], "statement_scores": [0]},
|
||||
},
|
||||
{
|
||||
"inputs": {"questions": "What is the capital of Italy?", "contexts": ["Rome is the capital of Italy."]},
|
||||
"outputs": {"statements": ["Rome is the capital of Italy."], "statement_scores": [1]},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
class ContextRelevanceEvaluator(LLMEvaluator):
|
||||
"""
|
||||
Evaluator that checks if a provided context is relevant to the question.
|
||||
|
||||
An LLM separates the answer into multiple statements and checks whether the statement can be inferred from the
|
||||
context or not. The final score for the full answer is a number from 0.0 to 1.0. It represents the proportion of
|
||||
statements that can be inferred from the provided contexts.
|
||||
|
||||
Usage example:
|
||||
```python
|
||||
from haystack.components.evaluators import ContextRelevanceEvaluator
|
||||
|
||||
questions = ["Who created the Python language?"]
|
||||
contexts = [
|
||||
[
|
||||
"Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming language. Its design philosophy emphasizes code readability, and its language constructs aim to help programmers write clear, logical code for both small and large-scale software projects."
|
||||
],
|
||||
]
|
||||
|
||||
evaluator = ContextRelevanceEvaluator()
|
||||
result = evaluator.run(questions=questions, contexts=contexts)
|
||||
print(result["score"])
|
||||
# 1.0
|
||||
print(result["individual_scores"])
|
||||
# [1.0]
|
||||
print(result["results"])
|
||||
# [{'statements': ['Python, created by Guido van Rossum in the late 1980s.'], 'statement_scores': [1], 'score': 1.0}]
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
examples: Optional[List[Dict[str, Any]]] = None,
|
||||
api: str = "openai",
|
||||
api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
|
||||
):
|
||||
"""
|
||||
Creates an instance of ContextRelevanceEvaluator.
|
||||
|
||||
:param examples:
|
||||
Optional few-shot examples conforming to the expected input and output format of ContextRelevanceEvaluator.
|
||||
Default examples will be used if none are provided.
|
||||
Each example must be a dictionary with keys "inputs" and "outputs".
|
||||
"inputs" must be a dictionary with keys "questions" and "contexts".
|
||||
"outputs" must be a dictionary with "statements" and "statement_scores".
|
||||
Expected format:
|
||||
[{
|
||||
"inputs": {
|
||||
"questions": "What is the capital of Italy?", "contexts": ["Rome is the capital of Italy."],
|
||||
},
|
||||
"outputs": {
|
||||
"statements": ["Rome is the capital of Italy."],
|
||||
"statement_scores": [1],
|
||||
},
|
||||
}]
|
||||
:param api:
|
||||
The API to use for calling an LLM through a Generator.
|
||||
Supported APIs: "openai".
|
||||
:param api_key:
|
||||
The API key.
|
||||
|
||||
"""
|
||||
self.instructions = (
|
||||
"Your task is to judge how relevant the provided context is for answering a question. "
|
||||
"First, please extract statements from the provided context. "
|
||||
"Second, calculate a relevance score for each statement in the context. "
|
||||
"The score is 1 if the statement is relevant to answer the question or 0 if it is not relevant."
|
||||
)
|
||||
self.inputs = [("questions", List[str]), ("contexts", List[List[str]])]
|
||||
self.outputs = ["statements", "statement_scores"]
|
||||
self.examples = examples or _DEFAULT_EXAMPLES
|
||||
self.api = api
|
||||
self.api_key = api_key
|
||||
|
||||
super().__init__(
|
||||
instructions=self.instructions,
|
||||
inputs=self.inputs,
|
||||
outputs=self.outputs,
|
||||
examples=self.examples,
|
||||
api=self.api,
|
||||
api_key=self.api_key,
|
||||
)
|
||||
|
||||
@component.output_types(results=List[Dict[str, Any]])
|
||||
def run(self, questions: List[str], contexts: List[List[str]]) -> Dict[str, Any]:
|
||||
"""
|
||||
Run the LLM evaluator.
|
||||
|
||||
:param questions:
|
||||
A list of questions.
|
||||
:param contexts:
|
||||
A list of lists of contexts. Each list of contexts corresponds to one question.
|
||||
:returns:
|
||||
A dictionary with the following outputs:
|
||||
- `score`: Mean context relevance score over all the provided input questions.
|
||||
- `individual_scores`: A list of context relevance scores for each input question.
|
||||
- `results`: A list of dictionaries with `statements` and `statement_scores` for each input context.
|
||||
"""
|
||||
result = super().run(questions=questions, contexts=contexts)
|
||||
|
||||
# calculate average statement relevance score per query
|
||||
for res in result["results"]:
|
||||
res["score"] = np_mean(res["statement_scores"])
|
||||
|
||||
# calculate average context relevance score over all queries
|
||||
result["score"] = np_mean([res["score"] for res in result["results"]])
|
||||
result["individual_scores"] = [res["score"] for res in result["results"]]
|
||||
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "ContextRelevanceEvaluator":
|
||||
"""
|
||||
Deserialize this component from a dictionary.
|
||||
|
||||
:param data:
|
||||
The dictionary representation of this component.
|
||||
:returns:
|
||||
The deserialized component instance.
|
||||
"""
|
||||
deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
|
||||
return default_from_dict(cls, data)
|
||||
@ -0,0 +1,5 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
Add a new ContextRelevanceEvaluator component that can be used to evaluate whether retrieved documents are relevant to answer a question with a RAG pipeline.
|
||||
Given a question and a list of retrieved document contents (contexts), an LLM is used to score to what extent the provided context is relevant. The score ranges from 0 to 1.
|
||||
142
test/components/evaluators/test_context_relevance_evaluator.py
Normal file
142
test/components/evaluators/test_context_relevance_evaluator.py
Normal file
@ -0,0 +1,142 @@
|
||||
import os
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
|
||||
from haystack.components.evaluators import ContextRelevanceEvaluator
|
||||
from haystack.utils.auth import Secret
|
||||
|
||||
|
||||
class TestContextRelevanceEvaluator:
|
||||
def test_init_default(self, monkeypatch):
|
||||
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
|
||||
component = ContextRelevanceEvaluator()
|
||||
assert component.api == "openai"
|
||||
assert component.generator.client.api_key == "test-api-key"
|
||||
assert component.instructions == (
|
||||
"Your task is to judge how relevant the provided context is for answering a question. "
|
||||
"First, please extract statements from the provided context. "
|
||||
"Second, calculate a relevance score for each statement in the context. "
|
||||
"The score is 1 if the statement is relevant to answer the question or 0 if it is not relevant."
|
||||
)
|
||||
assert component.inputs == [("questions", List[str]), ("contexts", List[List[str]])]
|
||||
assert component.outputs == ["statements", "statement_scores"]
|
||||
assert component.examples == [
|
||||
{
|
||||
"inputs": {
|
||||
"questions": "What is the capital of Germany?",
|
||||
"contexts": ["Berlin is the capital of Germany and was founded in 1244."],
|
||||
},
|
||||
"outputs": {
|
||||
"statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."],
|
||||
"statement_scores": [1, 0],
|
||||
},
|
||||
},
|
||||
{
|
||||
"inputs": {
|
||||
"questions": "What is the capital of France?",
|
||||
"contexts": ["Berlin is the capital of Germany."],
|
||||
},
|
||||
"outputs": {"statements": ["Berlin is the capital of Germany."], "statement_scores": [0]},
|
||||
},
|
||||
{
|
||||
"inputs": {"questions": "What is the capital of Italy?", "contexts": ["Rome is the capital of Italy."]},
|
||||
"outputs": {"statements": ["Rome is the capital of Italy."], "statement_scores": [1]},
|
||||
},
|
||||
]
|
||||
|
||||
def test_init_fail_wo_openai_api_key(self, monkeypatch):
|
||||
monkeypatch.delenv("OPENAI_API_KEY", raising=False)
|
||||
with pytest.raises(ValueError, match="None of the .* environment variables are set"):
|
||||
ContextRelevanceEvaluator()
|
||||
|
||||
def test_init_with_parameters(self):
|
||||
component = ContextRelevanceEvaluator(
|
||||
api_key=Secret.from_token("test-api-key"),
|
||||
api="openai",
|
||||
examples=[
|
||||
{"inputs": {"questions": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
|
||||
{"inputs": {"questions": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
|
||||
],
|
||||
)
|
||||
assert component.generator.client.api_key == "test-api-key"
|
||||
assert component.api == "openai"
|
||||
assert component.examples == [
|
||||
{"inputs": {"questions": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
|
||||
{"inputs": {"questions": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
|
||||
]
|
||||
|
||||
def test_from_dict(self, monkeypatch):
|
||||
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
|
||||
|
||||
data = {
|
||||
"type": "haystack.components.evaluators.context_relevance.ContextRelevanceEvaluator",
|
||||
"init_parameters": {
|
||||
"api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"},
|
||||
"api": "openai",
|
||||
"examples": [{"inputs": {"questions": "What is football?"}, "outputs": {"score": 0}}],
|
||||
},
|
||||
}
|
||||
component = ContextRelevanceEvaluator.from_dict(data)
|
||||
assert component.api == "openai"
|
||||
assert component.generator.client.api_key == "test-api-key"
|
||||
assert component.examples == [{"inputs": {"questions": "What is football?"}, "outputs": {"score": 0}}]
|
||||
|
||||
def test_run_calculates_mean_score(self, monkeypatch):
|
||||
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
|
||||
component = ContextRelevanceEvaluator()
|
||||
|
||||
def generator_run(self, *args, **kwargs):
|
||||
if "Football" in kwargs["prompt"]:
|
||||
return {"replies": ['{"statements": ["a", "b"], "statement_scores": [1, 0]}']}
|
||||
else:
|
||||
return {"replies": ['{"statements": ["c", "d"], "statement_scores": [1, 1]}']}
|
||||
|
||||
monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run)
|
||||
|
||||
questions = ["Which is the most popular global sport?", "Who created the Python language?"]
|
||||
contexts = [
|
||||
[
|
||||
"The popularity of sports can be measured in various ways, including TV viewership, social media "
|
||||
"presence, number of participants, and economic impact. Football is undoubtedly the world's most "
|
||||
"popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and "
|
||||
"Messi, drawing a followership of more than 4 billion people."
|
||||
],
|
||||
[
|
||||
"Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming "
|
||||
"language. Its design philosophy emphasizes code readability, and its language constructs aim to help "
|
||||
"programmers write clear, logical code for both small and large-scale software projects."
|
||||
],
|
||||
]
|
||||
results = component.run(questions=questions, contexts=contexts)
|
||||
assert results == {
|
||||
"individual_scores": [0.5, 1],
|
||||
"results": [
|
||||
{"score": 0.5, "statement_scores": [1, 0], "statements": ["a", "b"]},
|
||||
{"score": 1, "statement_scores": [1, 1], "statements": ["c", "d"]},
|
||||
],
|
||||
"score": 0.75,
|
||||
}
|
||||
|
||||
def test_run_missing_parameters(self, monkeypatch):
|
||||
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
|
||||
component = ContextRelevanceEvaluator()
|
||||
with pytest.raises(TypeError, match="missing 2 required positional arguments"):
|
||||
component.run()
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not os.environ.get("OPENAI_API_KEY", None),
|
||||
reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",
|
||||
)
|
||||
@pytest.mark.integration
|
||||
def test_live_run(self):
|
||||
questions = ["Who created the Python language?"]
|
||||
contexts = [["Python, created by Guido van Rossum, is a high-level general-purpose programming language."]]
|
||||
|
||||
evaluator = ContextRelevanceEvaluator()
|
||||
result = evaluator.run(questions=questions, contexts=contexts)
|
||||
assert result["score"] == 1.0
|
||||
assert result["individual_scores"] == [1.0]
|
||||
assert result["results"][0]["score"] == 1.0
|
||||
assert result["results"][0]["statement_scores"] == [1.0]
|
||||
assert "Guido van Rossum" in result["results"][0]["statements"][0]
|
||||
Loading…
x
Reference in New Issue
Block a user