feat: Add ContextRelevanceEvaluator component (#7519)

* feat: Add ContextRelevanceEvaluator component

* reno

* fix expected inputs and example docstring

* remove responses parameter from tests

* specify inputs explicitly

* add new evaluator to api reference docs
This commit is contained in:
Julian Risch 2024-04-22 14:10:00 +02:00 committed by GitHub
parent 5d0ccfe7d4
commit b12e0db134
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 304 additions and 0 deletions

View File

@ -4,6 +4,7 @@ loaders:
modules:
[
"answer_exact_match",
"context_relevance",
"document_map",
"document_mrr",
"document_recall",

View File

@ -1,4 +1,5 @@
from .answer_exact_match import AnswerExactMatchEvaluator
from .context_relevance import ContextRelevanceEvaluator
from .document_map import DocumentMAPEvaluator
from .document_mrr import DocumentMRREvaluator
from .document_recall import DocumentRecallEvaluator
@ -9,6 +10,7 @@ from .sas_evaluator import SASEvaluator
__all__ = [
"AnswerExactMatchEvaluator",
"ContextRelevanceEvaluator",
"DocumentMAPEvaluator",
"DocumentMRREvaluator",
"DocumentRecallEvaluator",

View File

@ -0,0 +1,154 @@
from typing import Any, Dict, List, Optional
from numpy import mean as np_mean
from haystack import default_from_dict
from haystack.components.evaluators.llm_evaluator import LLMEvaluator
from haystack.core.component import component
from haystack.utils import Secret, deserialize_secrets_inplace
# Private global variable for default examples to include in the prompt if the user does not provide any examples
_DEFAULT_EXAMPLES = [
{
"inputs": {
"questions": "What is the capital of Germany?",
"contexts": ["Berlin is the capital of Germany and was founded in 1244."],
},
"outputs": {
"statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."],
"statement_scores": [1, 0],
},
},
{
"inputs": {"questions": "What is the capital of France?", "contexts": ["Berlin is the capital of Germany."]},
"outputs": {"statements": ["Berlin is the capital of Germany."], "statement_scores": [0]},
},
{
"inputs": {"questions": "What is the capital of Italy?", "contexts": ["Rome is the capital of Italy."]},
"outputs": {"statements": ["Rome is the capital of Italy."], "statement_scores": [1]},
},
]
class ContextRelevanceEvaluator(LLMEvaluator):
"""
Evaluator that checks if a provided context is relevant to the question.
An LLM separates the answer into multiple statements and checks whether the statement can be inferred from the
context or not. The final score for the full answer is a number from 0.0 to 1.0. It represents the proportion of
statements that can be inferred from the provided contexts.
Usage example:
```python
from haystack.components.evaluators import ContextRelevanceEvaluator
questions = ["Who created the Python language?"]
contexts = [
[
"Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming language. Its design philosophy emphasizes code readability, and its language constructs aim to help programmers write clear, logical code for both small and large-scale software projects."
],
]
evaluator = ContextRelevanceEvaluator()
result = evaluator.run(questions=questions, contexts=contexts)
print(result["score"])
# 1.0
print(result["individual_scores"])
# [1.0]
print(result["results"])
# [{'statements': ['Python, created by Guido van Rossum in the late 1980s.'], 'statement_scores': [1], 'score': 1.0}]
```
"""
def __init__(
self,
examples: Optional[List[Dict[str, Any]]] = None,
api: str = "openai",
api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
):
"""
Creates an instance of ContextRelevanceEvaluator.
:param examples:
Optional few-shot examples conforming to the expected input and output format of ContextRelevanceEvaluator.
Default examples will be used if none are provided.
Each example must be a dictionary with keys "inputs" and "outputs".
"inputs" must be a dictionary with keys "questions" and "contexts".
"outputs" must be a dictionary with "statements" and "statement_scores".
Expected format:
[{
"inputs": {
"questions": "What is the capital of Italy?", "contexts": ["Rome is the capital of Italy."],
},
"outputs": {
"statements": ["Rome is the capital of Italy."],
"statement_scores": [1],
},
}]
:param api:
The API to use for calling an LLM through a Generator.
Supported APIs: "openai".
:param api_key:
The API key.
"""
self.instructions = (
"Your task is to judge how relevant the provided context is for answering a question. "
"First, please extract statements from the provided context. "
"Second, calculate a relevance score for each statement in the context. "
"The score is 1 if the statement is relevant to answer the question or 0 if it is not relevant."
)
self.inputs = [("questions", List[str]), ("contexts", List[List[str]])]
self.outputs = ["statements", "statement_scores"]
self.examples = examples or _DEFAULT_EXAMPLES
self.api = api
self.api_key = api_key
super().__init__(
instructions=self.instructions,
inputs=self.inputs,
outputs=self.outputs,
examples=self.examples,
api=self.api,
api_key=self.api_key,
)
@component.output_types(results=List[Dict[str, Any]])
def run(self, questions: List[str], contexts: List[List[str]]) -> Dict[str, Any]:
"""
Run the LLM evaluator.
:param questions:
A list of questions.
:param contexts:
A list of lists of contexts. Each list of contexts corresponds to one question.
:returns:
A dictionary with the following outputs:
- `score`: Mean context relevance score over all the provided input questions.
- `individual_scores`: A list of context relevance scores for each input question.
- `results`: A list of dictionaries with `statements` and `statement_scores` for each input context.
"""
result = super().run(questions=questions, contexts=contexts)
# calculate average statement relevance score per query
for res in result["results"]:
res["score"] = np_mean(res["statement_scores"])
# calculate average context relevance score over all queries
result["score"] = np_mean([res["score"] for res in result["results"]])
result["individual_scores"] = [res["score"] for res in result["results"]]
return result
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "ContextRelevanceEvaluator":
"""
Deserialize this component from a dictionary.
:param data:
The dictionary representation of this component.
:returns:
The deserialized component instance.
"""
deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
return default_from_dict(cls, data)

View File

@ -0,0 +1,5 @@
---
features:
- |
Add a new ContextRelevanceEvaluator component that can be used to evaluate whether retrieved documents are relevant to answer a question with a RAG pipeline.
Given a question and a list of retrieved document contents (contexts), an LLM is used to score to what extent the provided context is relevant. The score ranges from 0 to 1.

View File

@ -0,0 +1,142 @@
import os
from typing import List
import pytest
from haystack.components.evaluators import ContextRelevanceEvaluator
from haystack.utils.auth import Secret
class TestContextRelevanceEvaluator:
def test_init_default(self, monkeypatch):
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
component = ContextRelevanceEvaluator()
assert component.api == "openai"
assert component.generator.client.api_key == "test-api-key"
assert component.instructions == (
"Your task is to judge how relevant the provided context is for answering a question. "
"First, please extract statements from the provided context. "
"Second, calculate a relevance score for each statement in the context. "
"The score is 1 if the statement is relevant to answer the question or 0 if it is not relevant."
)
assert component.inputs == [("questions", List[str]), ("contexts", List[List[str]])]
assert component.outputs == ["statements", "statement_scores"]
assert component.examples == [
{
"inputs": {
"questions": "What is the capital of Germany?",
"contexts": ["Berlin is the capital of Germany and was founded in 1244."],
},
"outputs": {
"statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."],
"statement_scores": [1, 0],
},
},
{
"inputs": {
"questions": "What is the capital of France?",
"contexts": ["Berlin is the capital of Germany."],
},
"outputs": {"statements": ["Berlin is the capital of Germany."], "statement_scores": [0]},
},
{
"inputs": {"questions": "What is the capital of Italy?", "contexts": ["Rome is the capital of Italy."]},
"outputs": {"statements": ["Rome is the capital of Italy."], "statement_scores": [1]},
},
]
def test_init_fail_wo_openai_api_key(self, monkeypatch):
monkeypatch.delenv("OPENAI_API_KEY", raising=False)
with pytest.raises(ValueError, match="None of the .* environment variables are set"):
ContextRelevanceEvaluator()
def test_init_with_parameters(self):
component = ContextRelevanceEvaluator(
api_key=Secret.from_token("test-api-key"),
api="openai",
examples=[
{"inputs": {"questions": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
{"inputs": {"questions": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
],
)
assert component.generator.client.api_key == "test-api-key"
assert component.api == "openai"
assert component.examples == [
{"inputs": {"questions": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
{"inputs": {"questions": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
]
def test_from_dict(self, monkeypatch):
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
data = {
"type": "haystack.components.evaluators.context_relevance.ContextRelevanceEvaluator",
"init_parameters": {
"api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"},
"api": "openai",
"examples": [{"inputs": {"questions": "What is football?"}, "outputs": {"score": 0}}],
},
}
component = ContextRelevanceEvaluator.from_dict(data)
assert component.api == "openai"
assert component.generator.client.api_key == "test-api-key"
assert component.examples == [{"inputs": {"questions": "What is football?"}, "outputs": {"score": 0}}]
def test_run_calculates_mean_score(self, monkeypatch):
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
component = ContextRelevanceEvaluator()
def generator_run(self, *args, **kwargs):
if "Football" in kwargs["prompt"]:
return {"replies": ['{"statements": ["a", "b"], "statement_scores": [1, 0]}']}
else:
return {"replies": ['{"statements": ["c", "d"], "statement_scores": [1, 1]}']}
monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run)
questions = ["Which is the most popular global sport?", "Who created the Python language?"]
contexts = [
[
"The popularity of sports can be measured in various ways, including TV viewership, social media "
"presence, number of participants, and economic impact. Football is undoubtedly the world's most "
"popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and "
"Messi, drawing a followership of more than 4 billion people."
],
[
"Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming "
"language. Its design philosophy emphasizes code readability, and its language constructs aim to help "
"programmers write clear, logical code for both small and large-scale software projects."
],
]
results = component.run(questions=questions, contexts=contexts)
assert results == {
"individual_scores": [0.5, 1],
"results": [
{"score": 0.5, "statement_scores": [1, 0], "statements": ["a", "b"]},
{"score": 1, "statement_scores": [1, 1], "statements": ["c", "d"]},
],
"score": 0.75,
}
def test_run_missing_parameters(self, monkeypatch):
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
component = ContextRelevanceEvaluator()
with pytest.raises(TypeError, match="missing 2 required positional arguments"):
component.run()
@pytest.mark.skipif(
not os.environ.get("OPENAI_API_KEY", None),
reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",
)
@pytest.mark.integration
def test_live_run(self):
questions = ["Who created the Python language?"]
contexts = [["Python, created by Guido van Rossum, is a high-level general-purpose programming language."]]
evaluator = ContextRelevanceEvaluator()
result = evaluator.run(questions=questions, contexts=contexts)
assert result["score"] == 1.0
assert result["individual_scores"] == [1.0]
assert result["results"][0]["score"] == 1.0
assert result["results"][0]["statement_scores"] == [1.0]
assert "Guido van Rossum" in result["results"][0]["statements"][0]