diff --git a/docs/pydoc/config/evaluators_api.yml b/docs/pydoc/config/evaluators_api.yml index 9acd64efb..b24b3003e 100644 --- a/docs/pydoc/config/evaluators_api.yml +++ b/docs/pydoc/config/evaluators_api.yml @@ -4,6 +4,7 @@ loaders: modules: [ "answer_exact_match", + "context_relevance", "document_map", "document_mrr", "document_recall", diff --git a/haystack/components/evaluators/__init__.py b/haystack/components/evaluators/__init__.py index f69c8257a..631691c54 100644 --- a/haystack/components/evaluators/__init__.py +++ b/haystack/components/evaluators/__init__.py @@ -1,4 +1,5 @@ from .answer_exact_match import AnswerExactMatchEvaluator +from .context_relevance import ContextRelevanceEvaluator from .document_map import DocumentMAPEvaluator from .document_mrr import DocumentMRREvaluator from .document_recall import DocumentRecallEvaluator @@ -9,6 +10,7 @@ from .sas_evaluator import SASEvaluator __all__ = [ "AnswerExactMatchEvaluator", + "ContextRelevanceEvaluator", "DocumentMAPEvaluator", "DocumentMRREvaluator", "DocumentRecallEvaluator", diff --git a/haystack/components/evaluators/context_relevance.py b/haystack/components/evaluators/context_relevance.py new file mode 100644 index 000000000..d78ccfc74 --- /dev/null +++ b/haystack/components/evaluators/context_relevance.py @@ -0,0 +1,154 @@ +from typing import Any, Dict, List, Optional + +from numpy import mean as np_mean + +from haystack import default_from_dict +from haystack.components.evaluators.llm_evaluator import LLMEvaluator +from haystack.core.component import component +from haystack.utils import Secret, deserialize_secrets_inplace + +# Private global variable for default examples to include in the prompt if the user does not provide any examples +_DEFAULT_EXAMPLES = [ + { + "inputs": { + "questions": "What is the capital of Germany?", + "contexts": ["Berlin is the capital of Germany and was founded in 1244."], + }, + "outputs": { + "statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."], + "statement_scores": [1, 0], + }, + }, + { + "inputs": {"questions": "What is the capital of France?", "contexts": ["Berlin is the capital of Germany."]}, + "outputs": {"statements": ["Berlin is the capital of Germany."], "statement_scores": [0]}, + }, + { + "inputs": {"questions": "What is the capital of Italy?", "contexts": ["Rome is the capital of Italy."]}, + "outputs": {"statements": ["Rome is the capital of Italy."], "statement_scores": [1]}, + }, +] + + +class ContextRelevanceEvaluator(LLMEvaluator): + """ + Evaluator that checks if a provided context is relevant to the question. + + An LLM separates the answer into multiple statements and checks whether the statement can be inferred from the + context or not. The final score for the full answer is a number from 0.0 to 1.0. It represents the proportion of + statements that can be inferred from the provided contexts. + + Usage example: + ```python + from haystack.components.evaluators import ContextRelevanceEvaluator + + questions = ["Who created the Python language?"] + contexts = [ + [ + "Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming language. Its design philosophy emphasizes code readability, and its language constructs aim to help programmers write clear, logical code for both small and large-scale software projects." + ], + ] + + evaluator = ContextRelevanceEvaluator() + result = evaluator.run(questions=questions, contexts=contexts) + print(result["score"]) + # 1.0 + print(result["individual_scores"]) + # [1.0] + print(result["results"]) + # [{'statements': ['Python, created by Guido van Rossum in the late 1980s.'], 'statement_scores': [1], 'score': 1.0}] + ``` + """ + + def __init__( + self, + examples: Optional[List[Dict[str, Any]]] = None, + api: str = "openai", + api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"), + ): + """ + Creates an instance of ContextRelevanceEvaluator. + + :param examples: + Optional few-shot examples conforming to the expected input and output format of ContextRelevanceEvaluator. + Default examples will be used if none are provided. + Each example must be a dictionary with keys "inputs" and "outputs". + "inputs" must be a dictionary with keys "questions" and "contexts". + "outputs" must be a dictionary with "statements" and "statement_scores". + Expected format: + [{ + "inputs": { + "questions": "What is the capital of Italy?", "contexts": ["Rome is the capital of Italy."], + }, + "outputs": { + "statements": ["Rome is the capital of Italy."], + "statement_scores": [1], + }, + }] + :param api: + The API to use for calling an LLM through a Generator. + Supported APIs: "openai". + :param api_key: + The API key. + + """ + self.instructions = ( + "Your task is to judge how relevant the provided context is for answering a question. " + "First, please extract statements from the provided context. " + "Second, calculate a relevance score for each statement in the context. " + "The score is 1 if the statement is relevant to answer the question or 0 if it is not relevant." + ) + self.inputs = [("questions", List[str]), ("contexts", List[List[str]])] + self.outputs = ["statements", "statement_scores"] + self.examples = examples or _DEFAULT_EXAMPLES + self.api = api + self.api_key = api_key + + super().__init__( + instructions=self.instructions, + inputs=self.inputs, + outputs=self.outputs, + examples=self.examples, + api=self.api, + api_key=self.api_key, + ) + + @component.output_types(results=List[Dict[str, Any]]) + def run(self, questions: List[str], contexts: List[List[str]]) -> Dict[str, Any]: + """ + Run the LLM evaluator. + + :param questions: + A list of questions. + :param contexts: + A list of lists of contexts. Each list of contexts corresponds to one question. + :returns: + A dictionary with the following outputs: + - `score`: Mean context relevance score over all the provided input questions. + - `individual_scores`: A list of context relevance scores for each input question. + - `results`: A list of dictionaries with `statements` and `statement_scores` for each input context. + """ + result = super().run(questions=questions, contexts=contexts) + + # calculate average statement relevance score per query + for res in result["results"]: + res["score"] = np_mean(res["statement_scores"]) + + # calculate average context relevance score over all queries + result["score"] = np_mean([res["score"] for res in result["results"]]) + result["individual_scores"] = [res["score"] for res in result["results"]] + + return result + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "ContextRelevanceEvaluator": + """ + Deserialize this component from a dictionary. + + :param data: + The dictionary representation of this component. + :returns: + The deserialized component instance. + """ + deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"]) + return default_from_dict(cls, data) diff --git a/releasenotes/notes/context-relevance-04063b9dc9fe7379.yaml b/releasenotes/notes/context-relevance-04063b9dc9fe7379.yaml new file mode 100644 index 000000000..2ab79f87c --- /dev/null +++ b/releasenotes/notes/context-relevance-04063b9dc9fe7379.yaml @@ -0,0 +1,5 @@ +--- +features: + - | + Add a new ContextRelevanceEvaluator component that can be used to evaluate whether retrieved documents are relevant to answer a question with a RAG pipeline. + Given a question and a list of retrieved document contents (contexts), an LLM is used to score to what extent the provided context is relevant. The score ranges from 0 to 1. diff --git a/test/components/evaluators/test_context_relevance_evaluator.py b/test/components/evaluators/test_context_relevance_evaluator.py new file mode 100644 index 000000000..8bd1a3cfd --- /dev/null +++ b/test/components/evaluators/test_context_relevance_evaluator.py @@ -0,0 +1,142 @@ +import os +from typing import List + +import pytest + +from haystack.components.evaluators import ContextRelevanceEvaluator +from haystack.utils.auth import Secret + + +class TestContextRelevanceEvaluator: + def test_init_default(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") + component = ContextRelevanceEvaluator() + assert component.api == "openai" + assert component.generator.client.api_key == "test-api-key" + assert component.instructions == ( + "Your task is to judge how relevant the provided context is for answering a question. " + "First, please extract statements from the provided context. " + "Second, calculate a relevance score for each statement in the context. " + "The score is 1 if the statement is relevant to answer the question or 0 if it is not relevant." + ) + assert component.inputs == [("questions", List[str]), ("contexts", List[List[str]])] + assert component.outputs == ["statements", "statement_scores"] + assert component.examples == [ + { + "inputs": { + "questions": "What is the capital of Germany?", + "contexts": ["Berlin is the capital of Germany and was founded in 1244."], + }, + "outputs": { + "statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."], + "statement_scores": [1, 0], + }, + }, + { + "inputs": { + "questions": "What is the capital of France?", + "contexts": ["Berlin is the capital of Germany."], + }, + "outputs": {"statements": ["Berlin is the capital of Germany."], "statement_scores": [0]}, + }, + { + "inputs": {"questions": "What is the capital of Italy?", "contexts": ["Rome is the capital of Italy."]}, + "outputs": {"statements": ["Rome is the capital of Italy."], "statement_scores": [1]}, + }, + ] + + def test_init_fail_wo_openai_api_key(self, monkeypatch): + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + with pytest.raises(ValueError, match="None of the .* environment variables are set"): + ContextRelevanceEvaluator() + + def test_init_with_parameters(self): + component = ContextRelevanceEvaluator( + api_key=Secret.from_token("test-api-key"), + api="openai", + examples=[ + {"inputs": {"questions": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}, + {"inputs": {"questions": "Football is the most popular sport."}, "outputs": {"custom_score": 0}}, + ], + ) + assert component.generator.client.api_key == "test-api-key" + assert component.api == "openai" + assert component.examples == [ + {"inputs": {"questions": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}, + {"inputs": {"questions": "Football is the most popular sport."}, "outputs": {"custom_score": 0}}, + ] + + def test_from_dict(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") + + data = { + "type": "haystack.components.evaluators.context_relevance.ContextRelevanceEvaluator", + "init_parameters": { + "api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"}, + "api": "openai", + "examples": [{"inputs": {"questions": "What is football?"}, "outputs": {"score": 0}}], + }, + } + component = ContextRelevanceEvaluator.from_dict(data) + assert component.api == "openai" + assert component.generator.client.api_key == "test-api-key" + assert component.examples == [{"inputs": {"questions": "What is football?"}, "outputs": {"score": 0}}] + + def test_run_calculates_mean_score(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") + component = ContextRelevanceEvaluator() + + def generator_run(self, *args, **kwargs): + if "Football" in kwargs["prompt"]: + return {"replies": ['{"statements": ["a", "b"], "statement_scores": [1, 0]}']} + else: + return {"replies": ['{"statements": ["c", "d"], "statement_scores": [1, 1]}']} + + monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run) + + questions = ["Which is the most popular global sport?", "Who created the Python language?"] + contexts = [ + [ + "The popularity of sports can be measured in various ways, including TV viewership, social media " + "presence, number of participants, and economic impact. Football is undoubtedly the world's most " + "popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and " + "Messi, drawing a followership of more than 4 billion people." + ], + [ + "Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming " + "language. Its design philosophy emphasizes code readability, and its language constructs aim to help " + "programmers write clear, logical code for both small and large-scale software projects." + ], + ] + results = component.run(questions=questions, contexts=contexts) + assert results == { + "individual_scores": [0.5, 1], + "results": [ + {"score": 0.5, "statement_scores": [1, 0], "statements": ["a", "b"]}, + {"score": 1, "statement_scores": [1, 1], "statements": ["c", "d"]}, + ], + "score": 0.75, + } + + def test_run_missing_parameters(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") + component = ContextRelevanceEvaluator() + with pytest.raises(TypeError, match="missing 2 required positional arguments"): + component.run() + + @pytest.mark.skipif( + not os.environ.get("OPENAI_API_KEY", None), + reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.", + ) + @pytest.mark.integration + def test_live_run(self): + questions = ["Who created the Python language?"] + contexts = [["Python, created by Guido van Rossum, is a high-level general-purpose programming language."]] + + evaluator = ContextRelevanceEvaluator() + result = evaluator.run(questions=questions, contexts=contexts) + assert result["score"] == 1.0 + assert result["individual_scores"] == [1.0] + assert result["results"][0]["score"] == 1.0 + assert result["results"][0]["statement_scores"] == [1.0] + assert "Guido van Rossum" in result["results"][0]["statements"][0]