diff --git a/haystack/components/evaluators/context_relevance.py b/haystack/components/evaluators/context_relevance.py index d78ccfc74..6402ef3ba 100644 --- a/haystack/components/evaluators/context_relevance.py +++ b/haystack/components/evaluators/context_relevance.py @@ -113,7 +113,7 @@ class ContextRelevanceEvaluator(LLMEvaluator): api_key=self.api_key, ) - @component.output_types(results=List[Dict[str, Any]]) + @component.output_types(individual_scores=List[int], score=float, results=List[Dict[str, Any]]) def run(self, questions: List[str], contexts: List[List[str]]) -> Dict[str, Any]: """ Run the LLM evaluator. diff --git a/haystack/components/evaluators/faithfulness.py b/haystack/components/evaluators/faithfulness.py index 827499038..7c794345d 100644 --- a/haystack/components/evaluators/faithfulness.py +++ b/haystack/components/evaluators/faithfulness.py @@ -13,7 +13,7 @@ _DEFAULT_EXAMPLES = [ "inputs": { "questions": "What is the capital of Germany and when was it founded?", "contexts": ["Berlin is the capital of Germany and was founded in 1244."], - "responses": "The capital of Germany, Berlin, was founded in the 13th century.", + "predicted_answers": "The capital of Germany, Berlin, was founded in the 13th century.", }, "outputs": { "statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."], @@ -24,7 +24,7 @@ _DEFAULT_EXAMPLES = [ "inputs": { "questions": "What is the capital of France?", "contexts": ["Berlin is the capital of Germany."], - "responses": "Paris", + "predicted_answers": "Paris", }, "outputs": {"statements": ["Paris is the capital of France."], "statement_scores": [0]}, }, @@ -32,7 +32,7 @@ _DEFAULT_EXAMPLES = [ "inputs": { "questions": "What is the capital of Italy?", "contexts": ["Rome is the capital of Italy."], - "responses": "Rome is the capital of Italy with more than 4 million inhabitants.", + "predicted_answers": "Rome is the capital of Italy with more than 4 million inhabitants.", }, "outputs": { "statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."], @@ -60,9 +60,9 @@ class FaithfulnessEvaluator(LLMEvaluator): "Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming language. Its design philosophy emphasizes code readability, and its language constructs aim to help programmers write clear, logical code for both small and large-scale software projects." ], ] - responses = ["Python is a high-level general-purpose programming language that was created by George Lucas."] + predicted_answers = ["Python is a high-level general-purpose programming language that was created by George Lucas."] evaluator = FaithfulnessEvaluator() - result = evaluator.run(questions=questions, contexts=contexts, responses=responses) + result = evaluator.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers) print(result["individual_scores"]) # [0.5] @@ -87,13 +87,13 @@ class FaithfulnessEvaluator(LLMEvaluator): Optional few-shot examples conforming to the expected input and output format of FaithfulnessEvaluator. Default examples will be used if none are provided. Each example must be a dictionary with keys "inputs" and "outputs". - "inputs" must be a dictionary with keys "questions", "contexts", and "responses". + "inputs" must be a dictionary with keys "questions", "contexts", and "predicted_answers". "outputs" must be a dictionary with "statements" and "statement_scores". Expected format: [{ "inputs": { "questions": "What is the capital of Italy?", "contexts": ["Rome is the capital of Italy."], - "responses": "Rome is the capital of Italy with more than 4 million inhabitants.", + "predicted_answers": "Rome is the capital of Italy with more than 4 million inhabitants.", }, "outputs": { "statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."], @@ -110,11 +110,11 @@ class FaithfulnessEvaluator(LLMEvaluator): self.instructions = ( "Your task is to judge the faithfulness or groundedness of statements based " "on context information. First, please extract statements from a provided " - "response to a question. Second, calculate a faithfulness score for each " - "statement made in the response. The score is 1 if the statement can be " + "predicted answer to a question. Second, calculate a faithfulness score for each " + "statement made in the predicted answer. The score is 1 if the statement can be " "inferred from the provided context or 0 if it cannot be inferred." ) - self.inputs = [("questions", List[str]), ("contexts", List[List[str]]), ("responses", List[str])] + self.inputs = [("questions", List[str]), ("contexts", List[List[str]]), ("predicted_answers", List[str])] self.outputs = ["statements", "statement_scores"] self.examples = examples or _DEFAULT_EXAMPLES self.api = api @@ -129,8 +129,8 @@ class FaithfulnessEvaluator(LLMEvaluator): api_key=self.api_key, ) - @component.output_types(results=List[Dict[str, Any]]) - def run(self, questions: List[str], contexts: List[List[str]], responses: List[str]) -> Dict[str, Any]: + @component.output_types(individual_scores=List[int], score=float, results=List[Dict[str, Any]]) + def run(self, questions: List[str], contexts: List[List[str]], predicted_answers: List[str]) -> Dict[str, Any]: """ Run the LLM evaluator. @@ -138,15 +138,15 @@ class FaithfulnessEvaluator(LLMEvaluator): A list of questions. :param contexts: A nested list of contexts that correspond to the questions. - :param responses: - A list of responses. + :param predicted_answers: + A list of predicted answers. :returns: A dictionary with the following outputs: - `score`: Mean faithfulness score over all the provided input answers. - `individual_scores`: A list of faithfulness scores for each input answer. - `results`: A list of dictionaries with `statements` and `statement_scores` for each input answer. """ - result = super().run(questions=questions, contexts=contexts, responses=responses) + result = super().run(questions=questions, contexts=contexts, predicted_answers=predicted_answers) # calculate average statement faithfulness score per query for res in result["results"]: diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py index 4191dfc92..1e06e5ff4 100644 --- a/haystack/components/evaluators/llm_evaluator.py +++ b/haystack/components/evaluators/llm_evaluator.py @@ -23,18 +23,18 @@ class LLMEvaluator: from haystack.components.evaluators import LLMEvaluator evaluator = LLMEvaluator( instructions="Is this answer problematic for children?", - inputs=[("responses", List[str])], + inputs=[("predicted_answers", List[str])], outputs=["score"], examples=[ - {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}}, - {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}, + {"inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}}, + {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}, ], ) - RESPONSES = [ + predicted_answers = [ "Football is the most popular sport with around 4 billion followers worldwide", "Python language was created by Guido van Rossum.", ] - results = evaluator.run(responses=RESPONSES) + results = evaluator.run(predicted_answers=predicted_answers) print(results) # {'results': [{'score': 0}, {'score': 0}]} ``` @@ -199,7 +199,7 @@ class LLMEvaluator: The prompt template. """ inputs_section = ( - "{" + ",".join([f'"{input_socket[0]}": {{{{ {input_socket[0]} }}}}' for input_socket in self.inputs]) + "}" + "{" + ", ".join([f'"{input_socket[0]}": {{{{ {input_socket[0]} }}}}' for input_socket in self.inputs]) + "}" ) examples_section = "\n".join( diff --git a/haystack/components/evaluators/sas_evaluator.py b/haystack/components/evaluators/sas_evaluator.py index d7ab26a4d..6af8dd335 100644 --- a/haystack/components/evaluators/sas_evaluator.py +++ b/haystack/components/evaluators/sas_evaluator.py @@ -183,7 +183,7 @@ class SASEvaluator: # Compute cosine-similarities similarity_scores = [ - util.cos_sim(p, l).cpu().numpy() for p, l in zip(predictions_embeddings, label_embeddings) + float(util.cos_sim(p, l).cpu().numpy()) for p, l in zip(predictions_embeddings, label_embeddings) ] sas_score = np_mean(similarity_scores) diff --git a/test/components/evaluators/test_faithfulness_evaluator.py b/test/components/evaluators/test_faithfulness_evaluator.py index 7219c85d5..cacc53cd7 100644 --- a/test/components/evaluators/test_faithfulness_evaluator.py +++ b/test/components/evaluators/test_faithfulness_evaluator.py @@ -15,19 +15,23 @@ class TestFaithfulnessEvaluator: assert component.generator.client.api_key == "test-api-key" assert component.instructions == ( "Your task is to judge the faithfulness or groundedness of statements based " - "on context information. First, please extract statements from a provided " - "response to a question. Second, calculate a faithfulness score for each " - "statement made in the response. The score is 1 if the statement can be " + "on context information. First, please extract statements from a provided predicted " + "answer to a question. Second, calculate a faithfulness score for each " + "statement made in the predicted answer. The score is 1 if the statement can be " "inferred from the provided context or 0 if it cannot be inferred." ) - assert component.inputs == [("questions", List[str]), ("contexts", List[List[str]]), ("responses", List[str])] + assert component.inputs == [ + ("questions", List[str]), + ("contexts", List[List[str]]), + ("predicted_answers", List[str]), + ] assert component.outputs == ["statements", "statement_scores"] assert component.examples == [ { "inputs": { "questions": "What is the capital of Germany and when was it founded?", "contexts": ["Berlin is the capital of Germany and was founded in 1244."], - "responses": "The capital of Germany, Berlin, was founded in the 13th century.", + "predicted_answers": "The capital of Germany, Berlin, was founded in the 13th century.", }, "outputs": { "statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."], @@ -38,7 +42,7 @@ class TestFaithfulnessEvaluator: "inputs": { "questions": "What is the capital of France?", "contexts": ["Berlin is the capital of Germany."], - "responses": "Paris", + "predicted_answers": "Paris", }, "outputs": {"statements": ["Paris is the capital of France."], "statement_scores": [0]}, }, @@ -46,7 +50,7 @@ class TestFaithfulnessEvaluator: "inputs": { "questions": "What is the capital of Italy?", "contexts": ["Rome is the capital of Italy."], - "responses": "Rome is the capital of Italy with more than 4 million inhabitants.", + "predicted_answers": "Rome is the capital of Italy with more than 4 million inhabitants.", }, "outputs": { "statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."], @@ -65,15 +69,21 @@ class TestFaithfulnessEvaluator: api_key=Secret.from_token("test-api-key"), api="openai", examples=[ - {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}, - {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}}, + { + "inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"}, + "outputs": {"custom_score": 1}, + }, + { + "inputs": {"predicted_answers": "Football is the most popular sport."}, + "outputs": {"custom_score": 0}, + }, ], ) assert component.generator.client.api_key == "test-api-key" assert component.api == "openai" assert component.examples == [ - {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}, - {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}}, + {"inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}, + {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"custom_score": 0}}, ] def test_from_dict(self, monkeypatch): @@ -84,14 +94,16 @@ class TestFaithfulnessEvaluator: "init_parameters": { "api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"}, "api": "openai", - "examples": [{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + "examples": [ + {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} + ], }, } component = FaithfulnessEvaluator.from_dict(data) assert component.api == "openai" assert component.generator.client.api_key == "test-api-key" assert component.examples == [ - {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}} + {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} ] def test_run_calculates_mean_score(self, monkeypatch): @@ -120,11 +132,11 @@ class TestFaithfulnessEvaluator: "programmers write clear, logical code for both small and large-scale software projects." ], ] - responses = [ + predicted_answers = [ "Football is the most popular sport with around 4 billion followers worldwide.", "Python is a high-level general-purpose programming language that was created by George Lucas.", ] - results = component.run(questions=questions, contexts=contexts, responses=responses) + results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers) assert results == { "individual_scores": [0.5, 1], "results": [ @@ -148,9 +160,9 @@ class TestFaithfulnessEvaluator: def test_live_run(self): questions = ["What is Python and who created it?"] contexts = [["Python is a programming language created by Guido van Rossum."]] - responses = ["Python is a programming language created by George Lucas."] + predicted_answers = ["Python is a programming language created by George Lucas."] evaluator = FaithfulnessEvaluator() - result = evaluator.run(questions=questions, contexts=contexts, responses=responses) + result = evaluator.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers) required_fields = {"individual_scores", "results", "score"} assert all(field in result for field in required_fields) diff --git a/test/components/evaluators/test_llm_evaluator.py b/test/components/evaluators/test_llm_evaluator.py index 5960e32da..9755b9df0 100644 --- a/test/components/evaluators/test_llm_evaluator.py +++ b/test/components/evaluators/test_llm_evaluator.py @@ -11,17 +11,19 @@ class TestLLMEvaluator: monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") component = LLMEvaluator( instructions="test-instruction", - inputs=[("responses", List[str])], + inputs=[("predicted_answers", List[str])], outputs=["score"], - examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + examples=[ + {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} + ], ) assert component.api == "openai" assert component.generator.client.api_key == "test-api-key" assert component.instructions == "test-instruction" - assert component.inputs == [("responses", List[str])] + assert component.inputs == [("predicted_answers", List[str])] assert component.outputs == ["score"] assert component.examples == [ - {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}} + {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} ] def test_init_fail_wo_openai_api_key(self, monkeypatch): @@ -30,31 +32,39 @@ class TestLLMEvaluator: LLMEvaluator( api="openai", instructions="test-instruction", - inputs=[("responses", List[str])], + inputs=[("predicted_answers", List[str])], outputs=["score"], - examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + examples=[ + {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} + ], ) def test_init_with_parameters(self): component = LLMEvaluator( instructions="test-instruction", api_key=Secret.from_token("test-api-key"), - inputs=[("responses", List[str])], + inputs=[("predicted_answers", List[str])], outputs=["custom_score"], api="openai", examples=[ - {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}, - {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}}, + { + "inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"}, + "outputs": {"custom_score": 1}, + }, + { + "inputs": {"predicted_answers": "Football is the most popular sport."}, + "outputs": {"custom_score": 0}, + }, ], ) assert component.generator.client.api_key == "test-api-key" assert component.api == "openai" assert component.examples == [ - {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}, - {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}}, + {"inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}, + {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"custom_score": 0}}, ] assert component.instructions == "test-instruction" - assert component.inputs == [("responses", List[str])] + assert component.inputs == [("predicted_answers", List[str])] assert component.outputs == ["custom_score"] def test_init_with_invalid_parameters(self, monkeypatch): @@ -63,85 +73,105 @@ class TestLLMEvaluator: with pytest.raises(ValueError): LLMEvaluator( instructions="test-instruction", - inputs={("responses", List[str])}, + inputs={("predicted_answers", List[str])}, outputs=["score"], - examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + examples=[ + {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} + ], ) with pytest.raises(ValueError): LLMEvaluator( instructions="test-instruction", - inputs=[(List[str], "responses")], + inputs=[(List[str], "predicted_answers")], outputs=["score"], - examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + examples=[ + {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} + ], ) with pytest.raises(ValueError): LLMEvaluator( instructions="test-instruction", inputs=[List[str]], outputs=["score"], - examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + examples=[ + {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} + ], ) with pytest.raises(ValueError): LLMEvaluator( instructions="test-instruction", - inputs={("responses", str)}, + inputs={("predicted_answers", str)}, outputs=["score"], - examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + examples=[ + {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} + ], ) # Invalid outputs with pytest.raises(ValueError): LLMEvaluator( instructions="test-instruction", - inputs=[("responses", List[str])], + inputs=[("predicted_answers", List[str])], outputs="score", - examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + examples=[ + {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} + ], ) with pytest.raises(ValueError): LLMEvaluator( instructions="test-instruction", - inputs=[("responses", List[str])], + inputs=[("predicted_answers", List[str])], outputs=[["score"]], - examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + examples=[ + {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} + ], ) # Invalid examples with pytest.raises(ValueError): LLMEvaluator( instructions="test-instruction", - inputs=[("responses", List[str])], + inputs=[("predicted_answers", List[str])], outputs=["score"], examples={ - "inputs": {"responses": "Damn, this is straight outta hell!!!"}, + "inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}, }, ) with pytest.raises(ValueError): LLMEvaluator( instructions="test-instruction", - inputs=[("responses", List[str])], + inputs=[("predicted_answers", List[str])], outputs=["score"], examples=[ - [{"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}] + [ + { + "inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"}, + "outputs": {"custom_score": 1}, + } + ] ], ) with pytest.raises(ValueError): LLMEvaluator( instructions="test-instruction", - inputs=[("responses", List[str])], - outputs=["score"], - examples=[ - {"wrong_key": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}} - ], - ) - with pytest.raises(ValueError): - LLMEvaluator( - instructions="test-instruction", - inputs=[("responses", List[str])], + inputs=[("predicted_answers", List[str])], outputs=["score"], examples=[ { - "inputs": [{"responses": "Damn, this is straight outta hell!!!"}], + "wrong_key": {"predicted_answers": "Damn, this is straight outta hell!!!"}, + "outputs": {"custom_score": 1}, + } + ], + ) + with pytest.raises(ValueError): + LLMEvaluator( + instructions="test-instruction", + inputs=[("predicted_answers", List[str])], + outputs=["score"], + examples=[ + { + "inputs": [{"predicted_answers": "Damn, this is straight outta hell!!!"}], "outputs": [{"custom_score": 1}], } ], @@ -149,7 +179,7 @@ class TestLLMEvaluator: with pytest.raises(ValueError): LLMEvaluator( instructions="test-instruction", - inputs=[("responses", List[str])], + inputs=[("predicted_answers", List[str])], outputs=["score"], examples=[{"inputs": {1: "Damn, this is straight outta hell!!!"}, "outputs": {2: 1}}], ) @@ -158,9 +188,11 @@ class TestLLMEvaluator: monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") component = LLMEvaluator( instructions="test-instruction", - inputs=[("responses", List[str])], + inputs=[("predicted_answers", List[str])], outputs=["score"], - examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + examples=[ + {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} + ], ) data = component.to_dict() assert data == { @@ -169,9 +201,11 @@ class TestLLMEvaluator: "api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"}, "api": "openai", "instructions": "test-instruction", - "inputs": [("responses", List[str])], + "inputs": [("predicted_answers", List[str])], "outputs": ["score"], - "examples": [{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + "examples": [ + {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} + ], }, } @@ -184,19 +218,21 @@ class TestLLMEvaluator: "api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"}, "api": "openai", "instructions": "test-instruction", - "inputs": [("responses", List[str])], + "inputs": [("predicted_answers", List[str])], "outputs": ["score"], - "examples": [{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + "examples": [ + {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} + ], }, } component = LLMEvaluator.from_dict(data) assert component.api == "openai" assert component.generator.client.api_key == "test-api-key" assert component.instructions == "test-instruction" - assert component.inputs == [("responses", List[str])] + assert component.inputs == [("predicted_answers", List[str])] assert component.outputs == ["score"] assert component.examples == [ - {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}} + {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} ] def test_to_dict_with_parameters(self, monkeypatch): @@ -204,12 +240,18 @@ class TestLLMEvaluator: component = LLMEvaluator( instructions="test-instruction", api_key=Secret.from_env_var("ENV_VAR"), - inputs=[("responses", List[str])], + inputs=[("predicted_answers", List[str])], outputs=["custom_score"], api="openai", examples=[ - {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}, - {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}}, + { + "inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"}, + "outputs": {"custom_score": 1}, + }, + { + "inputs": {"predicted_answers": "Football is the most popular sport."}, + "outputs": {"custom_score": 0}, + }, ], ) data = component.to_dict() @@ -219,11 +261,17 @@ class TestLLMEvaluator: "api_key": {"env_vars": ["ENV_VAR"], "strict": True, "type": "env_var"}, "api": "openai", "instructions": "test-instruction", - "inputs": [("responses", List[str])], + "inputs": [("predicted_answers", List[str])], "outputs": ["custom_score"], "examples": [ - {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}, - {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}}, + { + "inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"}, + "outputs": {"custom_score": 1}, + }, + { + "inputs": {"predicted_answers": "Football is the most popular sport."}, + "outputs": {"custom_score": 0}, + }, ], }, } @@ -232,9 +280,11 @@ class TestLLMEvaluator: monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") component = LLMEvaluator( instructions="test-instruction", - inputs=[("questions", List[str]), ("responses", List[List[str]])], + inputs=[("questions", List[str]), ("predicted_answers", List[List[str]])], outputs=["score"], - examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + examples=[ + {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} + ], ) def generator_run(self, *args, **kwargs): @@ -243,20 +293,23 @@ class TestLLMEvaluator: monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run) with pytest.raises(ValueError): - component.run(questions=["What is the capital of Germany?"], responses=[["Berlin"], ["Paris"]]) + component.run(questions=["What is the capital of Germany?"], predicted_answers=[["Berlin"], ["Paris"]]) with pytest.raises(ValueError): component.run( - questions=["What is the capital of Germany?", "What is the capital of France?"], responses=[["Berlin"]] + questions=["What is the capital of Germany?", "What is the capital of France?"], + predicted_answers=[["Berlin"]], ) def test_run_returns_parsed_result(self, monkeypatch): monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") component = LLMEvaluator( instructions="test-instruction", - inputs=[("questions", List[str]), ("responses", List[List[str]])], + inputs=[("questions", List[str]), ("predicted_answers", List[List[str]])], outputs=["score"], - examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + examples=[ + {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} + ], ) def generator_run(self, *args, **kwargs): @@ -264,42 +317,46 @@ class TestLLMEvaluator: monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run) - results = component.run(questions=["What is the capital of Germany?"], responses=["Berlin"]) + results = component.run(questions=["What is the capital of Germany?"], predicted_answers=["Berlin"]) assert results == {"results": [{"score": 0.5}]} def test_prepare_template(self, monkeypatch): monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") component = LLMEvaluator( instructions="test-instruction", - inputs=[("responses", List[str])], + inputs=[("predicted_answers", List[str])], outputs=["score"], examples=[ - {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}}, - {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}, + {"inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}}, + {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}, ], ) template = component.prepare_template() assert ( template - == 'Instructions:\ntest-instruction\n\nGenerate the response in JSON format with the following keys:\n["score"]\nConsider the instructions and the examples below to determine those values.\n\nExamples:\nInputs:\n{"responses": "Damn, this is straight outta hell!!!"}\nOutputs:\n{"score": 1}\nInputs:\n{"responses": "Football is the most popular sport."}\nOutputs:\n{"score": 0}\n\nInputs:\n{"responses": {{ responses }}}\nOutputs:\n' + == 'Instructions:\ntest-instruction\n\nGenerate the response in JSON format with the following keys:\n["score"]\nConsider the instructions and the examples below to determine those values.\n\nExamples:\nInputs:\n{"predicted_answers": "Damn, this is straight outta hell!!!"}\nOutputs:\n{"score": 1}\nInputs:\n{"predicted_answers": "Football is the most popular sport."}\nOutputs:\n{"score": 0}\n\nInputs:\n{"predicted_answers": {{ predicted_answers }}}\nOutputs:\n' ) def test_invalid_input_parameters(self, monkeypatch): monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") component = LLMEvaluator( instructions="test-instruction", - inputs=[("responses", List[str])], + inputs=[("predicted_answers", List[str])], outputs=["score"], - examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + examples=[ + {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} + ], ) # None of the expected parameters are received with pytest.raises(ValueError): - component.validate_input_parameters(expected={"responses": List[str]}, received={"questions": List[str]}) + component.validate_input_parameters( + expected={"predicted_answers": List[str]}, received={"questions": List[str]} + ) # Only one but not all the expected parameters are received with pytest.raises(ValueError): component.validate_input_parameters( - expected={"responses": List[str], "questions": List[str]}, received={"questions": List[str]} + expected={"predicted_answers": List[str], "questions": List[str]}, received={"questions": List[str]} ) # Received inputs are not lists @@ -310,9 +367,11 @@ class TestLLMEvaluator: monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") component = LLMEvaluator( instructions="test-instruction", - inputs=[("responses", List[str])], + inputs=[("predicted_answers", List[str])], outputs=["score"], - examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + examples=[ + {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} + ], ) with pytest.raises(ValueError): component.validate_outputs(expected=["score", "another_expected_output"], received='{"score": 1.0}') @@ -325,7 +384,9 @@ class TestLLMEvaluator: LLMEvaluator( api="unsupported_api", instructions="test-instruction", - inputs=[("responses", List[str])], + inputs=[("predicted_answers", List[str])], outputs=["score"], - examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + examples=[ + {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} + ], )