refactor: Rename FaithfulnessEvaluator input responses to predicted_answers (#7621)

This commit is contained in:
Julian Risch 2024-04-30 16:30:57 +02:00 committed by GitHub
parent 5de5619abd
commit 2509eeea7e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 185 additions and 112 deletions

View File

@ -113,7 +113,7 @@ class ContextRelevanceEvaluator(LLMEvaluator):
api_key=self.api_key, api_key=self.api_key,
) )
@component.output_types(results=List[Dict[str, Any]]) @component.output_types(individual_scores=List[int], score=float, results=List[Dict[str, Any]])
def run(self, questions: List[str], contexts: List[List[str]]) -> Dict[str, Any]: def run(self, questions: List[str], contexts: List[List[str]]) -> Dict[str, Any]:
""" """
Run the LLM evaluator. Run the LLM evaluator.

View File

@ -13,7 +13,7 @@ _DEFAULT_EXAMPLES = [
"inputs": { "inputs": {
"questions": "What is the capital of Germany and when was it founded?", "questions": "What is the capital of Germany and when was it founded?",
"contexts": ["Berlin is the capital of Germany and was founded in 1244."], "contexts": ["Berlin is the capital of Germany and was founded in 1244."],
"responses": "The capital of Germany, Berlin, was founded in the 13th century.", "predicted_answers": "The capital of Germany, Berlin, was founded in the 13th century.",
}, },
"outputs": { "outputs": {
"statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."], "statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."],
@ -24,7 +24,7 @@ _DEFAULT_EXAMPLES = [
"inputs": { "inputs": {
"questions": "What is the capital of France?", "questions": "What is the capital of France?",
"contexts": ["Berlin is the capital of Germany."], "contexts": ["Berlin is the capital of Germany."],
"responses": "Paris", "predicted_answers": "Paris",
}, },
"outputs": {"statements": ["Paris is the capital of France."], "statement_scores": [0]}, "outputs": {"statements": ["Paris is the capital of France."], "statement_scores": [0]},
}, },
@ -32,7 +32,7 @@ _DEFAULT_EXAMPLES = [
"inputs": { "inputs": {
"questions": "What is the capital of Italy?", "questions": "What is the capital of Italy?",
"contexts": ["Rome is the capital of Italy."], "contexts": ["Rome is the capital of Italy."],
"responses": "Rome is the capital of Italy with more than 4 million inhabitants.", "predicted_answers": "Rome is the capital of Italy with more than 4 million inhabitants.",
}, },
"outputs": { "outputs": {
"statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."], "statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."],
@ -60,9 +60,9 @@ class FaithfulnessEvaluator(LLMEvaluator):
"Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming language. Its design philosophy emphasizes code readability, and its language constructs aim to help programmers write clear, logical code for both small and large-scale software projects." "Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming language. Its design philosophy emphasizes code readability, and its language constructs aim to help programmers write clear, logical code for both small and large-scale software projects."
], ],
] ]
responses = ["Python is a high-level general-purpose programming language that was created by George Lucas."] predicted_answers = ["Python is a high-level general-purpose programming language that was created by George Lucas."]
evaluator = FaithfulnessEvaluator() evaluator = FaithfulnessEvaluator()
result = evaluator.run(questions=questions, contexts=contexts, responses=responses) result = evaluator.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
print(result["individual_scores"]) print(result["individual_scores"])
# [0.5] # [0.5]
@ -87,13 +87,13 @@ class FaithfulnessEvaluator(LLMEvaluator):
Optional few-shot examples conforming to the expected input and output format of FaithfulnessEvaluator. Optional few-shot examples conforming to the expected input and output format of FaithfulnessEvaluator.
Default examples will be used if none are provided. Default examples will be used if none are provided.
Each example must be a dictionary with keys "inputs" and "outputs". Each example must be a dictionary with keys "inputs" and "outputs".
"inputs" must be a dictionary with keys "questions", "contexts", and "responses". "inputs" must be a dictionary with keys "questions", "contexts", and "predicted_answers".
"outputs" must be a dictionary with "statements" and "statement_scores". "outputs" must be a dictionary with "statements" and "statement_scores".
Expected format: Expected format:
[{ [{
"inputs": { "inputs": {
"questions": "What is the capital of Italy?", "contexts": ["Rome is the capital of Italy."], "questions": "What is the capital of Italy?", "contexts": ["Rome is the capital of Italy."],
"responses": "Rome is the capital of Italy with more than 4 million inhabitants.", "predicted_answers": "Rome is the capital of Italy with more than 4 million inhabitants.",
}, },
"outputs": { "outputs": {
"statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."], "statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."],
@ -110,11 +110,11 @@ class FaithfulnessEvaluator(LLMEvaluator):
self.instructions = ( self.instructions = (
"Your task is to judge the faithfulness or groundedness of statements based " "Your task is to judge the faithfulness or groundedness of statements based "
"on context information. First, please extract statements from a provided " "on context information. First, please extract statements from a provided "
"response to a question. Second, calculate a faithfulness score for each " "predicted answer to a question. Second, calculate a faithfulness score for each "
"statement made in the response. The score is 1 if the statement can be " "statement made in the predicted answer. The score is 1 if the statement can be "
"inferred from the provided context or 0 if it cannot be inferred." "inferred from the provided context or 0 if it cannot be inferred."
) )
self.inputs = [("questions", List[str]), ("contexts", List[List[str]]), ("responses", List[str])] self.inputs = [("questions", List[str]), ("contexts", List[List[str]]), ("predicted_answers", List[str])]
self.outputs = ["statements", "statement_scores"] self.outputs = ["statements", "statement_scores"]
self.examples = examples or _DEFAULT_EXAMPLES self.examples = examples or _DEFAULT_EXAMPLES
self.api = api self.api = api
@ -129,8 +129,8 @@ class FaithfulnessEvaluator(LLMEvaluator):
api_key=self.api_key, api_key=self.api_key,
) )
@component.output_types(results=List[Dict[str, Any]]) @component.output_types(individual_scores=List[int], score=float, results=List[Dict[str, Any]])
def run(self, questions: List[str], contexts: List[List[str]], responses: List[str]) -> Dict[str, Any]: def run(self, questions: List[str], contexts: List[List[str]], predicted_answers: List[str]) -> Dict[str, Any]:
""" """
Run the LLM evaluator. Run the LLM evaluator.
@ -138,15 +138,15 @@ class FaithfulnessEvaluator(LLMEvaluator):
A list of questions. A list of questions.
:param contexts: :param contexts:
A nested list of contexts that correspond to the questions. A nested list of contexts that correspond to the questions.
:param responses: :param predicted_answers:
A list of responses. A list of predicted answers.
:returns: :returns:
A dictionary with the following outputs: A dictionary with the following outputs:
- `score`: Mean faithfulness score over all the provided input answers. - `score`: Mean faithfulness score over all the provided input answers.
- `individual_scores`: A list of faithfulness scores for each input answer. - `individual_scores`: A list of faithfulness scores for each input answer.
- `results`: A list of dictionaries with `statements` and `statement_scores` for each input answer. - `results`: A list of dictionaries with `statements` and `statement_scores` for each input answer.
""" """
result = super().run(questions=questions, contexts=contexts, responses=responses) result = super().run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
# calculate average statement faithfulness score per query # calculate average statement faithfulness score per query
for res in result["results"]: for res in result["results"]:

View File

@ -23,18 +23,18 @@ class LLMEvaluator:
from haystack.components.evaluators import LLMEvaluator from haystack.components.evaluators import LLMEvaluator
evaluator = LLMEvaluator( evaluator = LLMEvaluator(
instructions="Is this answer problematic for children?", instructions="Is this answer problematic for children?",
inputs=[("responses", List[str])], inputs=[("predicted_answers", List[str])],
outputs=["score"], outputs=["score"],
examples=[ examples=[
{"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}}, {"inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}},
{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}, {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}},
], ],
) )
RESPONSES = [ predicted_answers = [
"Football is the most popular sport with around 4 billion followers worldwide", "Football is the most popular sport with around 4 billion followers worldwide",
"Python language was created by Guido van Rossum.", "Python language was created by Guido van Rossum.",
] ]
results = evaluator.run(responses=RESPONSES) results = evaluator.run(predicted_answers=predicted_answers)
print(results) print(results)
# {'results': [{'score': 0}, {'score': 0}]} # {'results': [{'score': 0}, {'score': 0}]}
``` ```
@ -199,7 +199,7 @@ class LLMEvaluator:
The prompt template. The prompt template.
""" """
inputs_section = ( inputs_section = (
"{" + ",".join([f'"{input_socket[0]}": {{{{ {input_socket[0]} }}}}' for input_socket in self.inputs]) + "}" "{" + ", ".join([f'"{input_socket[0]}": {{{{ {input_socket[0]} }}}}' for input_socket in self.inputs]) + "}"
) )
examples_section = "\n".join( examples_section = "\n".join(

View File

@ -183,7 +183,7 @@ class SASEvaluator:
# Compute cosine-similarities # Compute cosine-similarities
similarity_scores = [ similarity_scores = [
util.cos_sim(p, l).cpu().numpy() for p, l in zip(predictions_embeddings, label_embeddings) float(util.cos_sim(p, l).cpu().numpy()) for p, l in zip(predictions_embeddings, label_embeddings)
] ]
sas_score = np_mean(similarity_scores) sas_score = np_mean(similarity_scores)

View File

@ -15,19 +15,23 @@ class TestFaithfulnessEvaluator:
assert component.generator.client.api_key == "test-api-key" assert component.generator.client.api_key == "test-api-key"
assert component.instructions == ( assert component.instructions == (
"Your task is to judge the faithfulness or groundedness of statements based " "Your task is to judge the faithfulness or groundedness of statements based "
"on context information. First, please extract statements from a provided " "on context information. First, please extract statements from a provided predicted "
"response to a question. Second, calculate a faithfulness score for each " "answer to a question. Second, calculate a faithfulness score for each "
"statement made in the response. The score is 1 if the statement can be " "statement made in the predicted answer. The score is 1 if the statement can be "
"inferred from the provided context or 0 if it cannot be inferred." "inferred from the provided context or 0 if it cannot be inferred."
) )
assert component.inputs == [("questions", List[str]), ("contexts", List[List[str]]), ("responses", List[str])] assert component.inputs == [
("questions", List[str]),
("contexts", List[List[str]]),
("predicted_answers", List[str]),
]
assert component.outputs == ["statements", "statement_scores"] assert component.outputs == ["statements", "statement_scores"]
assert component.examples == [ assert component.examples == [
{ {
"inputs": { "inputs": {
"questions": "What is the capital of Germany and when was it founded?", "questions": "What is the capital of Germany and when was it founded?",
"contexts": ["Berlin is the capital of Germany and was founded in 1244."], "contexts": ["Berlin is the capital of Germany and was founded in 1244."],
"responses": "The capital of Germany, Berlin, was founded in the 13th century.", "predicted_answers": "The capital of Germany, Berlin, was founded in the 13th century.",
}, },
"outputs": { "outputs": {
"statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."], "statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."],
@ -38,7 +42,7 @@ class TestFaithfulnessEvaluator:
"inputs": { "inputs": {
"questions": "What is the capital of France?", "questions": "What is the capital of France?",
"contexts": ["Berlin is the capital of Germany."], "contexts": ["Berlin is the capital of Germany."],
"responses": "Paris", "predicted_answers": "Paris",
}, },
"outputs": {"statements": ["Paris is the capital of France."], "statement_scores": [0]}, "outputs": {"statements": ["Paris is the capital of France."], "statement_scores": [0]},
}, },
@ -46,7 +50,7 @@ class TestFaithfulnessEvaluator:
"inputs": { "inputs": {
"questions": "What is the capital of Italy?", "questions": "What is the capital of Italy?",
"contexts": ["Rome is the capital of Italy."], "contexts": ["Rome is the capital of Italy."],
"responses": "Rome is the capital of Italy with more than 4 million inhabitants.", "predicted_answers": "Rome is the capital of Italy with more than 4 million inhabitants.",
}, },
"outputs": { "outputs": {
"statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."], "statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."],
@ -65,15 +69,21 @@ class TestFaithfulnessEvaluator:
api_key=Secret.from_token("test-api-key"), api_key=Secret.from_token("test-api-key"),
api="openai", api="openai",
examples=[ examples=[
{"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}, {
{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}}, "inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"},
"outputs": {"custom_score": 1},
},
{
"inputs": {"predicted_answers": "Football is the most popular sport."},
"outputs": {"custom_score": 0},
},
], ],
) )
assert component.generator.client.api_key == "test-api-key" assert component.generator.client.api_key == "test-api-key"
assert component.api == "openai" assert component.api == "openai"
assert component.examples == [ assert component.examples == [
{"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}, {"inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}}, {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
] ]
def test_from_dict(self, monkeypatch): def test_from_dict(self, monkeypatch):
@ -84,14 +94,16 @@ class TestFaithfulnessEvaluator:
"init_parameters": { "init_parameters": {
"api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"}, "api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"},
"api": "openai", "api": "openai",
"examples": [{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], "examples": [
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
],
}, },
} }
component = FaithfulnessEvaluator.from_dict(data) component = FaithfulnessEvaluator.from_dict(data)
assert component.api == "openai" assert component.api == "openai"
assert component.generator.client.api_key == "test-api-key" assert component.generator.client.api_key == "test-api-key"
assert component.examples == [ assert component.examples == [
{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}} {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
] ]
def test_run_calculates_mean_score(self, monkeypatch): def test_run_calculates_mean_score(self, monkeypatch):
@ -120,11 +132,11 @@ class TestFaithfulnessEvaluator:
"programmers write clear, logical code for both small and large-scale software projects." "programmers write clear, logical code for both small and large-scale software projects."
], ],
] ]
responses = [ predicted_answers = [
"Football is the most popular sport with around 4 billion followers worldwide.", "Football is the most popular sport with around 4 billion followers worldwide.",
"Python is a high-level general-purpose programming language that was created by George Lucas.", "Python is a high-level general-purpose programming language that was created by George Lucas.",
] ]
results = component.run(questions=questions, contexts=contexts, responses=responses) results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
assert results == { assert results == {
"individual_scores": [0.5, 1], "individual_scores": [0.5, 1],
"results": [ "results": [
@ -148,9 +160,9 @@ class TestFaithfulnessEvaluator:
def test_live_run(self): def test_live_run(self):
questions = ["What is Python and who created it?"] questions = ["What is Python and who created it?"]
contexts = [["Python is a programming language created by Guido van Rossum."]] contexts = [["Python is a programming language created by Guido van Rossum."]]
responses = ["Python is a programming language created by George Lucas."] predicted_answers = ["Python is a programming language created by George Lucas."]
evaluator = FaithfulnessEvaluator() evaluator = FaithfulnessEvaluator()
result = evaluator.run(questions=questions, contexts=contexts, responses=responses) result = evaluator.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
required_fields = {"individual_scores", "results", "score"} required_fields = {"individual_scores", "results", "score"}
assert all(field in result for field in required_fields) assert all(field in result for field in required_fields)

View File

@ -11,17 +11,19 @@ class TestLLMEvaluator:
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
component = LLMEvaluator( component = LLMEvaluator(
instructions="test-instruction", instructions="test-instruction",
inputs=[("responses", List[str])], inputs=[("predicted_answers", List[str])],
outputs=["score"], outputs=["score"],
examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], examples=[
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
],
) )
assert component.api == "openai" assert component.api == "openai"
assert component.generator.client.api_key == "test-api-key" assert component.generator.client.api_key == "test-api-key"
assert component.instructions == "test-instruction" assert component.instructions == "test-instruction"
assert component.inputs == [("responses", List[str])] assert component.inputs == [("predicted_answers", List[str])]
assert component.outputs == ["score"] assert component.outputs == ["score"]
assert component.examples == [ assert component.examples == [
{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}} {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
] ]
def test_init_fail_wo_openai_api_key(self, monkeypatch): def test_init_fail_wo_openai_api_key(self, monkeypatch):
@ -30,31 +32,39 @@ class TestLLMEvaluator:
LLMEvaluator( LLMEvaluator(
api="openai", api="openai",
instructions="test-instruction", instructions="test-instruction",
inputs=[("responses", List[str])], inputs=[("predicted_answers", List[str])],
outputs=["score"], outputs=["score"],
examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], examples=[
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
],
) )
def test_init_with_parameters(self): def test_init_with_parameters(self):
component = LLMEvaluator( component = LLMEvaluator(
instructions="test-instruction", instructions="test-instruction",
api_key=Secret.from_token("test-api-key"), api_key=Secret.from_token("test-api-key"),
inputs=[("responses", List[str])], inputs=[("predicted_answers", List[str])],
outputs=["custom_score"], outputs=["custom_score"],
api="openai", api="openai",
examples=[ examples=[
{"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}, {
{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}}, "inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"},
"outputs": {"custom_score": 1},
},
{
"inputs": {"predicted_answers": "Football is the most popular sport."},
"outputs": {"custom_score": 0},
},
], ],
) )
assert component.generator.client.api_key == "test-api-key" assert component.generator.client.api_key == "test-api-key"
assert component.api == "openai" assert component.api == "openai"
assert component.examples == [ assert component.examples == [
{"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}, {"inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}}, {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
] ]
assert component.instructions == "test-instruction" assert component.instructions == "test-instruction"
assert component.inputs == [("responses", List[str])] assert component.inputs == [("predicted_answers", List[str])]
assert component.outputs == ["custom_score"] assert component.outputs == ["custom_score"]
def test_init_with_invalid_parameters(self, monkeypatch): def test_init_with_invalid_parameters(self, monkeypatch):
@ -63,85 +73,105 @@ class TestLLMEvaluator:
with pytest.raises(ValueError): with pytest.raises(ValueError):
LLMEvaluator( LLMEvaluator(
instructions="test-instruction", instructions="test-instruction",
inputs={("responses", List[str])}, inputs={("predicted_answers", List[str])},
outputs=["score"], outputs=["score"],
examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], examples=[
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
],
) )
with pytest.raises(ValueError): with pytest.raises(ValueError):
LLMEvaluator( LLMEvaluator(
instructions="test-instruction", instructions="test-instruction",
inputs=[(List[str], "responses")], inputs=[(List[str], "predicted_answers")],
outputs=["score"], outputs=["score"],
examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], examples=[
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
],
) )
with pytest.raises(ValueError): with pytest.raises(ValueError):
LLMEvaluator( LLMEvaluator(
instructions="test-instruction", instructions="test-instruction",
inputs=[List[str]], inputs=[List[str]],
outputs=["score"], outputs=["score"],
examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], examples=[
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
],
) )
with pytest.raises(ValueError): with pytest.raises(ValueError):
LLMEvaluator( LLMEvaluator(
instructions="test-instruction", instructions="test-instruction",
inputs={("responses", str)}, inputs={("predicted_answers", str)},
outputs=["score"], outputs=["score"],
examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], examples=[
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
],
) )
# Invalid outputs # Invalid outputs
with pytest.raises(ValueError): with pytest.raises(ValueError):
LLMEvaluator( LLMEvaluator(
instructions="test-instruction", instructions="test-instruction",
inputs=[("responses", List[str])], inputs=[("predicted_answers", List[str])],
outputs="score", outputs="score",
examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], examples=[
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
],
) )
with pytest.raises(ValueError): with pytest.raises(ValueError):
LLMEvaluator( LLMEvaluator(
instructions="test-instruction", instructions="test-instruction",
inputs=[("responses", List[str])], inputs=[("predicted_answers", List[str])],
outputs=[["score"]], outputs=[["score"]],
examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], examples=[
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
],
) )
# Invalid examples # Invalid examples
with pytest.raises(ValueError): with pytest.raises(ValueError):
LLMEvaluator( LLMEvaluator(
instructions="test-instruction", instructions="test-instruction",
inputs=[("responses", List[str])], inputs=[("predicted_answers", List[str])],
outputs=["score"], outputs=["score"],
examples={ examples={
"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"},
"outputs": {"custom_score": 1}, "outputs": {"custom_score": 1},
}, },
) )
with pytest.raises(ValueError): with pytest.raises(ValueError):
LLMEvaluator( LLMEvaluator(
instructions="test-instruction", instructions="test-instruction",
inputs=[("responses", List[str])], inputs=[("predicted_answers", List[str])],
outputs=["score"], outputs=["score"],
examples=[ examples=[
[{"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}] [
{
"inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"},
"outputs": {"custom_score": 1},
}
]
], ],
) )
with pytest.raises(ValueError): with pytest.raises(ValueError):
LLMEvaluator( LLMEvaluator(
instructions="test-instruction", instructions="test-instruction",
inputs=[("responses", List[str])], inputs=[("predicted_answers", List[str])],
outputs=["score"],
examples=[
{"wrong_key": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}
],
)
with pytest.raises(ValueError):
LLMEvaluator(
instructions="test-instruction",
inputs=[("responses", List[str])],
outputs=["score"], outputs=["score"],
examples=[ examples=[
{ {
"inputs": [{"responses": "Damn, this is straight outta hell!!!"}], "wrong_key": {"predicted_answers": "Damn, this is straight outta hell!!!"},
"outputs": {"custom_score": 1},
}
],
)
with pytest.raises(ValueError):
LLMEvaluator(
instructions="test-instruction",
inputs=[("predicted_answers", List[str])],
outputs=["score"],
examples=[
{
"inputs": [{"predicted_answers": "Damn, this is straight outta hell!!!"}],
"outputs": [{"custom_score": 1}], "outputs": [{"custom_score": 1}],
} }
], ],
@ -149,7 +179,7 @@ class TestLLMEvaluator:
with pytest.raises(ValueError): with pytest.raises(ValueError):
LLMEvaluator( LLMEvaluator(
instructions="test-instruction", instructions="test-instruction",
inputs=[("responses", List[str])], inputs=[("predicted_answers", List[str])],
outputs=["score"], outputs=["score"],
examples=[{"inputs": {1: "Damn, this is straight outta hell!!!"}, "outputs": {2: 1}}], examples=[{"inputs": {1: "Damn, this is straight outta hell!!!"}, "outputs": {2: 1}}],
) )
@ -158,9 +188,11 @@ class TestLLMEvaluator:
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
component = LLMEvaluator( component = LLMEvaluator(
instructions="test-instruction", instructions="test-instruction",
inputs=[("responses", List[str])], inputs=[("predicted_answers", List[str])],
outputs=["score"], outputs=["score"],
examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], examples=[
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
],
) )
data = component.to_dict() data = component.to_dict()
assert data == { assert data == {
@ -169,9 +201,11 @@ class TestLLMEvaluator:
"api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"}, "api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"},
"api": "openai", "api": "openai",
"instructions": "test-instruction", "instructions": "test-instruction",
"inputs": [("responses", List[str])], "inputs": [("predicted_answers", List[str])],
"outputs": ["score"], "outputs": ["score"],
"examples": [{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], "examples": [
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
],
}, },
} }
@ -184,19 +218,21 @@ class TestLLMEvaluator:
"api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"}, "api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"},
"api": "openai", "api": "openai",
"instructions": "test-instruction", "instructions": "test-instruction",
"inputs": [("responses", List[str])], "inputs": [("predicted_answers", List[str])],
"outputs": ["score"], "outputs": ["score"],
"examples": [{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], "examples": [
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
],
}, },
} }
component = LLMEvaluator.from_dict(data) component = LLMEvaluator.from_dict(data)
assert component.api == "openai" assert component.api == "openai"
assert component.generator.client.api_key == "test-api-key" assert component.generator.client.api_key == "test-api-key"
assert component.instructions == "test-instruction" assert component.instructions == "test-instruction"
assert component.inputs == [("responses", List[str])] assert component.inputs == [("predicted_answers", List[str])]
assert component.outputs == ["score"] assert component.outputs == ["score"]
assert component.examples == [ assert component.examples == [
{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}} {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
] ]
def test_to_dict_with_parameters(self, monkeypatch): def test_to_dict_with_parameters(self, monkeypatch):
@ -204,12 +240,18 @@ class TestLLMEvaluator:
component = LLMEvaluator( component = LLMEvaluator(
instructions="test-instruction", instructions="test-instruction",
api_key=Secret.from_env_var("ENV_VAR"), api_key=Secret.from_env_var("ENV_VAR"),
inputs=[("responses", List[str])], inputs=[("predicted_answers", List[str])],
outputs=["custom_score"], outputs=["custom_score"],
api="openai", api="openai",
examples=[ examples=[
{"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}, {
{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}}, "inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"},
"outputs": {"custom_score": 1},
},
{
"inputs": {"predicted_answers": "Football is the most popular sport."},
"outputs": {"custom_score": 0},
},
], ],
) )
data = component.to_dict() data = component.to_dict()
@ -219,11 +261,17 @@ class TestLLMEvaluator:
"api_key": {"env_vars": ["ENV_VAR"], "strict": True, "type": "env_var"}, "api_key": {"env_vars": ["ENV_VAR"], "strict": True, "type": "env_var"},
"api": "openai", "api": "openai",
"instructions": "test-instruction", "instructions": "test-instruction",
"inputs": [("responses", List[str])], "inputs": [("predicted_answers", List[str])],
"outputs": ["custom_score"], "outputs": ["custom_score"],
"examples": [ "examples": [
{"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}, {
{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}}, "inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"},
"outputs": {"custom_score": 1},
},
{
"inputs": {"predicted_answers": "Football is the most popular sport."},
"outputs": {"custom_score": 0},
},
], ],
}, },
} }
@ -232,9 +280,11 @@ class TestLLMEvaluator:
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
component = LLMEvaluator( component = LLMEvaluator(
instructions="test-instruction", instructions="test-instruction",
inputs=[("questions", List[str]), ("responses", List[List[str]])], inputs=[("questions", List[str]), ("predicted_answers", List[List[str]])],
outputs=["score"], outputs=["score"],
examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], examples=[
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
],
) )
def generator_run(self, *args, **kwargs): def generator_run(self, *args, **kwargs):
@ -243,20 +293,23 @@ class TestLLMEvaluator:
monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run) monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run)
with pytest.raises(ValueError): with pytest.raises(ValueError):
component.run(questions=["What is the capital of Germany?"], responses=[["Berlin"], ["Paris"]]) component.run(questions=["What is the capital of Germany?"], predicted_answers=[["Berlin"], ["Paris"]])
with pytest.raises(ValueError): with pytest.raises(ValueError):
component.run( component.run(
questions=["What is the capital of Germany?", "What is the capital of France?"], responses=[["Berlin"]] questions=["What is the capital of Germany?", "What is the capital of France?"],
predicted_answers=[["Berlin"]],
) )
def test_run_returns_parsed_result(self, monkeypatch): def test_run_returns_parsed_result(self, monkeypatch):
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
component = LLMEvaluator( component = LLMEvaluator(
instructions="test-instruction", instructions="test-instruction",
inputs=[("questions", List[str]), ("responses", List[List[str]])], inputs=[("questions", List[str]), ("predicted_answers", List[List[str]])],
outputs=["score"], outputs=["score"],
examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], examples=[
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
],
) )
def generator_run(self, *args, **kwargs): def generator_run(self, *args, **kwargs):
@ -264,42 +317,46 @@ class TestLLMEvaluator:
monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run) monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run)
results = component.run(questions=["What is the capital of Germany?"], responses=["Berlin"]) results = component.run(questions=["What is the capital of Germany?"], predicted_answers=["Berlin"])
assert results == {"results": [{"score": 0.5}]} assert results == {"results": [{"score": 0.5}]}
def test_prepare_template(self, monkeypatch): def test_prepare_template(self, monkeypatch):
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
component = LLMEvaluator( component = LLMEvaluator(
instructions="test-instruction", instructions="test-instruction",
inputs=[("responses", List[str])], inputs=[("predicted_answers", List[str])],
outputs=["score"], outputs=["score"],
examples=[ examples=[
{"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}}, {"inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}},
{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}, {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}},
], ],
) )
template = component.prepare_template() template = component.prepare_template()
assert ( assert (
template template
== 'Instructions:\ntest-instruction\n\nGenerate the response in JSON format with the following keys:\n["score"]\nConsider the instructions and the examples below to determine those values.\n\nExamples:\nInputs:\n{"responses": "Damn, this is straight outta hell!!!"}\nOutputs:\n{"score": 1}\nInputs:\n{"responses": "Football is the most popular sport."}\nOutputs:\n{"score": 0}\n\nInputs:\n{"responses": {{ responses }}}\nOutputs:\n' == 'Instructions:\ntest-instruction\n\nGenerate the response in JSON format with the following keys:\n["score"]\nConsider the instructions and the examples below to determine those values.\n\nExamples:\nInputs:\n{"predicted_answers": "Damn, this is straight outta hell!!!"}\nOutputs:\n{"score": 1}\nInputs:\n{"predicted_answers": "Football is the most popular sport."}\nOutputs:\n{"score": 0}\n\nInputs:\n{"predicted_answers": {{ predicted_answers }}}\nOutputs:\n'
) )
def test_invalid_input_parameters(self, monkeypatch): def test_invalid_input_parameters(self, monkeypatch):
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
component = LLMEvaluator( component = LLMEvaluator(
instructions="test-instruction", instructions="test-instruction",
inputs=[("responses", List[str])], inputs=[("predicted_answers", List[str])],
outputs=["score"], outputs=["score"],
examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], examples=[
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
],
) )
# None of the expected parameters are received # None of the expected parameters are received
with pytest.raises(ValueError): with pytest.raises(ValueError):
component.validate_input_parameters(expected={"responses": List[str]}, received={"questions": List[str]}) component.validate_input_parameters(
expected={"predicted_answers": List[str]}, received={"questions": List[str]}
)
# Only one but not all the expected parameters are received # Only one but not all the expected parameters are received
with pytest.raises(ValueError): with pytest.raises(ValueError):
component.validate_input_parameters( component.validate_input_parameters(
expected={"responses": List[str], "questions": List[str]}, received={"questions": List[str]} expected={"predicted_answers": List[str], "questions": List[str]}, received={"questions": List[str]}
) )
# Received inputs are not lists # Received inputs are not lists
@ -310,9 +367,11 @@ class TestLLMEvaluator:
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
component = LLMEvaluator( component = LLMEvaluator(
instructions="test-instruction", instructions="test-instruction",
inputs=[("responses", List[str])], inputs=[("predicted_answers", List[str])],
outputs=["score"], outputs=["score"],
examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], examples=[
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
],
) )
with pytest.raises(ValueError): with pytest.raises(ValueError):
component.validate_outputs(expected=["score", "another_expected_output"], received='{"score": 1.0}') component.validate_outputs(expected=["score", "another_expected_output"], received='{"score": 1.0}')
@ -325,7 +384,9 @@ class TestLLMEvaluator:
LLMEvaluator( LLMEvaluator(
api="unsupported_api", api="unsupported_api",
instructions="test-instruction", instructions="test-instruction",
inputs=[("responses", List[str])], inputs=[("predicted_answers", List[str])],
outputs=["score"], outputs=["score"],
examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], examples=[
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
],
) )