mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-14 17:13:03 +00:00
refactor: Rename FaithfulnessEvaluator input responses to predicted_answers (#7621)
This commit is contained in:
parent
5de5619abd
commit
2509eeea7e
@ -113,7 +113,7 @@ class ContextRelevanceEvaluator(LLMEvaluator):
|
|||||||
api_key=self.api_key,
|
api_key=self.api_key,
|
||||||
)
|
)
|
||||||
|
|
||||||
@component.output_types(results=List[Dict[str, Any]])
|
@component.output_types(individual_scores=List[int], score=float, results=List[Dict[str, Any]])
|
||||||
def run(self, questions: List[str], contexts: List[List[str]]) -> Dict[str, Any]:
|
def run(self, questions: List[str], contexts: List[List[str]]) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Run the LLM evaluator.
|
Run the LLM evaluator.
|
||||||
|
|||||||
@ -13,7 +13,7 @@ _DEFAULT_EXAMPLES = [
|
|||||||
"inputs": {
|
"inputs": {
|
||||||
"questions": "What is the capital of Germany and when was it founded?",
|
"questions": "What is the capital of Germany and when was it founded?",
|
||||||
"contexts": ["Berlin is the capital of Germany and was founded in 1244."],
|
"contexts": ["Berlin is the capital of Germany and was founded in 1244."],
|
||||||
"responses": "The capital of Germany, Berlin, was founded in the 13th century.",
|
"predicted_answers": "The capital of Germany, Berlin, was founded in the 13th century.",
|
||||||
},
|
},
|
||||||
"outputs": {
|
"outputs": {
|
||||||
"statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."],
|
"statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."],
|
||||||
@ -24,7 +24,7 @@ _DEFAULT_EXAMPLES = [
|
|||||||
"inputs": {
|
"inputs": {
|
||||||
"questions": "What is the capital of France?",
|
"questions": "What is the capital of France?",
|
||||||
"contexts": ["Berlin is the capital of Germany."],
|
"contexts": ["Berlin is the capital of Germany."],
|
||||||
"responses": "Paris",
|
"predicted_answers": "Paris",
|
||||||
},
|
},
|
||||||
"outputs": {"statements": ["Paris is the capital of France."], "statement_scores": [0]},
|
"outputs": {"statements": ["Paris is the capital of France."], "statement_scores": [0]},
|
||||||
},
|
},
|
||||||
@ -32,7 +32,7 @@ _DEFAULT_EXAMPLES = [
|
|||||||
"inputs": {
|
"inputs": {
|
||||||
"questions": "What is the capital of Italy?",
|
"questions": "What is the capital of Italy?",
|
||||||
"contexts": ["Rome is the capital of Italy."],
|
"contexts": ["Rome is the capital of Italy."],
|
||||||
"responses": "Rome is the capital of Italy with more than 4 million inhabitants.",
|
"predicted_answers": "Rome is the capital of Italy with more than 4 million inhabitants.",
|
||||||
},
|
},
|
||||||
"outputs": {
|
"outputs": {
|
||||||
"statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."],
|
"statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."],
|
||||||
@ -60,9 +60,9 @@ class FaithfulnessEvaluator(LLMEvaluator):
|
|||||||
"Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming language. Its design philosophy emphasizes code readability, and its language constructs aim to help programmers write clear, logical code for both small and large-scale software projects."
|
"Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming language. Its design philosophy emphasizes code readability, and its language constructs aim to help programmers write clear, logical code for both small and large-scale software projects."
|
||||||
],
|
],
|
||||||
]
|
]
|
||||||
responses = ["Python is a high-level general-purpose programming language that was created by George Lucas."]
|
predicted_answers = ["Python is a high-level general-purpose programming language that was created by George Lucas."]
|
||||||
evaluator = FaithfulnessEvaluator()
|
evaluator = FaithfulnessEvaluator()
|
||||||
result = evaluator.run(questions=questions, contexts=contexts, responses=responses)
|
result = evaluator.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
|
||||||
|
|
||||||
print(result["individual_scores"])
|
print(result["individual_scores"])
|
||||||
# [0.5]
|
# [0.5]
|
||||||
@ -87,13 +87,13 @@ class FaithfulnessEvaluator(LLMEvaluator):
|
|||||||
Optional few-shot examples conforming to the expected input and output format of FaithfulnessEvaluator.
|
Optional few-shot examples conforming to the expected input and output format of FaithfulnessEvaluator.
|
||||||
Default examples will be used if none are provided.
|
Default examples will be used if none are provided.
|
||||||
Each example must be a dictionary with keys "inputs" and "outputs".
|
Each example must be a dictionary with keys "inputs" and "outputs".
|
||||||
"inputs" must be a dictionary with keys "questions", "contexts", and "responses".
|
"inputs" must be a dictionary with keys "questions", "contexts", and "predicted_answers".
|
||||||
"outputs" must be a dictionary with "statements" and "statement_scores".
|
"outputs" must be a dictionary with "statements" and "statement_scores".
|
||||||
Expected format:
|
Expected format:
|
||||||
[{
|
[{
|
||||||
"inputs": {
|
"inputs": {
|
||||||
"questions": "What is the capital of Italy?", "contexts": ["Rome is the capital of Italy."],
|
"questions": "What is the capital of Italy?", "contexts": ["Rome is the capital of Italy."],
|
||||||
"responses": "Rome is the capital of Italy with more than 4 million inhabitants.",
|
"predicted_answers": "Rome is the capital of Italy with more than 4 million inhabitants.",
|
||||||
},
|
},
|
||||||
"outputs": {
|
"outputs": {
|
||||||
"statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."],
|
"statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."],
|
||||||
@ -110,11 +110,11 @@ class FaithfulnessEvaluator(LLMEvaluator):
|
|||||||
self.instructions = (
|
self.instructions = (
|
||||||
"Your task is to judge the faithfulness or groundedness of statements based "
|
"Your task is to judge the faithfulness or groundedness of statements based "
|
||||||
"on context information. First, please extract statements from a provided "
|
"on context information. First, please extract statements from a provided "
|
||||||
"response to a question. Second, calculate a faithfulness score for each "
|
"predicted answer to a question. Second, calculate a faithfulness score for each "
|
||||||
"statement made in the response. The score is 1 if the statement can be "
|
"statement made in the predicted answer. The score is 1 if the statement can be "
|
||||||
"inferred from the provided context or 0 if it cannot be inferred."
|
"inferred from the provided context or 0 if it cannot be inferred."
|
||||||
)
|
)
|
||||||
self.inputs = [("questions", List[str]), ("contexts", List[List[str]]), ("responses", List[str])]
|
self.inputs = [("questions", List[str]), ("contexts", List[List[str]]), ("predicted_answers", List[str])]
|
||||||
self.outputs = ["statements", "statement_scores"]
|
self.outputs = ["statements", "statement_scores"]
|
||||||
self.examples = examples or _DEFAULT_EXAMPLES
|
self.examples = examples or _DEFAULT_EXAMPLES
|
||||||
self.api = api
|
self.api = api
|
||||||
@ -129,8 +129,8 @@ class FaithfulnessEvaluator(LLMEvaluator):
|
|||||||
api_key=self.api_key,
|
api_key=self.api_key,
|
||||||
)
|
)
|
||||||
|
|
||||||
@component.output_types(results=List[Dict[str, Any]])
|
@component.output_types(individual_scores=List[int], score=float, results=List[Dict[str, Any]])
|
||||||
def run(self, questions: List[str], contexts: List[List[str]], responses: List[str]) -> Dict[str, Any]:
|
def run(self, questions: List[str], contexts: List[List[str]], predicted_answers: List[str]) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Run the LLM evaluator.
|
Run the LLM evaluator.
|
||||||
|
|
||||||
@ -138,15 +138,15 @@ class FaithfulnessEvaluator(LLMEvaluator):
|
|||||||
A list of questions.
|
A list of questions.
|
||||||
:param contexts:
|
:param contexts:
|
||||||
A nested list of contexts that correspond to the questions.
|
A nested list of contexts that correspond to the questions.
|
||||||
:param responses:
|
:param predicted_answers:
|
||||||
A list of responses.
|
A list of predicted answers.
|
||||||
:returns:
|
:returns:
|
||||||
A dictionary with the following outputs:
|
A dictionary with the following outputs:
|
||||||
- `score`: Mean faithfulness score over all the provided input answers.
|
- `score`: Mean faithfulness score over all the provided input answers.
|
||||||
- `individual_scores`: A list of faithfulness scores for each input answer.
|
- `individual_scores`: A list of faithfulness scores for each input answer.
|
||||||
- `results`: A list of dictionaries with `statements` and `statement_scores` for each input answer.
|
- `results`: A list of dictionaries with `statements` and `statement_scores` for each input answer.
|
||||||
"""
|
"""
|
||||||
result = super().run(questions=questions, contexts=contexts, responses=responses)
|
result = super().run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
|
||||||
|
|
||||||
# calculate average statement faithfulness score per query
|
# calculate average statement faithfulness score per query
|
||||||
for res in result["results"]:
|
for res in result["results"]:
|
||||||
|
|||||||
@ -23,18 +23,18 @@ class LLMEvaluator:
|
|||||||
from haystack.components.evaluators import LLMEvaluator
|
from haystack.components.evaluators import LLMEvaluator
|
||||||
evaluator = LLMEvaluator(
|
evaluator = LLMEvaluator(
|
||||||
instructions="Is this answer problematic for children?",
|
instructions="Is this answer problematic for children?",
|
||||||
inputs=[("responses", List[str])],
|
inputs=[("predicted_answers", List[str])],
|
||||||
outputs=["score"],
|
outputs=["score"],
|
||||||
examples=[
|
examples=[
|
||||||
{"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}},
|
{"inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}},
|
||||||
{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}},
|
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}},
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
RESPONSES = [
|
predicted_answers = [
|
||||||
"Football is the most popular sport with around 4 billion followers worldwide",
|
"Football is the most popular sport with around 4 billion followers worldwide",
|
||||||
"Python language was created by Guido van Rossum.",
|
"Python language was created by Guido van Rossum.",
|
||||||
]
|
]
|
||||||
results = evaluator.run(responses=RESPONSES)
|
results = evaluator.run(predicted_answers=predicted_answers)
|
||||||
print(results)
|
print(results)
|
||||||
# {'results': [{'score': 0}, {'score': 0}]}
|
# {'results': [{'score': 0}, {'score': 0}]}
|
||||||
```
|
```
|
||||||
@ -199,7 +199,7 @@ class LLMEvaluator:
|
|||||||
The prompt template.
|
The prompt template.
|
||||||
"""
|
"""
|
||||||
inputs_section = (
|
inputs_section = (
|
||||||
"{" + ",".join([f'"{input_socket[0]}": {{{{ {input_socket[0]} }}}}' for input_socket in self.inputs]) + "}"
|
"{" + ", ".join([f'"{input_socket[0]}": {{{{ {input_socket[0]} }}}}' for input_socket in self.inputs]) + "}"
|
||||||
)
|
)
|
||||||
|
|
||||||
examples_section = "\n".join(
|
examples_section = "\n".join(
|
||||||
|
|||||||
@ -183,7 +183,7 @@ class SASEvaluator:
|
|||||||
|
|
||||||
# Compute cosine-similarities
|
# Compute cosine-similarities
|
||||||
similarity_scores = [
|
similarity_scores = [
|
||||||
util.cos_sim(p, l).cpu().numpy() for p, l in zip(predictions_embeddings, label_embeddings)
|
float(util.cos_sim(p, l).cpu().numpy()) for p, l in zip(predictions_embeddings, label_embeddings)
|
||||||
]
|
]
|
||||||
|
|
||||||
sas_score = np_mean(similarity_scores)
|
sas_score = np_mean(similarity_scores)
|
||||||
|
|||||||
@ -15,19 +15,23 @@ class TestFaithfulnessEvaluator:
|
|||||||
assert component.generator.client.api_key == "test-api-key"
|
assert component.generator.client.api_key == "test-api-key"
|
||||||
assert component.instructions == (
|
assert component.instructions == (
|
||||||
"Your task is to judge the faithfulness or groundedness of statements based "
|
"Your task is to judge the faithfulness or groundedness of statements based "
|
||||||
"on context information. First, please extract statements from a provided "
|
"on context information. First, please extract statements from a provided predicted "
|
||||||
"response to a question. Second, calculate a faithfulness score for each "
|
"answer to a question. Second, calculate a faithfulness score for each "
|
||||||
"statement made in the response. The score is 1 if the statement can be "
|
"statement made in the predicted answer. The score is 1 if the statement can be "
|
||||||
"inferred from the provided context or 0 if it cannot be inferred."
|
"inferred from the provided context or 0 if it cannot be inferred."
|
||||||
)
|
)
|
||||||
assert component.inputs == [("questions", List[str]), ("contexts", List[List[str]]), ("responses", List[str])]
|
assert component.inputs == [
|
||||||
|
("questions", List[str]),
|
||||||
|
("contexts", List[List[str]]),
|
||||||
|
("predicted_answers", List[str]),
|
||||||
|
]
|
||||||
assert component.outputs == ["statements", "statement_scores"]
|
assert component.outputs == ["statements", "statement_scores"]
|
||||||
assert component.examples == [
|
assert component.examples == [
|
||||||
{
|
{
|
||||||
"inputs": {
|
"inputs": {
|
||||||
"questions": "What is the capital of Germany and when was it founded?",
|
"questions": "What is the capital of Germany and when was it founded?",
|
||||||
"contexts": ["Berlin is the capital of Germany and was founded in 1244."],
|
"contexts": ["Berlin is the capital of Germany and was founded in 1244."],
|
||||||
"responses": "The capital of Germany, Berlin, was founded in the 13th century.",
|
"predicted_answers": "The capital of Germany, Berlin, was founded in the 13th century.",
|
||||||
},
|
},
|
||||||
"outputs": {
|
"outputs": {
|
||||||
"statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."],
|
"statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."],
|
||||||
@ -38,7 +42,7 @@ class TestFaithfulnessEvaluator:
|
|||||||
"inputs": {
|
"inputs": {
|
||||||
"questions": "What is the capital of France?",
|
"questions": "What is the capital of France?",
|
||||||
"contexts": ["Berlin is the capital of Germany."],
|
"contexts": ["Berlin is the capital of Germany."],
|
||||||
"responses": "Paris",
|
"predicted_answers": "Paris",
|
||||||
},
|
},
|
||||||
"outputs": {"statements": ["Paris is the capital of France."], "statement_scores": [0]},
|
"outputs": {"statements": ["Paris is the capital of France."], "statement_scores": [0]},
|
||||||
},
|
},
|
||||||
@ -46,7 +50,7 @@ class TestFaithfulnessEvaluator:
|
|||||||
"inputs": {
|
"inputs": {
|
||||||
"questions": "What is the capital of Italy?",
|
"questions": "What is the capital of Italy?",
|
||||||
"contexts": ["Rome is the capital of Italy."],
|
"contexts": ["Rome is the capital of Italy."],
|
||||||
"responses": "Rome is the capital of Italy with more than 4 million inhabitants.",
|
"predicted_answers": "Rome is the capital of Italy with more than 4 million inhabitants.",
|
||||||
},
|
},
|
||||||
"outputs": {
|
"outputs": {
|
||||||
"statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."],
|
"statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."],
|
||||||
@ -65,15 +69,21 @@ class TestFaithfulnessEvaluator:
|
|||||||
api_key=Secret.from_token("test-api-key"),
|
api_key=Secret.from_token("test-api-key"),
|
||||||
api="openai",
|
api="openai",
|
||||||
examples=[
|
examples=[
|
||||||
{"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
|
{
|
||||||
{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
|
"inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"},
|
||||||
|
"outputs": {"custom_score": 1},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"inputs": {"predicted_answers": "Football is the most popular sport."},
|
||||||
|
"outputs": {"custom_score": 0},
|
||||||
|
},
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
assert component.generator.client.api_key == "test-api-key"
|
assert component.generator.client.api_key == "test-api-key"
|
||||||
assert component.api == "openai"
|
assert component.api == "openai"
|
||||||
assert component.examples == [
|
assert component.examples == [
|
||||||
{"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
|
{"inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
|
||||||
{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
|
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
|
||||||
]
|
]
|
||||||
|
|
||||||
def test_from_dict(self, monkeypatch):
|
def test_from_dict(self, monkeypatch):
|
||||||
@ -84,14 +94,16 @@ class TestFaithfulnessEvaluator:
|
|||||||
"init_parameters": {
|
"init_parameters": {
|
||||||
"api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"},
|
"api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"},
|
||||||
"api": "openai",
|
"api": "openai",
|
||||||
"examples": [{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
|
"examples": [
|
||||||
|
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
|
||||||
|
],
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
component = FaithfulnessEvaluator.from_dict(data)
|
component = FaithfulnessEvaluator.from_dict(data)
|
||||||
assert component.api == "openai"
|
assert component.api == "openai"
|
||||||
assert component.generator.client.api_key == "test-api-key"
|
assert component.generator.client.api_key == "test-api-key"
|
||||||
assert component.examples == [
|
assert component.examples == [
|
||||||
{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}
|
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
|
||||||
]
|
]
|
||||||
|
|
||||||
def test_run_calculates_mean_score(self, monkeypatch):
|
def test_run_calculates_mean_score(self, monkeypatch):
|
||||||
@ -120,11 +132,11 @@ class TestFaithfulnessEvaluator:
|
|||||||
"programmers write clear, logical code for both small and large-scale software projects."
|
"programmers write clear, logical code for both small and large-scale software projects."
|
||||||
],
|
],
|
||||||
]
|
]
|
||||||
responses = [
|
predicted_answers = [
|
||||||
"Football is the most popular sport with around 4 billion followers worldwide.",
|
"Football is the most popular sport with around 4 billion followers worldwide.",
|
||||||
"Python is a high-level general-purpose programming language that was created by George Lucas.",
|
"Python is a high-level general-purpose programming language that was created by George Lucas.",
|
||||||
]
|
]
|
||||||
results = component.run(questions=questions, contexts=contexts, responses=responses)
|
results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
|
||||||
assert results == {
|
assert results == {
|
||||||
"individual_scores": [0.5, 1],
|
"individual_scores": [0.5, 1],
|
||||||
"results": [
|
"results": [
|
||||||
@ -148,9 +160,9 @@ class TestFaithfulnessEvaluator:
|
|||||||
def test_live_run(self):
|
def test_live_run(self):
|
||||||
questions = ["What is Python and who created it?"]
|
questions = ["What is Python and who created it?"]
|
||||||
contexts = [["Python is a programming language created by Guido van Rossum."]]
|
contexts = [["Python is a programming language created by Guido van Rossum."]]
|
||||||
responses = ["Python is a programming language created by George Lucas."]
|
predicted_answers = ["Python is a programming language created by George Lucas."]
|
||||||
evaluator = FaithfulnessEvaluator()
|
evaluator = FaithfulnessEvaluator()
|
||||||
result = evaluator.run(questions=questions, contexts=contexts, responses=responses)
|
result = evaluator.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
|
||||||
|
|
||||||
required_fields = {"individual_scores", "results", "score"}
|
required_fields = {"individual_scores", "results", "score"}
|
||||||
assert all(field in result for field in required_fields)
|
assert all(field in result for field in required_fields)
|
||||||
|
|||||||
@ -11,17 +11,19 @@ class TestLLMEvaluator:
|
|||||||
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
|
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
|
||||||
component = LLMEvaluator(
|
component = LLMEvaluator(
|
||||||
instructions="test-instruction",
|
instructions="test-instruction",
|
||||||
inputs=[("responses", List[str])],
|
inputs=[("predicted_answers", List[str])],
|
||||||
outputs=["score"],
|
outputs=["score"],
|
||||||
examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
|
examples=[
|
||||||
|
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
|
||||||
|
],
|
||||||
)
|
)
|
||||||
assert component.api == "openai"
|
assert component.api == "openai"
|
||||||
assert component.generator.client.api_key == "test-api-key"
|
assert component.generator.client.api_key == "test-api-key"
|
||||||
assert component.instructions == "test-instruction"
|
assert component.instructions == "test-instruction"
|
||||||
assert component.inputs == [("responses", List[str])]
|
assert component.inputs == [("predicted_answers", List[str])]
|
||||||
assert component.outputs == ["score"]
|
assert component.outputs == ["score"]
|
||||||
assert component.examples == [
|
assert component.examples == [
|
||||||
{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}
|
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
|
||||||
]
|
]
|
||||||
|
|
||||||
def test_init_fail_wo_openai_api_key(self, monkeypatch):
|
def test_init_fail_wo_openai_api_key(self, monkeypatch):
|
||||||
@ -30,31 +32,39 @@ class TestLLMEvaluator:
|
|||||||
LLMEvaluator(
|
LLMEvaluator(
|
||||||
api="openai",
|
api="openai",
|
||||||
instructions="test-instruction",
|
instructions="test-instruction",
|
||||||
inputs=[("responses", List[str])],
|
inputs=[("predicted_answers", List[str])],
|
||||||
outputs=["score"],
|
outputs=["score"],
|
||||||
examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
|
examples=[
|
||||||
|
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
|
||||||
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_init_with_parameters(self):
|
def test_init_with_parameters(self):
|
||||||
component = LLMEvaluator(
|
component = LLMEvaluator(
|
||||||
instructions="test-instruction",
|
instructions="test-instruction",
|
||||||
api_key=Secret.from_token("test-api-key"),
|
api_key=Secret.from_token("test-api-key"),
|
||||||
inputs=[("responses", List[str])],
|
inputs=[("predicted_answers", List[str])],
|
||||||
outputs=["custom_score"],
|
outputs=["custom_score"],
|
||||||
api="openai",
|
api="openai",
|
||||||
examples=[
|
examples=[
|
||||||
{"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
|
{
|
||||||
{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
|
"inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"},
|
||||||
|
"outputs": {"custom_score": 1},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"inputs": {"predicted_answers": "Football is the most popular sport."},
|
||||||
|
"outputs": {"custom_score": 0},
|
||||||
|
},
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
assert component.generator.client.api_key == "test-api-key"
|
assert component.generator.client.api_key == "test-api-key"
|
||||||
assert component.api == "openai"
|
assert component.api == "openai"
|
||||||
assert component.examples == [
|
assert component.examples == [
|
||||||
{"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
|
{"inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
|
||||||
{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
|
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
|
||||||
]
|
]
|
||||||
assert component.instructions == "test-instruction"
|
assert component.instructions == "test-instruction"
|
||||||
assert component.inputs == [("responses", List[str])]
|
assert component.inputs == [("predicted_answers", List[str])]
|
||||||
assert component.outputs == ["custom_score"]
|
assert component.outputs == ["custom_score"]
|
||||||
|
|
||||||
def test_init_with_invalid_parameters(self, monkeypatch):
|
def test_init_with_invalid_parameters(self, monkeypatch):
|
||||||
@ -63,85 +73,105 @@ class TestLLMEvaluator:
|
|||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
LLMEvaluator(
|
LLMEvaluator(
|
||||||
instructions="test-instruction",
|
instructions="test-instruction",
|
||||||
inputs={("responses", List[str])},
|
inputs={("predicted_answers", List[str])},
|
||||||
outputs=["score"],
|
outputs=["score"],
|
||||||
examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
|
examples=[
|
||||||
|
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
|
||||||
|
],
|
||||||
)
|
)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
LLMEvaluator(
|
LLMEvaluator(
|
||||||
instructions="test-instruction",
|
instructions="test-instruction",
|
||||||
inputs=[(List[str], "responses")],
|
inputs=[(List[str], "predicted_answers")],
|
||||||
outputs=["score"],
|
outputs=["score"],
|
||||||
examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
|
examples=[
|
||||||
|
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
|
||||||
|
],
|
||||||
)
|
)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
LLMEvaluator(
|
LLMEvaluator(
|
||||||
instructions="test-instruction",
|
instructions="test-instruction",
|
||||||
inputs=[List[str]],
|
inputs=[List[str]],
|
||||||
outputs=["score"],
|
outputs=["score"],
|
||||||
examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
|
examples=[
|
||||||
|
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
|
||||||
|
],
|
||||||
)
|
)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
LLMEvaluator(
|
LLMEvaluator(
|
||||||
instructions="test-instruction",
|
instructions="test-instruction",
|
||||||
inputs={("responses", str)},
|
inputs={("predicted_answers", str)},
|
||||||
outputs=["score"],
|
outputs=["score"],
|
||||||
examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
|
examples=[
|
||||||
|
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
|
||||||
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
# Invalid outputs
|
# Invalid outputs
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
LLMEvaluator(
|
LLMEvaluator(
|
||||||
instructions="test-instruction",
|
instructions="test-instruction",
|
||||||
inputs=[("responses", List[str])],
|
inputs=[("predicted_answers", List[str])],
|
||||||
outputs="score",
|
outputs="score",
|
||||||
examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
|
examples=[
|
||||||
|
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
|
||||||
|
],
|
||||||
)
|
)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
LLMEvaluator(
|
LLMEvaluator(
|
||||||
instructions="test-instruction",
|
instructions="test-instruction",
|
||||||
inputs=[("responses", List[str])],
|
inputs=[("predicted_answers", List[str])],
|
||||||
outputs=[["score"]],
|
outputs=[["score"]],
|
||||||
examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
|
examples=[
|
||||||
|
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
|
||||||
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
# Invalid examples
|
# Invalid examples
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
LLMEvaluator(
|
LLMEvaluator(
|
||||||
instructions="test-instruction",
|
instructions="test-instruction",
|
||||||
inputs=[("responses", List[str])],
|
inputs=[("predicted_answers", List[str])],
|
||||||
outputs=["score"],
|
outputs=["score"],
|
||||||
examples={
|
examples={
|
||||||
"inputs": {"responses": "Damn, this is straight outta hell!!!"},
|
"inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"},
|
||||||
"outputs": {"custom_score": 1},
|
"outputs": {"custom_score": 1},
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
LLMEvaluator(
|
LLMEvaluator(
|
||||||
instructions="test-instruction",
|
instructions="test-instruction",
|
||||||
inputs=[("responses", List[str])],
|
inputs=[("predicted_answers", List[str])],
|
||||||
outputs=["score"],
|
outputs=["score"],
|
||||||
examples=[
|
examples=[
|
||||||
[{"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}]
|
[
|
||||||
|
{
|
||||||
|
"inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"},
|
||||||
|
"outputs": {"custom_score": 1},
|
||||||
|
}
|
||||||
|
]
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
LLMEvaluator(
|
LLMEvaluator(
|
||||||
instructions="test-instruction",
|
instructions="test-instruction",
|
||||||
inputs=[("responses", List[str])],
|
inputs=[("predicted_answers", List[str])],
|
||||||
outputs=["score"],
|
|
||||||
examples=[
|
|
||||||
{"wrong_key": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}
|
|
||||||
],
|
|
||||||
)
|
|
||||||
with pytest.raises(ValueError):
|
|
||||||
LLMEvaluator(
|
|
||||||
instructions="test-instruction",
|
|
||||||
inputs=[("responses", List[str])],
|
|
||||||
outputs=["score"],
|
outputs=["score"],
|
||||||
examples=[
|
examples=[
|
||||||
{
|
{
|
||||||
"inputs": [{"responses": "Damn, this is straight outta hell!!!"}],
|
"wrong_key": {"predicted_answers": "Damn, this is straight outta hell!!!"},
|
||||||
|
"outputs": {"custom_score": 1},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
LLMEvaluator(
|
||||||
|
instructions="test-instruction",
|
||||||
|
inputs=[("predicted_answers", List[str])],
|
||||||
|
outputs=["score"],
|
||||||
|
examples=[
|
||||||
|
{
|
||||||
|
"inputs": [{"predicted_answers": "Damn, this is straight outta hell!!!"}],
|
||||||
"outputs": [{"custom_score": 1}],
|
"outputs": [{"custom_score": 1}],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -149,7 +179,7 @@ class TestLLMEvaluator:
|
|||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
LLMEvaluator(
|
LLMEvaluator(
|
||||||
instructions="test-instruction",
|
instructions="test-instruction",
|
||||||
inputs=[("responses", List[str])],
|
inputs=[("predicted_answers", List[str])],
|
||||||
outputs=["score"],
|
outputs=["score"],
|
||||||
examples=[{"inputs": {1: "Damn, this is straight outta hell!!!"}, "outputs": {2: 1}}],
|
examples=[{"inputs": {1: "Damn, this is straight outta hell!!!"}, "outputs": {2: 1}}],
|
||||||
)
|
)
|
||||||
@ -158,9 +188,11 @@ class TestLLMEvaluator:
|
|||||||
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
|
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
|
||||||
component = LLMEvaluator(
|
component = LLMEvaluator(
|
||||||
instructions="test-instruction",
|
instructions="test-instruction",
|
||||||
inputs=[("responses", List[str])],
|
inputs=[("predicted_answers", List[str])],
|
||||||
outputs=["score"],
|
outputs=["score"],
|
||||||
examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
|
examples=[
|
||||||
|
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
|
||||||
|
],
|
||||||
)
|
)
|
||||||
data = component.to_dict()
|
data = component.to_dict()
|
||||||
assert data == {
|
assert data == {
|
||||||
@ -169,9 +201,11 @@ class TestLLMEvaluator:
|
|||||||
"api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"},
|
"api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"},
|
||||||
"api": "openai",
|
"api": "openai",
|
||||||
"instructions": "test-instruction",
|
"instructions": "test-instruction",
|
||||||
"inputs": [("responses", List[str])],
|
"inputs": [("predicted_answers", List[str])],
|
||||||
"outputs": ["score"],
|
"outputs": ["score"],
|
||||||
"examples": [{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
|
"examples": [
|
||||||
|
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
|
||||||
|
],
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -184,19 +218,21 @@ class TestLLMEvaluator:
|
|||||||
"api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"},
|
"api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"},
|
||||||
"api": "openai",
|
"api": "openai",
|
||||||
"instructions": "test-instruction",
|
"instructions": "test-instruction",
|
||||||
"inputs": [("responses", List[str])],
|
"inputs": [("predicted_answers", List[str])],
|
||||||
"outputs": ["score"],
|
"outputs": ["score"],
|
||||||
"examples": [{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
|
"examples": [
|
||||||
|
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
|
||||||
|
],
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
component = LLMEvaluator.from_dict(data)
|
component = LLMEvaluator.from_dict(data)
|
||||||
assert component.api == "openai"
|
assert component.api == "openai"
|
||||||
assert component.generator.client.api_key == "test-api-key"
|
assert component.generator.client.api_key == "test-api-key"
|
||||||
assert component.instructions == "test-instruction"
|
assert component.instructions == "test-instruction"
|
||||||
assert component.inputs == [("responses", List[str])]
|
assert component.inputs == [("predicted_answers", List[str])]
|
||||||
assert component.outputs == ["score"]
|
assert component.outputs == ["score"]
|
||||||
assert component.examples == [
|
assert component.examples == [
|
||||||
{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}
|
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
|
||||||
]
|
]
|
||||||
|
|
||||||
def test_to_dict_with_parameters(self, monkeypatch):
|
def test_to_dict_with_parameters(self, monkeypatch):
|
||||||
@ -204,12 +240,18 @@ class TestLLMEvaluator:
|
|||||||
component = LLMEvaluator(
|
component = LLMEvaluator(
|
||||||
instructions="test-instruction",
|
instructions="test-instruction",
|
||||||
api_key=Secret.from_env_var("ENV_VAR"),
|
api_key=Secret.from_env_var("ENV_VAR"),
|
||||||
inputs=[("responses", List[str])],
|
inputs=[("predicted_answers", List[str])],
|
||||||
outputs=["custom_score"],
|
outputs=["custom_score"],
|
||||||
api="openai",
|
api="openai",
|
||||||
examples=[
|
examples=[
|
||||||
{"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
|
{
|
||||||
{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
|
"inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"},
|
||||||
|
"outputs": {"custom_score": 1},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"inputs": {"predicted_answers": "Football is the most popular sport."},
|
||||||
|
"outputs": {"custom_score": 0},
|
||||||
|
},
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
data = component.to_dict()
|
data = component.to_dict()
|
||||||
@ -219,11 +261,17 @@ class TestLLMEvaluator:
|
|||||||
"api_key": {"env_vars": ["ENV_VAR"], "strict": True, "type": "env_var"},
|
"api_key": {"env_vars": ["ENV_VAR"], "strict": True, "type": "env_var"},
|
||||||
"api": "openai",
|
"api": "openai",
|
||||||
"instructions": "test-instruction",
|
"instructions": "test-instruction",
|
||||||
"inputs": [("responses", List[str])],
|
"inputs": [("predicted_answers", List[str])],
|
||||||
"outputs": ["custom_score"],
|
"outputs": ["custom_score"],
|
||||||
"examples": [
|
"examples": [
|
||||||
{"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
|
{
|
||||||
{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
|
"inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"},
|
||||||
|
"outputs": {"custom_score": 1},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"inputs": {"predicted_answers": "Football is the most popular sport."},
|
||||||
|
"outputs": {"custom_score": 0},
|
||||||
|
},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
@ -232,9 +280,11 @@ class TestLLMEvaluator:
|
|||||||
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
|
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
|
||||||
component = LLMEvaluator(
|
component = LLMEvaluator(
|
||||||
instructions="test-instruction",
|
instructions="test-instruction",
|
||||||
inputs=[("questions", List[str]), ("responses", List[List[str]])],
|
inputs=[("questions", List[str]), ("predicted_answers", List[List[str]])],
|
||||||
outputs=["score"],
|
outputs=["score"],
|
||||||
examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
|
examples=[
|
||||||
|
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
|
||||||
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
def generator_run(self, *args, **kwargs):
|
def generator_run(self, *args, **kwargs):
|
||||||
@ -243,20 +293,23 @@ class TestLLMEvaluator:
|
|||||||
monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run)
|
monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run)
|
||||||
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
component.run(questions=["What is the capital of Germany?"], responses=[["Berlin"], ["Paris"]])
|
component.run(questions=["What is the capital of Germany?"], predicted_answers=[["Berlin"], ["Paris"]])
|
||||||
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
component.run(
|
component.run(
|
||||||
questions=["What is the capital of Germany?", "What is the capital of France?"], responses=[["Berlin"]]
|
questions=["What is the capital of Germany?", "What is the capital of France?"],
|
||||||
|
predicted_answers=[["Berlin"]],
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_run_returns_parsed_result(self, monkeypatch):
|
def test_run_returns_parsed_result(self, monkeypatch):
|
||||||
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
|
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
|
||||||
component = LLMEvaluator(
|
component = LLMEvaluator(
|
||||||
instructions="test-instruction",
|
instructions="test-instruction",
|
||||||
inputs=[("questions", List[str]), ("responses", List[List[str]])],
|
inputs=[("questions", List[str]), ("predicted_answers", List[List[str]])],
|
||||||
outputs=["score"],
|
outputs=["score"],
|
||||||
examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
|
examples=[
|
||||||
|
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
|
||||||
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
def generator_run(self, *args, **kwargs):
|
def generator_run(self, *args, **kwargs):
|
||||||
@ -264,42 +317,46 @@ class TestLLMEvaluator:
|
|||||||
|
|
||||||
monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run)
|
monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run)
|
||||||
|
|
||||||
results = component.run(questions=["What is the capital of Germany?"], responses=["Berlin"])
|
results = component.run(questions=["What is the capital of Germany?"], predicted_answers=["Berlin"])
|
||||||
assert results == {"results": [{"score": 0.5}]}
|
assert results == {"results": [{"score": 0.5}]}
|
||||||
|
|
||||||
def test_prepare_template(self, monkeypatch):
|
def test_prepare_template(self, monkeypatch):
|
||||||
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
|
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
|
||||||
component = LLMEvaluator(
|
component = LLMEvaluator(
|
||||||
instructions="test-instruction",
|
instructions="test-instruction",
|
||||||
inputs=[("responses", List[str])],
|
inputs=[("predicted_answers", List[str])],
|
||||||
outputs=["score"],
|
outputs=["score"],
|
||||||
examples=[
|
examples=[
|
||||||
{"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}},
|
{"inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}},
|
||||||
{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}},
|
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}},
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
template = component.prepare_template()
|
template = component.prepare_template()
|
||||||
assert (
|
assert (
|
||||||
template
|
template
|
||||||
== 'Instructions:\ntest-instruction\n\nGenerate the response in JSON format with the following keys:\n["score"]\nConsider the instructions and the examples below to determine those values.\n\nExamples:\nInputs:\n{"responses": "Damn, this is straight outta hell!!!"}\nOutputs:\n{"score": 1}\nInputs:\n{"responses": "Football is the most popular sport."}\nOutputs:\n{"score": 0}\n\nInputs:\n{"responses": {{ responses }}}\nOutputs:\n'
|
== 'Instructions:\ntest-instruction\n\nGenerate the response in JSON format with the following keys:\n["score"]\nConsider the instructions and the examples below to determine those values.\n\nExamples:\nInputs:\n{"predicted_answers": "Damn, this is straight outta hell!!!"}\nOutputs:\n{"score": 1}\nInputs:\n{"predicted_answers": "Football is the most popular sport."}\nOutputs:\n{"score": 0}\n\nInputs:\n{"predicted_answers": {{ predicted_answers }}}\nOutputs:\n'
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_invalid_input_parameters(self, monkeypatch):
|
def test_invalid_input_parameters(self, monkeypatch):
|
||||||
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
|
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
|
||||||
component = LLMEvaluator(
|
component = LLMEvaluator(
|
||||||
instructions="test-instruction",
|
instructions="test-instruction",
|
||||||
inputs=[("responses", List[str])],
|
inputs=[("predicted_answers", List[str])],
|
||||||
outputs=["score"],
|
outputs=["score"],
|
||||||
examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
|
examples=[
|
||||||
|
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
|
||||||
|
],
|
||||||
)
|
)
|
||||||
# None of the expected parameters are received
|
# None of the expected parameters are received
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
component.validate_input_parameters(expected={"responses": List[str]}, received={"questions": List[str]})
|
component.validate_input_parameters(
|
||||||
|
expected={"predicted_answers": List[str]}, received={"questions": List[str]}
|
||||||
|
)
|
||||||
|
|
||||||
# Only one but not all the expected parameters are received
|
# Only one but not all the expected parameters are received
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
component.validate_input_parameters(
|
component.validate_input_parameters(
|
||||||
expected={"responses": List[str], "questions": List[str]}, received={"questions": List[str]}
|
expected={"predicted_answers": List[str], "questions": List[str]}, received={"questions": List[str]}
|
||||||
)
|
)
|
||||||
|
|
||||||
# Received inputs are not lists
|
# Received inputs are not lists
|
||||||
@ -310,9 +367,11 @@ class TestLLMEvaluator:
|
|||||||
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
|
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
|
||||||
component = LLMEvaluator(
|
component = LLMEvaluator(
|
||||||
instructions="test-instruction",
|
instructions="test-instruction",
|
||||||
inputs=[("responses", List[str])],
|
inputs=[("predicted_answers", List[str])],
|
||||||
outputs=["score"],
|
outputs=["score"],
|
||||||
examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
|
examples=[
|
||||||
|
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
|
||||||
|
],
|
||||||
)
|
)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
component.validate_outputs(expected=["score", "another_expected_output"], received='{"score": 1.0}')
|
component.validate_outputs(expected=["score", "another_expected_output"], received='{"score": 1.0}')
|
||||||
@ -325,7 +384,9 @@ class TestLLMEvaluator:
|
|||||||
LLMEvaluator(
|
LLMEvaluator(
|
||||||
api="unsupported_api",
|
api="unsupported_api",
|
||||||
instructions="test-instruction",
|
instructions="test-instruction",
|
||||||
inputs=[("responses", List[str])],
|
inputs=[("predicted_answers", List[str])],
|
||||||
outputs=["score"],
|
outputs=["score"],
|
||||||
examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
|
examples=[
|
||||||
|
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
|
||||||
|
],
|
||||||
)
|
)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user