mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-03 18:36:04 +00:00
fix: Fix deserialization of pipelines that contain LLMEvaluator subclasses (#7891)
This commit is contained in:
parent
7c31d5f418
commit
fe60eedee9
@ -6,9 +6,8 @@ from typing import Any, Dict, List, Optional
|
||||
|
||||
from numpy import mean as np_mean
|
||||
|
||||
from haystack import default_from_dict
|
||||
from haystack import component, default_from_dict, default_to_dict
|
||||
from haystack.components.evaluators.llm_evaluator import LLMEvaluator
|
||||
from haystack.core.component import component
|
||||
from haystack.utils import Secret, deserialize_secrets_inplace
|
||||
|
||||
# Private global variable for default examples to include in the prompt if the user does not provide any examples
|
||||
@ -34,6 +33,7 @@ _DEFAULT_EXAMPLES = [
|
||||
]
|
||||
|
||||
|
||||
@component
|
||||
class ContextRelevanceEvaluator(LLMEvaluator):
|
||||
"""
|
||||
Evaluator that checks if a provided context is relevant to the question.
|
||||
@ -121,7 +121,7 @@ class ContextRelevanceEvaluator(LLMEvaluator):
|
||||
self.api = api
|
||||
self.api_key = api_key
|
||||
|
||||
super().__init__(
|
||||
super(ContextRelevanceEvaluator, self).__init__(
|
||||
instructions=self.instructions,
|
||||
inputs=self.inputs,
|
||||
outputs=self.outputs,
|
||||
@ -147,7 +147,7 @@ class ContextRelevanceEvaluator(LLMEvaluator):
|
||||
- `individual_scores`: A list of context relevance scores for each input question.
|
||||
- `results`: A list of dictionaries with `statements` and `statement_scores` for each input context.
|
||||
"""
|
||||
result = super().run(questions=questions, contexts=contexts)
|
||||
result = super(ContextRelevanceEvaluator, self).run(questions=questions, contexts=contexts)
|
||||
|
||||
# calculate average statement relevance score per query
|
||||
for idx, res in enumerate(result["results"]):
|
||||
@ -165,6 +165,22 @@ class ContextRelevanceEvaluator(LLMEvaluator):
|
||||
|
||||
return result
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Serialize this component to a dictionary.
|
||||
|
||||
:returns:
|
||||
A dictionary with serialized data.
|
||||
"""
|
||||
return default_to_dict(
|
||||
self,
|
||||
api=self.api,
|
||||
api_key=self.api_key.to_dict() if self.api_key else None,
|
||||
examples=self.examples,
|
||||
progress_bar=self.progress_bar,
|
||||
raise_on_failure=self.raise_on_failure,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "ContextRelevanceEvaluator":
|
||||
"""
|
||||
|
||||
@ -6,9 +6,8 @@ from typing import Any, Dict, List, Optional
|
||||
|
||||
from numpy import mean as np_mean
|
||||
|
||||
from haystack import default_from_dict
|
||||
from haystack import component, default_from_dict, default_to_dict
|
||||
from haystack.components.evaluators.llm_evaluator import LLMEvaluator
|
||||
from haystack.core.component import component
|
||||
from haystack.utils import Secret, deserialize_secrets_inplace
|
||||
|
||||
# Default examples to include in the prompt if the user does not provide any examples
|
||||
@ -46,6 +45,7 @@ _DEFAULT_EXAMPLES = [
|
||||
]
|
||||
|
||||
|
||||
@component
|
||||
class FaithfulnessEvaluator(LLMEvaluator):
|
||||
"""
|
||||
Evaluator that checks if a generated answer can be inferred from the provided contexts.
|
||||
@ -134,7 +134,7 @@ class FaithfulnessEvaluator(LLMEvaluator):
|
||||
self.api = api
|
||||
self.api_key = api_key
|
||||
|
||||
super().__init__(
|
||||
super(FaithfulnessEvaluator, self).__init__(
|
||||
instructions=self.instructions,
|
||||
inputs=self.inputs,
|
||||
outputs=self.outputs,
|
||||
@ -162,7 +162,9 @@ class FaithfulnessEvaluator(LLMEvaluator):
|
||||
- `individual_scores`: A list of faithfulness scores for each input answer.
|
||||
- `results`: A list of dictionaries with `statements` and `statement_scores` for each input answer.
|
||||
"""
|
||||
result = super().run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
|
||||
result = super(FaithfulnessEvaluator, self).run(
|
||||
questions=questions, contexts=contexts, predicted_answers=predicted_answers
|
||||
)
|
||||
|
||||
# calculate average statement faithfulness score per query
|
||||
for idx, res in enumerate(result["results"]):
|
||||
@ -180,6 +182,22 @@ class FaithfulnessEvaluator(LLMEvaluator):
|
||||
|
||||
return result
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Serialize this component to a dictionary.
|
||||
|
||||
:returns:
|
||||
A dictionary with serialized data.
|
||||
"""
|
||||
return default_to_dict(
|
||||
self,
|
||||
api=self.api,
|
||||
api_key=self.api_key.to_dict() if self.api_key else None,
|
||||
examples=self.examples,
|
||||
progress_bar=self.progress_bar,
|
||||
raise_on_failure=self.raise_on_failure,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "FaithfulnessEvaluator":
|
||||
"""
|
||||
|
||||
@ -0,0 +1,4 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
Fix the deserialization of pipelines containing evaluator components that were subclasses of `LLMEvaluator`.
|
||||
@ -8,6 +8,7 @@ import math
|
||||
|
||||
import pytest
|
||||
|
||||
from haystack import Pipeline
|
||||
from haystack.components.evaluators import ContextRelevanceEvaluator
|
||||
from haystack.utils.auth import Secret
|
||||
|
||||
@ -71,6 +72,27 @@ class TestContextRelevanceEvaluator:
|
||||
{"inputs": {"questions": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
|
||||
]
|
||||
|
||||
def test_to_dict_with_parameters(self, monkeypatch):
|
||||
monkeypatch.setenv("ENV_VAR", "test-api-key")
|
||||
component = ContextRelevanceEvaluator(
|
||||
api="openai",
|
||||
api_key=Secret.from_env_var("ENV_VAR"),
|
||||
examples=[{"inputs": {"questions": "What is football?"}, "outputs": {"score": 0}}],
|
||||
raise_on_failure=False,
|
||||
progress_bar=False,
|
||||
)
|
||||
data = component.to_dict()
|
||||
assert data == {
|
||||
"type": "haystack.components.evaluators.context_relevance.ContextRelevanceEvaluator",
|
||||
"init_parameters": {
|
||||
"api_key": {"env_vars": ["ENV_VAR"], "strict": True, "type": "env_var"},
|
||||
"api": "openai",
|
||||
"examples": [{"inputs": {"questions": "What is football?"}, "outputs": {"score": 0}}],
|
||||
"progress_bar": False,
|
||||
"raise_on_failure": False,
|
||||
},
|
||||
}
|
||||
|
||||
def test_from_dict(self, monkeypatch):
|
||||
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
|
||||
|
||||
@ -87,6 +109,10 @@ class TestContextRelevanceEvaluator:
|
||||
assert component.generator.client.api_key == "test-api-key"
|
||||
assert component.examples == [{"inputs": {"questions": "What is football?"}, "outputs": {"score": 0}}]
|
||||
|
||||
pipeline = Pipeline()
|
||||
pipeline.add_component("evaluator", component)
|
||||
assert pipeline.loads(pipeline.dumps())
|
||||
|
||||
def test_run_calculates_mean_score(self, monkeypatch):
|
||||
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
|
||||
component = ContextRelevanceEvaluator()
|
||||
|
||||
@ -8,6 +8,7 @@ from typing import List
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from haystack import Pipeline
|
||||
from haystack.components.evaluators import FaithfulnessEvaluator
|
||||
from haystack.utils.auth import Secret
|
||||
|
||||
@ -91,6 +92,31 @@ class TestFaithfulnessEvaluator:
|
||||
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
|
||||
]
|
||||
|
||||
def test_to_dict_with_parameters(self, monkeypatch):
|
||||
monkeypatch.setenv("ENV_VAR", "test-api-key")
|
||||
component = FaithfulnessEvaluator(
|
||||
api="openai",
|
||||
api_key=Secret.from_env_var("ENV_VAR"),
|
||||
examples=[
|
||||
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
|
||||
],
|
||||
raise_on_failure=False,
|
||||
progress_bar=False,
|
||||
)
|
||||
data = component.to_dict()
|
||||
assert data == {
|
||||
"type": "haystack.components.evaluators.faithfulness.FaithfulnessEvaluator",
|
||||
"init_parameters": {
|
||||
"api_key": {"env_vars": ["ENV_VAR"], "strict": True, "type": "env_var"},
|
||||
"api": "openai",
|
||||
"examples": [
|
||||
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
|
||||
],
|
||||
"progress_bar": False,
|
||||
"raise_on_failure": False,
|
||||
},
|
||||
}
|
||||
|
||||
def test_from_dict(self, monkeypatch):
|
||||
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
|
||||
|
||||
@ -111,6 +137,10 @@ class TestFaithfulnessEvaluator:
|
||||
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
|
||||
]
|
||||
|
||||
pipeline = Pipeline()
|
||||
pipeline.add_component("evaluator", component)
|
||||
assert pipeline.loads(pipeline.dumps())
|
||||
|
||||
def test_run_calculates_mean_score(self, monkeypatch):
|
||||
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
|
||||
component = FaithfulnessEvaluator()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user