feat: Add new LLMEvaluator component (#7401)

* draft llm evaluator * docstrings * flexible inputs; validate inputs and outputs * add tests * add release note * remove example * docstrings * make outputs parameter optional. default: * validate init parameters * linting * remove mention of binary scores from template * make examples and outputs params non-optional * removed leftover from optional outputs param * simplify building examples section for template * validate inputs and outputs in examples are dict with str as key * fix pylint too-many-boolean-expressions * increase test coverage
2025-12-20 03:28:33 +00:00 · 2024-03-25 07:05:27 +01:00 · 2024-03-25 07:05:27 +01:00 · bfd0d3eacd
commit bfd0d3eacd
parent 42b587ab55
4 changed files with 635 additions and 1 deletions
--- a/haystack/components/evaluators/init.py
+++ b/haystack/components/evaluators/init.py
@ -1,3 +1,4 @@
 from .answer_exact_match import AnswerExactMatchEvaluator
+from .llm_evaluator import LLMEvaluator

-__all__ = ["AnswerExactMatchEvaluator"]
+__all__ = ["AnswerExactMatchEvaluator", "LLMEvaluator"]
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@ -0,0 +1,298 @@
+import json
+from typing import Any, Dict, List, Tuple, Type
+
+from haystack import component, default_from_dict, default_to_dict
+from haystack.components.builders import PromptBuilder
+from haystack.components.generators import OpenAIGenerator
+from haystack.utils import Secret, deserialize_secrets_inplace
+
+
+@component
+class LLMEvaluator:
+    """
+    Uses an LLM to evaluate inputs based on a prompt containing instructions and examples.
+
+    The default API requires an OpenAI API key to be provided as an environment variable "OPENAI_API_KEY".
+    The inputs are lists that are user-defined depending on the desired metric.
+    The output is a dictionary with a key `results` containing a list of evaluation results.
+    Each result is a dictionary with user-defined keys and values of either 0 for FALSE or 1 for TRUE respectively.
+
+    Usage example:
+    ```python
+    from haystack.components.evaluators import LLMEvaluator
+    evaluator = LLMEvaluator(
+        instructions="Is this answer problematic for children?",
+        inputs=[("responses", List[str])],
+        outputs=["score"],
+    )
+    RESPONSES = [
+        "Football is the most popular sport with around 4 billion followers worldwide",
+        "Python language was created by Guido van Rossum.",
+    ]
+    results = evaluator.run(responses=RESPONSES)
+    ```
+    """
+
+    def __init__(
+        self,
+        instructions: str,
+        inputs: List[Tuple[str, Type[List]]],
+        outputs: List[str],
+        examples: List[Dict[str, Any]],
+        *,
+        api: str = "openai",
+        api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
+    ):
+        """
+        Creates an instance of LLMEvaluator.
+
+        :param instructions:
+            The prompt instructions to use for evaluation.
+            Should be a question about the inputs that can be answered with yes or no.
+        :param inputs:
+            The inputs that the component expects as incoming connections and that it evaluates.
+            Each input is a tuple of an input name and input type. Input types must be lists.
+        :param outputs:
+            Output names of the evaluation results. They correspond to keys in the output dictionary.
+            The default is a single key "score".
+        :param examples:
+            Few-shot examples conforming to the expected input and output format as defined in the `inputs` and
+             `outputs` parameters.
+            Each example is a dictionary with keys "inputs" and "outputs"
+            They contain the input and output as dictionaries respectively.
+        :param api:
+            The API to use for calling an LLM through a Generator.
+            Supported APIs: "openai".
+        :param api_key:
+            The API key.
+
+        """
+        self.validate_init_parameters(inputs, outputs, examples)
+
+        self.instructions = instructions
+        self.inputs = inputs
+        self.outputs = outputs
+        self.examples = examples
+        self.api = api
+        self.api_key = api_key
+
+        if api == "openai":
+            self.generator = OpenAIGenerator(api_key=api_key)
+        else:
+            raise ValueError(f"Unsupported API: {api}")
+
+        template = self.prepare_template()
+        self.builder = PromptBuilder(template=template)
+
+        component.set_input_types(self, **dict(inputs))
+
+    def validate_init_parameters(
+        self, inputs: List[Tuple[str, Type[List]]], outputs: List[str], examples: List[Dict[str, Any]]
+    ):
+        """
+        Validate the init parameters.
+
+        :param inputs:
+            The inputs to validate.
+        :param outputs:
+            The outputs to validate.
+        :param examples:
+            The examples to validate.
+
+        :raises ValueError:
+            If the inputs are not a list of tuples with a string and a type of list.
+            If the outputs are not a list of strings.
+            If the examples are not a list of dictionaries.
+            If any example does not have keys "inputs" and "outputs" with values that are dictionaries with string keys.
+        """
+        # Validate inputs
+        if (
+            not isinstance(inputs, list)
+            or not all(isinstance(input, tuple) for input in inputs)
+            or not all(isinstance(input[0], str) and input[1] is not list and len(input) == 2 for input in inputs)
+        ):
+            msg = (
+                f"LLM evaluator expects inputs to be a list of tuples. Each tuple must contain an input name and "
+                f"type of list but received {inputs}."
+            )
+            raise ValueError(msg)
+
+        # Validate outputs
+        if not isinstance(outputs, list) or not all(isinstance(output, str) for output in outputs):
+            msg = f"LLM evaluator expects outputs to be a list of str but received {outputs}."
+            raise ValueError(msg)
+
+        # Validate examples are lists of dicts
+        if not isinstance(examples, list) or not all(isinstance(example, dict) for example in examples):
+            msg = f"LLM evaluator expects examples to be a list of dictionaries but received {examples}."
+            raise ValueError(msg)
+
+        # Validate each example
+        for example in examples:
+            if (
+                {"inputs", "outputs"} != example.keys()
+                or not all(isinstance(example[param], dict) for param in ["inputs", "outputs"])
+                or not all(isinstance(key, str) for param in ["inputs", "outputs"] for key in example[param])
+            ):
+                msg = (
+                    f"LLM evaluator expects each example to have keys `inputs` and `outputs` with values that are "
+                    f"dictionaries with str keys but received {example}."
+                )
+                raise ValueError(msg)
+
+    @component.output_types(results=List[Dict[str, Any]])
+    def run(self, **inputs) -> Dict[str, Any]:
+        """
+        Run the LLM evaluator.
+
+        :param inputs:
+            The input values to evaluate. The keys are the input names and the values are lists of input values.
+        :returns:
+            A dictionary with a single `results` entry that contains a list of results.
+            Each result is a dictionary containing the keys as defined in the `outputs` parameter of the LLMEvaluator
+            and the evaluation results as the values.
+        """
+        self.validate_input_parameters(dict(self.inputs), inputs)
+
+        # inputs is a dictionary with keys being input names and values being a list of input values
+        # We need to iterate through the lists in parallel for all keys of the dictionary
+        input_names, values = inputs.keys(), list(zip(*inputs.values()))
+        list_of_input_names_to_values = [dict(zip(input_names, v)) for v in values]
+
+        results = []
+        for input_names_to_values in list_of_input_names_to_values:
+            prompt = self.builder.run(**input_names_to_values)
+            result = self.generator.run(prompt=prompt["prompt"])
+
+            self.validate_outputs(expected=self.outputs, received=result["replies"][0])
+            parsed_result = json.loads(result["replies"][0])
+            parsed_result["name"] = "llm"
+            results.append(parsed_result)
+
+        return {"results": results}
+
+    def prepare_template(self) -> str:
+        """
+        Combine instructions, inputs, outputs, and examples into one prompt template with the following format:
+        Instructions:
+        <instructions>
+
+        Generate the response in JSON format with the following keys:
+        <list of output keys>
+        Consider the instructions and the examples below to determine those values.
+
+        Examples:
+        <examples>
+
+        Inputs:
+        <inputs>
+        Outputs:
+
+        :returns:
+            The prompt template.
+        """
+        inputs_section = (
+            "{" + ",".join([f'"{input_socket[0]}": {{{{ {input_socket[0]} }}}}' for input_socket in self.inputs]) + "}"
+        )
+
+        examples_section = "\n".join(
+            [
+                "Inputs:\n" + json.dumps(example["inputs"]) + "\nOutputs:\n" + json.dumps(example["outputs"])
+                for example in self.examples
+            ]
+        )
+        return (
+            f"Instructions:\n"
+            f"{self.instructions}\n\n"
+            f"Generate the response in JSON format with the following keys:\n"
+            f"{json.dumps(self.outputs)}\n"
+            f"Consider the instructions and the examples below to determine those values.\n\n"
+            f"Examples:\n"
+            f"{examples_section}\n\n"
+            f"Inputs:\n"
+            f"{inputs_section}\n"
+            f"Outputs:\n"
+        )
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serialize this component to a dictionary.
+
+        :returns:
+            The serialized component as a dictionary.
+        """
+        return default_to_dict(
+            self,
+            instructions=self.instructions,
+            inputs=self.inputs,
+            outputs=self.outputs,
+            examples=self.examples,
+            api=self.api,
+            api_key=self.api_key.to_dict(),
+        )
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "LLMEvaluator":
+        """
+        Deserialize this component from a dictionary.
+
+        :param data:
+            The dictionary representation of this component.
+        :returns:
+            The deserialized component instance.
+        """
+        deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
+        return default_from_dict(cls, data)
+
+    @staticmethod
+    def validate_input_parameters(expected: Dict[str, Any], received: Dict[str, Any]) -> None:
+        """
+        Validate the input parameters.
+
+        :param expected:
+            The expected input parameters.
+        :param received:
+            The received input parameters.
+
+        :raises ValueError:
+            If not all expected inputs are present in the received inputs
+            If the received inputs are not lists or have different lengths
+        """
+        # Validate that all expected inputs are present in the received inputs
+        for param in expected.keys():
+            if param not in received:
+                msg = f"LLM evaluator expected input parameter '{param}' but received only {received.keys()}."
+                raise ValueError(msg)
+
+        # Validate that all received inputs are lists
+        if not all(isinstance(input, list) for input in received.values()):
+            msg = f"LLM evaluator expects all input values to be lists but received {[type(input) for input in received.values()]}."
+            raise ValueError(msg)
+
+        # Validate that all received inputs are of the same length
+        inputs = received.values()
+        length = len(next(iter(inputs)))
+        if not all(len(input) == length for input in inputs):
+            msg = (
+                f"LLM evaluator expects all input lists to have the same length but received {inputs} with lengths "
+                f"{[len(input) for input in inputs]}."
+            )
+            raise ValueError(msg)
+
+    @staticmethod
+    def validate_outputs(expected: List[str], received: str) -> None:
+        """
+        Validate the output.
+
+        :param expected:
+            Names of expected outputs
+        :param received:
+            Names of received outputs
+
+        :raises ValueError:
+            If not all expected outputs are present in the received outputs
+        """
+        parsed_output = json.loads(received)
+        if not all(output in parsed_output for output in expected):
+            msg = f"Expected response from LLM evaluator to be JSON with keys {expected}, got {received}."
+            raise ValueError(msg)
--- a/releasenotes/notes/llmevaluator-0ae63b2b9715fb9b.yaml
+++ b/releasenotes/notes/llmevaluator-0ae63b2b9715fb9b.yaml
@ -0,0 +1,4 @@
+---
+features:
+  - |
+    Add a new LLMEvaluator component that leverages LLMs through the OpenAI api to evaluate pipelines.
--- a/test/components/evaluators/test_llm_evaluator.py
+++ b/test/components/evaluators/test_llm_evaluator.py
@ -0,0 +1,331 @@
+from typing import List
+
+import pytest
+
+from haystack.components.evaluators import LLMEvaluator
+from haystack.utils.auth import Secret
+
+
+class TestLLMEvaluator:
+    def test_init_default(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = LLMEvaluator(
+            instructions="test-instruction",
+            inputs=[("responses", List[str])],
+            outputs=["score"],
+            examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+        )
+        assert component.api == "openai"
+        assert component.generator.client.api_key == "test-api-key"
+        assert component.instructions == "test-instruction"
+        assert component.inputs == [("responses", List[str])]
+        assert component.outputs == ["score"]
+        assert component.examples == [
+            {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}
+        ]
+
+    def test_init_fail_wo_openai_api_key(self, monkeypatch):
+        monkeypatch.delenv("OPENAI_API_KEY", raising=False)
+        with pytest.raises(ValueError, match="None of the .* environment variables are set"):
+            LLMEvaluator(
+                api="openai",
+                instructions="test-instruction",
+                inputs=[("responses", List[str])],
+                outputs=["score"],
+                examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+            )
+
+    def test_init_with_parameters(self):
+        component = LLMEvaluator(
+            instructions="test-instruction",
+            api_key=Secret.from_token("test-api-key"),
+            inputs=[("responses", List[str])],
+            outputs=["custom_score"],
+            api="openai",
+            examples=[
+                {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
+                {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
+            ],
+        )
+        assert component.generator.client.api_key == "test-api-key"
+        assert component.api == "openai"
+        assert component.examples == [
+            {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
+            {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
+        ]
+        assert component.instructions == "test-instruction"
+        assert component.inputs == [("responses", List[str])]
+        assert component.outputs == ["custom_score"]
+
+    def test_init_with_invalid_parameters(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        # Invalid inputs
+        with pytest.raises(ValueError):
+            LLMEvaluator(
+                instructions="test-instruction",
+                inputs={("responses", List[str])},
+                outputs=["score"],
+                examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+            )
+        with pytest.raises(ValueError):
+            LLMEvaluator(
+                instructions="test-instruction",
+                inputs=[(List[str], "responses")],
+                outputs=["score"],
+                examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+            )
+        with pytest.raises(ValueError):
+            LLMEvaluator(
+                instructions="test-instruction",
+                inputs=[List[str]],
+                outputs=["score"],
+                examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+            )
+        with pytest.raises(ValueError):
+            LLMEvaluator(
+                instructions="test-instruction",
+                inputs={("responses", str)},
+                outputs=["score"],
+                examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+            )
+
+        # Invalid outputs
+        with pytest.raises(ValueError):
+            LLMEvaluator(
+                instructions="test-instruction",
+                inputs=[("responses", List[str])],
+                outputs="score",
+                examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+            )
+        with pytest.raises(ValueError):
+            LLMEvaluator(
+                instructions="test-instruction",
+                inputs=[("responses", List[str])],
+                outputs=[["score"]],
+                examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+            )
+
+        # Invalid examples
+        with pytest.raises(ValueError):
+            LLMEvaluator(
+                instructions="test-instruction",
+                inputs=[("responses", List[str])],
+                outputs=["score"],
+                examples={
+                    "inputs": {"responses": "Damn, this is straight outta hell!!!"},
+                    "outputs": {"custom_score": 1},
+                },
+            )
+        with pytest.raises(ValueError):
+            LLMEvaluator(
+                instructions="test-instruction",
+                inputs=[("responses", List[str])],
+                outputs=["score"],
+                examples=[
+                    [{"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}]
+                ],
+            )
+        with pytest.raises(ValueError):
+            LLMEvaluator(
+                instructions="test-instruction",
+                inputs=[("responses", List[str])],
+                outputs=["score"],
+                examples=[
+                    {"wrong_key": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}
+                ],
+            )
+        with pytest.raises(ValueError):
+            LLMEvaluator(
+                instructions="test-instruction",
+                inputs=[("responses", List[str])],
+                outputs=["score"],
+                examples=[
+                    {
+                        "inputs": [{"responses": "Damn, this is straight outta hell!!!"}],
+                        "outputs": [{"custom_score": 1}],
+                    }
+                ],
+            )
+        with pytest.raises(ValueError):
+            LLMEvaluator(
+                instructions="test-instruction",
+                inputs=[("responses", List[str])],
+                outputs=["score"],
+                examples=[{"inputs": {1: "Damn, this is straight outta hell!!!"}, "outputs": {2: 1}}],
+            )
+
+    def test_to_dict_default(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = LLMEvaluator(
+            instructions="test-instruction",
+            inputs=[("responses", List[str])],
+            outputs=["score"],
+            examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+        )
+        data = component.to_dict()
+        assert data == {
+            "type": "haystack.components.evaluators.llm_evaluator.LLMEvaluator",
+            "init_parameters": {
+                "api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"},
+                "api": "openai",
+                "instructions": "test-instruction",
+                "inputs": [("responses", List[str])],
+                "outputs": ["score"],
+                "examples": [{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+            },
+        }
+
+    def test_from_dict(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+
+        data = {
+            "type": "haystack.components.evaluators.llm_evaluator.LLMEvaluator",
+            "init_parameters": {
+                "api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"},
+                "api": "openai",
+                "instructions": "test-instruction",
+                "inputs": [("responses", List[str])],
+                "outputs": ["score"],
+                "examples": [{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+            },
+        }
+        component = LLMEvaluator.from_dict(data)
+        assert component.api == "openai"
+        assert component.generator.client.api_key == "test-api-key"
+        assert component.instructions == "test-instruction"
+        assert component.inputs == [("responses", List[str])]
+        assert component.outputs == ["score"]
+        assert component.examples == [
+            {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}
+        ]
+
+    def test_to_dict_with_parameters(self, monkeypatch):
+        monkeypatch.setenv("ENV_VAR", "test-api-key")
+        component = LLMEvaluator(
+            instructions="test-instruction",
+            api_key=Secret.from_env_var("ENV_VAR"),
+            inputs=[("responses", List[str])],
+            outputs=["custom_score"],
+            api="openai",
+            examples=[
+                {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
+                {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
+            ],
+        )
+        data = component.to_dict()
+        assert data == {
+            "type": "haystack.components.evaluators.llm_evaluator.LLMEvaluator",
+            "init_parameters": {
+                "api_key": {"env_vars": ["ENV_VAR"], "strict": True, "type": "env_var"},
+                "api": "openai",
+                "instructions": "test-instruction",
+                "inputs": [("responses", List[str])],
+                "outputs": ["custom_score"],
+                "examples": [
+                    {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
+                    {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
+                ],
+            },
+        }
+
+    def test_run_with_different_lengths(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = LLMEvaluator(
+            instructions="test-instruction",
+            inputs=[("questions", List[str]), ("responses", List[List[str]])],
+            outputs=["score"],
+            examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+        )
+
+        def generator_run(self, *args, **kwargs):
+            return {"replies": ['{"score": 0.5}']}
+
+        monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run)
+
+        with pytest.raises(ValueError):
+            component.run(questions=["What is the capital of Germany?"], responses=[["Berlin"], ["Paris"]])
+
+        with pytest.raises(ValueError):
+            component.run(
+                questions=["What is the capital of Germany?", "What is the capital of France?"], responses=[["Berlin"]]
+            )
+
+    def test_run_returns_parsed_result(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = LLMEvaluator(
+            instructions="test-instruction",
+            inputs=[("questions", List[str]), ("responses", List[List[str]])],
+            outputs=["score"],
+            examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+        )
+
+        def generator_run(self, *args, **kwargs):
+            return {"replies": ['{"score": 0.5}']}
+
+        monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run)
+
+        results = component.run(questions=["What is the capital of Germany?"], responses=["Berlin"])
+        assert results == {"results": [{"score": 0.5, "name": "llm"}]}
+
+    def test_prepare_template(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = LLMEvaluator(
+            instructions="test-instruction",
+            inputs=[("responses", List[str])],
+            outputs=["score"],
+            examples=[
+                {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}},
+                {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}},
+            ],
+        )
+        template = component.prepare_template()
+        assert (
+            template
+            == 'Instructions:\ntest-instruction\n\nGenerate the response in JSON format with the following keys:\n["score"]\nConsider the instructions and the examples below to determine those values.\n\nExamples:\nInputs:\n{"responses": "Damn, this is straight outta hell!!!"}\nOutputs:\n{"score": 1}\nInputs:\n{"responses": "Football is the most popular sport."}\nOutputs:\n{"score": 0}\n\nInputs:\n{"responses": {{ responses }}}\nOutputs:\n'
+        )
+
+    def test_invalid_input_parameters(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = LLMEvaluator(
+            instructions="test-instruction",
+            inputs=[("responses", List[str])],
+            outputs=["score"],
+            examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+        )
+        # None of the expected parameters are received
+        with pytest.raises(ValueError):
+            component.validate_input_parameters(expected={"responses": List[str]}, received={"questions": List[str]})
+
+        # Only one but not all the expected parameters are received
+        with pytest.raises(ValueError):
+            component.validate_input_parameters(
+                expected={"responses": List[str], "questions": List[str]}, received={"questions": List[str]}
+            )
+
+        # Received inputs are not lists
+        with pytest.raises(ValueError):
+            component.validate_input_parameters(expected={"questions": List[str]}, received={"questions": str})
+
+    def test_invalid_outputs(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = LLMEvaluator(
+            instructions="test-instruction",
+            inputs=[("responses", List[str])],
+            outputs=["score"],
+            examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+        )
+        with pytest.raises(ValueError):
+            component.validate_outputs(expected=["score", "another_expected_output"], received='{"score": 1.0}')
+
+        with pytest.raises(ValueError):
+            component.validate_outputs(expected=["score"], received='{"wrong_name": 1.0}')
+
+    def test_unsupported_api(self):
+        with pytest.raises(ValueError):
+            LLMEvaluator(
+                api="unsupported_api",
+                instructions="test-instruction",
+                inputs=[("responses", List[str])],
+                outputs=["score"],
+                examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+            )