diff --git a/haystack/components/evaluators/context_relevance.py b/haystack/components/evaluators/context_relevance.py
index d78ccfc74..6402ef3ba 100644
--- a/haystack/components/evaluators/context_relevance.py
+++ b/haystack/components/evaluators/context_relevance.py
@@ -113,7 +113,7 @@ class ContextRelevanceEvaluator(LLMEvaluator):
             api_key=self.api_key,
         )
 
-    @component.output_types(results=List[Dict[str, Any]])
+    @component.output_types(individual_scores=List[int], score=float, results=List[Dict[str, Any]])
     def run(self, questions: List[str], contexts: List[List[str]]) -> Dict[str, Any]:
         """
         Run the LLM evaluator.
diff --git a/haystack/components/evaluators/faithfulness.py b/haystack/components/evaluators/faithfulness.py
index 827499038..7c794345d 100644
--- a/haystack/components/evaluators/faithfulness.py
+++ b/haystack/components/evaluators/faithfulness.py
@@ -13,7 +13,7 @@ _DEFAULT_EXAMPLES = [
         "inputs": {
             "questions": "What is the capital of Germany and when was it founded?",
             "contexts": ["Berlin is the capital of Germany and was founded in 1244."],
-            "responses": "The capital of Germany, Berlin, was founded in the 13th century.",
+            "predicted_answers": "The capital of Germany, Berlin, was founded in the 13th century.",
         },
         "outputs": {
             "statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."],
@@ -24,7 +24,7 @@ _DEFAULT_EXAMPLES = [
         "inputs": {
             "questions": "What is the capital of France?",
             "contexts": ["Berlin is the capital of Germany."],
-            "responses": "Paris",
+            "predicted_answers": "Paris",
         },
         "outputs": {"statements": ["Paris is the capital of France."], "statement_scores": [0]},
     },
@@ -32,7 +32,7 @@ _DEFAULT_EXAMPLES = [
         "inputs": {
             "questions": "What is the capital of Italy?",
             "contexts": ["Rome is the capital of Italy."],
-            "responses": "Rome is the capital of Italy with more than 4 million inhabitants.",
+            "predicted_answers": "Rome is the capital of Italy with more than 4 million inhabitants.",
         },
         "outputs": {
             "statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."],
@@ -60,9 +60,9 @@ class FaithfulnessEvaluator(LLMEvaluator):
             "Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming language. Its design philosophy emphasizes code readability, and its language constructs aim to help programmers write clear, logical code for both small and large-scale software projects."
         ],
     ]
-    responses = ["Python is a high-level general-purpose programming language that was created by George Lucas."]
+    predicted_answers = ["Python is a high-level general-purpose programming language that was created by George Lucas."]
     evaluator = FaithfulnessEvaluator()
-    result = evaluator.run(questions=questions, contexts=contexts, responses=responses)
+    result = evaluator.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
 
     print(result["individual_scores"])
     # [0.5]
@@ -87,13 +87,13 @@ class FaithfulnessEvaluator(LLMEvaluator):
             Optional few-shot examples conforming to the expected input and output format of FaithfulnessEvaluator.
             Default examples will be used if none are provided.
             Each example must be a dictionary with keys "inputs" and "outputs".
-            "inputs" must be a dictionary with keys "questions", "contexts", and "responses".
+            "inputs" must be a dictionary with keys "questions", "contexts", and "predicted_answers".
             "outputs" must be a dictionary with "statements" and "statement_scores".
             Expected format:
             [{
                 "inputs": {
                     "questions": "What is the capital of Italy?", "contexts": ["Rome is the capital of Italy."],
-                    "responses": "Rome is the capital of Italy with more than 4 million inhabitants.",
+                    "predicted_answers": "Rome is the capital of Italy with more than 4 million inhabitants.",
                 },
                 "outputs": {
                     "statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."],
@@ -110,11 +110,11 @@ class FaithfulnessEvaluator(LLMEvaluator):
         self.instructions = (
             "Your task is to judge the faithfulness or groundedness of statements based "
             "on context information. First, please extract statements from a provided "
-            "response to a question. Second, calculate a faithfulness score for each "
-            "statement made in the response. The score is 1 if the statement can be "
+            "predicted answer to a question. Second, calculate a faithfulness score for each "
+            "statement made in the predicted answer. The score is 1 if the statement can be "
             "inferred from the provided context or 0 if it cannot be inferred."
         )
-        self.inputs = [("questions", List[str]), ("contexts", List[List[str]]), ("responses", List[str])]
+        self.inputs = [("questions", List[str]), ("contexts", List[List[str]]), ("predicted_answers", List[str])]
         self.outputs = ["statements", "statement_scores"]
         self.examples = examples or _DEFAULT_EXAMPLES
         self.api = api
@@ -129,8 +129,8 @@ class FaithfulnessEvaluator(LLMEvaluator):
             api_key=self.api_key,
         )
 
-    @component.output_types(results=List[Dict[str, Any]])
-    def run(self, questions: List[str], contexts: List[List[str]], responses: List[str]) -> Dict[str, Any]:
+    @component.output_types(individual_scores=List[int], score=float, results=List[Dict[str, Any]])
+    def run(self, questions: List[str], contexts: List[List[str]], predicted_answers: List[str]) -> Dict[str, Any]:
         """
         Run the LLM evaluator.
 
@@ -138,15 +138,15 @@ class FaithfulnessEvaluator(LLMEvaluator):
             A list of questions.
         :param contexts:
             A nested list of contexts that correspond to the questions.
-        :param responses:
-            A list of responses.
+        :param predicted_answers:
+            A list of predicted answers.
         :returns:
             A dictionary with the following outputs:
                 - `score`: Mean faithfulness score over all the provided input answers.
                 - `individual_scores`: A list of faithfulness scores for each input answer.
                 - `results`: A list of dictionaries with `statements` and `statement_scores` for each input answer.
         """
-        result = super().run(questions=questions, contexts=contexts, responses=responses)
+        result = super().run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
 
         # calculate average statement faithfulness score per query
         for res in result["results"]:
diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
index 4191dfc92..1e06e5ff4 100644
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@@ -23,18 +23,18 @@ class LLMEvaluator:
     from haystack.components.evaluators import LLMEvaluator
     evaluator = LLMEvaluator(
         instructions="Is this answer problematic for children?",
-        inputs=[("responses", List[str])],
+        inputs=[("predicted_answers", List[str])],
         outputs=["score"],
         examples=[
-            {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}},
-            {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}},
+            {"inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}},
+            {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}},
         ],
     )
-    RESPONSES = [
+    predicted_answers = [
         "Football is the most popular sport with around 4 billion followers worldwide",
         "Python language was created by Guido van Rossum.",
     ]
-    results = evaluator.run(responses=RESPONSES)
+    results = evaluator.run(predicted_answers=predicted_answers)
     print(results)
     # {'results': [{'score': 0}, {'score': 0}]}
     ```
@@ -199,7 +199,7 @@ class LLMEvaluator:
             The prompt template.
         """
         inputs_section = (
-            "{" + ",".join([f'"{input_socket[0]}": {{{{ {input_socket[0]} }}}}' for input_socket in self.inputs]) + "}"
+            "{" + ", ".join([f'"{input_socket[0]}": {{{{ {input_socket[0]} }}}}' for input_socket in self.inputs]) + "}"
         )
 
         examples_section = "\n".join(
diff --git a/haystack/components/evaluators/sas_evaluator.py b/haystack/components/evaluators/sas_evaluator.py
index d7ab26a4d..6af8dd335 100644
--- a/haystack/components/evaluators/sas_evaluator.py
+++ b/haystack/components/evaluators/sas_evaluator.py
@@ -183,7 +183,7 @@ class SASEvaluator:
 
             # Compute cosine-similarities
             similarity_scores = [
-                util.cos_sim(p, l).cpu().numpy() for p, l in zip(predictions_embeddings, label_embeddings)
+                float(util.cos_sim(p, l).cpu().numpy()) for p, l in zip(predictions_embeddings, label_embeddings)
             ]
 
         sas_score = np_mean(similarity_scores)
diff --git a/test/components/evaluators/test_faithfulness_evaluator.py b/test/components/evaluators/test_faithfulness_evaluator.py
index 7219c85d5..cacc53cd7 100644
--- a/test/components/evaluators/test_faithfulness_evaluator.py
+++ b/test/components/evaluators/test_faithfulness_evaluator.py
@@ -15,19 +15,23 @@ class TestFaithfulnessEvaluator:
         assert component.generator.client.api_key == "test-api-key"
         assert component.instructions == (
             "Your task is to judge the faithfulness or groundedness of statements based "
-            "on context information. First, please extract statements from a provided "
-            "response to a question. Second, calculate a faithfulness score for each "
-            "statement made in the response. The score is 1 if the statement can be "
+            "on context information. First, please extract statements from a provided predicted "
+            "answer to a question. Second, calculate a faithfulness score for each "
+            "statement made in the predicted answer. The score is 1 if the statement can be "
             "inferred from the provided context or 0 if it cannot be inferred."
         )
-        assert component.inputs == [("questions", List[str]), ("contexts", List[List[str]]), ("responses", List[str])]
+        assert component.inputs == [
+            ("questions", List[str]),
+            ("contexts", List[List[str]]),
+            ("predicted_answers", List[str]),
+        ]
         assert component.outputs == ["statements", "statement_scores"]
         assert component.examples == [
             {
                 "inputs": {
                     "questions": "What is the capital of Germany and when was it founded?",
                     "contexts": ["Berlin is the capital of Germany and was founded in 1244."],
-                    "responses": "The capital of Germany, Berlin, was founded in the 13th century.",
+                    "predicted_answers": "The capital of Germany, Berlin, was founded in the 13th century.",
                 },
                 "outputs": {
                     "statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."],
@@ -38,7 +42,7 @@ class TestFaithfulnessEvaluator:
                 "inputs": {
                     "questions": "What is the capital of France?",
                     "contexts": ["Berlin is the capital of Germany."],
-                    "responses": "Paris",
+                    "predicted_answers": "Paris",
                 },
                 "outputs": {"statements": ["Paris is the capital of France."], "statement_scores": [0]},
             },
@@ -46,7 +50,7 @@ class TestFaithfulnessEvaluator:
                 "inputs": {
                     "questions": "What is the capital of Italy?",
                     "contexts": ["Rome is the capital of Italy."],
-                    "responses": "Rome is the capital of Italy with more than 4 million inhabitants.",
+                    "predicted_answers": "Rome is the capital of Italy with more than 4 million inhabitants.",
                 },
                 "outputs": {
                     "statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."],
@@ -65,15 +69,21 @@ class TestFaithfulnessEvaluator:
             api_key=Secret.from_token("test-api-key"),
             api="openai",
             examples=[
-                {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
-                {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
+                {
+                    "inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"},
+                    "outputs": {"custom_score": 1},
+                },
+                {
+                    "inputs": {"predicted_answers": "Football is the most popular sport."},
+                    "outputs": {"custom_score": 0},
+                },
             ],
         )
         assert component.generator.client.api_key == "test-api-key"
         assert component.api == "openai"
         assert component.examples == [
-            {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
-            {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
+            {"inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
+            {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
         ]
 
     def test_from_dict(self, monkeypatch):
@@ -84,14 +94,16 @@ class TestFaithfulnessEvaluator:
             "init_parameters": {
                 "api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"},
                 "api": "openai",
-                "examples": [{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+                "examples": [
+                    {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
+                ],
             },
         }
         component = FaithfulnessEvaluator.from_dict(data)
         assert component.api == "openai"
         assert component.generator.client.api_key == "test-api-key"
         assert component.examples == [
-            {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}
+            {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
         ]
 
     def test_run_calculates_mean_score(self, monkeypatch):
@@ -120,11 +132,11 @@ class TestFaithfulnessEvaluator:
                 "programmers write clear, logical code for both small and large-scale software projects."
             ],
         ]
-        responses = [
+        predicted_answers = [
             "Football is the most popular sport with around 4 billion followers worldwide.",
             "Python is a high-level general-purpose programming language that was created by George Lucas.",
         ]
-        results = component.run(questions=questions, contexts=contexts, responses=responses)
+        results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
         assert results == {
             "individual_scores": [0.5, 1],
             "results": [
@@ -148,9 +160,9 @@ class TestFaithfulnessEvaluator:
     def test_live_run(self):
         questions = ["What is Python and who created it?"]
         contexts = [["Python is a programming language created by Guido van Rossum."]]
-        responses = ["Python is a programming language created by George Lucas."]
+        predicted_answers = ["Python is a programming language created by George Lucas."]
         evaluator = FaithfulnessEvaluator()
-        result = evaluator.run(questions=questions, contexts=contexts, responses=responses)
+        result = evaluator.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
 
         required_fields = {"individual_scores", "results", "score"}
         assert all(field in result for field in required_fields)
diff --git a/test/components/evaluators/test_llm_evaluator.py b/test/components/evaluators/test_llm_evaluator.py
index 5960e32da..9755b9df0 100644
--- a/test/components/evaluators/test_llm_evaluator.py
+++ b/test/components/evaluators/test_llm_evaluator.py
@@ -11,17 +11,19 @@ class TestLLMEvaluator:
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
         component = LLMEvaluator(
             instructions="test-instruction",
-            inputs=[("responses", List[str])],
+            inputs=[("predicted_answers", List[str])],
             outputs=["score"],
-            examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+            examples=[
+                {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
+            ],
         )
         assert component.api == "openai"
         assert component.generator.client.api_key == "test-api-key"
         assert component.instructions == "test-instruction"
-        assert component.inputs == [("responses", List[str])]
+        assert component.inputs == [("predicted_answers", List[str])]
         assert component.outputs == ["score"]
         assert component.examples == [
-            {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}
+            {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
         ]
 
     def test_init_fail_wo_openai_api_key(self, monkeypatch):
@@ -30,31 +32,39 @@ class TestLLMEvaluator:
             LLMEvaluator(
                 api="openai",
                 instructions="test-instruction",
-                inputs=[("responses", List[str])],
+                inputs=[("predicted_answers", List[str])],
                 outputs=["score"],
-                examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+                examples=[
+                    {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
+                ],
             )
 
     def test_init_with_parameters(self):
         component = LLMEvaluator(
             instructions="test-instruction",
             api_key=Secret.from_token("test-api-key"),
-            inputs=[("responses", List[str])],
+            inputs=[("predicted_answers", List[str])],
             outputs=["custom_score"],
             api="openai",
             examples=[
-                {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
-                {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
+                {
+                    "inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"},
+                    "outputs": {"custom_score": 1},
+                },
+                {
+                    "inputs": {"predicted_answers": "Football is the most popular sport."},
+                    "outputs": {"custom_score": 0},
+                },
             ],
         )
         assert component.generator.client.api_key == "test-api-key"
         assert component.api == "openai"
         assert component.examples == [
-            {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
-            {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
+            {"inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
+            {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
         ]
         assert component.instructions == "test-instruction"
-        assert component.inputs == [("responses", List[str])]
+        assert component.inputs == [("predicted_answers", List[str])]
         assert component.outputs == ["custom_score"]
 
     def test_init_with_invalid_parameters(self, monkeypatch):
@@ -63,85 +73,105 @@ class TestLLMEvaluator:
         with pytest.raises(ValueError):
             LLMEvaluator(
                 instructions="test-instruction",
-                inputs={("responses", List[str])},
+                inputs={("predicted_answers", List[str])},
                 outputs=["score"],
-                examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+                examples=[
+                    {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
+                ],
             )
         with pytest.raises(ValueError):
             LLMEvaluator(
                 instructions="test-instruction",
-                inputs=[(List[str], "responses")],
+                inputs=[(List[str], "predicted_answers")],
                 outputs=["score"],
-                examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+                examples=[
+                    {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
+                ],
             )
         with pytest.raises(ValueError):
             LLMEvaluator(
                 instructions="test-instruction",
                 inputs=[List[str]],
                 outputs=["score"],
-                examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+                examples=[
+                    {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
+                ],
             )
         with pytest.raises(ValueError):
             LLMEvaluator(
                 instructions="test-instruction",
-                inputs={("responses", str)},
+                inputs={("predicted_answers", str)},
                 outputs=["score"],
-                examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+                examples=[
+                    {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
+                ],
             )
 
         # Invalid outputs
         with pytest.raises(ValueError):
             LLMEvaluator(
                 instructions="test-instruction",
-                inputs=[("responses", List[str])],
+                inputs=[("predicted_answers", List[str])],
                 outputs="score",
-                examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+                examples=[
+                    {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
+                ],
             )
         with pytest.raises(ValueError):
             LLMEvaluator(
                 instructions="test-instruction",
-                inputs=[("responses", List[str])],
+                inputs=[("predicted_answers", List[str])],
                 outputs=[["score"]],
-                examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+                examples=[
+                    {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
+                ],
             )
 
         # Invalid examples
         with pytest.raises(ValueError):
             LLMEvaluator(
                 instructions="test-instruction",
-                inputs=[("responses", List[str])],
+                inputs=[("predicted_answers", List[str])],
                 outputs=["score"],
                 examples={
-                    "inputs": {"responses": "Damn, this is straight outta hell!!!"},
+                    "inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"},
                     "outputs": {"custom_score": 1},
                 },
             )
         with pytest.raises(ValueError):
             LLMEvaluator(
                 instructions="test-instruction",
-                inputs=[("responses", List[str])],
+                inputs=[("predicted_answers", List[str])],
                 outputs=["score"],
                 examples=[
-                    [{"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}]
+                    [
+                        {
+                            "inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"},
+                            "outputs": {"custom_score": 1},
+                        }
+                    ]
                 ],
             )
         with pytest.raises(ValueError):
             LLMEvaluator(
                 instructions="test-instruction",
-                inputs=[("responses", List[str])],
-                outputs=["score"],
-                examples=[
-                    {"wrong_key": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}
-                ],
-            )
-        with pytest.raises(ValueError):
-            LLMEvaluator(
-                instructions="test-instruction",
-                inputs=[("responses", List[str])],
+                inputs=[("predicted_answers", List[str])],
                 outputs=["score"],
                 examples=[
                     {
-                        "inputs": [{"responses": "Damn, this is straight outta hell!!!"}],
+                        "wrong_key": {"predicted_answers": "Damn, this is straight outta hell!!!"},
+                        "outputs": {"custom_score": 1},
+                    }
+                ],
+            )
+        with pytest.raises(ValueError):
+            LLMEvaluator(
+                instructions="test-instruction",
+                inputs=[("predicted_answers", List[str])],
+                outputs=["score"],
+                examples=[
+                    {
+                        "inputs": [{"predicted_answers": "Damn, this is straight outta hell!!!"}],
                         "outputs": [{"custom_score": 1}],
                     }
                 ],
@@ -149,7 +179,7 @@ class TestLLMEvaluator:
         with pytest.raises(ValueError):
             LLMEvaluator(
                 instructions="test-instruction",
-                inputs=[("responses", List[str])],
+                inputs=[("predicted_answers", List[str])],
                 outputs=["score"],
                 examples=[{"inputs": {1: "Damn, this is straight outta hell!!!"}, "outputs": {2: 1}}],
             )
@@ -158,9 +188,11 @@ class TestLLMEvaluator:
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
         component = LLMEvaluator(
             instructions="test-instruction",
-            inputs=[("responses", List[str])],
+            inputs=[("predicted_answers", List[str])],
             outputs=["score"],
-            examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+            examples=[
+                {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
+            ],
         )
         data = component.to_dict()
         assert data == {
@@ -169,9 +201,11 @@ class TestLLMEvaluator:
                 "api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"},
                 "api": "openai",
                 "instructions": "test-instruction",
-                "inputs": [("responses", List[str])],
+                "inputs": [("predicted_answers", List[str])],
                 "outputs": ["score"],
-                "examples": [{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+                "examples": [
+                    {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
+                ],
             },
         }
 
@@ -184,19 +218,21 @@ class TestLLMEvaluator:
                 "api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"},
                 "api": "openai",
                 "instructions": "test-instruction",
-                "inputs": [("responses", List[str])],
+                "inputs": [("predicted_answers", List[str])],
                 "outputs": ["score"],
-                "examples": [{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+                "examples": [
+                    {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
+                ],
             },
         }
         component = LLMEvaluator.from_dict(data)
         assert component.api == "openai"
         assert component.generator.client.api_key == "test-api-key"
         assert component.instructions == "test-instruction"
-        assert component.inputs == [("responses", List[str])]
+        assert component.inputs == [("predicted_answers", List[str])]
         assert component.outputs == ["score"]
         assert component.examples == [
-            {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}
+            {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
         ]
 
     def test_to_dict_with_parameters(self, monkeypatch):
@@ -204,12 +240,18 @@ class TestLLMEvaluator:
         component = LLMEvaluator(
             instructions="test-instruction",
             api_key=Secret.from_env_var("ENV_VAR"),
-            inputs=[("responses", List[str])],
+            inputs=[("predicted_answers", List[str])],
             outputs=["custom_score"],
             api="openai",
             examples=[
-                {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
-                {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
+                {
+                    "inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"},
+                    "outputs": {"custom_score": 1},
+                },
+                {
+                    "inputs": {"predicted_answers": "Football is the most popular sport."},
+                    "outputs": {"custom_score": 0},
+                },
             ],
         )
         data = component.to_dict()
@@ -219,11 +261,17 @@ class TestLLMEvaluator:
                 "api_key": {"env_vars": ["ENV_VAR"], "strict": True, "type": "env_var"},
                 "api": "openai",
                 "instructions": "test-instruction",
-                "inputs": [("responses", List[str])],
+                "inputs": [("predicted_answers", List[str])],
                 "outputs": ["custom_score"],
                 "examples": [
-                    {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
-                    {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
+                    {
+                        "inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"},
+                        "outputs": {"custom_score": 1},
+                    },
+                    {
+                        "inputs": {"predicted_answers": "Football is the most popular sport."},
+                        "outputs": {"custom_score": 0},
+                    },
                 ],
             },
         }
@@ -232,9 +280,11 @@ class TestLLMEvaluator:
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
         component = LLMEvaluator(
             instructions="test-instruction",
-            inputs=[("questions", List[str]), ("responses", List[List[str]])],
+            inputs=[("questions", List[str]), ("predicted_answers", List[List[str]])],
             outputs=["score"],
-            examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+            examples=[
+                {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
+            ],
         )
 
         def generator_run(self, *args, **kwargs):
@@ -243,20 +293,23 @@ class TestLLMEvaluator:
         monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run)
 
         with pytest.raises(ValueError):
-            component.run(questions=["What is the capital of Germany?"], responses=[["Berlin"], ["Paris"]])
+            component.run(questions=["What is the capital of Germany?"], predicted_answers=[["Berlin"], ["Paris"]])
 
         with pytest.raises(ValueError):
             component.run(
-                questions=["What is the capital of Germany?", "What is the capital of France?"], responses=[["Berlin"]]
+                questions=["What is the capital of Germany?", "What is the capital of France?"],
+                predicted_answers=[["Berlin"]],
             )
 
     def test_run_returns_parsed_result(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
         component = LLMEvaluator(
             instructions="test-instruction",
-            inputs=[("questions", List[str]), ("responses", List[List[str]])],
+            inputs=[("questions", List[str]), ("predicted_answers", List[List[str]])],
             outputs=["score"],
-            examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+            examples=[
+                {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
+            ],
         )
 
         def generator_run(self, *args, **kwargs):
@@ -264,42 +317,46 @@ class TestLLMEvaluator:
 
         monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run)
 
-        results = component.run(questions=["What is the capital of Germany?"], responses=["Berlin"])
+        results = component.run(questions=["What is the capital of Germany?"], predicted_answers=["Berlin"])
         assert results == {"results": [{"score": 0.5}]}
 
     def test_prepare_template(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
         component = LLMEvaluator(
             instructions="test-instruction",
-            inputs=[("responses", List[str])],
+            inputs=[("predicted_answers", List[str])],
             outputs=["score"],
             examples=[
-                {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}},
-                {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}},
+                {"inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}},
+                {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}},
             ],
         )
         template = component.prepare_template()
         assert (
             template
-            == 'Instructions:\ntest-instruction\n\nGenerate the response in JSON format with the following keys:\n["score"]\nConsider the instructions and the examples below to determine those values.\n\nExamples:\nInputs:\n{"responses": "Damn, this is straight outta hell!!!"}\nOutputs:\n{"score": 1}\nInputs:\n{"responses": "Football is the most popular sport."}\nOutputs:\n{"score": 0}\n\nInputs:\n{"responses": {{ responses }}}\nOutputs:\n'
+            == 'Instructions:\ntest-instruction\n\nGenerate the response in JSON format with the following keys:\n["score"]\nConsider the instructions and the examples below to determine those values.\n\nExamples:\nInputs:\n{"predicted_answers": "Damn, this is straight outta hell!!!"}\nOutputs:\n{"score": 1}\nInputs:\n{"predicted_answers": "Football is the most popular sport."}\nOutputs:\n{"score": 0}\n\nInputs:\n{"predicted_answers": {{ predicted_answers }}}\nOutputs:\n'
         )
 
     def test_invalid_input_parameters(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
         component = LLMEvaluator(
             instructions="test-instruction",
-            inputs=[("responses", List[str])],
+            inputs=[("predicted_answers", List[str])],
             outputs=["score"],
-            examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+            examples=[
+                {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
+            ],
         )
         # None of the expected parameters are received
         with pytest.raises(ValueError):
-            component.validate_input_parameters(expected={"responses": List[str]}, received={"questions": List[str]})
+            component.validate_input_parameters(
+                expected={"predicted_answers": List[str]}, received={"questions": List[str]}
+            )
 
         # Only one but not all the expected parameters are received
         with pytest.raises(ValueError):
             component.validate_input_parameters(
-                expected={"responses": List[str], "questions": List[str]}, received={"questions": List[str]}
+                expected={"predicted_answers": List[str], "questions": List[str]}, received={"questions": List[str]}
             )
 
         # Received inputs are not lists
@@ -310,9 +367,11 @@ class TestLLMEvaluator:
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
         component = LLMEvaluator(
             instructions="test-instruction",
-            inputs=[("responses", List[str])],
+            inputs=[("predicted_answers", List[str])],
             outputs=["score"],
-            examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+            examples=[
+                {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
+            ],
         )
         with pytest.raises(ValueError):
             component.validate_outputs(expected=["score", "another_expected_output"], received='{"score": 1.0}')
@@ -325,7 +384,9 @@ class TestLLMEvaluator:
             LLMEvaluator(
                 api="unsupported_api",
                 instructions="test-instruction",
-                inputs=[("responses", List[str])],
+                inputs=[("predicted_answers", List[str])],
                 outputs=["score"],
-                examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+                examples=[
+                    {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
+                ],
             )