diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py index 671ca57a5..22885f87e 100644 --- a/haystack/components/evaluators/llm_evaluator.py +++ b/haystack/components/evaluators/llm_evaluator.py @@ -171,10 +171,12 @@ class LLMEvaluator: :param inputs: The input values to evaluate. The keys are the input names and the values are lists of input values. :returns: - A dictionary with a single `results` entry that contains a list of results. + A dictionary with a `results` entry that contains a list of results. Each result is a dictionary containing the keys as defined in the `outputs` parameter of the LLMEvaluator and the evaluation results as the values. If an exception occurs for a particular input value, the result will be `None` for that entry. + If the API is "openai" and the response contains a "meta" key, the metadata from OpenAI will be included + in the output dictionary, under the key "meta". :raises ValueError: Only in the case that `raise_on_failure` is set to True and the received inputs are not lists or have different lengths, or if the output is not a valid JSON or doesn't contain the expected keys. @@ -187,6 +189,7 @@ class LLMEvaluator: list_of_input_names_to_values = [dict(zip(input_names, v)) for v in values] results: List[Optional[Dict[str, Any]]] = [] + metadata = None errors = 0 for input_names_to_values in tqdm(list_of_input_names_to_values, disable=not self.progress_bar): prompt = self.builder.run(**input_names_to_values) @@ -208,11 +211,14 @@ class LLMEvaluator: results.append(None) errors += 1 + if self.api == "openai" and "meta" in result: + metadata = result["meta"] + if errors > 0: msg = f"LLM evaluator failed for {errors} out of {len(list_of_input_names_to_values)} inputs." warn(msg) - return {"results": results} + return {"results": results, "meta": metadata} def prepare_template(self) -> str: """ diff --git a/releasenotes/notes/adding-metadata-info-from-OpenAI-f5309af5f59bb6a7.yaml b/releasenotes/notes/adding-metadata-info-from-OpenAI-f5309af5f59bb6a7.yaml new file mode 100644 index 000000000..f8de43bb1 --- /dev/null +++ b/releasenotes/notes/adding-metadata-info-from-OpenAI-f5309af5f59bb6a7.yaml @@ -0,0 +1,5 @@ +--- + +enhancements: + - | + When using "openai" for the LLM-based evaluators the metadata from OpenAI will be in the output dictionary, under the key "meta". diff --git a/test/components/evaluators/test_context_relevance_evaluator.py b/test/components/evaluators/test_context_relevance_evaluator.py index 1214bd0cf..1a7daa0bc 100644 --- a/test/components/evaluators/test_context_relevance_evaluator.py +++ b/test/components/evaluators/test_context_relevance_evaluator.py @@ -160,6 +160,7 @@ class TestContextRelevanceEvaluator: {"score": 1, "statement_scores": [1, 1], "statements": ["c", "d"]}, ], "score": 0.75, + "meta": None, } def test_run_no_statements_extracted(self, monkeypatch): @@ -192,6 +193,7 @@ class TestContextRelevanceEvaluator: {"score": 0, "statement_scores": [], "statements": []}, ], "score": 0.25, + "meta": None, } def test_run_missing_parameters(self, monkeypatch): @@ -256,6 +258,11 @@ class TestContextRelevanceEvaluator: nested_required_fields = {"score", "statement_scores", "statements"} assert all(field in result["results"][0] for field in nested_required_fields) + assert "meta" in result + assert "prompt_tokens" in result["meta"][0]["usage"] + assert "completion_tokens" in result["meta"][0]["usage"] + assert "total_tokens" in result["meta"][0]["usage"] + @pytest.mark.skipif( not os.environ.get("OPENAI_API_KEY", None), reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.", diff --git a/test/components/evaluators/test_faithfulness_evaluator.py b/test/components/evaluators/test_faithfulness_evaluator.py index c24df9458..7616a8849 100644 --- a/test/components/evaluators/test_faithfulness_evaluator.py +++ b/test/components/evaluators/test_faithfulness_evaluator.py @@ -179,6 +179,7 @@ class TestFaithfulnessEvaluator: {"score": 1, "statement_scores": [1, 1], "statements": ["c", "d"]}, ], "score": 0.75, + "meta": None, } def test_run_no_statements_extracted(self, monkeypatch): @@ -215,6 +216,7 @@ class TestFaithfulnessEvaluator: {"score": 0, "statement_scores": [], "statements": []}, ], "score": 0.25, + "meta": None, } def test_run_missing_parameters(self, monkeypatch): @@ -282,3 +284,9 @@ class TestFaithfulnessEvaluator: assert all(field in result for field in required_fields) nested_required_fields = {"score", "statement_scores", "statements"} assert all(field in result["results"][0] for field in nested_required_fields) + + # assert that metadata is present in the result + assert "meta" in result + assert "prompt_tokens" in result["meta"][0]["usage"] + assert "completion_tokens" in result["meta"][0]["usage"] + assert "total_tokens" in result["meta"][0]["usage"] diff --git a/test/components/evaluators/test_llm_evaluator.py b/test/components/evaluators/test_llm_evaluator.py index d3ec4d29e..edab87349 100644 --- a/test/components/evaluators/test_llm_evaluator.py +++ b/test/components/evaluators/test_llm_evaluator.py @@ -339,7 +339,7 @@ class TestLLMEvaluator: monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run) results = component.run(questions=["What is the capital of Germany?"], predicted_answers=["Berlin"]) - assert results == {"results": [{"score": 0.5}]} + assert results == {"results": [{"score": 0.5}], "meta": None} def test_prepare_template(self, monkeypatch): monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")