feat: LLM-based evaluators return meta info from OpenAI (#7947)

* LLM-Evaluator returns metadata from OpenAI

* adding tests

* adding release notes

* updating test

* updating release notes

* fixing live tests

* attending PR comments

* fixing tests

* Update releasenotes/notes/adding-metadata-info-from-OpenAI-f5309af5f59bb6a7.yaml

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>

* Update llm_evaluator.py

---------

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>
This commit is contained in:
David S. Batista 2024-07-02 11:31:51 +02:00 committed by GitHub
parent 3068ea258b
commit 186512459d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 29 additions and 3 deletions

View File

@ -171,10 +171,12 @@ class LLMEvaluator:
:param inputs: :param inputs:
The input values to evaluate. The keys are the input names and the values are lists of input values. The input values to evaluate. The keys are the input names and the values are lists of input values.
:returns: :returns:
A dictionary with a single `results` entry that contains a list of results. A dictionary with a `results` entry that contains a list of results.
Each result is a dictionary containing the keys as defined in the `outputs` parameter of the LLMEvaluator Each result is a dictionary containing the keys as defined in the `outputs` parameter of the LLMEvaluator
and the evaluation results as the values. If an exception occurs for a particular input value, the result and the evaluation results as the values. If an exception occurs for a particular input value, the result
will be `None` for that entry. will be `None` for that entry.
If the API is "openai" and the response contains a "meta" key, the metadata from OpenAI will be included
in the output dictionary, under the key "meta".
:raises ValueError: :raises ValueError:
Only in the case that `raise_on_failure` is set to True and the received inputs are not lists or have Only in the case that `raise_on_failure` is set to True and the received inputs are not lists or have
different lengths, or if the output is not a valid JSON or doesn't contain the expected keys. different lengths, or if the output is not a valid JSON or doesn't contain the expected keys.
@ -187,6 +189,7 @@ class LLMEvaluator:
list_of_input_names_to_values = [dict(zip(input_names, v)) for v in values] list_of_input_names_to_values = [dict(zip(input_names, v)) for v in values]
results: List[Optional[Dict[str, Any]]] = [] results: List[Optional[Dict[str, Any]]] = []
metadata = None
errors = 0 errors = 0
for input_names_to_values in tqdm(list_of_input_names_to_values, disable=not self.progress_bar): for input_names_to_values in tqdm(list_of_input_names_to_values, disable=not self.progress_bar):
prompt = self.builder.run(**input_names_to_values) prompt = self.builder.run(**input_names_to_values)
@ -208,11 +211,14 @@ class LLMEvaluator:
results.append(None) results.append(None)
errors += 1 errors += 1
if self.api == "openai" and "meta" in result:
metadata = result["meta"]
if errors > 0: if errors > 0:
msg = f"LLM evaluator failed for {errors} out of {len(list_of_input_names_to_values)} inputs." msg = f"LLM evaluator failed for {errors} out of {len(list_of_input_names_to_values)} inputs."
warn(msg) warn(msg)
return {"results": results} return {"results": results, "meta": metadata}
def prepare_template(self) -> str: def prepare_template(self) -> str:
""" """

View File

@ -0,0 +1,5 @@
---
enhancements:
- |
When using "openai" for the LLM-based evaluators the metadata from OpenAI will be in the output dictionary, under the key "meta".

View File

@ -160,6 +160,7 @@ class TestContextRelevanceEvaluator:
{"score": 1, "statement_scores": [1, 1], "statements": ["c", "d"]}, {"score": 1, "statement_scores": [1, 1], "statements": ["c", "d"]},
], ],
"score": 0.75, "score": 0.75,
"meta": None,
} }
def test_run_no_statements_extracted(self, monkeypatch): def test_run_no_statements_extracted(self, monkeypatch):
@ -192,6 +193,7 @@ class TestContextRelevanceEvaluator:
{"score": 0, "statement_scores": [], "statements": []}, {"score": 0, "statement_scores": [], "statements": []},
], ],
"score": 0.25, "score": 0.25,
"meta": None,
} }
def test_run_missing_parameters(self, monkeypatch): def test_run_missing_parameters(self, monkeypatch):
@ -256,6 +258,11 @@ class TestContextRelevanceEvaluator:
nested_required_fields = {"score", "statement_scores", "statements"} nested_required_fields = {"score", "statement_scores", "statements"}
assert all(field in result["results"][0] for field in nested_required_fields) assert all(field in result["results"][0] for field in nested_required_fields)
assert "meta" in result
assert "prompt_tokens" in result["meta"][0]["usage"]
assert "completion_tokens" in result["meta"][0]["usage"]
assert "total_tokens" in result["meta"][0]["usage"]
@pytest.mark.skipif( @pytest.mark.skipif(
not os.environ.get("OPENAI_API_KEY", None), not os.environ.get("OPENAI_API_KEY", None),
reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.", reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",

View File

@ -179,6 +179,7 @@ class TestFaithfulnessEvaluator:
{"score": 1, "statement_scores": [1, 1], "statements": ["c", "d"]}, {"score": 1, "statement_scores": [1, 1], "statements": ["c", "d"]},
], ],
"score": 0.75, "score": 0.75,
"meta": None,
} }
def test_run_no_statements_extracted(self, monkeypatch): def test_run_no_statements_extracted(self, monkeypatch):
@ -215,6 +216,7 @@ class TestFaithfulnessEvaluator:
{"score": 0, "statement_scores": [], "statements": []}, {"score": 0, "statement_scores": [], "statements": []},
], ],
"score": 0.25, "score": 0.25,
"meta": None,
} }
def test_run_missing_parameters(self, monkeypatch): def test_run_missing_parameters(self, monkeypatch):
@ -282,3 +284,9 @@ class TestFaithfulnessEvaluator:
assert all(field in result for field in required_fields) assert all(field in result for field in required_fields)
nested_required_fields = {"score", "statement_scores", "statements"} nested_required_fields = {"score", "statement_scores", "statements"}
assert all(field in result["results"][0] for field in nested_required_fields) assert all(field in result["results"][0] for field in nested_required_fields)
# assert that metadata is present in the result
assert "meta" in result
assert "prompt_tokens" in result["meta"][0]["usage"]
assert "completion_tokens" in result["meta"][0]["usage"]
assert "total_tokens" in result["meta"][0]["usage"]

View File

@ -339,7 +339,7 @@ class TestLLMEvaluator:
monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run) monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run)
results = component.run(questions=["What is the capital of Germany?"], predicted_answers=["Berlin"]) results = component.run(questions=["What is the capital of Germany?"], predicted_answers=["Berlin"])
assert results == {"results": [{"score": 0.5}]} assert results == {"results": [{"score": 0.5}], "meta": None}
def test_prepare_template(self, monkeypatch): def test_prepare_template(self, monkeypatch):
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")