mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-08 04:47:06 +00:00
feat: LLM-based evaluators return meta info from OpenAI (#7947)
* LLM-Evaluator returns metadata from OpenAI * adding tests * adding release notes * updating test * updating release notes * fixing live tests * attending PR comments * fixing tests * Update releasenotes/notes/adding-metadata-info-from-OpenAI-f5309af5f59bb6a7.yaml Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> * Update llm_evaluator.py --------- Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>
This commit is contained in:
parent
3068ea258b
commit
186512459d
@ -171,10 +171,12 @@ class LLMEvaluator:
|
|||||||
:param inputs:
|
:param inputs:
|
||||||
The input values to evaluate. The keys are the input names and the values are lists of input values.
|
The input values to evaluate. The keys are the input names and the values are lists of input values.
|
||||||
:returns:
|
:returns:
|
||||||
A dictionary with a single `results` entry that contains a list of results.
|
A dictionary with a `results` entry that contains a list of results.
|
||||||
Each result is a dictionary containing the keys as defined in the `outputs` parameter of the LLMEvaluator
|
Each result is a dictionary containing the keys as defined in the `outputs` parameter of the LLMEvaluator
|
||||||
and the evaluation results as the values. If an exception occurs for a particular input value, the result
|
and the evaluation results as the values. If an exception occurs for a particular input value, the result
|
||||||
will be `None` for that entry.
|
will be `None` for that entry.
|
||||||
|
If the API is "openai" and the response contains a "meta" key, the metadata from OpenAI will be included
|
||||||
|
in the output dictionary, under the key "meta".
|
||||||
:raises ValueError:
|
:raises ValueError:
|
||||||
Only in the case that `raise_on_failure` is set to True and the received inputs are not lists or have
|
Only in the case that `raise_on_failure` is set to True and the received inputs are not lists or have
|
||||||
different lengths, or if the output is not a valid JSON or doesn't contain the expected keys.
|
different lengths, or if the output is not a valid JSON or doesn't contain the expected keys.
|
||||||
@ -187,6 +189,7 @@ class LLMEvaluator:
|
|||||||
list_of_input_names_to_values = [dict(zip(input_names, v)) for v in values]
|
list_of_input_names_to_values = [dict(zip(input_names, v)) for v in values]
|
||||||
|
|
||||||
results: List[Optional[Dict[str, Any]]] = []
|
results: List[Optional[Dict[str, Any]]] = []
|
||||||
|
metadata = None
|
||||||
errors = 0
|
errors = 0
|
||||||
for input_names_to_values in tqdm(list_of_input_names_to_values, disable=not self.progress_bar):
|
for input_names_to_values in tqdm(list_of_input_names_to_values, disable=not self.progress_bar):
|
||||||
prompt = self.builder.run(**input_names_to_values)
|
prompt = self.builder.run(**input_names_to_values)
|
||||||
@ -208,11 +211,14 @@ class LLMEvaluator:
|
|||||||
results.append(None)
|
results.append(None)
|
||||||
errors += 1
|
errors += 1
|
||||||
|
|
||||||
|
if self.api == "openai" and "meta" in result:
|
||||||
|
metadata = result["meta"]
|
||||||
|
|
||||||
if errors > 0:
|
if errors > 0:
|
||||||
msg = f"LLM evaluator failed for {errors} out of {len(list_of_input_names_to_values)} inputs."
|
msg = f"LLM evaluator failed for {errors} out of {len(list_of_input_names_to_values)} inputs."
|
||||||
warn(msg)
|
warn(msg)
|
||||||
|
|
||||||
return {"results": results}
|
return {"results": results, "meta": metadata}
|
||||||
|
|
||||||
def prepare_template(self) -> str:
|
def prepare_template(self) -> str:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -0,0 +1,5 @@
|
|||||||
|
---
|
||||||
|
|
||||||
|
enhancements:
|
||||||
|
- |
|
||||||
|
When using "openai" for the LLM-based evaluators the metadata from OpenAI will be in the output dictionary, under the key "meta".
|
||||||
@ -160,6 +160,7 @@ class TestContextRelevanceEvaluator:
|
|||||||
{"score": 1, "statement_scores": [1, 1], "statements": ["c", "d"]},
|
{"score": 1, "statement_scores": [1, 1], "statements": ["c", "d"]},
|
||||||
],
|
],
|
||||||
"score": 0.75,
|
"score": 0.75,
|
||||||
|
"meta": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
def test_run_no_statements_extracted(self, monkeypatch):
|
def test_run_no_statements_extracted(self, monkeypatch):
|
||||||
@ -192,6 +193,7 @@ class TestContextRelevanceEvaluator:
|
|||||||
{"score": 0, "statement_scores": [], "statements": []},
|
{"score": 0, "statement_scores": [], "statements": []},
|
||||||
],
|
],
|
||||||
"score": 0.25,
|
"score": 0.25,
|
||||||
|
"meta": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
def test_run_missing_parameters(self, monkeypatch):
|
def test_run_missing_parameters(self, monkeypatch):
|
||||||
@ -256,6 +258,11 @@ class TestContextRelevanceEvaluator:
|
|||||||
nested_required_fields = {"score", "statement_scores", "statements"}
|
nested_required_fields = {"score", "statement_scores", "statements"}
|
||||||
assert all(field in result["results"][0] for field in nested_required_fields)
|
assert all(field in result["results"][0] for field in nested_required_fields)
|
||||||
|
|
||||||
|
assert "meta" in result
|
||||||
|
assert "prompt_tokens" in result["meta"][0]["usage"]
|
||||||
|
assert "completion_tokens" in result["meta"][0]["usage"]
|
||||||
|
assert "total_tokens" in result["meta"][0]["usage"]
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
not os.environ.get("OPENAI_API_KEY", None),
|
not os.environ.get("OPENAI_API_KEY", None),
|
||||||
reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",
|
reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",
|
||||||
|
|||||||
@ -179,6 +179,7 @@ class TestFaithfulnessEvaluator:
|
|||||||
{"score": 1, "statement_scores": [1, 1], "statements": ["c", "d"]},
|
{"score": 1, "statement_scores": [1, 1], "statements": ["c", "d"]},
|
||||||
],
|
],
|
||||||
"score": 0.75,
|
"score": 0.75,
|
||||||
|
"meta": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
def test_run_no_statements_extracted(self, monkeypatch):
|
def test_run_no_statements_extracted(self, monkeypatch):
|
||||||
@ -215,6 +216,7 @@ class TestFaithfulnessEvaluator:
|
|||||||
{"score": 0, "statement_scores": [], "statements": []},
|
{"score": 0, "statement_scores": [], "statements": []},
|
||||||
],
|
],
|
||||||
"score": 0.25,
|
"score": 0.25,
|
||||||
|
"meta": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
def test_run_missing_parameters(self, monkeypatch):
|
def test_run_missing_parameters(self, monkeypatch):
|
||||||
@ -282,3 +284,9 @@ class TestFaithfulnessEvaluator:
|
|||||||
assert all(field in result for field in required_fields)
|
assert all(field in result for field in required_fields)
|
||||||
nested_required_fields = {"score", "statement_scores", "statements"}
|
nested_required_fields = {"score", "statement_scores", "statements"}
|
||||||
assert all(field in result["results"][0] for field in nested_required_fields)
|
assert all(field in result["results"][0] for field in nested_required_fields)
|
||||||
|
|
||||||
|
# assert that metadata is present in the result
|
||||||
|
assert "meta" in result
|
||||||
|
assert "prompt_tokens" in result["meta"][0]["usage"]
|
||||||
|
assert "completion_tokens" in result["meta"][0]["usage"]
|
||||||
|
assert "total_tokens" in result["meta"][0]["usage"]
|
||||||
|
|||||||
@ -339,7 +339,7 @@ class TestLLMEvaluator:
|
|||||||
monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run)
|
monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run)
|
||||||
|
|
||||||
results = component.run(questions=["What is the capital of Germany?"], predicted_answers=["Berlin"])
|
results = component.run(questions=["What is the capital of Germany?"], predicted_answers=["Berlin"])
|
||||||
assert results == {"results": [{"score": 0.5}]}
|
assert results == {"results": [{"score": 0.5}], "meta": None}
|
||||||
|
|
||||||
def test_prepare_template(self, monkeypatch):
|
def test_prepare_template(self, monkeypatch):
|
||||||
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
|
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user