Default usage statistics for streaming responses (#6578)

## Why are these changes needed?

Enables usage statistics for streaming responses by default.

There is a similar bug in the AzureAI client. Theoretically adding the
parameter
```
model_extras={"stream_options": {"include_usage": True}}
```
should fix the problem, but I'm currently unable to test that workflow

## Related issue number

closes https://github.com/microsoft/autogen/issues/6548

## Checks

- [ ] I've included any doc changes needed for
<https://microsoft.github.io/autogen/>. See
<https://github.com/microsoft/autogen/blob/main/CONTRIBUTING.md> to
build and test documentation locally.
- [ ] I've added tests (if relevant) corresponding to the changes
introduced in this PR.
- [ ] I've made sure all auto checks have passed.
This commit is contained in:
peterychang 2025-05-28 14:32:04 -04:00 committed by GitHub
parent 9bbcfa03ac
commit 03394a42c0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 46 additions and 1 deletions

View File

@ -740,6 +740,7 @@ class BaseOpenAIChatCompletionClient(ChatCompletionClient):
extra_create_args: Mapping[str, Any] = {},
cancellation_token: Optional[CancellationToken] = None,
max_consecutive_empty_chunk_tolerance: int = 0,
include_usage: Optional[bool] = None,
) -> AsyncGenerator[Union[str, CreateResult], None]:
"""Create a stream of string chunks from the model ending with a :class:`~autogen_core.models.CreateResult`.
@ -748,7 +749,7 @@ class BaseOpenAIChatCompletionClient(ChatCompletionClient):
In streaming, the default behaviour is not return token usage counts.
See: `OpenAI API reference for possible args <https://platform.openai.com/docs/api-reference/chat/create>`_.
You can set `extra_create_args={"stream_options": {"include_usage": True}}`
You can set set the `include_usage` flag to True or `extra_create_args={"stream_options": {"include_usage": True}}`. If both the flag and `stream_options` are set, but to different values, an exception will be raised.
(if supported by the accessed API) to
return a final chunk with usage set to a :class:`~autogen_core.models.RequestUsage` object
with prompt and completion token counts,
@ -770,6 +771,17 @@ class BaseOpenAIChatCompletionClient(ChatCompletionClient):
extra_create_args,
)
if include_usage is not None:
if "stream_options" in create_params.create_args:
stream_options = create_params.create_args["stream_options"]
if "include_usage" in stream_options and stream_options["include_usage"] != include_usage:
raise ValueError(
"include_usage and extra_create_args['stream_options']['include_usage'] are both set, but differ in value."
)
else:
# If stream options are not present, add them.
create_params.create_args["stream_options"] = {"include_usage": True}
if max_consecutive_empty_chunk_tolerance != 0:
warnings.warn(
"The 'max_consecutive_empty_chunk_tolerance' parameter is deprecated and will be removed in the future releases. All of empty chunks will be skipped with a warning.",

View File

@ -276,6 +276,7 @@ async def test_openai_chat_completion_client_create_stream_with_usage(
monkeypatch.setattr(AsyncCompletions, "create", _mock_create)
client = OpenAIChatCompletionClient(model="gpt-4o", api_key="api_key")
chunks: List[str | CreateResult] = []
# Check that include_usage works when set via create_args
with caplog.at_level(logging.INFO):
async for chunk in client.create_stream(
messages=[UserMessage(content="Hello", source="user")],
@ -296,6 +297,38 @@ async def test_openai_chat_completion_client_create_stream_with_usage(
assert chunks[-1].content in caplog.text
assert chunks[-1].usage == RequestUsage(prompt_tokens=3, completion_tokens=3)
chunks = []
# Check that include_usage works when set via include_usage flag
with caplog.at_level(logging.INFO):
async for chunk in client.create_stream(
messages=[UserMessage(content="Hello", source="user")],
include_usage=True,
):
chunks.append(chunk)
assert "LLMStreamStart" in caplog.text
assert "LLMStreamEnd" in caplog.text
assert chunks[0] == "Hello"
assert chunks[1] == " Another Hello"
assert chunks[2] == " Yet Another Hello"
assert isinstance(chunks[-1], CreateResult)
assert isinstance(chunks[-1].content, str)
assert chunks[-1].content == "Hello Another Hello Yet Another Hello"
assert chunks[-1].content in caplog.text
assert chunks[-1].usage == RequestUsage(prompt_tokens=3, completion_tokens=3)
chunks = []
# Check that setting both flags to different values raises an exception
with pytest.raises(ValueError):
async for chunk in client.create_stream(
messages=[UserMessage(content="Hello", source="user")],
extra_create_args={"stream_options": {"include_usage": False}},
include_usage=True,
):
chunks.append(chunk)
@pytest.mark.asyncio
async def test_openai_chat_completion_client_create_stream_no_usage_default(monkeypatch: pytest.MonkeyPatch) -> None: