Default usage statistics for streaming responses (#6578)

## Why are these changes needed? Enables usage statistics for streaming responses by default. There is a similar bug in the AzureAI client. Theoretically adding the parameter ``` model_extras={"stream_options": {"include_usage": True}} ``` should fix the problem, but I'm currently unable to test that workflow ## Related issue number closes https://github.com/microsoft/autogen/issues/6548 ## Checks - [ ] I've included any doc changes needed for <https://microsoft.github.io/autogen/>. See <https://github.com/microsoft/autogen/blob/main/CONTRIBUTING.md> to build and test documentation locally. - [ ] I've added tests (if relevant) corresponding to the changes introduced in this PR. - [ ] I've made sure all auto checks have passed.
2025-12-26 06:28:50 +00:00 · 2025-05-28 14:32:04 -04:00 · 2025-05-28 14:32:04 -04:00 · 03394a42c0
commit 03394a42c0
parent 9bbcfa03ac
2 changed files with 46 additions and 1 deletions
--- a/python/packages/autogen-ext/src/autogen_ext/models/openai/_openai_client.py
+++ b/python/packages/autogen-ext/src/autogen_ext/models/openai/_openai_client.py
@ -740,6 +740,7 @@ class BaseOpenAIChatCompletionClient(ChatCompletionClient):
        extra_create_args: Mapping[str, Any] = {},
        cancellation_token: Optional[CancellationToken] = None,
        max_consecutive_empty_chunk_tolerance: int = 0,
+        include_usage: Optional[bool] = None,
    ) -> AsyncGenerator[Union[str, CreateResult], None]:
        """Create a stream of string chunks from the model ending with a :class:`~autogen_core.models.CreateResult`.

@ -748,7 +749,7 @@ class BaseOpenAIChatCompletionClient(ChatCompletionClient):
        In streaming, the default behaviour is not return token usage counts.
        See: `OpenAI API reference for possible args <https://platform.openai.com/docs/api-reference/chat/create>`_.

-        You can set `extra_create_args={"stream_options": {"include_usage": True}}`
+        You can set set the `include_usage` flag to True or `extra_create_args={"stream_options": {"include_usage": True}}`. If both the flag and `stream_options` are set, but to different values, an exception will be raised.
        (if supported by the accessed API) to
        return a final chunk with usage set to a :class:`~autogen_core.models.RequestUsage` object
        with prompt and completion token counts,
@ -770,6 +771,17 @@ class BaseOpenAIChatCompletionClient(ChatCompletionClient):
            extra_create_args,
        )

+        if include_usage is not None:
+            if "stream_options" in create_params.create_args:
+                stream_options = create_params.create_args["stream_options"]
+                if "include_usage" in stream_options and stream_options["include_usage"] != include_usage:
+                    raise ValueError(
+                        "include_usage and extra_create_args['stream_options']['include_usage'] are both set, but differ in value."
+                    )
+            else:
+                # If stream options are not present, add them.
+                create_params.create_args["stream_options"] = {"include_usage": True}
+
        if max_consecutive_empty_chunk_tolerance != 0:
            warnings.warn(
                "The 'max_consecutive_empty_chunk_tolerance' parameter is deprecated and will be removed in the future releases. All of empty chunks will be skipped with a warning.",
--- a/python/packages/autogen-ext/tests/models/test_openai_model_client.py
+++ b/python/packages/autogen-ext/tests/models/test_openai_model_client.py
@ -276,6 +276,7 @@ async def test_openai_chat_completion_client_create_stream_with_usage(
    monkeypatch.setattr(AsyncCompletions, "create", _mock_create)
    client = OpenAIChatCompletionClient(model="gpt-4o", api_key="api_key")
    chunks: List[str | CreateResult] = []
+    # Check that include_usage works when set via create_args
    with caplog.at_level(logging.INFO):
        async for chunk in client.create_stream(
            messages=[UserMessage(content="Hello", source="user")],
@ -296,6 +297,38 @@ async def test_openai_chat_completion_client_create_stream_with_usage(
        assert chunks[-1].content in caplog.text
        assert chunks[-1].usage == RequestUsage(prompt_tokens=3, completion_tokens=3)

+    chunks = []
+    # Check that include_usage works when set via include_usage flag
+    with caplog.at_level(logging.INFO):
+        async for chunk in client.create_stream(
+            messages=[UserMessage(content="Hello", source="user")],
+            include_usage=True,
+        ):
+            chunks.append(chunk)
+
+        assert "LLMStreamStart" in caplog.text
+        assert "LLMStreamEnd" in caplog.text
+
+        assert chunks[0] == "Hello"
+        assert chunks[1] == " Another Hello"
+        assert chunks[2] == " Yet Another Hello"
+        assert isinstance(chunks[-1], CreateResult)
+        assert isinstance(chunks[-1].content, str)
+        assert chunks[-1].content == "Hello Another Hello Yet Another Hello"
+        assert chunks[-1].content in caplog.text
+        assert chunks[-1].usage == RequestUsage(prompt_tokens=3, completion_tokens=3)
+
+    chunks = []
+    # Check that setting both flags to different values raises an exception
+
+    with pytest.raises(ValueError):
+        async for chunk in client.create_stream(
+            messages=[UserMessage(content="Hello", source="user")],
+            extra_create_args={"stream_options": {"include_usage": False}},
+            include_usage=True,
+        ):
+            chunks.append(chunk)
+

@pytest.mark.asyncio
 async def test_openai_chat_completion_client_create_stream_no_usage_default(monkeypatch: pytest.MonkeyPatch) -> None: