From a249bd32caa4cdca084cb6bc336f6a484b1e44ea Mon Sep 17 00:00:00 2001
From: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com>
Date: Tue, 30 Dec 2025 10:26:01 +0100
Subject: [PATCH] Improve flaky test and actually test reasoning (#10287)

---
 .../generators/chat/test_hugging_face_api.py       | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/test/components/generators/chat/test_hugging_face_api.py b/test/components/generators/chat/test_hugging_face_api.py
index 421a05ac2..7dafc1615 100644
--- a/test/components/generators/chat/test_hugging_face_api.py
+++ b/test/components/generators/chat/test_hugging_face_api.py
@@ -1100,11 +1100,15 @@ class TestHuggingFaceAPIChatGenerator:
         2. When the assistant message (with reasoning) is sent back in a multi-turn conversation,
            the API call succeeds (reasoning is dropped during conversion since HF API doesn't support it)
         """
-        # Note: Using a model that supports reasoning. DeepSeek-R1-Distill models are available
-        # via serverless inference and support reasoning output.
+        # Note: Using a model that supports reasoning AND a provider that actually follows the spec defined in
+        # huggingface-hub. Reasoning content especially seems to be non-standard across providers and is either left
+        # in the main response or put in a new field that is not part of the official API.
+        # One combo that does respect the spec is together + openai/gpt-oss-20b.
+        # together + openai/gpt-oss-20b actually uses the expected reasoning field in the response
         generator = HuggingFaceAPIChatGenerator(
             api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"},
+            # We use together + openai/gpt-oss-20b since it actually returns reasoning content in the expected field
+            api_params={"model": "openai/gpt-oss-20b", "provider": "together"},
             generation_kwargs={"max_tokens": 300},
         )
 
@@ -1116,13 +1120,14 @@ class TestHuggingFaceAPIChatGenerator:
         assert len(response["replies"]) > 0
         first_reply = response["replies"][0]
         assert first_reply.text is not None
+        assert first_reply.reasoning is not None
 
         # Second turn: send a follow-up including the assistant's previous response
         # This tests that convert_message_to_hf_format properly handles messages
         # that may contain ReasoningContent (it should skip it)
         follow_up_messages = [
             ChatMessage.from_user("What is 2 + 2? Answer briefly."),
-            first_reply,  # Include the assistant's response (may contain reasoning)
+            first_reply,  # Include the assistant's response with reasoning
             ChatMessage.from_user("Now what is 3 + 3? Answer briefly."),
         ]
         follow_up_response = generator.run(messages=follow_up_messages)
@@ -1131,6 +1136,7 @@ class TestHuggingFaceAPIChatGenerator:
         assert "replies" in follow_up_response
         assert len(follow_up_response["replies"]) > 0
         assert follow_up_response["replies"][0].text is not None
+        assert follow_up_response["replies"][0].reasoning is not None
 
     def test_hugging_face_api_generator_with_toolset_initialization(self, mock_check_valid_model, tools):
         """Test that the HuggingFaceAPIChatGenerator can be initialized with a Toolset."""