Improve flaky test and actually test reasoning (#10287)

2026-02-11 01:46:14 +00:00 · 2025-12-30 10:26:01 +01:00 · 2025-12-30 10:26:01 +01:00 · a249bd32ca
commit a249bd32ca
parent e86f016141
1 changed files with 10 additions and 4 deletions
--- a/test/components/generators/chat/test_hugging_face_api.py
+++ b/test/components/generators/chat/test_hugging_face_api.py
@ -1100,11 +1100,15 @@ class TestHuggingFaceAPIChatGenerator:
        2. When the assistant message (with reasoning) is sent back in a multi-turn conversation,
           the API call succeeds (reasoning is dropped during conversion since HF API doesn't support it)
        """
-        # Note: Using a model that supports reasoning. DeepSeek-R1-Distill models are available
-        # via serverless inference and support reasoning output.
+        # Note: Using a model that supports reasoning AND a provider that actually follows the spec defined in
+        # huggingface-hub. Reasoning content especially seems to be non-standard across providers and is either left
+        # in the main response or put in a new field that is not part of the official API.
+        # One combo that does respect the spec is together + openai/gpt-oss-20b.
+        # together + openai/gpt-oss-20b actually uses the expected reasoning field in the response
        generator = HuggingFaceAPIChatGenerator(
            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"},
+            # We use together + openai/gpt-oss-20b since it actually returns reasoning content in the expected field
+            api_params={"model": "openai/gpt-oss-20b", "provider": "together"},
            generation_kwargs={"max_tokens": 300},
        )

@ -1116,13 +1120,14 @@ class TestHuggingFaceAPIChatGenerator:
        assert len(response["replies"]) > 0
        first_reply = response["replies"][0]
        assert first_reply.text is not None
+        assert first_reply.reasoning is not None

        # Second turn: send a follow-up including the assistant's previous response
        # This tests that convert_message_to_hf_format properly handles messages
        # that may contain ReasoningContent (it should skip it)
        follow_up_messages = [
            ChatMessage.from_user("What is 2 + 2? Answer briefly."),
-            first_reply,  # Include the assistant's response (may contain reasoning)
+            first_reply,  # Include the assistant's response with reasoning
            ChatMessage.from_user("Now what is 3 + 3? Answer briefly."),
        ]
        follow_up_response = generator.run(messages=follow_up_messages)
@ -1131,6 +1136,7 @@ class TestHuggingFaceAPIChatGenerator:
        assert "replies" in follow_up_response
        assert len(follow_up_response["replies"]) > 0
        assert follow_up_response["replies"][0].text is not None
+        assert follow_up_response["replies"][0].reasoning is not None

    def test_hugging_face_api_generator_with_toolset_initialization(self, mock_check_valid_model, tools):
        """Test that the HuggingFaceAPIChatGenerator can be initialized with a Toolset."""