From a249bd32caa4cdca084cb6bc336f6a484b1e44ea Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> Date: Tue, 30 Dec 2025 10:26:01 +0100 Subject: [PATCH] Improve flaky test and actually test reasoning (#10287) --- .../generators/chat/test_hugging_face_api.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/test/components/generators/chat/test_hugging_face_api.py b/test/components/generators/chat/test_hugging_face_api.py index 421a05ac2..7dafc1615 100644 --- a/test/components/generators/chat/test_hugging_face_api.py +++ b/test/components/generators/chat/test_hugging_face_api.py @@ -1100,11 +1100,15 @@ class TestHuggingFaceAPIChatGenerator: 2. When the assistant message (with reasoning) is sent back in a multi-turn conversation, the API call succeeds (reasoning is dropped during conversion since HF API doesn't support it) """ - # Note: Using a model that supports reasoning. DeepSeek-R1-Distill models are available - # via serverless inference and support reasoning output. + # Note: Using a model that supports reasoning AND a provider that actually follows the spec defined in + # huggingface-hub. Reasoning content especially seems to be non-standard across providers and is either left + # in the main response or put in a new field that is not part of the official API. + # One combo that does respect the spec is together + openai/gpt-oss-20b. + # together + openai/gpt-oss-20b actually uses the expected reasoning field in the response generator = HuggingFaceAPIChatGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"}, + # We use together + openai/gpt-oss-20b since it actually returns reasoning content in the expected field + api_params={"model": "openai/gpt-oss-20b", "provider": "together"}, generation_kwargs={"max_tokens": 300}, ) @@ -1116,13 +1120,14 @@ class TestHuggingFaceAPIChatGenerator: assert len(response["replies"]) > 0 first_reply = response["replies"][0] assert first_reply.text is not None + assert first_reply.reasoning is not None # Second turn: send a follow-up including the assistant's previous response # This tests that convert_message_to_hf_format properly handles messages # that may contain ReasoningContent (it should skip it) follow_up_messages = [ ChatMessage.from_user("What is 2 + 2? Answer briefly."), - first_reply, # Include the assistant's response (may contain reasoning) + first_reply, # Include the assistant's response with reasoning ChatMessage.from_user("Now what is 3 + 3? Answer briefly."), ] follow_up_response = generator.run(messages=follow_up_messages) @@ -1131,6 +1136,7 @@ class TestHuggingFaceAPIChatGenerator: assert "replies" in follow_up_response assert len(follow_up_response["replies"]) > 0 assert follow_up_response["replies"][0].text is not None + assert follow_up_response["replies"][0].reasoning is not None def test_hugging_face_api_generator_with_toolset_initialization(self, mock_check_valid_model, tools): """Test that the HuggingFaceAPIChatGenerator can be initialized with a Toolset."""