# SPDX-FileCopyrightText: 2022-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 import os from datetime import datetime from typing import Any, Dict from unittest.mock import AsyncMock, MagicMock, Mock, patch import pytest from huggingface_hub import ( ChatCompletionInputStreamOptions, ChatCompletionOutput, ChatCompletionOutputComplete, ChatCompletionOutputFunctionDefinition, ChatCompletionOutputMessage, ChatCompletionOutputToolCall, ChatCompletionOutputUsage, ChatCompletionStreamOutput, ChatCompletionStreamOutputChoice, ChatCompletionStreamOutputDelta, ChatCompletionStreamOutputUsage, ) from huggingface_hub.errors import RepositoryNotFoundError from haystack import Pipeline from haystack.components.generators.chat.hugging_face_api import ( HuggingFaceAPIChatGenerator, _convert_chat_completion_stream_output_to_streaming_chunk, _convert_hfapi_tool_calls, _convert_tools_to_hfapi_tools, ) from haystack.dataclasses import ChatMessage, StreamingChunk, ToolCall from haystack.tools import Tool from haystack.tools.toolset import Toolset from haystack.utils.auth import Secret from haystack.utils.hf import HFGenerationAPIType @pytest.fixture def chat_messages(): return [ ChatMessage.from_system("You are a helpful assistant speaking A2 level of English"), ChatMessage.from_user("Tell me about Berlin"), ] def get_weather(city: str) -> Dict[str, Any]: weather_info = { "Berlin": {"weather": "mostly sunny", "temperature": 7, "unit": "celsius"}, "Paris": {"weather": "mostly cloudy", "temperature": 8, "unit": "celsius"}, "Rome": {"weather": "sunny", "temperature": 14, "unit": "celsius"}, } return weather_info.get(city, {"weather": "unknown", "temperature": 0, "unit": "celsius"}) @pytest.fixture def tools(): weather_tool = Tool( name="weather", description="useful to determine the weather in a given location", parameters={"type": "object", "properties": {"city": {"type": "string"}}, "required": ["city"]}, function=get_weather, ) return [weather_tool] @pytest.fixture def mock_check_valid_model(): with patch( "haystack.components.generators.chat.hugging_face_api.check_valid_model", MagicMock(return_value=None) ) as mock: yield mock @pytest.fixture def mock_chat_completion(): # https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion.example with patch("huggingface_hub.InferenceClient.chat_completion", autospec=True) as mock_chat_completion: completion = ChatCompletionOutput( choices=[ ChatCompletionOutputComplete( finish_reason="eos_token", index=0, message=ChatCompletionOutputMessage(content="The capital of France is Paris.", role="assistant"), ) ], id="some_id", model="some_model", system_fingerprint="some_fingerprint", usage=ChatCompletionOutputUsage(completion_tokens=8, prompt_tokens=17, total_tokens=25), created=1710498360, ) mock_chat_completion.return_value = completion yield mock_chat_completion @pytest.fixture def mock_chat_completion_async(): with patch("huggingface_hub.AsyncInferenceClient.chat_completion", autospec=True) as mock_chat_completion: completion = ChatCompletionOutput( choices=[ ChatCompletionOutputComplete( finish_reason="eos_token", index=0, message=ChatCompletionOutputMessage(content="The capital of France is Paris.", role="assistant"), ) ], id="some_id", model="some_model", system_fingerprint="some_fingerprint", usage=ChatCompletionOutputUsage(completion_tokens=8, prompt_tokens=17, total_tokens=25), created=1710498360, ) # Use AsyncMock to properly mock the async method mock_chat_completion.return_value = completion mock_chat_completion.__call__ = AsyncMock(return_value=completion) yield mock_chat_completion # used to test serialization of streaming_callback def streaming_callback_handler(x): return x class TestHuggingFaceAPIChatGenerator: def test_init_invalid_api_type(self): with pytest.raises(ValueError): HuggingFaceAPIChatGenerator(api_type="invalid_api_type", api_params={}) def test_init_serverless(self, mock_check_valid_model): model = "HuggingFaceH4/zephyr-7b-alpha" generation_kwargs = {"temperature": 0.6} stop_words = ["stop"] streaming_callback = None generator = HuggingFaceAPIChatGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": model}, token=None, generation_kwargs=generation_kwargs, stop_words=stop_words, streaming_callback=streaming_callback, ) assert generator.api_type == HFGenerationAPIType.SERVERLESS_INFERENCE_API assert generator.api_params == {"model": model} assert generator.generation_kwargs == {**generation_kwargs, **{"stop": ["stop"]}, **{"max_tokens": 512}} assert generator.streaming_callback == streaming_callback assert generator.tools is None # check that client and async_client are initialized assert generator._client.model == model assert generator._async_client.model == model def test_init_serverless_with_tools(self, mock_check_valid_model, tools): model = "HuggingFaceH4/zephyr-7b-alpha" generation_kwargs = {"temperature": 0.6} stop_words = ["stop"] streaming_callback = None generator = HuggingFaceAPIChatGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": model}, token=None, generation_kwargs=generation_kwargs, stop_words=stop_words, streaming_callback=streaming_callback, tools=tools, ) assert generator.api_type == HFGenerationAPIType.SERVERLESS_INFERENCE_API assert generator.api_params == {"model": model} assert generator.generation_kwargs == {**generation_kwargs, **{"stop": ["stop"]}, **{"max_tokens": 512}} assert generator.streaming_callback == streaming_callback assert generator.tools == tools assert generator._client.model == model assert generator._async_client.model == model def test_init_serverless_invalid_model(self, mock_check_valid_model): mock_check_valid_model.side_effect = RepositoryNotFoundError("Invalid model id") with pytest.raises(RepositoryNotFoundError): HuggingFaceAPIChatGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "invalid_model_id"} ) def test_init_serverless_no_model(self): with pytest.raises(ValueError): HuggingFaceAPIChatGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"param": "irrelevant"} ) def test_init_tgi(self): url = "https://some_model.com" generation_kwargs = {"temperature": 0.6} stop_words = ["stop"] streaming_callback = None generator = HuggingFaceAPIChatGenerator( api_type=HFGenerationAPIType.TEXT_GENERATION_INFERENCE, api_params={"url": url}, token=None, generation_kwargs=generation_kwargs, stop_words=stop_words, streaming_callback=streaming_callback, ) assert generator.api_type == HFGenerationAPIType.TEXT_GENERATION_INFERENCE assert generator.api_params == {"url": url} assert generator.generation_kwargs == {**generation_kwargs, **{"stop": ["stop"]}, **{"max_tokens": 512}} assert generator.streaming_callback == streaming_callback assert generator.tools is None assert generator._client.model == url assert generator._async_client.model == url def test_init_tgi_invalid_url(self): with pytest.raises(ValueError): HuggingFaceAPIChatGenerator( api_type=HFGenerationAPIType.TEXT_GENERATION_INFERENCE, api_params={"url": "invalid_url"} ) def test_init_tgi_no_url(self): with pytest.raises(ValueError): HuggingFaceAPIChatGenerator( api_type=HFGenerationAPIType.TEXT_GENERATION_INFERENCE, api_params={"param": "irrelevant"} ) def test_init_fail_with_duplicate_tool_names(self, mock_check_valid_model, tools): duplicate_tools = [tools[0], tools[0]] with pytest.raises(ValueError): HuggingFaceAPIChatGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "irrelevant"}, tools=duplicate_tools, ) def test_init_fail_with_tools_and_streaming(self, mock_check_valid_model, tools): with pytest.raises(ValueError): HuggingFaceAPIChatGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "irrelevant"}, tools=tools, streaming_callback=streaming_callback_handler, ) def test_to_dict(self, mock_check_valid_model): tool = Tool(name="name", description="description", parameters={"x": {"type": "string"}}, function=print) generator = HuggingFaceAPIChatGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "HuggingFaceH4/zephyr-7b-beta"}, generation_kwargs={"temperature": 0.6}, stop_words=["stop", "words"], tools=[tool], ) result = generator.to_dict() init_params = result["init_parameters"] assert init_params["api_type"] == "serverless_inference_api" assert init_params["api_params"] == {"model": "HuggingFaceH4/zephyr-7b-beta"} assert init_params["token"] == {"env_vars": ["HF_API_TOKEN", "HF_TOKEN"], "strict": False, "type": "env_var"} assert init_params["generation_kwargs"] == {"temperature": 0.6, "stop": ["stop", "words"], "max_tokens": 512} assert init_params["streaming_callback"] is None assert init_params["tools"] == [ { "type": "haystack.tools.tool.Tool", "data": { "description": "description", "function": "builtins.print", "inputs_from_state": None, "name": "name", "outputs_to_state": None, "outputs_to_string": None, "parameters": {"x": {"type": "string"}}, }, } ] def test_from_dict(self, mock_check_valid_model): tool = Tool(name="name", description="description", parameters={"x": {"type": "string"}}, function=print) generator = HuggingFaceAPIChatGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "HuggingFaceH4/zephyr-7b-beta"}, token=Secret.from_env_var("ENV_VAR", strict=False), generation_kwargs={"temperature": 0.6}, stop_words=["stop", "words"], tools=[tool], ) result = generator.to_dict() # now deserialize, call from_dict generator_2 = HuggingFaceAPIChatGenerator.from_dict(result) assert generator_2.api_type == HFGenerationAPIType.SERVERLESS_INFERENCE_API assert generator_2.api_params == {"model": "HuggingFaceH4/zephyr-7b-beta"} assert generator_2.token == Secret.from_env_var("ENV_VAR", strict=False) assert generator_2.generation_kwargs == {"temperature": 0.6, "stop": ["stop", "words"], "max_tokens": 512} assert generator_2.streaming_callback is None assert generator_2.tools == [tool] def test_serde_in_pipeline(self, mock_check_valid_model): tool = Tool(name="name", description="description", parameters={"x": {"type": "string"}}, function=print) generator = HuggingFaceAPIChatGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "HuggingFaceH4/zephyr-7b-beta"}, token=Secret.from_env_var("ENV_VAR", strict=False), generation_kwargs={"temperature": 0.6}, stop_words=["stop", "words"], tools=[tool], ) pipeline = Pipeline() pipeline.add_component("generator", generator) pipeline_dict = pipeline.to_dict() assert pipeline_dict == { "metadata": {}, "max_runs_per_component": 100, "connection_type_validation": True, "components": { "generator": { "type": "haystack.components.generators.chat.hugging_face_api.HuggingFaceAPIChatGenerator", "init_parameters": { "api_type": "serverless_inference_api", "api_params": {"model": "HuggingFaceH4/zephyr-7b-beta"}, "token": {"type": "env_var", "env_vars": ["ENV_VAR"], "strict": False}, "generation_kwargs": {"temperature": 0.6, "stop": ["stop", "words"], "max_tokens": 512}, "streaming_callback": None, "tools": [ { "type": "haystack.tools.tool.Tool", "data": { "inputs_from_state": None, "name": "name", "outputs_to_state": None, "outputs_to_string": None, "description": "description", "parameters": {"x": {"type": "string"}}, "function": "builtins.print", }, } ], }, } }, "connections": [], } pipeline_yaml = pipeline.dumps() new_pipeline = Pipeline.loads(pipeline_yaml) assert new_pipeline == pipeline def test_run(self, mock_check_valid_model, mock_chat_completion, chat_messages): generator = HuggingFaceAPIChatGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "meta-llama/Llama-2-13b-chat-hf"}, generation_kwargs={"temperature": 0.6}, stop_words=["stop", "words"], streaming_callback=None, ) response = generator.run(messages=chat_messages) # check kwargs passed to chat_completion _, kwargs = mock_chat_completion.call_args hf_messages = [ {"role": "system", "content": "You are a helpful assistant speaking A2 level of English"}, {"role": "user", "content": "Tell me about Berlin"}, ] assert kwargs == { "temperature": 0.6, "stop": ["stop", "words"], "max_tokens": 512, "tools": None, "messages": hf_messages, } assert isinstance(response, dict) assert "replies" in response assert isinstance(response["replies"], list) assert len(response["replies"]) == 1 assert [isinstance(reply, ChatMessage) for reply in response["replies"]] def test_run_with_streaming_callback(self, mock_check_valid_model, mock_chat_completion, chat_messages): streaming_call_count = 0 # Define the streaming callback function def streaming_callback_fn(chunk: StreamingChunk): nonlocal streaming_call_count streaming_call_count += 1 assert isinstance(chunk, StreamingChunk) generator = HuggingFaceAPIChatGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "meta-llama/Llama-2-13b-chat-hf"}, streaming_callback=streaming_callback_fn, ) # Create a fake streamed response # self needed here, don't remove def mock_iter(self): yield ChatCompletionStreamOutput( choices=[ ChatCompletionStreamOutputChoice( delta=ChatCompletionStreamOutputDelta(content="The", role="assistant"), index=0, finish_reason=None, ) ], id="some_id", model="some_model", system_fingerprint="some_fingerprint", created=1710498504, ) yield ChatCompletionStreamOutput( choices=[ ChatCompletionStreamOutputChoice( delta=ChatCompletionStreamOutputDelta(content=None, role=None), index=0, finish_reason="length" ) ], id="some_id", model="some_model", system_fingerprint="some_fingerprint", created=1710498504, ) mock_response = Mock(**{"__iter__": mock_iter}) mock_chat_completion.return_value = mock_response # Generate text response with streaming callback response = generator.run(chat_messages) # check kwargs passed to text_generation _, kwargs = mock_chat_completion.call_args assert kwargs == { "stop": [], "stream": True, "max_tokens": 512, "stream_options": ChatCompletionInputStreamOptions(include_usage=True), } # Assert that the streaming callback was called twice assert streaming_call_count == 2 # Assert that the response contains the generated replies assert "replies" in response assert isinstance(response["replies"], list) assert len(response["replies"]) > 0 assert [isinstance(reply, ChatMessage) for reply in response["replies"]] def test_run_with_streaming_callback_in_run_method( self, mock_check_valid_model, mock_chat_completion, chat_messages ): streaming_call_count = 0 # Define the streaming callback function def streaming_callback_fn(chunk: StreamingChunk): nonlocal streaming_call_count streaming_call_count += 1 assert isinstance(chunk, StreamingChunk) generator = HuggingFaceAPIChatGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "meta-llama/Llama-2-13b-chat-hf"}, ) # Create a fake streamed response # self needed here, don't remove def mock_iter(self): yield ChatCompletionStreamOutput( choices=[ ChatCompletionStreamOutputChoice( delta=ChatCompletionStreamOutputDelta(content="The", role="assistant"), index=0, finish_reason=None, ) ], id="some_id", model="some_model", system_fingerprint="some_fingerprint", created=1710498504, ) yield ChatCompletionStreamOutput( choices=[ ChatCompletionStreamOutputChoice( delta=ChatCompletionStreamOutputDelta(content=None, role=None), index=0, finish_reason="length" ) ], id="some_id", model="some_model", system_fingerprint="some_fingerprint", created=1710498504, ) mock_response = Mock(**{"__iter__": mock_iter}) mock_chat_completion.return_value = mock_response # Generate text response with streaming callback response = generator.run(chat_messages, streaming_callback=streaming_callback_fn) # check kwargs passed to text_generation _, kwargs = mock_chat_completion.call_args assert kwargs == { "stop": [], "stream": True, "max_tokens": 512, "stream_options": ChatCompletionInputStreamOptions(include_usage=True), } # Assert that the streaming callback was called twice assert streaming_call_count == 2 # Assert that the response contains the generated replies assert "replies" in response assert isinstance(response["replies"], list) assert len(response["replies"]) > 0 assert [isinstance(reply, ChatMessage) for reply in response["replies"]] def test_run_fail_with_tools_and_streaming(self, tools, mock_check_valid_model): component = HuggingFaceAPIChatGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "meta-llama/Llama-2-13b-chat-hf"}, streaming_callback=streaming_callback_handler, ) with pytest.raises(ValueError): message = ChatMessage.from_user("irrelevant") component.run([message], tools=tools) def test_run_with_tools(self, mock_check_valid_model, tools): generator = HuggingFaceAPIChatGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "meta-llama/Llama-3.1-70B-Instruct"}, tools=tools, ) with patch("huggingface_hub.InferenceClient.chat_completion", autospec=True) as mock_chat_completion: completion = ChatCompletionOutput( choices=[ ChatCompletionOutputComplete( finish_reason="stop", index=0, message=ChatCompletionOutputMessage( role="assistant", content=None, tool_calls=[ ChatCompletionOutputToolCall( function=ChatCompletionOutputFunctionDefinition( arguments={"city": "Paris"}, name="weather", description=None ), id="0", type="function", ) ], ), logprobs=None, ) ], created=1729074760, id="", model="meta-llama/Llama-3.1-70B-Instruct", system_fingerprint="2.3.2-dev0-sha-28bb7ae", usage=ChatCompletionOutputUsage(completion_tokens=30, prompt_tokens=426, total_tokens=456), ) mock_chat_completion.return_value = completion messages = [ChatMessage.from_user("What is the weather in Paris?")] response = generator.run(messages=messages) assert isinstance(response, dict) assert "replies" in response assert isinstance(response["replies"], list) assert len(response["replies"]) == 1 assert [isinstance(reply, ChatMessage) for reply in response["replies"]] assert response["replies"][0].tool_calls[0].tool_name == "weather" assert response["replies"][0].tool_calls[0].arguments == {"city": "Paris"} assert response["replies"][0].tool_calls[0].id == "0" assert response["replies"][0].meta == { "finish_reason": "stop", "index": 0, "model": "meta-llama/Llama-3.1-70B-Instruct", "usage": {"completion_tokens": 30, "prompt_tokens": 426}, } def test_convert_hfapi_tool_calls_empty(self): hfapi_tool_calls = None tool_calls = _convert_hfapi_tool_calls(hfapi_tool_calls) assert len(tool_calls) == 0 hfapi_tool_calls = [] tool_calls = _convert_hfapi_tool_calls(hfapi_tool_calls) assert len(tool_calls) == 0 def test_convert_hfapi_tool_calls_dict_arguments(self): hfapi_tool_calls = [ ChatCompletionOutputToolCall( function=ChatCompletionOutputFunctionDefinition( arguments={"city": "Paris"}, name="weather", description=None ), id="0", type="function", ) ] tool_calls = _convert_hfapi_tool_calls(hfapi_tool_calls) assert len(tool_calls) == 1 assert tool_calls[0].tool_name == "weather" assert tool_calls[0].arguments == {"city": "Paris"} assert tool_calls[0].id == "0" def test_convert_hfapi_tool_calls_str_arguments(self): hfapi_tool_calls = [ ChatCompletionOutputToolCall( function=ChatCompletionOutputFunctionDefinition( arguments='{"city": "Paris"}', name="weather", description=None ), id="0", type="function", ) ] tool_calls = _convert_hfapi_tool_calls(hfapi_tool_calls) assert len(tool_calls) == 1 assert tool_calls[0].tool_name == "weather" assert tool_calls[0].arguments == {"city": "Paris"} assert tool_calls[0].id == "0" def test_convert_hfapi_tool_calls_invalid_str_arguments(self): hfapi_tool_calls = [ ChatCompletionOutputToolCall( function=ChatCompletionOutputFunctionDefinition( arguments="not a valid JSON string", name="weather", description=None ), id="0", type="function", ) ] tool_calls = _convert_hfapi_tool_calls(hfapi_tool_calls) assert len(tool_calls) == 0 def test_convert_hfapi_tool_calls_invalid_type_arguments(self): hfapi_tool_calls = [ ChatCompletionOutputToolCall( function=ChatCompletionOutputFunctionDefinition( arguments=["this", "is", "a", "list"], name="weather", description=None ), id="0", type="function", ) ] tool_calls = _convert_hfapi_tool_calls(hfapi_tool_calls) assert len(tool_calls) == 0 @pytest.mark.parametrize( "hf_stream_output, expected_stream_chunk, dummy_previous_chunks", [ ( ChatCompletionStreamOutput( choices=[ ChatCompletionStreamOutputChoice( delta=ChatCompletionStreamOutputDelta(role="assistant", content=" Paris"), index=0 ) ], created=1748339326, id="", model="microsoft/Phi-3.5-mini-instruct", system_fingerprint="3.2.1-sha-4d28897", ), StreamingChunk( content=" Paris", meta={ "received_at": "2025-05-27T12:14:28.228852", "model": "microsoft/Phi-3.5-mini-instruct", "finish_reason": None, }, index=0, start=True, ), [], ), ( ChatCompletionStreamOutput( choices=[ ChatCompletionStreamOutputChoice( delta=ChatCompletionStreamOutputDelta(role="assistant", content=""), index=0, finish_reason="stop", ) ], created=1748339326, id="", model="microsoft/Phi-3.5-mini-instruct", system_fingerprint="3.2.1-sha-4d28897", ), StreamingChunk( content="", meta={ "received_at": "2025-05-27T12:14:28.228852", "model": "microsoft/Phi-3.5-mini-instruct", "finish_reason": "stop", }, finish_reason="stop", ), [0], ), ( ChatCompletionStreamOutput( choices=[], created=1748339326, id="", model="microsoft/Phi-3.5-mini-instruct", system_fingerprint="3.2.1-sha-4d28897", usage=ChatCompletionStreamOutputUsage(completion_tokens=2, prompt_tokens=21, total_tokens=23), ), StreamingChunk( content="", meta={ "received_at": "2025-05-27T12:14:28.228852", "model": "microsoft/Phi-3.5-mini-instruct", "usage": {"completion_tokens": 2, "prompt_tokens": 21}, }, ), [0, 1], ), ], ) def test_convert_chat_completion_stream_output_to_streaming_chunk( self, hf_stream_output, expected_stream_chunk, dummy_previous_chunks ): converted_stream_chunk = _convert_chat_completion_stream_output_to_streaming_chunk( chunk=hf_stream_output, previous_chunks=dummy_previous_chunks ) # Remove timestamp from comparison since it's always the current time converted_stream_chunk.meta.pop("received_at", None) expected_stream_chunk.meta.pop("received_at", None) assert converted_stream_chunk == expected_stream_chunk @pytest.mark.integration @pytest.mark.slow @pytest.mark.skipif( not os.environ.get("HF_API_TOKEN", None), reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.", ) @pytest.mark.flaky(reruns=2, reruns_delay=10) def test_live_run_serverless(self): generator = HuggingFaceAPIChatGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "microsoft/Phi-3.5-mini-instruct", "provider": "featherless-ai"}, generation_kwargs={"max_tokens": 20}, ) # No need for instruction tokens here since we use the chat_completion endpoint which handles the chat # templating for us. messages = [ ChatMessage.from_user("What is the capital of France? Be concise only provide the capital, nothing else.") ] response = generator.run(messages=messages) assert "replies" in response assert isinstance(response["replies"], list) assert len(response["replies"]) > 0 assert [isinstance(reply, ChatMessage) for reply in response["replies"]] assert response["replies"][0].text is not None meta = response["replies"][0].meta assert "usage" in meta assert "prompt_tokens" in meta["usage"] assert meta["usage"]["prompt_tokens"] > 0 assert "completion_tokens" in meta["usage"] assert meta["usage"]["completion_tokens"] > 0 assert meta["model"] == "microsoft/Phi-3.5-mini-instruct" assert meta["finish_reason"] is not None @pytest.mark.integration @pytest.mark.slow @pytest.mark.skipif( not os.environ.get("HF_API_TOKEN", None), reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.", ) @pytest.mark.flaky(reruns=2, reruns_delay=10) def test_live_run_serverless_streaming(self): generator = HuggingFaceAPIChatGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "microsoft/Phi-3.5-mini-instruct", "provider": "featherless-ai"}, generation_kwargs={"max_tokens": 20}, streaming_callback=streaming_callback_handler, ) # No need for instruction tokens here since we use the chat_completion endpoint which handles the chat # templating for us. messages = [ ChatMessage.from_user("What is the capital of France? Be concise only provide the capital, nothing else.") ] response = generator.run(messages=messages) assert "replies" in response assert isinstance(response["replies"], list) assert len(response["replies"]) > 0 assert [isinstance(reply, ChatMessage) for reply in response["replies"]] assert response["replies"][0].text is not None response_meta = response["replies"][0].meta assert "completion_start_time" in response_meta assert datetime.fromisoformat(response_meta["completion_start_time"]) <= datetime.now() assert "usage" in response_meta assert "prompt_tokens" in response_meta["usage"] assert response_meta["usage"]["prompt_tokens"] > 0 assert "completion_tokens" in response_meta["usage"] assert response_meta["usage"]["completion_tokens"] > 0 assert response_meta["model"] == "microsoft/Phi-3.5-mini-instruct" assert response_meta["finish_reason"] is not None @pytest.mark.integration @pytest.mark.slow @pytest.mark.skipif( not os.environ.get("HF_API_TOKEN", None), reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.", ) def test_live_run_with_tools(self, tools): """ We test the round trip: generate tool call, pass tool message, generate response. The model used here (Qwen/Qwen2.5-72B-Instruct) is not gated and kept in a warm state. """ chat_messages = [ChatMessage.from_user("What's the weather like in Paris?")] generator = HuggingFaceAPIChatGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "Qwen/Qwen2.5-72B-Instruct", "provider": "together"}, generation_kwargs={"temperature": 0.5}, ) results = generator.run(chat_messages, tools=tools) assert len(results["replies"]) == 1 message = results["replies"][0] assert message.tool_calls tool_call = message.tool_call assert isinstance(tool_call, ToolCall) assert tool_call.tool_name == "weather" assert "city" in tool_call.arguments assert "Paris" in tool_call.arguments["city"] assert message.meta["finish_reason"] == "tool_calls" new_messages = chat_messages + [message, ChatMessage.from_tool(tool_result="22° C", origin=tool_call)] # the model tends to make tool calls if provided with tools, so we don't pass them here results = generator.run(new_messages, generation_kwargs={"max_tokens": 50}) assert len(results["replies"]) == 1 final_message = results["replies"][0] assert not final_message.tool_calls assert len(final_message.text) > 0 assert "paris" in final_message.text.lower() and "22" in final_message.text @pytest.mark.asyncio async def test_run_async(self, mock_check_valid_model, mock_chat_completion_async, chat_messages): generator = HuggingFaceAPIChatGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "meta-llama/Llama-2-13b-chat-hf"}, generation_kwargs={"temperature": 0.6}, stop_words=["stop", "words"], streaming_callback=None, ) response = await generator.run_async(messages=chat_messages) # check kwargs passed to chat_completion _, kwargs = mock_chat_completion_async.call_args hf_messages = [ {"role": "system", "content": "You are a helpful assistant speaking A2 level of English"}, {"role": "user", "content": "Tell me about Berlin"}, ] assert kwargs == { "temperature": 0.6, "stop": ["stop", "words"], "max_tokens": 512, "tools": None, "messages": hf_messages, } assert isinstance(response, dict) assert "replies" in response assert isinstance(response["replies"], list) assert len(response["replies"]) == 1 assert [isinstance(reply, ChatMessage) for reply in response["replies"]] @pytest.mark.asyncio async def test_run_async_with_streaming(self, mock_check_valid_model, mock_chat_completion_async, chat_messages): streaming_call_count = 0 async def streaming_callback_fn(chunk: StreamingChunk): nonlocal streaming_call_count streaming_call_count += 1 assert isinstance(chunk, StreamingChunk) # Create a fake streamed response async def mock_aiter(self): yield ChatCompletionStreamOutput( choices=[ ChatCompletionStreamOutputChoice( delta=ChatCompletionStreamOutputDelta(content="The", role="assistant"), index=0, finish_reason=None, ) ], id="some_id", model="some_model", system_fingerprint="some_fingerprint", created=1710498504, ) yield ChatCompletionStreamOutput( choices=[ ChatCompletionStreamOutputChoice( delta=ChatCompletionStreamOutputDelta(content=None, role=None), index=0, finish_reason="length" ) ], id="some_id", model="some_model", system_fingerprint="some_fingerprint", created=1710498504, ) mock_response = Mock(**{"__aiter__": mock_aiter}) mock_chat_completion_async.return_value = mock_response generator = HuggingFaceAPIChatGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "meta-llama/Llama-2-13b-chat-hf"}, streaming_callback=streaming_callback_fn, ) response = await generator.run_async(messages=chat_messages) # check kwargs passed to chat_completion _, kwargs = mock_chat_completion_async.call_args assert kwargs == { "stop": [], "stream": True, "max_tokens": 512, "stream_options": ChatCompletionInputStreamOptions(include_usage=True), } # Assert that the streaming callback was called twice assert streaming_call_count == 2 # Assert that the response contains the generated replies assert "replies" in response assert isinstance(response["replies"], list) assert len(response["replies"]) > 0 assert [isinstance(reply, ChatMessage) for reply in response["replies"]] @pytest.mark.asyncio async def test_run_async_with_tools(self, tools, mock_check_valid_model): generator = HuggingFaceAPIChatGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "meta-llama/Llama-3.1-70B-Instruct"}, tools=tools, ) with patch("huggingface_hub.AsyncInferenceClient.chat_completion", autospec=True) as mock_chat_completion_async: completion = ChatCompletionOutput( choices=[ ChatCompletionOutputComplete( finish_reason="stop", index=0, message=ChatCompletionOutputMessage( role="assistant", content=None, tool_calls=[ ChatCompletionOutputToolCall( function=ChatCompletionOutputFunctionDefinition( arguments={"city": "Paris"}, name="weather", description=None ), id="0", type="function", ) ], ), logprobs=None, ) ], created=1729074760, id="", model="meta-llama/Llama-3.1-70B-Instruct", system_fingerprint="2.3.2-dev0-sha-28bb7ae", usage=ChatCompletionOutputUsage(completion_tokens=30, prompt_tokens=426, total_tokens=456), ) mock_chat_completion_async.return_value = completion messages = [ChatMessage.from_user("What is the weather in Paris?")] response = await generator.run_async(messages=messages) assert isinstance(response, dict) assert "replies" in response assert isinstance(response["replies"], list) assert len(response["replies"]) == 1 assert [isinstance(reply, ChatMessage) for reply in response["replies"]] assert response["replies"][0].tool_calls[0].tool_name == "weather" assert response["replies"][0].tool_calls[0].arguments == {"city": "Paris"} assert response["replies"][0].tool_calls[0].id == "0" assert response["replies"][0].meta == { "finish_reason": "stop", "index": 0, "model": "meta-llama/Llama-3.1-70B-Instruct", "usage": {"completion_tokens": 30, "prompt_tokens": 426}, } @pytest.mark.integration @pytest.mark.slow @pytest.mark.skipif( not os.environ.get("HF_API_TOKEN", None), reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.", ) @pytest.mark.flaky(reruns=2, reruns_delay=10) @pytest.mark.asyncio async def test_live_run_async_serverless(self): generator = HuggingFaceAPIChatGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "microsoft/Phi-3.5-mini-instruct", "provider": "featherless-ai"}, generation_kwargs={"max_tokens": 20}, ) messages = [ ChatMessage.from_user("What is the capital of France? Be concise only provide the capital, nothing else.") ] try: response = await generator.run_async(messages=messages) assert "replies" in response assert isinstance(response["replies"], list) assert len(response["replies"]) > 0 assert [isinstance(reply, ChatMessage) for reply in response["replies"]] assert response["replies"][0].text is not None meta = response["replies"][0].meta assert "usage" in meta assert "prompt_tokens" in meta["usage"] assert meta["usage"]["prompt_tokens"] > 0 assert "completion_tokens" in meta["usage"] assert meta["usage"]["completion_tokens"] > 0 assert meta["model"] == "microsoft/Phi-3.5-mini-instruct" assert meta["finish_reason"] is not None finally: await generator._async_client.close() def test_hugging_face_api_generator_with_toolset_initialization(self, mock_check_valid_model, tools): """Test that the HuggingFaceAPIChatGenerator can be initialized with a Toolset.""" toolset = Toolset(tools) generator = HuggingFaceAPIChatGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "irrelevant"}, tools=toolset ) assert generator.tools == toolset def test_from_dict_with_toolset(self, mock_check_valid_model, tools): """Test that the HuggingFaceAPIChatGenerator can be deserialized from a dictionary with a Toolset.""" toolset = Toolset(tools) component = HuggingFaceAPIChatGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "irrelevant"}, tools=toolset ) data = component.to_dict() deserialized_component = HuggingFaceAPIChatGenerator.from_dict(data) assert isinstance(deserialized_component.tools, Toolset) assert len(deserialized_component.tools) == len(tools) assert all(isinstance(tool, Tool) for tool in deserialized_component.tools) def test_to_dict_with_toolset(self, mock_check_valid_model, tools): """Test that the HuggingFaceAPIChatGenerator can be serialized to a dictionary with a Toolset.""" toolset = Toolset(tools[:1]) generator = HuggingFaceAPIChatGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "irrelevant"}, tools=toolset ) data = generator.to_dict() expected_tools_data = { "type": "haystack.tools.toolset.Toolset", "data": { "tools": [ { "type": "haystack.tools.tool.Tool", "data": { "name": "weather", "description": "useful to determine the weather in a given location", "parameters": { "type": "object", "properties": {"city": {"type": "string"}}, "required": ["city"], }, "function": "generators.chat.test_hugging_face_api.get_weather", "outputs_to_string": None, "inputs_from_state": None, "outputs_to_state": None, }, } ] }, } assert data["init_parameters"]["tools"] == expected_tools_data def test_convert_tools_to_hfapi_tools(self): assert _convert_tools_to_hfapi_tools(None) is None assert _convert_tools_to_hfapi_tools([]) is None tool = Tool( name="weather", description="useful to determine the weather in a given location", parameters={"city": {"type": "string"}}, function=get_weather, ) hf_tools = _convert_tools_to_hfapi_tools([tool]) assert len(hf_tools) == 1 assert hf_tools[0].type == "function" assert hf_tools[0].function.name == "weather" assert hf_tools[0].function.description == "useful to determine the weather in a given location" assert hf_tools[0].function.parameters == {"city": {"type": "string"}} def test_convert_tools_to_hfapi_tools_legacy(self): # this satisfies the check hasattr(ChatCompletionInputFunctionDefinition, "arguments") mock_class = MagicMock() with patch( "haystack.components.generators.chat.hugging_face_api.ChatCompletionInputFunctionDefinition", mock_class ): tool = Tool( name="weather", description="useful to determine the weather in a given location", parameters={"city": {"type": "string"}}, function=get_weather, ) _convert_tools_to_hfapi_tools([tool]) mock_class.assert_called_once_with( name="weather", arguments={"city": {"type": "string"}}, description="useful to determine the weather in a given location", )