mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-08 13:06:29 +00:00
feat: Add completion start time timestamp to relevant generators (#8728)
* OpenAIChatGenerator - add completion_start_time * HuggingFaceAPIChatGenerator - add completion_start_time * Add tests * Add reno note * Relax condition for cached responses * Add completion_start_time timestamping to non-chat generators * Update haystack/components/generators/chat/hugging_face_api.py Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> * PR feedback --------- Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>
This commit is contained in:
parent
62ac27c947
commit
21dd03d3e7
@ -2,6 +2,7 @@
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Any, Callable, Dict, Iterable, List, Optional, Union
|
||||
|
||||
from haystack import component, default_from_dict, default_to_dict, logging
|
||||
@ -259,6 +260,7 @@ class HuggingFaceAPIChatGenerator:
|
||||
)
|
||||
|
||||
generated_text = ""
|
||||
first_chunk_time = None
|
||||
|
||||
for chunk in api_output:
|
||||
# n is unused, so the API always returns only one choice
|
||||
@ -276,6 +278,9 @@ class HuggingFaceAPIChatGenerator:
|
||||
if finish_reason:
|
||||
meta["finish_reason"] = finish_reason
|
||||
|
||||
if first_chunk_time is None:
|
||||
first_chunk_time = datetime.now().isoformat()
|
||||
|
||||
stream_chunk = StreamingChunk(text, meta)
|
||||
self.streaming_callback(stream_chunk) # type: ignore # streaming_callback is not None (verified in the run method)
|
||||
|
||||
@ -285,6 +290,7 @@ class HuggingFaceAPIChatGenerator:
|
||||
"finish_reason": finish_reason,
|
||||
"index": 0,
|
||||
"usage": {"prompt_tokens": 0, "completion_tokens": 0}, # not available in streaming
|
||||
"completion_start_time": first_chunk_time,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@ -4,6 +4,7 @@
|
||||
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
|
||||
from openai import OpenAI, Stream
|
||||
@ -381,6 +382,7 @@ class OpenAIChatGenerator:
|
||||
"model": chunk.model,
|
||||
"index": 0,
|
||||
"finish_reason": chunk.choices[0].finish_reason,
|
||||
"completion_start_time": chunks[0].meta.get("received_at"), # first chunk received
|
||||
"usage": {}, # we don't have usage data for streaming responses
|
||||
}
|
||||
|
||||
@ -444,6 +446,7 @@ class OpenAIChatGenerator:
|
||||
"index": choice.index,
|
||||
"tool_calls": choice.delta.tool_calls,
|
||||
"finish_reason": choice.finish_reason,
|
||||
"received_at": datetime.now().isoformat(),
|
||||
}
|
||||
)
|
||||
return chunk_message
|
||||
|
||||
@ -3,6 +3,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from dataclasses import asdict
|
||||
from datetime import datetime
|
||||
from typing import Any, Callable, Dict, Iterable, List, Optional, Union
|
||||
|
||||
from haystack import component, default_from_dict, default_to_dict, logging
|
||||
@ -217,18 +218,26 @@ class HuggingFaceAPIGenerator:
|
||||
self, hf_output: Iterable["TextGenerationStreamOutput"], streaming_callback: Callable[[StreamingChunk], None]
|
||||
):
|
||||
chunks: List[StreamingChunk] = []
|
||||
first_chunk_time = None
|
||||
|
||||
for chunk in hf_output:
|
||||
token: TextGenerationOutputToken = chunk.token
|
||||
if token.special:
|
||||
continue
|
||||
|
||||
chunk_metadata = {**asdict(token), **(asdict(chunk.details) if chunk.details else {})}
|
||||
if first_chunk_time is None:
|
||||
first_chunk_time = datetime.now().isoformat()
|
||||
|
||||
stream_chunk = StreamingChunk(token.text, chunk_metadata)
|
||||
chunks.append(stream_chunk)
|
||||
streaming_callback(stream_chunk)
|
||||
|
||||
metadata = {
|
||||
"finish_reason": chunks[-1].meta.get("finish_reason", None),
|
||||
"model": self._client.model,
|
||||
"usage": {"completion_tokens": chunks[-1].meta.get("generated_tokens", 0)},
|
||||
"completion_start_time": first_chunk_time,
|
||||
}
|
||||
return {"replies": ["".join([chunk.content for chunk in chunks])], "meta": [metadata]}
|
||||
|
||||
|
||||
@ -3,6 +3,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import os
|
||||
from datetime import datetime
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
|
||||
from openai import OpenAI, Stream
|
||||
@ -255,7 +256,7 @@ class OpenAIGenerator:
|
||||
"model": completion_chunk.model,
|
||||
"index": 0,
|
||||
"finish_reason": finish_reason,
|
||||
# Usage is available when streaming only if the user explicitly requests it
|
||||
"completion_start_time": streamed_chunks[0].meta.get("received_at"), # first chunk received
|
||||
"usage": dict(completion_chunk.usage or {}),
|
||||
}
|
||||
)
|
||||
@ -296,12 +297,17 @@ class OpenAIGenerator:
|
||||
:returns:
|
||||
The StreamingChunk.
|
||||
"""
|
||||
# function or tools calls are not going to happen in non-chat generation
|
||||
# as users can not send ChatMessage with function or tools calls
|
||||
choice = chunk.choices[0]
|
||||
content = choice.delta.content or ""
|
||||
chunk_message = StreamingChunk(content)
|
||||
chunk_message.meta.update({"model": chunk.model, "index": choice.index, "finish_reason": choice.finish_reason})
|
||||
chunk_message.meta.update(
|
||||
{
|
||||
"model": chunk.model,
|
||||
"index": choice.index,
|
||||
"finish_reason": choice.finish_reason,
|
||||
"received_at": datetime.now().isoformat(),
|
||||
}
|
||||
)
|
||||
return chunk_message
|
||||
|
||||
@staticmethod
|
||||
|
||||
@ -0,0 +1,4 @@
|
||||
---
|
||||
enhancements:
|
||||
- |
|
||||
Added completion_start_time metadata to track time-to-first-token (TTFT) in streaming responses from Hugging Face API and OpenAI (Azure).
|
||||
@ -1,6 +1,7 @@
|
||||
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from datetime import datetime
|
||||
import os
|
||||
from unittest.mock import MagicMock, Mock, patch
|
||||
|
||||
@ -503,9 +504,13 @@ class TestHuggingFaceAPIChatGenerator:
|
||||
assert isinstance(response["replies"], list)
|
||||
assert len(response["replies"]) > 0
|
||||
assert [isinstance(reply, ChatMessage) for reply in response["replies"]]
|
||||
assert "usage" in response["replies"][0].meta
|
||||
assert "prompt_tokens" in response["replies"][0].meta["usage"]
|
||||
assert "completion_tokens" in response["replies"][0].meta["usage"]
|
||||
|
||||
response_meta = response["replies"][0].meta
|
||||
assert "completion_start_time" in response_meta
|
||||
assert datetime.fromisoformat(response_meta["completion_start_time"]) <= datetime.now()
|
||||
assert "usage" in response_meta
|
||||
assert "prompt_tokens" in response_meta["usage"]
|
||||
assert "completion_tokens" in response_meta["usage"]
|
||||
|
||||
@pytest.mark.integration
|
||||
@pytest.mark.skipif(
|
||||
|
||||
@ -546,6 +546,10 @@ class TestOpenAIChatGenerator:
|
||||
assert callback.counter > 1
|
||||
assert "Paris" in callback.responses
|
||||
|
||||
# check that the completion_start_time is set and valid ISO format
|
||||
assert "completion_start_time" in message.meta
|
||||
assert datetime.fromisoformat(message.meta["completion_start_time"]) < datetime.now()
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not os.environ.get("OPENAI_API_KEY", None),
|
||||
reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",
|
||||
|
||||
@ -3,6 +3,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import os
|
||||
from unittest.mock import MagicMock, Mock, patch
|
||||
from datetime import datetime
|
||||
|
||||
import pytest
|
||||
from huggingface_hub import (
|
||||
@ -312,3 +313,25 @@ class TestHuggingFaceAPIGenerator:
|
||||
assert isinstance(response["meta"], list)
|
||||
assert len(response["meta"]) > 0
|
||||
assert [isinstance(meta, dict) for meta in response["meta"]]
|
||||
|
||||
@pytest.mark.integration
|
||||
@pytest.mark.skipif(
|
||||
not os.environ.get("HF_API_TOKEN", None),
|
||||
reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.",
|
||||
)
|
||||
def test_live_run_streaming_check_completion_start_time(self):
|
||||
generator = HuggingFaceAPIGenerator(
|
||||
api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
|
||||
api_params={"model": "HuggingFaceH4/zephyr-7b-beta"},
|
||||
streaming_callback=streaming_callback_handler,
|
||||
)
|
||||
|
||||
results = generator.run("What is the capital of France?")
|
||||
|
||||
assert len(results["replies"]) == 1
|
||||
assert "Paris" in results["replies"][0]
|
||||
|
||||
# Verify completion start time in final metadata
|
||||
assert "completion_start_time" in results["meta"][0]
|
||||
completion_start = datetime.fromisoformat(results["meta"][0]["completion_start_time"])
|
||||
assert completion_start <= datetime.now()
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from datetime import datetime
|
||||
import logging
|
||||
import os
|
||||
from typing import List
|
||||
@ -286,6 +287,9 @@ class TestOpenAIGenerator:
|
||||
assert "gpt-4o-mini" in metadata["model"]
|
||||
assert metadata["finish_reason"] == "stop"
|
||||
|
||||
assert "completion_start_time" in metadata
|
||||
assert datetime.fromisoformat(metadata["completion_start_time"]) <= datetime.now()
|
||||
|
||||
# unfortunately, the usage is not available for streaming calls
|
||||
# we keep the key in the metadata for compatibility
|
||||
assert "usage" in metadata and len(metadata["usage"]) == 0
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user