feat: Add completion start time timestamp to relevant generators (#8728)

* OpenAIChatGenerator - add completion_start_time

* HuggingFaceAPIChatGenerator - add completion_start_time

* Add tests

* Add reno note

* Relax condition for cached responses

* Add completion_start_time timestamping to non-chat generators

* Update haystack/components/generators/chat/hugging_face_api.py

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>

* PR feedback

---------

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>
This commit is contained in:
Vladimir Blagojevic 2025-01-17 09:58:45 +01:00 committed by GitHub
parent 62ac27c947
commit 21dd03d3e7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 71 additions and 7 deletions

View File

@ -2,6 +2,7 @@
#
# SPDX-License-Identifier: Apache-2.0
from datetime import datetime
from typing import Any, Callable, Dict, Iterable, List, Optional, Union
from haystack import component, default_from_dict, default_to_dict, logging
@ -259,6 +260,7 @@ class HuggingFaceAPIChatGenerator:
)
generated_text = ""
first_chunk_time = None
for chunk in api_output:
# n is unused, so the API always returns only one choice
@ -276,6 +278,9 @@ class HuggingFaceAPIChatGenerator:
if finish_reason:
meta["finish_reason"] = finish_reason
if first_chunk_time is None:
first_chunk_time = datetime.now().isoformat()
stream_chunk = StreamingChunk(text, meta)
self.streaming_callback(stream_chunk) # type: ignore # streaming_callback is not None (verified in the run method)
@ -285,6 +290,7 @@ class HuggingFaceAPIChatGenerator:
"finish_reason": finish_reason,
"index": 0,
"usage": {"prompt_tokens": 0, "completion_tokens": 0}, # not available in streaming
"completion_start_time": first_chunk_time,
}
)

View File

@ -4,6 +4,7 @@
import json
import os
from datetime import datetime
from typing import Any, Callable, Dict, List, Optional, Union
from openai import OpenAI, Stream
@ -381,6 +382,7 @@ class OpenAIChatGenerator:
"model": chunk.model,
"index": 0,
"finish_reason": chunk.choices[0].finish_reason,
"completion_start_time": chunks[0].meta.get("received_at"), # first chunk received
"usage": {}, # we don't have usage data for streaming responses
}
@ -444,6 +446,7 @@ class OpenAIChatGenerator:
"index": choice.index,
"tool_calls": choice.delta.tool_calls,
"finish_reason": choice.finish_reason,
"received_at": datetime.now().isoformat(),
}
)
return chunk_message

View File

@ -3,6 +3,7 @@
# SPDX-License-Identifier: Apache-2.0
from dataclasses import asdict
from datetime import datetime
from typing import Any, Callable, Dict, Iterable, List, Optional, Union
from haystack import component, default_from_dict, default_to_dict, logging
@ -217,18 +218,26 @@ class HuggingFaceAPIGenerator:
self, hf_output: Iterable["TextGenerationStreamOutput"], streaming_callback: Callable[[StreamingChunk], None]
):
chunks: List[StreamingChunk] = []
first_chunk_time = None
for chunk in hf_output:
token: TextGenerationOutputToken = chunk.token
if token.special:
continue
chunk_metadata = {**asdict(token), **(asdict(chunk.details) if chunk.details else {})}
if first_chunk_time is None:
first_chunk_time = datetime.now().isoformat()
stream_chunk = StreamingChunk(token.text, chunk_metadata)
chunks.append(stream_chunk)
streaming_callback(stream_chunk)
metadata = {
"finish_reason": chunks[-1].meta.get("finish_reason", None),
"model": self._client.model,
"usage": {"completion_tokens": chunks[-1].meta.get("generated_tokens", 0)},
"completion_start_time": first_chunk_time,
}
return {"replies": ["".join([chunk.content for chunk in chunks])], "meta": [metadata]}

View File

@ -3,6 +3,7 @@
# SPDX-License-Identifier: Apache-2.0
import os
from datetime import datetime
from typing import Any, Callable, Dict, List, Optional, Union
from openai import OpenAI, Stream
@ -255,7 +256,7 @@ class OpenAIGenerator:
"model": completion_chunk.model,
"index": 0,
"finish_reason": finish_reason,
# Usage is available when streaming only if the user explicitly requests it
"completion_start_time": streamed_chunks[0].meta.get("received_at"), # first chunk received
"usage": dict(completion_chunk.usage or {}),
}
)
@ -296,12 +297,17 @@ class OpenAIGenerator:
:returns:
The StreamingChunk.
"""
# function or tools calls are not going to happen in non-chat generation
# as users can not send ChatMessage with function or tools calls
choice = chunk.choices[0]
content = choice.delta.content or ""
chunk_message = StreamingChunk(content)
chunk_message.meta.update({"model": chunk.model, "index": choice.index, "finish_reason": choice.finish_reason})
chunk_message.meta.update(
{
"model": chunk.model,
"index": choice.index,
"finish_reason": choice.finish_reason,
"received_at": datetime.now().isoformat(),
}
)
return chunk_message
@staticmethod

View File

@ -0,0 +1,4 @@
---
enhancements:
- |
Added completion_start_time metadata to track time-to-first-token (TTFT) in streaming responses from Hugging Face API and OpenAI (Azure).

View File

@ -1,6 +1,7 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
from datetime import datetime
import os
from unittest.mock import MagicMock, Mock, patch
@ -503,9 +504,13 @@ class TestHuggingFaceAPIChatGenerator:
assert isinstance(response["replies"], list)
assert len(response["replies"]) > 0
assert [isinstance(reply, ChatMessage) for reply in response["replies"]]
assert "usage" in response["replies"][0].meta
assert "prompt_tokens" in response["replies"][0].meta["usage"]
assert "completion_tokens" in response["replies"][0].meta["usage"]
response_meta = response["replies"][0].meta
assert "completion_start_time" in response_meta
assert datetime.fromisoformat(response_meta["completion_start_time"]) <= datetime.now()
assert "usage" in response_meta
assert "prompt_tokens" in response_meta["usage"]
assert "completion_tokens" in response_meta["usage"]
@pytest.mark.integration
@pytest.mark.skipif(

View File

@ -546,6 +546,10 @@ class TestOpenAIChatGenerator:
assert callback.counter > 1
assert "Paris" in callback.responses
# check that the completion_start_time is set and valid ISO format
assert "completion_start_time" in message.meta
assert datetime.fromisoformat(message.meta["completion_start_time"]) < datetime.now()
@pytest.mark.skipif(
not os.environ.get("OPENAI_API_KEY", None),
reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",

View File

@ -3,6 +3,7 @@
# SPDX-License-Identifier: Apache-2.0
import os
from unittest.mock import MagicMock, Mock, patch
from datetime import datetime
import pytest
from huggingface_hub import (
@ -312,3 +313,25 @@ class TestHuggingFaceAPIGenerator:
assert isinstance(response["meta"], list)
assert len(response["meta"]) > 0
assert [isinstance(meta, dict) for meta in response["meta"]]
@pytest.mark.integration
@pytest.mark.skipif(
not os.environ.get("HF_API_TOKEN", None),
reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.",
)
def test_live_run_streaming_check_completion_start_time(self):
generator = HuggingFaceAPIGenerator(
api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
api_params={"model": "HuggingFaceH4/zephyr-7b-beta"},
streaming_callback=streaming_callback_handler,
)
results = generator.run("What is the capital of France?")
assert len(results["replies"]) == 1
assert "Paris" in results["replies"][0]
# Verify completion start time in final metadata
assert "completion_start_time" in results["meta"][0]
completion_start = datetime.fromisoformat(results["meta"][0]["completion_start_time"])
assert completion_start <= datetime.now()

View File

@ -1,6 +1,7 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
from datetime import datetime
import logging
import os
from typing import List
@ -286,6 +287,9 @@ class TestOpenAIGenerator:
assert "gpt-4o-mini" in metadata["model"]
assert metadata["finish_reason"] == "stop"
assert "completion_start_time" in metadata
assert datetime.fromisoformat(metadata["completion_start_time"]) <= datetime.now()
# unfortunately, the usage is not available for streaming calls
# we keep the key in the metadata for compatibility
assert "usage" in metadata and len(metadata["usage"]) == 0