diff --git a/haystack/components/audio/whisper_local.py b/haystack/components/audio/whisper_local.py index 761cf4adc..63f25a811 100644 --- a/haystack/components/audio/whisper_local.py +++ b/haystack/components/audio/whisper_local.py @@ -134,16 +134,16 @@ class LocalWhisperTranscriber: if not isinstance(source, ByteStream): path = Path(source) source = ByteStream.from_file_path(path) - source.metadata["file_path"] = path + source.meta["file_path"] = path else: # If we received a ByteStream instance that doesn't have the "file_path" metadata set, # we dump the bytes into a temporary file. - path = source.metadata.get("file_path") + path = source.meta.get("file_path") if path is None: fp = tempfile.NamedTemporaryFile(delete=False) path = Path(fp.name) source.to_file(path) - source.metadata["file_path"] = path + source.meta["file_path"] = path transcription = self._model.transcribe(str(path), **kwargs) if not return_segments: diff --git a/haystack/components/audio/whisper_remote.py b/haystack/components/audio/whisper_remote.py index 02384e2d4..1fc5c049c 100644 --- a/haystack/components/audio/whisper_remote.py +++ b/haystack/components/audio/whisper_remote.py @@ -129,13 +129,13 @@ class RemoteWhisperTranscriber: if not isinstance(source, ByteStream): path = source source = ByteStream.from_file_path(Path(source)) - source.metadata["file_path"] = path + source.meta["file_path"] = path file = io.BytesIO(source.data) - file.name = str(source.metadata["file_path"]) if "file_path" in source.metadata else "__fallback__.wav" + file.name = str(source.meta["file_path"]) if "file_path" in source.meta else "__fallback__.wav" content = openai.Audio.transcribe(file=file, model=self.model_name, **self.whisper_params) - doc = Document(content=content["text"], meta=source.metadata) + doc = Document(content=content["text"], meta=source.meta) documents.append(doc) return {"documents": documents} diff --git a/haystack/components/builders/answer_builder.py b/haystack/components/builders/answer_builder.py index 5815add26..81dddc62b 100644 --- a/haystack/components/builders/answer_builder.py +++ b/haystack/components/builders/answer_builder.py @@ -42,7 +42,7 @@ class AnswerBuilder: self, query: str, replies: List[str], - metadata: Optional[List[Dict[str, Any]]] = None, + meta: Optional[List[Dict[str, Any]]] = None, documents: Optional[List[Document]] = None, pattern: Optional[str] = None, reference_pattern: Optional[str] = None, @@ -52,7 +52,7 @@ class AnswerBuilder: :param query: The query used in the prompts for the Generator as a string. :param replies: The output of the Generator. A list of strings. - :param metadata: The metadata returned by the Generator. An optional list of dictionaries. If not specified, + :param meta: The metadata returned by the Generator. An optional list of dictionaries. If not specified, the generated answer will contain no metadata. :param documents: The documents used as input to the Generator. A list of `Document` objects. If `documents` are specified, they are added to the `Answer` objects. @@ -74,10 +74,10 @@ class AnswerBuilder: If not specified, no parsing is done, and all documents are referenced. Default: `None`. """ - if not metadata: - metadata = [{}] * len(replies) - elif len(replies) != len(metadata): - raise ValueError(f"Number of replies ({len(replies)}), and metadata ({len(metadata)}) must match.") + if not meta: + meta = [{}] * len(replies) + elif len(replies) != len(meta): + raise ValueError(f"Number of replies ({len(replies)}), and metadata ({len(meta)}) must match.") if pattern: AnswerBuilder._check_num_groups_in_regex(pattern) @@ -86,7 +86,7 @@ class AnswerBuilder: reference_pattern = reference_pattern or self.reference_pattern all_answers = [] - for reply, meta in zip(replies, metadata): + for reply, metadata in zip(replies, meta): referenced_docs = [] if documents: reference_idxs = [] @@ -102,7 +102,7 @@ class AnswerBuilder: logger.warning("Document index '%s' referenced in Generator output is out of range. ", idx + 1) answer_string = AnswerBuilder._extract_answer_string(reply, pattern) - answer = GeneratedAnswer(data=answer_string, query=query, documents=referenced_docs, meta=meta) + answer = GeneratedAnswer(data=answer_string, query=query, documents=referenced_docs, meta=metadata) all_answers.append(answer) return {"answers": all_answers} diff --git a/haystack/components/builders/dynamic_prompt_builder.py b/haystack/components/builders/dynamic_prompt_builder.py index cb5554537..59b025009 100644 --- a/haystack/components/builders/dynamic_prompt_builder.py +++ b/haystack/components/builders/dynamic_prompt_builder.py @@ -53,7 +53,7 @@ class DynamicPromptBuilder: >> {'llm': {'replies': [ChatMessage(content="Berlin is the capital city of Germany and one of the most vibrant and diverse cities in Europe. Here are some key things to know...Enjoy your time exploring the vibrant and dynamic - capital of Germany!", role=, name=None, metadata={'model': 'gpt-3.5-turbo-0613', + capital of Germany!", role=, name=None, meta={'model': 'gpt-3.5-turbo-0613', 'index': 0, 'finish_reason': 'stop', 'usage': {'prompt_tokens': 27, 'completion_tokens': 681, 'total_tokens': 708}})]}} @@ -65,7 +65,7 @@ class DynamicPromptBuilder: print(res) >> {'llm': {'replies': [ChatMessage(content="Here is the weather forecast for Berlin in the next 5 days:\\n\\nDay 1: Mostly cloudy with a high of 22°C (72°F) and...so it's always a good idea to check for updates - closer to your visit.", role=, name=None, metadata={'model': 'gpt-3.5-turbo-0613', + closer to your visit.", role=, name=None, meta={'model': 'gpt-3.5-turbo-0613', 'index': 0, 'finish_reason': 'stop', 'usage': {'prompt_tokens': 37, 'completion_tokens': 201, 'total_tokens': 238}})]}} ``` @@ -126,7 +126,7 @@ class DynamicPromptBuilder: "template_variables":{"query": "who's making a greeting?"}}}) >> {'llm': {'replies': [ChatMessage(content='Haystack', role=, name=None, - >> metadata={'model': 'gpt-3.5-turbo-0613', 'index': 0, 'finish_reason': 'stop', 'usage': + >> meta={'model': 'gpt-3.5-turbo-0613', 'index': 0, 'finish_reason': 'stop', 'usage': >> {'prompt_tokens': 51, 'completion_tokens': 2, 'total_tokens': 53}})]}} ``` @@ -159,7 +159,7 @@ class DynamicPromptBuilder: "template_variables":{"query": "Where does the speaker live?"}}}) >> {'llm': {'replies': ['The speaker lives in Berlin.'], - >> 'metadata': [{'model': 'gpt-3.5-turbo-0613', + >> 'meta': [{'model': 'gpt-3.5-turbo-0613', >> 'index': 0, >> 'finish_reason': 'stop', >> 'usage': {'prompt_tokens': 28, diff --git a/haystack/components/converters/azure.py b/haystack/components/converters/azure.py index a66e05f9f..2f97dcc81 100644 --- a/haystack/components/converters/azure.py +++ b/haystack/components/converters/azure.py @@ -104,11 +104,11 @@ class AzureOCRDocumentConverter: azure_output.append(result.to_dict()) file_suffix = None - if "file_path" in bytestream.metadata: - file_suffix = Path(bytestream.metadata["file_path"]).suffix + if "file_path" in bytestream.meta: + file_suffix = Path(bytestream.meta["file_path"]).suffix document = AzureOCRDocumentConverter._convert_azure_result_to_document(result, file_suffix) - merged_metadata = {**bytestream.metadata, **metadata} + merged_metadata = {**bytestream.meta, **metadata} document.meta = merged_metadata documents.append(document) diff --git a/haystack/components/converters/html.py b/haystack/components/converters/html.py index 859f62fed..83f0acd43 100644 --- a/haystack/components/converters/html.py +++ b/haystack/components/converters/html.py @@ -83,7 +83,7 @@ class HTMLToDocument: logger.warning("Failed to extract text from %s. Skipping it. Error: %s", source, conversion_e) continue - merged_metadata = {**bytestream.metadata, **metadata} + merged_metadata = {**bytestream.meta, **metadata} document = Document(content=text, meta=merged_metadata) documents.append(document) diff --git a/haystack/components/converters/markdown.py b/haystack/components/converters/markdown.py index eb867bc66..c5c5a83a2 100644 --- a/haystack/components/converters/markdown.py +++ b/haystack/components/converters/markdown.py @@ -83,7 +83,7 @@ class MarkdownToDocument: logger.warning("Failed to extract text from %s. Skipping it. Error: %s", source, conversion_e) continue - merged_metadata = {**bytestream.metadata, **metadata} + merged_metadata = {**bytestream.meta, **metadata} document = Document(content=text, meta=merged_metadata) documents.append(document) diff --git a/haystack/components/converters/pypdf.py b/haystack/components/converters/pypdf.py index ba2fb1472..735426fdf 100644 --- a/haystack/components/converters/pypdf.py +++ b/haystack/components/converters/pypdf.py @@ -111,7 +111,7 @@ class PyPDFToDocument: logger.warning("Could not read %s and convert it to Document, skipping. %s", source, e) continue - merged_metadata = {**bytestream.metadata, **metadata} + merged_metadata = {**bytestream.meta, **metadata} document.meta = merged_metadata documents.append(document) diff --git a/haystack/components/converters/tika.py b/haystack/components/converters/tika.py index 4543da24a..eed81fe30 100644 --- a/haystack/components/converters/tika.py +++ b/haystack/components/converters/tika.py @@ -77,7 +77,7 @@ class TikaDocumentConverter: logger.warning("Failed to extract text from %s. Skipping it. Error: %s", source, conversion_e) continue - merged_metadata = {**bytestream.metadata, **metadata} + merged_metadata = {**bytestream.meta, **metadata} document = Document(content=text, meta=merged_metadata) documents.append(document) return {"documents": documents} diff --git a/haystack/components/converters/txt.py b/haystack/components/converters/txt.py index 4e16759dd..08c48b97c 100644 --- a/haystack/components/converters/txt.py +++ b/haystack/components/converters/txt.py @@ -63,13 +63,13 @@ class TextFileToDocument: logger.warning("Could not read %s. Skipping it. Error: %s", source, e) continue try: - encoding = bytestream.metadata.get("encoding", self.encoding) + encoding = bytestream.meta.get("encoding", self.encoding) text = bytestream.data.decode(encoding) except Exception as e: logger.warning("Could not convert file %s. Skipping it. Error message: %s", source, e) continue - merged_metadata = {**bytestream.metadata, **metadata} + merged_metadata = {**bytestream.meta, **metadata} document = Document(content=text, meta=merged_metadata) documents.append(document) diff --git a/haystack/components/converters/utils.py b/haystack/components/converters/utils.py index b8871aeec..d5040635e 100644 --- a/haystack/components/converters/utils.py +++ b/haystack/components/converters/utils.py @@ -15,6 +15,6 @@ def get_bytestream_from_source(source: Union[str, Path, ByteStream]) -> ByteStre return source if isinstance(source, (str, Path)): bs = ByteStream.from_file_path(Path(source)) - bs.metadata["file_path"] = str(source) + bs.meta["file_path"] = str(source) return bs raise ValueError(f"Unsupported source type {type(source)}") diff --git a/haystack/components/fetchers/link_content.py b/haystack/components/fetchers/link_content.py index e1fee5f71..7d531694c 100644 --- a/haystack/components/fetchers/link_content.py +++ b/haystack/components/fetchers/link_content.py @@ -118,7 +118,7 @@ class LinkContentFetcher: # don't use multithreading if there's only one URL if len(urls) == 1: stream_metadata, stream = self.fetch(urls[0]) - stream.metadata.update(stream_metadata) + stream.meta.update(stream_metadata) streams.append(stream) else: with ThreadPoolExecutor() as executor: @@ -126,7 +126,7 @@ class LinkContentFetcher: for stream_metadata, stream in results: # type: ignore if stream_metadata is not None and stream is not None: - stream.metadata.update(stream_metadata) + stream.meta.update(stream_metadata) streams.append(stream) return {"streams": streams} diff --git a/haystack/components/generators/chat/hugging_face_tgi.py b/haystack/components/generators/chat/hugging_face_tgi.py index 33ad9c67b..7363497c5 100644 --- a/haystack/components/generators/chat/hugging_face_tgi.py +++ b/haystack/components/generators/chat/hugging_face_tgi.py @@ -241,7 +241,7 @@ class HuggingFaceTGIChatGenerator: self.streaming_callback(stream_chunk) # type: ignore # streaming_callback is not None (verified in the run method) message = ChatMessage.from_assistant(chunk.generated_text) - message.metadata.update( + message.meta.update( { "finish_reason": chunk.details.finish_reason.value, "index": 0, @@ -264,7 +264,7 @@ class HuggingFaceTGIChatGenerator: prepared_prompt, details=True, **generation_kwargs ) message = ChatMessage.from_assistant(tgr.generated_text) - message.metadata.update( + message.meta.update( { "finish_reason": tgr.details.finish_reason.value, "index": _i, diff --git a/haystack/components/generators/chat/openai.py b/haystack/components/generators/chat/openai.py index 1ec2b541e..09cd4e58e 100644 --- a/haystack/components/generators/chat/openai.py +++ b/haystack/components/generators/chat/openai.py @@ -42,7 +42,7 @@ class GPTChatGenerator: >>{'replies': [ChatMessage(content='Natural Language Processing (NLP) is a branch of artificial intelligence >>that focuses on enabling computers to understand, interpret, and generate human language in a way that is >>meaningful and useful.', role=, name=None, - >>metadata={'model': 'gpt-3.5-turbo-0613', 'index': 0, 'finish_reason': 'stop', + >>meta={'model': 'gpt-3.5-turbo-0613', 'index': 0, 'finish_reason': 'stop', >>'usage': {'prompt_tokens': 15, 'completion_tokens': 36, 'total_tokens': 51}})]} ``` @@ -218,7 +218,7 @@ class GPTChatGenerator: :param chunks: The list of all chunks returned by the OpenAI API. """ complete_response = ChatMessage.from_assistant("".join([chunk.content for chunk in chunks])) - complete_response.metadata.update( + complete_response.meta.update( { "model": chunk.model, "index": 0, @@ -239,7 +239,7 @@ class GPTChatGenerator: # message.content is str but message.function_call is OpenAIObject but JSON in fact, convert to str content = str(message.function_call) if choice.finish_reason == "function_call" else message.content chat_message = ChatMessage.from_assistant(content) - chat_message.metadata.update( + chat_message.meta.update( { "model": completion.model, "index": choice.index, @@ -264,9 +264,7 @@ class GPTChatGenerator: else: content = "" chunk_message = StreamingChunk(content) - chunk_message.metadata.update( - {"model": chunk.model, "index": choice.index, "finish_reason": choice.finish_reason} - ) + chunk_message.meta.update({"model": chunk.model, "index": choice.index, "finish_reason": choice.finish_reason}) return chunk_message def _check_finish_reason(self, message: ChatMessage) -> None: @@ -275,13 +273,13 @@ class GPTChatGenerator: If the `finish_reason` is `length` or `content_filter`, log a warning. :param message: The message returned by the LLM. """ - if message.metadata["finish_reason"] == "length": + if message.meta["finish_reason"] == "length": logger.warning( "The completion for index %s has been truncated before reaching a natural stopping point. " "Increase the max_tokens parameter to allow for longer completions.", - message.metadata["index"], + message.meta["index"], ) - if message.metadata["finish_reason"] == "content_filter": + if message.meta["finish_reason"] == "content_filter": logger.warning( - "The completion for index %s has been truncated due to the content filter.", message.metadata["index"] + "The completion for index %s has been truncated due to the content filter.", message.meta["index"] ) diff --git a/haystack/components/generators/hugging_face_tgi.py b/haystack/components/generators/hugging_face_tgi.py index e7d9128d8..f55a0e2da 100644 --- a/haystack/components/generators/hugging_face_tgi.py +++ b/haystack/components/generators/hugging_face_tgi.py @@ -157,7 +157,7 @@ class HuggingFaceTGIGenerator: # Don't send URL as it is sensitive information return {"model": self.model} - @component.output_types(replies=List[str], metadata=List[Dict[str, Any]]) + @component.output_types(replies=List[str], meta=List[Dict[str, Any]]) def run(self, prompt: str, generation_kwargs: Optional[Dict[str, Any]] = None): """ Invoke the text generation inference for the given prompt and generation parameters. @@ -204,15 +204,15 @@ class HuggingFaceTGIGenerator: chunks.append(stream_chunk) self.streaming_callback(stream_chunk) # type: ignore # streaming_callback is not None (verified in the run method) metadata = { - "finish_reason": chunks[-1].metadata.get("finish_reason", None), + "finish_reason": chunks[-1].meta.get("finish_reason", None), "model": self.client.model, "usage": { - "completion_tokens": chunks[-1].metadata.get("generated_tokens", 0), + "completion_tokens": chunks[-1].meta.get("generated_tokens", 0), "prompt_tokens": prompt_token_count, - "total_tokens": prompt_token_count + chunks[-1].metadata.get("generated_tokens", 0), + "total_tokens": prompt_token_count + chunks[-1].meta.get("generated_tokens", 0), }, } - return {"replies": ["".join([chunk.content for chunk in chunks])], "metadata": [metadata]} + return {"replies": ["".join([chunk.content for chunk in chunks])], "meta": [metadata]} def _run_non_streaming( self, prompt: str, prompt_token_count: int, num_responses: int, generation_kwargs: Dict[str, Any] @@ -234,4 +234,4 @@ class HuggingFaceTGIGenerator: } ) responses.append(tgr.generated_text) - return {"replies": responses, "metadata": all_metadata} + return {"replies": responses, "meta": all_metadata} diff --git a/haystack/components/generators/openai.py b/haystack/components/generators/openai.py index bb4f807bf..73a829830 100644 --- a/haystack/components/generators/openai.py +++ b/haystack/components/generators/openai.py @@ -37,7 +37,7 @@ class GPTGenerator: >> {'replies': ['Natural Language Processing (NLP) is a branch of artificial intelligence that focuses on >> the interaction between computers and human language. It involves enabling computers to understand, interpret, - >> and respond to natural human language in a way that is both meaningful and useful.'], 'metadata': [{'model': + >> and respond to natural human language in a way that is both meaningful and useful.'], 'meta': [{'model': >> 'gpt-3.5-turbo-0613', 'index': 0, 'finish_reason': 'stop', 'usage': {'prompt_tokens': 16, >> 'completion_tokens': 49, 'total_tokens': 65}}]} ``` @@ -146,7 +146,7 @@ class GPTGenerator: data["init_parameters"]["streaming_callback"] = deserialize_callback_handler(serialized_callback_handler) return default_from_dict(cls, data) - @component.output_types(replies=List[str], metadata=List[Dict[str, Any]]) + @component.output_types(replies=List[str], meta=List[Dict[str, Any]]) def run(self, prompt: str, generation_kwargs: Optional[Dict[str, Any]] = None): """ Invoke the text generation inference based on the provided messages and generation parameters. @@ -200,7 +200,7 @@ class GPTGenerator: return { "replies": [message.content for message in completions], - "metadata": [message.metadata for message in completions], + "meta": [message.meta for message in completions], } def _convert_to_openai_format(self, messages: List[ChatMessage]) -> List[Dict[str, Any]]: @@ -222,7 +222,7 @@ class GPTGenerator: Connects the streaming chunks into a single ChatMessage. """ complete_response = ChatMessage.from_assistant("".join([chunk.content for chunk in chunks])) - complete_response.metadata.update( + complete_response.meta.update( { "model": chunk.model, "index": 0, @@ -242,7 +242,7 @@ class GPTGenerator: message: OpenAIObject = choice.message content = dict(message.function_call) if choice.finish_reason == "function_call" else message.content chat_message = ChatMessage.from_assistant(content) - chat_message.metadata.update( + chat_message.meta.update( { "model": completion.model, "index": choice.index, @@ -267,9 +267,7 @@ class GPTGenerator: else: content = "" chunk_message = StreamingChunk(content) - chunk_message.metadata.update( - {"model": chunk.model, "index": choice.index, "finish_reason": choice.finish_reason} - ) + chunk_message.meta.update({"model": chunk.model, "index": choice.index, "finish_reason": choice.finish_reason}) return chunk_message def _check_finish_reason(self, message: ChatMessage) -> None: @@ -278,13 +276,13 @@ class GPTGenerator: If the `finish_reason` is `length`, log a warning to the user. :param message: The message returned by the LLM. """ - if message.metadata["finish_reason"] == "length": + if message.meta["finish_reason"] == "length": logger.warning( "The completion for index %s has been truncated before reaching a natural stopping point. " "Increase the max_tokens parameter to allow for longer completions.", - message.metadata["index"], + message.meta["index"], ) - if message.metadata["finish_reason"] == "content_filter": + if message.meta["finish_reason"] == "content_filter": logger.warning( - "The completion for index %s has been truncated due to the content filter.", message.metadata["index"] + "The completion for index %s has been truncated due to the content filter.", message.meta["index"] ) diff --git a/haystack/components/routers/file_type_router.py b/haystack/components/routers/file_type_router.py index 932472bf1..59f87f0ae 100644 --- a/haystack/components/routers/file_type_router.py +++ b/haystack/components/routers/file_type_router.py @@ -58,7 +58,7 @@ class FileTypeRouter: if isinstance(source, Path): mime_type = self.get_mime_type(source) elif isinstance(source, ByteStream): - mime_type = source.metadata.get("content_type") + mime_type = source.meta.get("content_type") else: raise ValueError(f"Unsupported data source type: {type(source)}") diff --git a/haystack/dataclasses/byte_stream.py b/haystack/dataclasses/byte_stream.py index dd84e1c26..6ccf32464 100644 --- a/haystack/dataclasses/byte_stream.py +++ b/haystack/dataclasses/byte_stream.py @@ -10,7 +10,7 @@ class ByteStream: """ data: bytes - metadata: Dict[str, Any] = field(default_factory=dict, hash=False) + meta: Dict[str, Any] = field(default_factory=dict, hash=False) mime_type: Optional[str] = field(default=None) def to_file(self, destination_path: Path): diff --git a/haystack/dataclasses/chat_message.py b/haystack/dataclasses/chat_message.py index 08c61d6cf..b85c13747 100644 --- a/haystack/dataclasses/chat_message.py +++ b/haystack/dataclasses/chat_message.py @@ -20,13 +20,13 @@ class ChatMessage: :param content: The text content of the message. :param role: The role of the entity sending the message. :param name: The name of the function being called (only applicable for role FUNCTION). - :param metadata: Additional metadata associated with the message. + :param meta: Additional metadata associated with the message. """ content: str role: ChatRole name: Optional[str] - metadata: Dict[str, Any] = field(default_factory=dict, hash=False) + meta: Dict[str, Any] = field(default_factory=dict, hash=False) def is_from(self, role: ChatRole) -> bool: """ @@ -38,15 +38,15 @@ class ChatMessage: return self.role == role @classmethod - def from_assistant(cls, content: str, metadata: Optional[Dict[str, Any]] = None) -> "ChatMessage": + def from_assistant(cls, content: str, meta: Optional[Dict[str, Any]] = None) -> "ChatMessage": """ Create a message from the assistant. :param content: The text content of the message. - :param metadata: Additional metadata associated with the message. + :param meta: Additional metadata associated with the message. :return: A new ChatMessage instance. """ - return cls(content, ChatRole.ASSISTANT, None, metadata or {}) + return cls(content, ChatRole.ASSISTANT, None, meta or {}) @classmethod def from_user(cls, content: str) -> "ChatMessage": diff --git a/haystack/dataclasses/streaming_chunk.py b/haystack/dataclasses/streaming_chunk.py index 124556043..f2f2b990f 100644 --- a/haystack/dataclasses/streaming_chunk.py +++ b/haystack/dataclasses/streaming_chunk.py @@ -10,8 +10,8 @@ class StreamingChunk: streamed data in a systematic manner. :param content: The content of the message chunk as a string. - :param metadata: A dictionary containing metadata related to the message chunk. + :param meta: A dictionary containing metadata related to the message chunk. """ content: str - metadata: Dict[str, Any] = field(default_factory=dict, hash=False) + meta: Dict[str, Any] = field(default_factory=dict, hash=False) diff --git a/haystack/pipeline_utils/rag.py b/haystack/pipeline_utils/rag.py index fdf8ca563..1c088efe8 100644 --- a/haystack/pipeline_utils/rag.py +++ b/haystack/pipeline_utils/rag.py @@ -67,7 +67,7 @@ class _RAGPipeline: self.pipeline.connect("retriever", "prompt_builder.documents") self.pipeline.connect("prompt_builder.prompt", "llm.prompt") self.pipeline.connect("llm.replies", "answer_builder.replies") - self.pipeline.connect("llm.metadata", "answer_builder.metadata") + self.pipeline.connect("llm.meta", "answer_builder.meta") self.pipeline.connect("retriever", "answer_builder.documents") def run(self, query: str) -> Answer: diff --git a/releasenotes/notes/changed-metadata-to-meta-64cceb9ed19722fe.yaml b/releasenotes/notes/changed-metadata-to-meta-64cceb9ed19722fe.yaml new file mode 100644 index 000000000..5710841ba --- /dev/null +++ b/releasenotes/notes/changed-metadata-to-meta-64cceb9ed19722fe.yaml @@ -0,0 +1,4 @@ +--- +enhancements: + - | + Rename all metadata references to meta. diff --git a/test/components/audio/test_whisper_local.py b/test/components/audio/test_whisper_local.py index 82da74ab7..0fb6fdfc1 100644 --- a/test/components/audio/test_whisper_local.py +++ b/test/components/audio/test_whisper_local.py @@ -125,7 +125,7 @@ class TestLocalWhisperTranscriber: } path = SAMPLES_PATH / "audio" / "this is the content of the document.wav" bs = ByteStream.from_file_path(path) - bs.metadata["file_path"] = path + bs.meta["file_path"] = path results = comp.transcribe(sources=[bs]) expected = Document( content="test transcription", meta={"audio_file": path, "other_metadata": ["other", "meta", "data"]} diff --git a/test/components/audio/test_whisper_remote.py b/test/components/audio/test_whisper_remote.py index 2c1b45963..fb5e31ab6 100644 --- a/test/components/audio/test_whisper_remote.py +++ b/test/components/audio/test_whisper_remote.py @@ -210,7 +210,7 @@ class TestRemoteWhisperTranscriber: transcriber = RemoteWhisperTranscriber(api_key="test_api_key", model_name=model, response_format="json") with open(file_path, "rb") as audio_stream: byte_stream = audio_stream.read() - audio_file = ByteStream(byte_stream, metadata={"file_path": str(file_path.absolute())}) + audio_file = ByteStream(byte_stream, meta={"file_path": str(file_path.absolute())}) result = transcriber.run(sources=[audio_file]) diff --git a/test/components/builders/test_answer_builder.py b/test/components/builders/test_answer_builder.py index 10ec43ba6..4f94852df 100644 --- a/test/components/builders/test_answer_builder.py +++ b/test/components/builders/test_answer_builder.py @@ -10,7 +10,7 @@ class TestAnswerBuilder: def test_run_unmatching_input_len(self): component = AnswerBuilder() with pytest.raises(ValueError): - component.run(query="query", replies=["reply1"], metadata=[{"test": "meta"}, {"test": "meta2"}]) + component.run(query="query", replies=["reply1"], meta=[{"test": "meta"}, {"test": "meta2"}]) def test_run_without_meta(self): component = AnswerBuilder() @@ -24,7 +24,7 @@ class TestAnswerBuilder: def test_run_meta_is_an_empty_list(self): component = AnswerBuilder() - output = component.run(query="query", replies=["reply1"], metadata=[]) + output = component.run(query="query", replies=["reply1"], meta=[]) answers = output["answers"] assert answers[0].data == "reply1" assert answers[0].meta == {} @@ -34,7 +34,7 @@ class TestAnswerBuilder: def test_run_without_pattern(self): component = AnswerBuilder() - output = component.run(query="test query", replies=["Answer: AnswerString"], metadata=[{}]) + output = component.run(query="test query", replies=["Answer: AnswerString"], meta=[{}]) answers = output["answers"] assert len(answers) == 1 assert answers[0].data == "Answer: AnswerString" @@ -45,7 +45,7 @@ class TestAnswerBuilder: def test_run_with_pattern_with_capturing_group(self): component = AnswerBuilder(pattern=r"Answer: (.*)") - output = component.run(query="test query", replies=["Answer: AnswerString"], metadata=[{}]) + output = component.run(query="test query", replies=["Answer: AnswerString"], meta=[{}]) answers = output["answers"] assert len(answers) == 1 assert answers[0].data == "AnswerString" @@ -56,7 +56,7 @@ class TestAnswerBuilder: def test_run_with_pattern_without_capturing_group(self): component = AnswerBuilder(pattern=r"'.*'") - output = component.run(query="test query", replies=["Answer: 'AnswerString'"], metadata=[{}]) + output = component.run(query="test query", replies=["Answer: 'AnswerString'"], meta=[{}]) answers = output["answers"] assert len(answers) == 1 assert answers[0].data == "'AnswerString'" @@ -71,9 +71,7 @@ class TestAnswerBuilder: def test_run_with_pattern_set_at_runtime(self): component = AnswerBuilder(pattern="unused pattern") - output = component.run( - query="test query", replies=["Answer: AnswerString"], metadata=[{}], pattern=r"Answer: (.*)" - ) + output = component.run(query="test query", replies=["Answer: AnswerString"], meta=[{}], pattern=r"Answer: (.*)") answers = output["answers"] assert len(answers) == 1 assert answers[0].data == "AnswerString" @@ -87,7 +85,7 @@ class TestAnswerBuilder: output = component.run( query="test query", replies=["Answer: AnswerString"], - metadata=[{}], + meta=[{}], documents=[Document(content="test doc 1"), Document(content="test doc 2")], ) answers = output["answers"] @@ -104,7 +102,7 @@ class TestAnswerBuilder: output = component.run( query="test query", replies=["Answer: AnswerString[2]"], - metadata=[{}], + meta=[{}], documents=[Document(content="test doc 1"), Document(content="test doc 2")], ) answers = output["answers"] @@ -121,7 +119,7 @@ class TestAnswerBuilder: output = component.run( query="test query", replies=["Answer: AnswerString[3]"], - metadata=[{}], + meta=[{}], documents=[Document(content="test doc 1"), Document(content="test doc 2")], ) answers = output["answers"] @@ -137,7 +135,7 @@ class TestAnswerBuilder: output = component.run( query="test query", replies=["Answer: AnswerString[2][3]"], - metadata=[{}], + meta=[{}], documents=[Document(content="test doc 1"), Document(content="test doc 2"), Document(content="test doc 3")], reference_pattern="\\[(\\d+)\\]", ) diff --git a/test/components/converters/test_azure_ocr_doc_converter.py b/test/components/converters/test_azure_ocr_doc_converter.py index e5f434519..36f8f6a90 100644 --- a/test/components/converters/test_azure_ocr_doc_converter.py +++ b/test/components/converters/test_azure_ocr_doc_converter.py @@ -45,7 +45,7 @@ class TestAzureOCRDocumentConverter: } def test_run_with_meta(self): - bytestream = ByteStream(data=b"test", metadata={"author": "test_author", "language": "en"}) + bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"}) with patch("haystack.components.converters.azure.DocumentAnalysisClient"): component = AzureOCRDocumentConverter(endpoint="test_endpoint", api_key="test_credential_key") diff --git a/test/components/converters/test_html_to_document.py b/test/components/converters/test_html_to_document.py index 1cdb47812..dfd3cfd96 100644 --- a/test/components/converters/test_html_to_document.py +++ b/test/components/converters/test_html_to_document.py @@ -63,7 +63,7 @@ class TestHTMLToDocument: converter = HTMLToDocument() with open(test_files_path / "html" / "what_is_haystack.html", "rb") as file: byte_stream = file.read() - stream = ByteStream(byte_stream, metadata={"content_type": "text/html", "url": "test_url"}) + stream = ByteStream(byte_stream, meta={"content_type": "text/html", "url": "test_url"}) results = converter.run(sources=[stream]) docs = results["documents"] @@ -81,7 +81,7 @@ class TestHTMLToDocument: converter = HTMLToDocument() with open(test_files_path / "html" / "what_is_haystack.html", "rb") as file: byte_stream = file.read() - stream = ByteStream(byte_stream, metadata={"content_type": "text/html", "url": "test_url"}) + stream = ByteStream(byte_stream, meta={"content_type": "text/html", "url": "test_url"}) metadata = [{"file_name": "what_is_haystack.html"}] results = converter.run(sources=[stream], meta=metadata) @@ -103,7 +103,7 @@ class TestHTMLToDocument: with open(test_files_path / "html" / "what_is_haystack.html", "rb") as file: byte_stream = file.read() # ByteStream has "url" present in metadata - stream = ByteStream(byte_stream, metadata={"content_type": "text/html", "url": "test_url_correct"}) + stream = ByteStream(byte_stream, meta={"content_type": "text/html", "url": "test_url_correct"}) # "url" supplied by the user overwrites value present in metadata metadata = [{"file_name": "what_is_haystack.html", "url": "test_url_new"}] diff --git a/test/components/converters/test_markdown_to_document.py b/test/components/converters/test_markdown_to_document.py index 7b47551ac..3764fd7a6 100644 --- a/test/components/converters/test_markdown_to_document.py +++ b/test/components/converters/test_markdown_to_document.py @@ -32,7 +32,7 @@ class TestMarkdownToDocument: assert "# git clone https://github.com/deepset-ai/haystack.git" in doc.content def test_run_with_meta(self): - bytestream = ByteStream(data=b"test", metadata={"author": "test_author", "language": "en"}) + bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"}) converter = MarkdownToDocument() diff --git a/test/components/converters/test_pypdf_to_document.py b/test/components/converters/test_pypdf_to_document.py index 7da02a4cd..d403cc0e7 100644 --- a/test/components/converters/test_pypdf_to_document.py +++ b/test/components/converters/test_pypdf_to_document.py @@ -30,7 +30,7 @@ class TestPyPDFToDocument: assert "ReAct" in docs[0].content def test_run_with_meta(self): - bytestream = ByteStream(data=b"test", metadata={"author": "test_author", "language": "en"}) + bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"}) converter = PyPDFToDocument() with patch("haystack.components.converters.pypdf.PdfReader"): diff --git a/test/components/converters/test_textfile_to_document.py b/test/components/converters/test_textfile_to_document.py index a99c52420..5e115775a 100644 --- a/test/components/converters/test_textfile_to_document.py +++ b/test/components/converters/test_textfile_to_document.py @@ -14,8 +14,8 @@ class TestTextfileToDocument: Test if the component runs correctly. """ bytestream = ByteStream.from_file_path(test_files_path / "txt" / "doc_3.txt") - bytestream.metadata["file_path"] = str(test_files_path / "txt" / "doc_3.txt") - bytestream.metadata["key"] = "value" + bytestream.meta["file_path"] = str(test_files_path / "txt" / "doc_3.txt") + bytestream.meta["key"] = "value" files = [str(test_files_path / "txt" / "doc_1.txt"), test_files_path / "txt" / "doc_2.txt", bytestream] converter = TextFileToDocument() output = converter.run(sources=files) @@ -26,7 +26,7 @@ class TestTextfileToDocument: assert "That's yet another file!" in docs[2].content assert docs[0].meta["file_path"] == str(files[0]) assert docs[1].meta["file_path"] == str(files[1]) - assert docs[2].meta == bytestream.metadata + assert docs[2].meta == bytestream.meta def test_run_error_handling(self, test_files_path, caplog): """ @@ -47,18 +47,18 @@ class TestTextfileToDocument: Test if the encoding metadata field is used properly """ bytestream = ByteStream.from_file_path(test_files_path / "txt" / "doc_1.txt") - bytestream.metadata["key"] = "value" + bytestream.meta["key"] = "value" converter = TextFileToDocument(encoding="utf-16") output = converter.run(sources=[bytestream]) assert "Some text for testing." not in output["documents"][0].content - bytestream.metadata["encoding"] = "utf-8" + bytestream.meta["encoding"] = "utf-8" output = converter.run(sources=[bytestream]) assert "Some text for testing." in output["documents"][0].content def test_run_with_meta(self): - bytestream = ByteStream(data=b"test", metadata={"author": "test_author", "language": "en"}) + bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"}) converter = TextFileToDocument() diff --git a/test/components/converters/test_tika_doc_converter.py b/test/components/converters/test_tika_doc_converter.py index e65fb0c8b..23c1fa92f 100644 --- a/test/components/converters/test_tika_doc_converter.py +++ b/test/components/converters/test_tika_doc_converter.py @@ -19,7 +19,7 @@ class TestTikaDocumentConverter: assert documents[0].content == "Content of mock source" def test_run_with_meta(self): - bytestream = ByteStream(data=b"test", metadata={"author": "test_author", "language": "en"}) + bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"}) converter = TikaDocumentConverter() with patch("haystack.components.converters.tika.tika_parser.from_buffer"): diff --git a/test/components/fetchers/test_link_content_fetcher.py b/test/components/fetchers/test_link_content_fetcher.py index 816185d4d..e3f350d5a 100644 --- a/test/components/fetchers/test_link_content_fetcher.py +++ b/test/components/fetchers/test_link_content_fetcher.py @@ -67,7 +67,7 @@ class TestLinkContentFetcher: streams = fetcher.run(urls=["https://www.example.com"])["streams"] first_stream = streams[0] assert first_stream.data == correct_response - assert first_stream.metadata["content_type"] == "text/plain" + assert first_stream.meta["content_type"] == "text/plain" def test_run_html(self): correct_response = b"

Example test response

" @@ -79,7 +79,7 @@ class TestLinkContentFetcher: streams = fetcher.run(urls=["https://www.example.com"])["streams"] first_stream = streams[0] assert first_stream.data == correct_response - assert first_stream.metadata["content_type"] == "text/html" + assert first_stream.meta["content_type"] == "text/html" def test_run_binary(self, test_files_path): file_bytes = open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb").read() @@ -91,7 +91,7 @@ class TestLinkContentFetcher: streams = fetcher.run(urls=["https://www.example.com"])["streams"] first_stream = streams[0] assert first_stream.data == file_bytes - assert first_stream.metadata["content_type"] == "application/pdf" + assert first_stream.meta["content_type"] == "application/pdf" def test_run_bad_status_code(self): empty_byte_stream = b"" @@ -105,7 +105,7 @@ class TestLinkContentFetcher: assert len(streams) == 1 first_stream = streams[0] assert first_stream.data == empty_byte_stream - assert first_stream.metadata["content_type"] == "text/html" + assert first_stream.meta["content_type"] == "text/html" @pytest.mark.integration def test_link_content_fetcher_html(self): @@ -113,8 +113,8 @@ class TestLinkContentFetcher: streams = fetcher.run([HTML_URL])["streams"] first_stream = streams[0] assert "Haystack" in first_stream.data.decode("utf-8") - assert first_stream.metadata["content_type"] == "text/html" - assert "url" in first_stream.metadata and first_stream.metadata["url"] == HTML_URL + assert first_stream.meta["content_type"] == "text/html" + assert "url" in first_stream.meta and first_stream.meta["url"] == HTML_URL @pytest.mark.integration def test_link_content_fetcher_text(self): @@ -122,8 +122,8 @@ class TestLinkContentFetcher: streams = fetcher.run([TEXT_URL])["streams"] first_stream = streams[0] assert "Haystack" in first_stream.data.decode("utf-8") - assert first_stream.metadata["content_type"] == "text/plain" - assert "url" in first_stream.metadata and first_stream.metadata["url"] == TEXT_URL + assert first_stream.meta["content_type"] == "text/plain" + assert "url" in first_stream.meta and first_stream.meta["url"] == TEXT_URL @pytest.mark.integration def test_link_content_fetcher_pdf(self): @@ -131,8 +131,8 @@ class TestLinkContentFetcher: streams = fetcher.run([PDF_URL])["streams"] assert len(streams) == 1 first_stream = streams[0] - assert first_stream.metadata["content_type"] in ("application/octet-stream", "application/pdf") - assert "url" in first_stream.metadata and first_stream.metadata["url"] == PDF_URL + assert first_stream.meta["content_type"] in ("application/octet-stream", "application/pdf") + assert "url" in first_stream.meta and first_stream.meta["url"] == PDF_URL @pytest.mark.integration def test_link_content_fetcher_multiple_different_content_types(self): @@ -143,10 +143,10 @@ class TestLinkContentFetcher: streams = fetcher.run([PDF_URL, HTML_URL])["streams"] assert len(streams) == 2 for stream in streams: - assert stream.metadata["content_type"] in ("text/html", "application/pdf", "application/octet-stream") - if stream.metadata["content_type"] == "text/html": + assert stream.meta["content_type"] in ("text/html", "application/pdf", "application/octet-stream") + if stream.meta["content_type"] == "text/html": assert "Haystack" in stream.data.decode("utf-8") - elif stream.metadata["content_type"] == "application/pdf": + elif stream.meta["content_type"] == "application/pdf": assert len(stream.data) > 0 @pytest.mark.integration @@ -160,10 +160,10 @@ class TestLinkContentFetcher: streams = fetcher.run([PDF_URL, HTML_URL, "https://google.com"])["streams"] assert len(streams) == 3 for stream in streams: - assert stream.metadata["content_type"] in ("text/html", "application/pdf", "application/octet-stream") - if stream.metadata["content_type"] == "text/html": + assert stream.meta["content_type"] in ("text/html", "application/pdf", "application/octet-stream") + if stream.meta["content_type"] == "text/html": assert "Haystack" in stream.data.decode("utf-8") or "Google" in stream.data.decode("utf-8") - elif stream.metadata["content_type"] == "application/pdf": + elif stream.meta["content_type"] == "application/pdf": assert len(stream.data) > 0 @pytest.mark.integration @@ -177,7 +177,7 @@ class TestLinkContentFetcher: result = fetcher.run(["https://non_existent_website_dot.com/", "https://www.google.com/"]) assert len(result["streams"]) == 1 first_stream = result["streams"][0] - assert first_stream.metadata["content_type"] == "text/html" + assert first_stream.meta["content_type"] == "text/html" @pytest.mark.integration def test_bad_request_exception_raised(self): diff --git a/test/components/generators/chat/test_openai.py b/test/components/generators/chat/test_openai.py index e62930cef..9d8aa34f4 100644 --- a/test/components/generators/chat/test_openai.py +++ b/test/components/generators/chat/test_openai.py @@ -241,7 +241,7 @@ class TestGPTChatGenerator: component = GPTChatGenerator(api_key="test-api-key") messages = [ ChatMessage.from_assistant( - "", metadata={"finish_reason": "content_filter" if i % 2 == 0 else "length", "index": i} + "", meta={"finish_reason": "content_filter" if i % 2 == 0 else "length", "index": i} ) for i, _ in enumerate(range(4)) ] diff --git a/test/components/generators/test_hugging_face_tgi.py b/test/components/generators/test_hugging_face_tgi.py index 0d59aa781..4ab077147 100644 --- a/test/components/generators/test_hugging_face_tgi.py +++ b/test/components/generators/test_hugging_face_tgi.py @@ -124,11 +124,11 @@ class TestHuggingFaceTGIGenerator: assert isinstance(response, dict) assert "replies" in response - assert "metadata" in response + assert "meta" in response assert isinstance(response["replies"], list) - assert isinstance(response["metadata"], list) + assert isinstance(response["meta"], list) assert len(response["replies"]) == 1 - assert len(response["metadata"]) == 1 + assert len(response["meta"]) == 1 assert [isinstance(reply, str) for reply in response["replies"]] def test_generate_multiple_text_responses_with_valid_prompt_and_generation_parameters( @@ -157,14 +157,14 @@ class TestHuggingFaceTGIGenerator: assert isinstance(response, dict) assert "replies" in response - assert "metadata" in response + assert "meta" in response assert isinstance(response["replies"], list) assert [isinstance(reply, str) for reply in response["replies"]] - assert isinstance(response["metadata"], list) + assert isinstance(response["meta"], list) assert len(response["replies"]) == 3 - assert len(response["metadata"]) == 3 - assert [isinstance(reply, dict) for reply in response["metadata"]] + assert len(response["meta"]) == 3 + assert [isinstance(reply, dict) for reply in response["meta"]] def test_initialize_with_invalid_model(self, mock_check_valid_model): model = "invalid_model" @@ -200,9 +200,9 @@ class TestHuggingFaceTGIGenerator: assert [isinstance(reply, str) for reply in response["replies"]] # Assert that the response contains the metadata - assert "metadata" in response - assert isinstance(response["metadata"], list) - assert len(response["metadata"]) > 0 + assert "meta" in response + assert isinstance(response["meta"], list) + assert len(response["meta"]) > 0 assert [isinstance(reply, dict) for reply in response["replies"]] def test_generate_text_with_custom_generation_parameters( @@ -226,9 +226,9 @@ class TestHuggingFaceTGIGenerator: assert response["replies"][0] == "I'm fine, thanks." # Assert that the response contains the metadata - assert "metadata" in response - assert isinstance(response["metadata"], list) - assert len(response["metadata"]) > 0 + assert "meta" in response + assert isinstance(response["meta"], list) + assert len(response["meta"]) > 0 assert [isinstance(reply, str) for reply in response["replies"]] def test_generate_text_with_streaming_callback( @@ -278,7 +278,7 @@ class TestHuggingFaceTGIGenerator: assert [isinstance(reply, str) for reply in response["replies"]] # Assert that the response contains the metadata - assert "metadata" in response - assert isinstance(response["metadata"], list) - assert len(response["metadata"]) > 0 + assert "meta" in response + assert isinstance(response["meta"], list) + assert len(response["meta"]) > 0 assert [isinstance(reply, dict) for reply in response["replies"]] diff --git a/test/components/generators/test_openai.py b/test/components/generators/test_openai.py index e81752bde..a5cc2fc38 100644 --- a/test/components/generators/test_openai.py +++ b/test/components/generators/test_openai.py @@ -242,7 +242,7 @@ class TestGPTGenerator: for i, _ in enumerate(range(4)): message = ChatMessage.from_assistant("Hello") metadata = {"finish_reason": "content_filter" if i % 2 == 0 else "length", "index": i} - message.metadata.update(metadata) + message.meta.update(metadata) messages.append(message) for m in messages: diff --git a/test/components/routers/test_file_router.py b/test/components/routers/test_file_router.py index c9c868470..9409cc5d5 100644 --- a/test/components/routers/test_file_router.py +++ b/test/components/routers/test_file_router.py @@ -46,13 +46,13 @@ class TestFileTypeRouter: for path, mime_type in zip(file_paths, mime_types): stream = ByteStream(path.read_bytes()) - stream.metadata["content_type"] = mime_type + stream.meta["content_type"] = mime_type byte_streams.append(stream) # add unclassified ByteStream bs = ByteStream(b"unclassified content") - bs.metadata["content_type"] = "unknown_type" + bs.meta["content_type"] = "unknown_type" byte_streams.append(bs) router = FileTypeRouter(mime_types=["text/plain", "audio/x-wav", "image/jpeg"]) @@ -75,7 +75,7 @@ class TestFileTypeRouter: byte_stream_sources = [] for path, mime_type in zip(file_paths, mime_types): stream = ByteStream(path.read_bytes()) - stream.metadata["content_type"] = mime_type + stream.meta["content_type"] = mime_type byte_stream_sources.append(stream) mixed_sources = file_paths[:2] + byte_stream_sources[2:] diff --git a/test/dataclasses/test_streaming_chunk.py b/test/dataclasses/test_streaming_chunk.py index cecad8a08..97c1b1987 100644 --- a/test/dataclasses/test_streaming_chunk.py +++ b/test/dataclasses/test_streaming_chunk.py @@ -4,25 +4,25 @@ from haystack.dataclasses import StreamingChunk def test_create_chunk_with_content_and_metadata(): - chunk = StreamingChunk(content="Test content", metadata={"key": "value"}) + chunk = StreamingChunk(content="Test content", meta={"key": "value"}) assert chunk.content == "Test content" - assert chunk.metadata == {"key": "value"} + assert chunk.meta == {"key": "value"} def test_create_chunk_with_only_content(): chunk = StreamingChunk(content="Test content") assert chunk.content == "Test content" - assert chunk.metadata == {} + assert chunk.meta == {} def test_access_content(): - chunk = StreamingChunk(content="Test content", metadata={"key": "value"}) + chunk = StreamingChunk(content="Test content", meta={"key": "value"}) assert chunk.content == "Test content" def test_create_chunk_with_empty_content(): chunk = StreamingChunk(content="") assert chunk.content == "" - assert chunk.metadata == {} + assert chunk.meta == {}