changed metadata to meta (#6605)

This commit is contained in:
sahusiddharth 2023-12-21 17:09:58 +05:30 committed by GitHub
parent fc88ef7076
commit 3d17e6ff76
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
37 changed files with 138 additions and 140 deletions

View File

@ -134,16 +134,16 @@ class LocalWhisperTranscriber:
if not isinstance(source, ByteStream):
path = Path(source)
source = ByteStream.from_file_path(path)
source.metadata["file_path"] = path
source.meta["file_path"] = path
else:
# If we received a ByteStream instance that doesn't have the "file_path" metadata set,
# we dump the bytes into a temporary file.
path = source.metadata.get("file_path")
path = source.meta.get("file_path")
if path is None:
fp = tempfile.NamedTemporaryFile(delete=False)
path = Path(fp.name)
source.to_file(path)
source.metadata["file_path"] = path
source.meta["file_path"] = path
transcription = self._model.transcribe(str(path), **kwargs)
if not return_segments:

View File

@ -129,13 +129,13 @@ class RemoteWhisperTranscriber:
if not isinstance(source, ByteStream):
path = source
source = ByteStream.from_file_path(Path(source))
source.metadata["file_path"] = path
source.meta["file_path"] = path
file = io.BytesIO(source.data)
file.name = str(source.metadata["file_path"]) if "file_path" in source.metadata else "__fallback__.wav"
file.name = str(source.meta["file_path"]) if "file_path" in source.meta else "__fallback__.wav"
content = openai.Audio.transcribe(file=file, model=self.model_name, **self.whisper_params)
doc = Document(content=content["text"], meta=source.metadata)
doc = Document(content=content["text"], meta=source.meta)
documents.append(doc)
return {"documents": documents}

View File

@ -42,7 +42,7 @@ class AnswerBuilder:
self,
query: str,
replies: List[str],
metadata: Optional[List[Dict[str, Any]]] = None,
meta: Optional[List[Dict[str, Any]]] = None,
documents: Optional[List[Document]] = None,
pattern: Optional[str] = None,
reference_pattern: Optional[str] = None,
@ -52,7 +52,7 @@ class AnswerBuilder:
:param query: The query used in the prompts for the Generator as a string.
:param replies: The output of the Generator. A list of strings.
:param metadata: The metadata returned by the Generator. An optional list of dictionaries. If not specified,
:param meta: The metadata returned by the Generator. An optional list of dictionaries. If not specified,
the generated answer will contain no metadata.
:param documents: The documents used as input to the Generator. A list of `Document` objects. If
`documents` are specified, they are added to the `Answer` objects.
@ -74,10 +74,10 @@ class AnswerBuilder:
If not specified, no parsing is done, and all documents are referenced.
Default: `None`.
"""
if not metadata:
metadata = [{}] * len(replies)
elif len(replies) != len(metadata):
raise ValueError(f"Number of replies ({len(replies)}), and metadata ({len(metadata)}) must match.")
if not meta:
meta = [{}] * len(replies)
elif len(replies) != len(meta):
raise ValueError(f"Number of replies ({len(replies)}), and metadata ({len(meta)}) must match.")
if pattern:
AnswerBuilder._check_num_groups_in_regex(pattern)
@ -86,7 +86,7 @@ class AnswerBuilder:
reference_pattern = reference_pattern or self.reference_pattern
all_answers = []
for reply, meta in zip(replies, metadata):
for reply, metadata in zip(replies, meta):
referenced_docs = []
if documents:
reference_idxs = []
@ -102,7 +102,7 @@ class AnswerBuilder:
logger.warning("Document index '%s' referenced in Generator output is out of range. ", idx + 1)
answer_string = AnswerBuilder._extract_answer_string(reply, pattern)
answer = GeneratedAnswer(data=answer_string, query=query, documents=referenced_docs, meta=meta)
answer = GeneratedAnswer(data=answer_string, query=query, documents=referenced_docs, meta=metadata)
all_answers.append(answer)
return {"answers": all_answers}

View File

@ -53,7 +53,7 @@ class DynamicPromptBuilder:
>> {'llm': {'replies': [ChatMessage(content="Berlin is the capital city of Germany and one of the most vibrant
and diverse cities in Europe. Here are some key things to know...Enjoy your time exploring the vibrant and dynamic
capital of Germany!", role=<ChatRole.ASSISTANT: 'assistant'>, name=None, metadata={'model': 'gpt-3.5-turbo-0613',
capital of Germany!", role=<ChatRole.ASSISTANT: 'assistant'>, name=None, meta={'model': 'gpt-3.5-turbo-0613',
'index': 0, 'finish_reason': 'stop', 'usage': {'prompt_tokens': 27, 'completion_tokens': 681, 'total_tokens': 708}})]}}
@ -65,7 +65,7 @@ class DynamicPromptBuilder:
print(res)
>> {'llm': {'replies': [ChatMessage(content="Here is the weather forecast for Berlin in the next 5
days:\\n\\nDay 1: Mostly cloudy with a high of 22°C (72°F) and...so it's always a good idea to check for updates
closer to your visit.", role=<ChatRole.ASSISTANT: 'assistant'>, name=None, metadata={'model': 'gpt-3.5-turbo-0613',
closer to your visit.", role=<ChatRole.ASSISTANT: 'assistant'>, name=None, meta={'model': 'gpt-3.5-turbo-0613',
'index': 0, 'finish_reason': 'stop', 'usage': {'prompt_tokens': 37, 'completion_tokens': 201, 'total_tokens': 238}})]}}
```
@ -126,7 +126,7 @@ class DynamicPromptBuilder:
"template_variables":{"query": "who's making a greeting?"}}})
>> {'llm': {'replies': [ChatMessage(content='Haystack', role=<ChatRole.ASSISTANT: 'assistant'>, name=None,
>> metadata={'model': 'gpt-3.5-turbo-0613', 'index': 0, 'finish_reason': 'stop', 'usage':
>> meta={'model': 'gpt-3.5-turbo-0613', 'index': 0, 'finish_reason': 'stop', 'usage':
>> {'prompt_tokens': 51, 'completion_tokens': 2, 'total_tokens': 53}})]}}
```
@ -159,7 +159,7 @@ class DynamicPromptBuilder:
"template_variables":{"query": "Where does the speaker live?"}}})
>> {'llm': {'replies': ['The speaker lives in Berlin.'],
>> 'metadata': [{'model': 'gpt-3.5-turbo-0613',
>> 'meta': [{'model': 'gpt-3.5-turbo-0613',
>> 'index': 0,
>> 'finish_reason': 'stop',
>> 'usage': {'prompt_tokens': 28,

View File

@ -104,11 +104,11 @@ class AzureOCRDocumentConverter:
azure_output.append(result.to_dict())
file_suffix = None
if "file_path" in bytestream.metadata:
file_suffix = Path(bytestream.metadata["file_path"]).suffix
if "file_path" in bytestream.meta:
file_suffix = Path(bytestream.meta["file_path"]).suffix
document = AzureOCRDocumentConverter._convert_azure_result_to_document(result, file_suffix)
merged_metadata = {**bytestream.metadata, **metadata}
merged_metadata = {**bytestream.meta, **metadata}
document.meta = merged_metadata
documents.append(document)

View File

@ -83,7 +83,7 @@ class HTMLToDocument:
logger.warning("Failed to extract text from %s. Skipping it. Error: %s", source, conversion_e)
continue
merged_metadata = {**bytestream.metadata, **metadata}
merged_metadata = {**bytestream.meta, **metadata}
document = Document(content=text, meta=merged_metadata)
documents.append(document)

View File

@ -83,7 +83,7 @@ class MarkdownToDocument:
logger.warning("Failed to extract text from %s. Skipping it. Error: %s", source, conversion_e)
continue
merged_metadata = {**bytestream.metadata, **metadata}
merged_metadata = {**bytestream.meta, **metadata}
document = Document(content=text, meta=merged_metadata)
documents.append(document)

View File

@ -111,7 +111,7 @@ class PyPDFToDocument:
logger.warning("Could not read %s and convert it to Document, skipping. %s", source, e)
continue
merged_metadata = {**bytestream.metadata, **metadata}
merged_metadata = {**bytestream.meta, **metadata}
document.meta = merged_metadata
documents.append(document)

View File

@ -77,7 +77,7 @@ class TikaDocumentConverter:
logger.warning("Failed to extract text from %s. Skipping it. Error: %s", source, conversion_e)
continue
merged_metadata = {**bytestream.metadata, **metadata}
merged_metadata = {**bytestream.meta, **metadata}
document = Document(content=text, meta=merged_metadata)
documents.append(document)
return {"documents": documents}

View File

@ -63,13 +63,13 @@ class TextFileToDocument:
logger.warning("Could not read %s. Skipping it. Error: %s", source, e)
continue
try:
encoding = bytestream.metadata.get("encoding", self.encoding)
encoding = bytestream.meta.get("encoding", self.encoding)
text = bytestream.data.decode(encoding)
except Exception as e:
logger.warning("Could not convert file %s. Skipping it. Error message: %s", source, e)
continue
merged_metadata = {**bytestream.metadata, **metadata}
merged_metadata = {**bytestream.meta, **metadata}
document = Document(content=text, meta=merged_metadata)
documents.append(document)

View File

@ -15,6 +15,6 @@ def get_bytestream_from_source(source: Union[str, Path, ByteStream]) -> ByteStre
return source
if isinstance(source, (str, Path)):
bs = ByteStream.from_file_path(Path(source))
bs.metadata["file_path"] = str(source)
bs.meta["file_path"] = str(source)
return bs
raise ValueError(f"Unsupported source type {type(source)}")

View File

@ -118,7 +118,7 @@ class LinkContentFetcher:
# don't use multithreading if there's only one URL
if len(urls) == 1:
stream_metadata, stream = self.fetch(urls[0])
stream.metadata.update(stream_metadata)
stream.meta.update(stream_metadata)
streams.append(stream)
else:
with ThreadPoolExecutor() as executor:
@ -126,7 +126,7 @@ class LinkContentFetcher:
for stream_metadata, stream in results: # type: ignore
if stream_metadata is not None and stream is not None:
stream.metadata.update(stream_metadata)
stream.meta.update(stream_metadata)
streams.append(stream)
return {"streams": streams}

View File

@ -241,7 +241,7 @@ class HuggingFaceTGIChatGenerator:
self.streaming_callback(stream_chunk) # type: ignore # streaming_callback is not None (verified in the run method)
message = ChatMessage.from_assistant(chunk.generated_text)
message.metadata.update(
message.meta.update(
{
"finish_reason": chunk.details.finish_reason.value,
"index": 0,
@ -264,7 +264,7 @@ class HuggingFaceTGIChatGenerator:
prepared_prompt, details=True, **generation_kwargs
)
message = ChatMessage.from_assistant(tgr.generated_text)
message.metadata.update(
message.meta.update(
{
"finish_reason": tgr.details.finish_reason.value,
"index": _i,

View File

@ -42,7 +42,7 @@ class GPTChatGenerator:
>>{'replies': [ChatMessage(content='Natural Language Processing (NLP) is a branch of artificial intelligence
>>that focuses on enabling computers to understand, interpret, and generate human language in a way that is
>>meaningful and useful.', role=<ChatRole.ASSISTANT: 'assistant'>, name=None,
>>metadata={'model': 'gpt-3.5-turbo-0613', 'index': 0, 'finish_reason': 'stop',
>>meta={'model': 'gpt-3.5-turbo-0613', 'index': 0, 'finish_reason': 'stop',
>>'usage': {'prompt_tokens': 15, 'completion_tokens': 36, 'total_tokens': 51}})]}
```
@ -218,7 +218,7 @@ class GPTChatGenerator:
:param chunks: The list of all chunks returned by the OpenAI API.
"""
complete_response = ChatMessage.from_assistant("".join([chunk.content for chunk in chunks]))
complete_response.metadata.update(
complete_response.meta.update(
{
"model": chunk.model,
"index": 0,
@ -239,7 +239,7 @@ class GPTChatGenerator:
# message.content is str but message.function_call is OpenAIObject but JSON in fact, convert to str
content = str(message.function_call) if choice.finish_reason == "function_call" else message.content
chat_message = ChatMessage.from_assistant(content)
chat_message.metadata.update(
chat_message.meta.update(
{
"model": completion.model,
"index": choice.index,
@ -264,9 +264,7 @@ class GPTChatGenerator:
else:
content = ""
chunk_message = StreamingChunk(content)
chunk_message.metadata.update(
{"model": chunk.model, "index": choice.index, "finish_reason": choice.finish_reason}
)
chunk_message.meta.update({"model": chunk.model, "index": choice.index, "finish_reason": choice.finish_reason})
return chunk_message
def _check_finish_reason(self, message: ChatMessage) -> None:
@ -275,13 +273,13 @@ class GPTChatGenerator:
If the `finish_reason` is `length` or `content_filter`, log a warning.
:param message: The message returned by the LLM.
"""
if message.metadata["finish_reason"] == "length":
if message.meta["finish_reason"] == "length":
logger.warning(
"The completion for index %s has been truncated before reaching a natural stopping point. "
"Increase the max_tokens parameter to allow for longer completions.",
message.metadata["index"],
message.meta["index"],
)
if message.metadata["finish_reason"] == "content_filter":
if message.meta["finish_reason"] == "content_filter":
logger.warning(
"The completion for index %s has been truncated due to the content filter.", message.metadata["index"]
"The completion for index %s has been truncated due to the content filter.", message.meta["index"]
)

View File

@ -157,7 +157,7 @@ class HuggingFaceTGIGenerator:
# Don't send URL as it is sensitive information
return {"model": self.model}
@component.output_types(replies=List[str], metadata=List[Dict[str, Any]])
@component.output_types(replies=List[str], meta=List[Dict[str, Any]])
def run(self, prompt: str, generation_kwargs: Optional[Dict[str, Any]] = None):
"""
Invoke the text generation inference for the given prompt and generation parameters.
@ -204,15 +204,15 @@ class HuggingFaceTGIGenerator:
chunks.append(stream_chunk)
self.streaming_callback(stream_chunk) # type: ignore # streaming_callback is not None (verified in the run method)
metadata = {
"finish_reason": chunks[-1].metadata.get("finish_reason", None),
"finish_reason": chunks[-1].meta.get("finish_reason", None),
"model": self.client.model,
"usage": {
"completion_tokens": chunks[-1].metadata.get("generated_tokens", 0),
"completion_tokens": chunks[-1].meta.get("generated_tokens", 0),
"prompt_tokens": prompt_token_count,
"total_tokens": prompt_token_count + chunks[-1].metadata.get("generated_tokens", 0),
"total_tokens": prompt_token_count + chunks[-1].meta.get("generated_tokens", 0),
},
}
return {"replies": ["".join([chunk.content for chunk in chunks])], "metadata": [metadata]}
return {"replies": ["".join([chunk.content for chunk in chunks])], "meta": [metadata]}
def _run_non_streaming(
self, prompt: str, prompt_token_count: int, num_responses: int, generation_kwargs: Dict[str, Any]
@ -234,4 +234,4 @@ class HuggingFaceTGIGenerator:
}
)
responses.append(tgr.generated_text)
return {"replies": responses, "metadata": all_metadata}
return {"replies": responses, "meta": all_metadata}

View File

@ -37,7 +37,7 @@ class GPTGenerator:
>> {'replies': ['Natural Language Processing (NLP) is a branch of artificial intelligence that focuses on
>> the interaction between computers and human language. It involves enabling computers to understand, interpret,
>> and respond to natural human language in a way that is both meaningful and useful.'], 'metadata': [{'model':
>> and respond to natural human language in a way that is both meaningful and useful.'], 'meta': [{'model':
>> 'gpt-3.5-turbo-0613', 'index': 0, 'finish_reason': 'stop', 'usage': {'prompt_tokens': 16,
>> 'completion_tokens': 49, 'total_tokens': 65}}]}
```
@ -146,7 +146,7 @@ class GPTGenerator:
data["init_parameters"]["streaming_callback"] = deserialize_callback_handler(serialized_callback_handler)
return default_from_dict(cls, data)
@component.output_types(replies=List[str], metadata=List[Dict[str, Any]])
@component.output_types(replies=List[str], meta=List[Dict[str, Any]])
def run(self, prompt: str, generation_kwargs: Optional[Dict[str, Any]] = None):
"""
Invoke the text generation inference based on the provided messages and generation parameters.
@ -200,7 +200,7 @@ class GPTGenerator:
return {
"replies": [message.content for message in completions],
"metadata": [message.metadata for message in completions],
"meta": [message.meta for message in completions],
}
def _convert_to_openai_format(self, messages: List[ChatMessage]) -> List[Dict[str, Any]]:
@ -222,7 +222,7 @@ class GPTGenerator:
Connects the streaming chunks into a single ChatMessage.
"""
complete_response = ChatMessage.from_assistant("".join([chunk.content for chunk in chunks]))
complete_response.metadata.update(
complete_response.meta.update(
{
"model": chunk.model,
"index": 0,
@ -242,7 +242,7 @@ class GPTGenerator:
message: OpenAIObject = choice.message
content = dict(message.function_call) if choice.finish_reason == "function_call" else message.content
chat_message = ChatMessage.from_assistant(content)
chat_message.metadata.update(
chat_message.meta.update(
{
"model": completion.model,
"index": choice.index,
@ -267,9 +267,7 @@ class GPTGenerator:
else:
content = ""
chunk_message = StreamingChunk(content)
chunk_message.metadata.update(
{"model": chunk.model, "index": choice.index, "finish_reason": choice.finish_reason}
)
chunk_message.meta.update({"model": chunk.model, "index": choice.index, "finish_reason": choice.finish_reason})
return chunk_message
def _check_finish_reason(self, message: ChatMessage) -> None:
@ -278,13 +276,13 @@ class GPTGenerator:
If the `finish_reason` is `length`, log a warning to the user.
:param message: The message returned by the LLM.
"""
if message.metadata["finish_reason"] == "length":
if message.meta["finish_reason"] == "length":
logger.warning(
"The completion for index %s has been truncated before reaching a natural stopping point. "
"Increase the max_tokens parameter to allow for longer completions.",
message.metadata["index"],
message.meta["index"],
)
if message.metadata["finish_reason"] == "content_filter":
if message.meta["finish_reason"] == "content_filter":
logger.warning(
"The completion for index %s has been truncated due to the content filter.", message.metadata["index"]
"The completion for index %s has been truncated due to the content filter.", message.meta["index"]
)

View File

@ -58,7 +58,7 @@ class FileTypeRouter:
if isinstance(source, Path):
mime_type = self.get_mime_type(source)
elif isinstance(source, ByteStream):
mime_type = source.metadata.get("content_type")
mime_type = source.meta.get("content_type")
else:
raise ValueError(f"Unsupported data source type: {type(source)}")

View File

@ -10,7 +10,7 @@ class ByteStream:
"""
data: bytes
metadata: Dict[str, Any] = field(default_factory=dict, hash=False)
meta: Dict[str, Any] = field(default_factory=dict, hash=False)
mime_type: Optional[str] = field(default=None)
def to_file(self, destination_path: Path):

View File

@ -20,13 +20,13 @@ class ChatMessage:
:param content: The text content of the message.
:param role: The role of the entity sending the message.
:param name: The name of the function being called (only applicable for role FUNCTION).
:param metadata: Additional metadata associated with the message.
:param meta: Additional metadata associated with the message.
"""
content: str
role: ChatRole
name: Optional[str]
metadata: Dict[str, Any] = field(default_factory=dict, hash=False)
meta: Dict[str, Any] = field(default_factory=dict, hash=False)
def is_from(self, role: ChatRole) -> bool:
"""
@ -38,15 +38,15 @@ class ChatMessage:
return self.role == role
@classmethod
def from_assistant(cls, content: str, metadata: Optional[Dict[str, Any]] = None) -> "ChatMessage":
def from_assistant(cls, content: str, meta: Optional[Dict[str, Any]] = None) -> "ChatMessage":
"""
Create a message from the assistant.
:param content: The text content of the message.
:param metadata: Additional metadata associated with the message.
:param meta: Additional metadata associated with the message.
:return: A new ChatMessage instance.
"""
return cls(content, ChatRole.ASSISTANT, None, metadata or {})
return cls(content, ChatRole.ASSISTANT, None, meta or {})
@classmethod
def from_user(cls, content: str) -> "ChatMessage":

View File

@ -10,8 +10,8 @@ class StreamingChunk:
streamed data in a systematic manner.
:param content: The content of the message chunk as a string.
:param metadata: A dictionary containing metadata related to the message chunk.
:param meta: A dictionary containing metadata related to the message chunk.
"""
content: str
metadata: Dict[str, Any] = field(default_factory=dict, hash=False)
meta: Dict[str, Any] = field(default_factory=dict, hash=False)

View File

@ -67,7 +67,7 @@ class _RAGPipeline:
self.pipeline.connect("retriever", "prompt_builder.documents")
self.pipeline.connect("prompt_builder.prompt", "llm.prompt")
self.pipeline.connect("llm.replies", "answer_builder.replies")
self.pipeline.connect("llm.metadata", "answer_builder.metadata")
self.pipeline.connect("llm.meta", "answer_builder.meta")
self.pipeline.connect("retriever", "answer_builder.documents")
def run(self, query: str) -> Answer:

View File

@ -0,0 +1,4 @@
---
enhancements:
- |
Rename all metadata references to meta.

View File

@ -125,7 +125,7 @@ class TestLocalWhisperTranscriber:
}
path = SAMPLES_PATH / "audio" / "this is the content of the document.wav"
bs = ByteStream.from_file_path(path)
bs.metadata["file_path"] = path
bs.meta["file_path"] = path
results = comp.transcribe(sources=[bs])
expected = Document(
content="test transcription", meta={"audio_file": path, "other_metadata": ["other", "meta", "data"]}

View File

@ -210,7 +210,7 @@ class TestRemoteWhisperTranscriber:
transcriber = RemoteWhisperTranscriber(api_key="test_api_key", model_name=model, response_format="json")
with open(file_path, "rb") as audio_stream:
byte_stream = audio_stream.read()
audio_file = ByteStream(byte_stream, metadata={"file_path": str(file_path.absolute())})
audio_file = ByteStream(byte_stream, meta={"file_path": str(file_path.absolute())})
result = transcriber.run(sources=[audio_file])

View File

@ -10,7 +10,7 @@ class TestAnswerBuilder:
def test_run_unmatching_input_len(self):
component = AnswerBuilder()
with pytest.raises(ValueError):
component.run(query="query", replies=["reply1"], metadata=[{"test": "meta"}, {"test": "meta2"}])
component.run(query="query", replies=["reply1"], meta=[{"test": "meta"}, {"test": "meta2"}])
def test_run_without_meta(self):
component = AnswerBuilder()
@ -24,7 +24,7 @@ class TestAnswerBuilder:
def test_run_meta_is_an_empty_list(self):
component = AnswerBuilder()
output = component.run(query="query", replies=["reply1"], metadata=[])
output = component.run(query="query", replies=["reply1"], meta=[])
answers = output["answers"]
assert answers[0].data == "reply1"
assert answers[0].meta == {}
@ -34,7 +34,7 @@ class TestAnswerBuilder:
def test_run_without_pattern(self):
component = AnswerBuilder()
output = component.run(query="test query", replies=["Answer: AnswerString"], metadata=[{}])
output = component.run(query="test query", replies=["Answer: AnswerString"], meta=[{}])
answers = output["answers"]
assert len(answers) == 1
assert answers[0].data == "Answer: AnswerString"
@ -45,7 +45,7 @@ class TestAnswerBuilder:
def test_run_with_pattern_with_capturing_group(self):
component = AnswerBuilder(pattern=r"Answer: (.*)")
output = component.run(query="test query", replies=["Answer: AnswerString"], metadata=[{}])
output = component.run(query="test query", replies=["Answer: AnswerString"], meta=[{}])
answers = output["answers"]
assert len(answers) == 1
assert answers[0].data == "AnswerString"
@ -56,7 +56,7 @@ class TestAnswerBuilder:
def test_run_with_pattern_without_capturing_group(self):
component = AnswerBuilder(pattern=r"'.*'")
output = component.run(query="test query", replies=["Answer: 'AnswerString'"], metadata=[{}])
output = component.run(query="test query", replies=["Answer: 'AnswerString'"], meta=[{}])
answers = output["answers"]
assert len(answers) == 1
assert answers[0].data == "'AnswerString'"
@ -71,9 +71,7 @@ class TestAnswerBuilder:
def test_run_with_pattern_set_at_runtime(self):
component = AnswerBuilder(pattern="unused pattern")
output = component.run(
query="test query", replies=["Answer: AnswerString"], metadata=[{}], pattern=r"Answer: (.*)"
)
output = component.run(query="test query", replies=["Answer: AnswerString"], meta=[{}], pattern=r"Answer: (.*)")
answers = output["answers"]
assert len(answers) == 1
assert answers[0].data == "AnswerString"
@ -87,7 +85,7 @@ class TestAnswerBuilder:
output = component.run(
query="test query",
replies=["Answer: AnswerString"],
metadata=[{}],
meta=[{}],
documents=[Document(content="test doc 1"), Document(content="test doc 2")],
)
answers = output["answers"]
@ -104,7 +102,7 @@ class TestAnswerBuilder:
output = component.run(
query="test query",
replies=["Answer: AnswerString[2]"],
metadata=[{}],
meta=[{}],
documents=[Document(content="test doc 1"), Document(content="test doc 2")],
)
answers = output["answers"]
@ -121,7 +119,7 @@ class TestAnswerBuilder:
output = component.run(
query="test query",
replies=["Answer: AnswerString[3]"],
metadata=[{}],
meta=[{}],
documents=[Document(content="test doc 1"), Document(content="test doc 2")],
)
answers = output["answers"]
@ -137,7 +135,7 @@ class TestAnswerBuilder:
output = component.run(
query="test query",
replies=["Answer: AnswerString[2][3]"],
metadata=[{}],
meta=[{}],
documents=[Document(content="test doc 1"), Document(content="test doc 2"), Document(content="test doc 3")],
reference_pattern="\\[(\\d+)\\]",
)

View File

@ -45,7 +45,7 @@ class TestAzureOCRDocumentConverter:
}
def test_run_with_meta(self):
bytestream = ByteStream(data=b"test", metadata={"author": "test_author", "language": "en"})
bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})
with patch("haystack.components.converters.azure.DocumentAnalysisClient"):
component = AzureOCRDocumentConverter(endpoint="test_endpoint", api_key="test_credential_key")

View File

@ -63,7 +63,7 @@ class TestHTMLToDocument:
converter = HTMLToDocument()
with open(test_files_path / "html" / "what_is_haystack.html", "rb") as file:
byte_stream = file.read()
stream = ByteStream(byte_stream, metadata={"content_type": "text/html", "url": "test_url"})
stream = ByteStream(byte_stream, meta={"content_type": "text/html", "url": "test_url"})
results = converter.run(sources=[stream])
docs = results["documents"]
@ -81,7 +81,7 @@ class TestHTMLToDocument:
converter = HTMLToDocument()
with open(test_files_path / "html" / "what_is_haystack.html", "rb") as file:
byte_stream = file.read()
stream = ByteStream(byte_stream, metadata={"content_type": "text/html", "url": "test_url"})
stream = ByteStream(byte_stream, meta={"content_type": "text/html", "url": "test_url"})
metadata = [{"file_name": "what_is_haystack.html"}]
results = converter.run(sources=[stream], meta=metadata)
@ -103,7 +103,7 @@ class TestHTMLToDocument:
with open(test_files_path / "html" / "what_is_haystack.html", "rb") as file:
byte_stream = file.read()
# ByteStream has "url" present in metadata
stream = ByteStream(byte_stream, metadata={"content_type": "text/html", "url": "test_url_correct"})
stream = ByteStream(byte_stream, meta={"content_type": "text/html", "url": "test_url_correct"})
# "url" supplied by the user overwrites value present in metadata
metadata = [{"file_name": "what_is_haystack.html", "url": "test_url_new"}]

View File

@ -32,7 +32,7 @@ class TestMarkdownToDocument:
assert "# git clone https://github.com/deepset-ai/haystack.git" in doc.content
def test_run_with_meta(self):
bytestream = ByteStream(data=b"test", metadata={"author": "test_author", "language": "en"})
bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})
converter = MarkdownToDocument()

View File

@ -30,7 +30,7 @@ class TestPyPDFToDocument:
assert "ReAct" in docs[0].content
def test_run_with_meta(self):
bytestream = ByteStream(data=b"test", metadata={"author": "test_author", "language": "en"})
bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})
converter = PyPDFToDocument()
with patch("haystack.components.converters.pypdf.PdfReader"):

View File

@ -14,8 +14,8 @@ class TestTextfileToDocument:
Test if the component runs correctly.
"""
bytestream = ByteStream.from_file_path(test_files_path / "txt" / "doc_3.txt")
bytestream.metadata["file_path"] = str(test_files_path / "txt" / "doc_3.txt")
bytestream.metadata["key"] = "value"
bytestream.meta["file_path"] = str(test_files_path / "txt" / "doc_3.txt")
bytestream.meta["key"] = "value"
files = [str(test_files_path / "txt" / "doc_1.txt"), test_files_path / "txt" / "doc_2.txt", bytestream]
converter = TextFileToDocument()
output = converter.run(sources=files)
@ -26,7 +26,7 @@ class TestTextfileToDocument:
assert "That's yet another file!" in docs[2].content
assert docs[0].meta["file_path"] == str(files[0])
assert docs[1].meta["file_path"] == str(files[1])
assert docs[2].meta == bytestream.metadata
assert docs[2].meta == bytestream.meta
def test_run_error_handling(self, test_files_path, caplog):
"""
@ -47,18 +47,18 @@ class TestTextfileToDocument:
Test if the encoding metadata field is used properly
"""
bytestream = ByteStream.from_file_path(test_files_path / "txt" / "doc_1.txt")
bytestream.metadata["key"] = "value"
bytestream.meta["key"] = "value"
converter = TextFileToDocument(encoding="utf-16")
output = converter.run(sources=[bytestream])
assert "Some text for testing." not in output["documents"][0].content
bytestream.metadata["encoding"] = "utf-8"
bytestream.meta["encoding"] = "utf-8"
output = converter.run(sources=[bytestream])
assert "Some text for testing." in output["documents"][0].content
def test_run_with_meta(self):
bytestream = ByteStream(data=b"test", metadata={"author": "test_author", "language": "en"})
bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})
converter = TextFileToDocument()

View File

@ -19,7 +19,7 @@ class TestTikaDocumentConverter:
assert documents[0].content == "Content of mock source"
def test_run_with_meta(self):
bytestream = ByteStream(data=b"test", metadata={"author": "test_author", "language": "en"})
bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})
converter = TikaDocumentConverter()
with patch("haystack.components.converters.tika.tika_parser.from_buffer"):

View File

@ -67,7 +67,7 @@ class TestLinkContentFetcher:
streams = fetcher.run(urls=["https://www.example.com"])["streams"]
first_stream = streams[0]
assert first_stream.data == correct_response
assert first_stream.metadata["content_type"] == "text/plain"
assert first_stream.meta["content_type"] == "text/plain"
def test_run_html(self):
correct_response = b"<h1>Example test response</h1>"
@ -79,7 +79,7 @@ class TestLinkContentFetcher:
streams = fetcher.run(urls=["https://www.example.com"])["streams"]
first_stream = streams[0]
assert first_stream.data == correct_response
assert first_stream.metadata["content_type"] == "text/html"
assert first_stream.meta["content_type"] == "text/html"
def test_run_binary(self, test_files_path):
file_bytes = open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb").read()
@ -91,7 +91,7 @@ class TestLinkContentFetcher:
streams = fetcher.run(urls=["https://www.example.com"])["streams"]
first_stream = streams[0]
assert first_stream.data == file_bytes
assert first_stream.metadata["content_type"] == "application/pdf"
assert first_stream.meta["content_type"] == "application/pdf"
def test_run_bad_status_code(self):
empty_byte_stream = b""
@ -105,7 +105,7 @@ class TestLinkContentFetcher:
assert len(streams) == 1
first_stream = streams[0]
assert first_stream.data == empty_byte_stream
assert first_stream.metadata["content_type"] == "text/html"
assert first_stream.meta["content_type"] == "text/html"
@pytest.mark.integration
def test_link_content_fetcher_html(self):
@ -113,8 +113,8 @@ class TestLinkContentFetcher:
streams = fetcher.run([HTML_URL])["streams"]
first_stream = streams[0]
assert "Haystack" in first_stream.data.decode("utf-8")
assert first_stream.metadata["content_type"] == "text/html"
assert "url" in first_stream.metadata and first_stream.metadata["url"] == HTML_URL
assert first_stream.meta["content_type"] == "text/html"
assert "url" in first_stream.meta and first_stream.meta["url"] == HTML_URL
@pytest.mark.integration
def test_link_content_fetcher_text(self):
@ -122,8 +122,8 @@ class TestLinkContentFetcher:
streams = fetcher.run([TEXT_URL])["streams"]
first_stream = streams[0]
assert "Haystack" in first_stream.data.decode("utf-8")
assert first_stream.metadata["content_type"] == "text/plain"
assert "url" in first_stream.metadata and first_stream.metadata["url"] == TEXT_URL
assert first_stream.meta["content_type"] == "text/plain"
assert "url" in first_stream.meta and first_stream.meta["url"] == TEXT_URL
@pytest.mark.integration
def test_link_content_fetcher_pdf(self):
@ -131,8 +131,8 @@ class TestLinkContentFetcher:
streams = fetcher.run([PDF_URL])["streams"]
assert len(streams) == 1
first_stream = streams[0]
assert first_stream.metadata["content_type"] in ("application/octet-stream", "application/pdf")
assert "url" in first_stream.metadata and first_stream.metadata["url"] == PDF_URL
assert first_stream.meta["content_type"] in ("application/octet-stream", "application/pdf")
assert "url" in first_stream.meta and first_stream.meta["url"] == PDF_URL
@pytest.mark.integration
def test_link_content_fetcher_multiple_different_content_types(self):
@ -143,10 +143,10 @@ class TestLinkContentFetcher:
streams = fetcher.run([PDF_URL, HTML_URL])["streams"]
assert len(streams) == 2
for stream in streams:
assert stream.metadata["content_type"] in ("text/html", "application/pdf", "application/octet-stream")
if stream.metadata["content_type"] == "text/html":
assert stream.meta["content_type"] in ("text/html", "application/pdf", "application/octet-stream")
if stream.meta["content_type"] == "text/html":
assert "Haystack" in stream.data.decode("utf-8")
elif stream.metadata["content_type"] == "application/pdf":
elif stream.meta["content_type"] == "application/pdf":
assert len(stream.data) > 0
@pytest.mark.integration
@ -160,10 +160,10 @@ class TestLinkContentFetcher:
streams = fetcher.run([PDF_URL, HTML_URL, "https://google.com"])["streams"]
assert len(streams) == 3
for stream in streams:
assert stream.metadata["content_type"] in ("text/html", "application/pdf", "application/octet-stream")
if stream.metadata["content_type"] == "text/html":
assert stream.meta["content_type"] in ("text/html", "application/pdf", "application/octet-stream")
if stream.meta["content_type"] == "text/html":
assert "Haystack" in stream.data.decode("utf-8") or "Google" in stream.data.decode("utf-8")
elif stream.metadata["content_type"] == "application/pdf":
elif stream.meta["content_type"] == "application/pdf":
assert len(stream.data) > 0
@pytest.mark.integration
@ -177,7 +177,7 @@ class TestLinkContentFetcher:
result = fetcher.run(["https://non_existent_website_dot.com/", "https://www.google.com/"])
assert len(result["streams"]) == 1
first_stream = result["streams"][0]
assert first_stream.metadata["content_type"] == "text/html"
assert first_stream.meta["content_type"] == "text/html"
@pytest.mark.integration
def test_bad_request_exception_raised(self):

View File

@ -241,7 +241,7 @@ class TestGPTChatGenerator:
component = GPTChatGenerator(api_key="test-api-key")
messages = [
ChatMessage.from_assistant(
"", metadata={"finish_reason": "content_filter" if i % 2 == 0 else "length", "index": i}
"", meta={"finish_reason": "content_filter" if i % 2 == 0 else "length", "index": i}
)
for i, _ in enumerate(range(4))
]

View File

@ -124,11 +124,11 @@ class TestHuggingFaceTGIGenerator:
assert isinstance(response, dict)
assert "replies" in response
assert "metadata" in response
assert "meta" in response
assert isinstance(response["replies"], list)
assert isinstance(response["metadata"], list)
assert isinstance(response["meta"], list)
assert len(response["replies"]) == 1
assert len(response["metadata"]) == 1
assert len(response["meta"]) == 1
assert [isinstance(reply, str) for reply in response["replies"]]
def test_generate_multiple_text_responses_with_valid_prompt_and_generation_parameters(
@ -157,14 +157,14 @@ class TestHuggingFaceTGIGenerator:
assert isinstance(response, dict)
assert "replies" in response
assert "metadata" in response
assert "meta" in response
assert isinstance(response["replies"], list)
assert [isinstance(reply, str) for reply in response["replies"]]
assert isinstance(response["metadata"], list)
assert isinstance(response["meta"], list)
assert len(response["replies"]) == 3
assert len(response["metadata"]) == 3
assert [isinstance(reply, dict) for reply in response["metadata"]]
assert len(response["meta"]) == 3
assert [isinstance(reply, dict) for reply in response["meta"]]
def test_initialize_with_invalid_model(self, mock_check_valid_model):
model = "invalid_model"
@ -200,9 +200,9 @@ class TestHuggingFaceTGIGenerator:
assert [isinstance(reply, str) for reply in response["replies"]]
# Assert that the response contains the metadata
assert "metadata" in response
assert isinstance(response["metadata"], list)
assert len(response["metadata"]) > 0
assert "meta" in response
assert isinstance(response["meta"], list)
assert len(response["meta"]) > 0
assert [isinstance(reply, dict) for reply in response["replies"]]
def test_generate_text_with_custom_generation_parameters(
@ -226,9 +226,9 @@ class TestHuggingFaceTGIGenerator:
assert response["replies"][0] == "I'm fine, thanks."
# Assert that the response contains the metadata
assert "metadata" in response
assert isinstance(response["metadata"], list)
assert len(response["metadata"]) > 0
assert "meta" in response
assert isinstance(response["meta"], list)
assert len(response["meta"]) > 0
assert [isinstance(reply, str) for reply in response["replies"]]
def test_generate_text_with_streaming_callback(
@ -278,7 +278,7 @@ class TestHuggingFaceTGIGenerator:
assert [isinstance(reply, str) for reply in response["replies"]]
# Assert that the response contains the metadata
assert "metadata" in response
assert isinstance(response["metadata"], list)
assert len(response["metadata"]) > 0
assert "meta" in response
assert isinstance(response["meta"], list)
assert len(response["meta"]) > 0
assert [isinstance(reply, dict) for reply in response["replies"]]

View File

@ -242,7 +242,7 @@ class TestGPTGenerator:
for i, _ in enumerate(range(4)):
message = ChatMessage.from_assistant("Hello")
metadata = {"finish_reason": "content_filter" if i % 2 == 0 else "length", "index": i}
message.metadata.update(metadata)
message.meta.update(metadata)
messages.append(message)
for m in messages:

View File

@ -46,13 +46,13 @@ class TestFileTypeRouter:
for path, mime_type in zip(file_paths, mime_types):
stream = ByteStream(path.read_bytes())
stream.metadata["content_type"] = mime_type
stream.meta["content_type"] = mime_type
byte_streams.append(stream)
# add unclassified ByteStream
bs = ByteStream(b"unclassified content")
bs.metadata["content_type"] = "unknown_type"
bs.meta["content_type"] = "unknown_type"
byte_streams.append(bs)
router = FileTypeRouter(mime_types=["text/plain", "audio/x-wav", "image/jpeg"])
@ -75,7 +75,7 @@ class TestFileTypeRouter:
byte_stream_sources = []
for path, mime_type in zip(file_paths, mime_types):
stream = ByteStream(path.read_bytes())
stream.metadata["content_type"] = mime_type
stream.meta["content_type"] = mime_type
byte_stream_sources.append(stream)
mixed_sources = file_paths[:2] + byte_stream_sources[2:]

View File

@ -4,25 +4,25 @@ from haystack.dataclasses import StreamingChunk
def test_create_chunk_with_content_and_metadata():
chunk = StreamingChunk(content="Test content", metadata={"key": "value"})
chunk = StreamingChunk(content="Test content", meta={"key": "value"})
assert chunk.content == "Test content"
assert chunk.metadata == {"key": "value"}
assert chunk.meta == {"key": "value"}
def test_create_chunk_with_only_content():
chunk = StreamingChunk(content="Test content")
assert chunk.content == "Test content"
assert chunk.metadata == {}
assert chunk.meta == {}
def test_access_content():
chunk = StreamingChunk(content="Test content", metadata={"key": "value"})
chunk = StreamingChunk(content="Test content", meta={"key": "value"})
assert chunk.content == "Test content"
def test_create_chunk_with_empty_content():
chunk = StreamingChunk(content="")
assert chunk.content == ""
assert chunk.metadata == {}
assert chunk.meta == {}