feat: Add ByteStream to_string method (#7009)

This commit is contained in:
Vladimir Blagojevic 2024-02-17 12:57:42 +01:00 committed by GitHub
parent 3f85a63468
commit 3ce6b9768e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 36 additions and 0 deletions

View File

@ -49,3 +49,13 @@ class ByteStream:
:param meta: Additional metadata to be stored with the ByteStream.
"""
return cls(data=text.encode(encoding), mime_type=mime_type, meta=meta or {})
def to_string(self, encoding: str = "utf-8") -> str:
"""
Convert the ByteStream to a string, metadata will not be included.
:param encoding: The encoding used to convert the bytes to a string. Defaults to "utf-8".
:return: The string representation of the ByteStream.
:raises UnicodeDecodeError: If the ByteStream data cannot be decoded with the specified encoding.
"""
return self.data.decode(encoding)

View File

@ -1,3 +1,5 @@
import pytest
from haystack.dataclasses import ByteStream
@ -35,6 +37,30 @@ def test_from_string():
assert b.meta == {"foo": "bar"}
def test_to_string():
test_string = "Hello, world!"
b = ByteStream.from_string(test_string)
assert b.to_string() == test_string
def test_to_from_string_encoding():
test_string = "Hello Baščaršija!"
with pytest.raises(UnicodeEncodeError):
ByteStream.from_string(test_string, encoding="ISO-8859-1")
bs = ByteStream.from_string(test_string) # default encoding is utf-8
assert bs.to_string(encoding="ISO-8859-1") != test_string
assert bs.to_string(encoding="utf-8") == test_string
def test_to_string_encoding_error():
# test that it raises ValueError if the encoding is not valid
b = ByteStream.from_string("Hello, world!")
with pytest.raises(UnicodeDecodeError):
b.to_string("utf-16")
def test_to_file(tmp_path, request):
test_str = "Hello, world!\n"
test_path = tmp_path / request.node.name