mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-02-07 07:22:03 +00:00
* feat(bytestream): add guess_mime_type parameter * refactor(FileTypeRouter): refactor guess mimetype * feat(bytestream): add guess_mime_type to util * style(ruff): add trailing whitespace * fix: fix type annotation * test(file_type_router): add test for additional_mimetypes param * fix(file_type_router): non-existent file behavior * feat(file_type_router): add release notes * fix(file_type_router): remove unused logger * style: fix ruff formatting magic values * test(bytestream): handle windows/unix mimetype differences --------- Co-authored-by: Julian Risch <julian.risch@deepset.ai>
73 lines
2.6 KiB
Python
73 lines
2.6 KiB
Python
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
import pytest
|
|
|
|
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
|
|
from haystack.dataclasses import ByteStream
|
|
|
|
|
|
def test_normalize_metadata_None():
|
|
assert normalize_metadata(None, sources_count=1) == [{}]
|
|
assert normalize_metadata(None, sources_count=3) == [{}, {}, {}]
|
|
|
|
|
|
def test_normalize_metadata_single_dict():
|
|
assert normalize_metadata({"a": 1}, sources_count=1) == [{"a": 1}]
|
|
assert normalize_metadata({"a": 1}, sources_count=3) == [{"a": 1}, {"a": 1}, {"a": 1}]
|
|
|
|
|
|
def test_normalize_metadata_list_of_right_size():
|
|
assert normalize_metadata([{"a": 1}], sources_count=1) == [{"a": 1}]
|
|
assert normalize_metadata([{"a": 1}, {"b": 2}, {"c": 3}], sources_count=3) == [{"a": 1}, {"b": 2}, {"c": 3}]
|
|
|
|
|
|
def test_normalize_metadata_list_of_wrong_size():
|
|
with pytest.raises(ValueError, match="The length of the metadata list must match the number of sources."):
|
|
normalize_metadata([{"a": 1}], sources_count=3)
|
|
with pytest.raises(ValueError, match="The length of the metadata list must match the number of sources."):
|
|
assert normalize_metadata([{"a": 1}, {"b": 2}, {"c": 3}], sources_count=1)
|
|
|
|
|
|
def test_normalize_metadata_other_type():
|
|
with pytest.raises(ValueError, match="meta must be either None, a dictionary or a list of dictionaries."):
|
|
normalize_metadata(({"a": 1},), sources_count=1)
|
|
|
|
|
|
def test_get_bytestream_from_path_object(tmp_path):
|
|
bytes_ = b"hello world"
|
|
source = tmp_path / "test.txt"
|
|
source.write_bytes(bytes_)
|
|
|
|
bs = get_bytestream_from_source(source, guess_mime_type=True)
|
|
|
|
assert isinstance(bs, ByteStream)
|
|
assert bs.data == bytes_
|
|
assert bs.mime_type == "text/plain"
|
|
assert bs.meta["file_path"].endswith("test.txt")
|
|
|
|
|
|
def test_get_bytestream_from_string_path(tmp_path):
|
|
bytes_ = b"hello world"
|
|
source = tmp_path / "test.txt"
|
|
source.write_bytes(bytes_)
|
|
|
|
bs = get_bytestream_from_source(str(source), guess_mime_type=True)
|
|
|
|
assert isinstance(bs, ByteStream)
|
|
assert bs.data == bytes_
|
|
assert bs.mime_type == "text/plain"
|
|
assert bs.meta["file_path"].endswith("test.txt")
|
|
|
|
|
|
def test_get_bytestream_from_source_invalid_type():
|
|
with pytest.raises(ValueError, match="Unsupported source type"):
|
|
get_bytestream_from_source(123)
|
|
|
|
|
|
def test_get_bytestream_from_source_bytestream_passthrough():
|
|
bs = ByteStream(data=b"spam", mime_type="text/custom", meta={"spam": "eggs"})
|
|
result = get_bytestream_from_source(bs)
|
|
assert result is bs
|