Kane Norman 9420492798
feat: allow mime_type to be guessed for ByteStream (#9573)
* feat(bytestream): add guess_mime_type parameter

* refactor(FileTypeRouter): refactor guess mimetype

* feat(bytestream): add guess_mime_type to util

* style(ruff): add trailing whitespace

* fix: fix type annotation

* test(file_type_router): add test for additional_mimetypes param

* fix(file_type_router): non-existent file behavior

* feat(file_type_router): add release notes

* fix(file_type_router): remove unused logger

* style: fix ruff formatting magic values

* test(bytestream): handle windows/unix mimetype differences

---------

Co-authored-by: Julian Risch <julian.risch@deepset.ai>
2025-07-22 07:43:22 +00:00

73 lines
2.6 KiB
Python

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
import pytest
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
from haystack.dataclasses import ByteStream
def test_normalize_metadata_None():
assert normalize_metadata(None, sources_count=1) == [{}]
assert normalize_metadata(None, sources_count=3) == [{}, {}, {}]
def test_normalize_metadata_single_dict():
assert normalize_metadata({"a": 1}, sources_count=1) == [{"a": 1}]
assert normalize_metadata({"a": 1}, sources_count=3) == [{"a": 1}, {"a": 1}, {"a": 1}]
def test_normalize_metadata_list_of_right_size():
assert normalize_metadata([{"a": 1}], sources_count=1) == [{"a": 1}]
assert normalize_metadata([{"a": 1}, {"b": 2}, {"c": 3}], sources_count=3) == [{"a": 1}, {"b": 2}, {"c": 3}]
def test_normalize_metadata_list_of_wrong_size():
with pytest.raises(ValueError, match="The length of the metadata list must match the number of sources."):
normalize_metadata([{"a": 1}], sources_count=3)
with pytest.raises(ValueError, match="The length of the metadata list must match the number of sources."):
assert normalize_metadata([{"a": 1}, {"b": 2}, {"c": 3}], sources_count=1)
def test_normalize_metadata_other_type():
with pytest.raises(ValueError, match="meta must be either None, a dictionary or a list of dictionaries."):
normalize_metadata(({"a": 1},), sources_count=1)
def test_get_bytestream_from_path_object(tmp_path):
bytes_ = b"hello world"
source = tmp_path / "test.txt"
source.write_bytes(bytes_)
bs = get_bytestream_from_source(source, guess_mime_type=True)
assert isinstance(bs, ByteStream)
assert bs.data == bytes_
assert bs.mime_type == "text/plain"
assert bs.meta["file_path"].endswith("test.txt")
def test_get_bytestream_from_string_path(tmp_path):
bytes_ = b"hello world"
source = tmp_path / "test.txt"
source.write_bytes(bytes_)
bs = get_bytestream_from_source(str(source), guess_mime_type=True)
assert isinstance(bs, ByteStream)
assert bs.data == bytes_
assert bs.mime_type == "text/plain"
assert bs.meta["file_path"].endswith("test.txt")
def test_get_bytestream_from_source_invalid_type():
with pytest.raises(ValueError, match="Unsupported source type"):
get_bytestream_from_source(123)
def test_get_bytestream_from_source_bytestream_passthrough():
bs = ByteStream(data=b"spam", mime_type="text/custom", meta={"spam": "eggs"})
result = get_bytestream_from_source(bs)
assert result is bs