From 811b93db918f39aa81b48faac0b1622e0922b81d Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Mon, 13 May 2024 19:44:02 +0200 Subject: [PATCH] feat: Set ByteStream's mime_type attribute for web based resources (#7681) --- haystack/components/fetchers/link_content.py | 1 + haystack/components/routers/file_type_router.py | 2 +- ...nhanced-mime-type-handling-182fb64a0f5fb852.yaml | 4 ++++ test/components/routers/test_file_router.py | 13 ++++++++----- 4 files changed, 14 insertions(+), 6 deletions(-) create mode 100644 releasenotes/notes/enhanced-mime-type-handling-182fb64a0f5fb852.yaml diff --git a/haystack/components/fetchers/link_content.py b/haystack/components/fetchers/link_content.py index 00cbdeb66..0d86ff852 100644 --- a/haystack/components/fetchers/link_content.py +++ b/haystack/components/fetchers/link_content.py @@ -151,6 +151,7 @@ class LinkContentFetcher: for stream_metadata, stream in results: # type: ignore if stream_metadata is not None and stream is not None: stream.meta.update(stream_metadata) + stream.mime_type = stream.meta.get("content_type", None) streams.append(stream) return {"streams": streams} diff --git a/haystack/components/routers/file_type_router.py b/haystack/components/routers/file_type_router.py index 08c615c7e..fdf8830c9 100644 --- a/haystack/components/routers/file_type_router.py +++ b/haystack/components/routers/file_type_router.py @@ -90,7 +90,7 @@ class FileTypeRouter: if isinstance(source, Path): mime_type = self._get_mime_type(source) elif isinstance(source, ByteStream): - mime_type = source.meta.get("content_type", None) + mime_type = source.mime_type else: raise ValueError(f"Unsupported data source type: {type(source).__name__}") diff --git a/releasenotes/notes/enhanced-mime-type-handling-182fb64a0f5fb852.yaml b/releasenotes/notes/enhanced-mime-type-handling-182fb64a0f5fb852.yaml new file mode 100644 index 000000000..c0e7d0744 --- /dev/null +++ b/releasenotes/notes/enhanced-mime-type-handling-182fb64a0f5fb852.yaml @@ -0,0 +1,4 @@ +--- +enhancements: + - | + Improved MIME type management by directly setting MIME types on ByteStreams, enhancing the overall handling and routing of different file types. This update makes MIME type data more consistently accessible and simplifies the process of working with various document formats. diff --git a/test/components/routers/test_file_router.py b/test/components/routers/test_file_router.py index b64be6633..32d1e99dd 100644 --- a/test/components/routers/test_file_router.py +++ b/test/components/routers/test_file_router.py @@ -50,7 +50,7 @@ class TestFileTypeRouter: byte_streams = [] for path, mime_type in zip(file_paths, mime_types): stream = ByteStream(path.read_bytes()) - stream.meta["content_type"] = mime_type + stream.mime_type = mime_type byte_streams.append(stream) # add unclassified ByteStream @@ -81,7 +81,7 @@ class TestFileTypeRouter: byte_stream_sources = [] for path, mime_type in zip(file_paths, mime_types): stream = ByteStream(path.read_bytes()) - stream.meta["content_type"] = mime_type + stream.mime_type = mime_type byte_stream_sources.append(stream) mixed_sources = file_paths[:2] + byte_stream_sources[2:] @@ -165,9 +165,12 @@ class TestFileTypeRouter: """ Test if the component correctly matches mime types exactly, without regex patterns. """ - txt_stream = ByteStream(io.BytesIO(b"Text file content"), meta={"content_type": "text/plain"}) - jpg_stream = ByteStream(io.BytesIO(b"JPEG file content"), meta={"content_type": "image/jpeg"}) - mp3_stream = ByteStream(io.BytesIO(b"MP3 file content"), meta={"content_type": "audio/mpeg"}) + txt_stream = ByteStream(io.BytesIO(b"Text file content").read()) + txt_stream.mime_type = "text/plain" + jpg_stream = ByteStream(io.BytesIO(b"JPEG file content").read()) + jpg_stream.mime_type = "image/jpeg" + mp3_stream = ByteStream(io.BytesIO(b"MP3 file content").read()) + mp3_stream.mime_type = "audio/mpeg" byte_streams = [txt_stream, jpg_stream, mp3_stream]