feat: Set ByteStream's mime_type attribute for web based resources (#7681)

This commit is contained in:
Vladimir Blagojevic 2024-05-13 19:44:02 +02:00 committed by GitHub
parent 1d20ac3c5e
commit 811b93db91
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 14 additions and 6 deletions

View File

@ -151,6 +151,7 @@ class LinkContentFetcher:
for stream_metadata, stream in results: # type: ignore
if stream_metadata is not None and stream is not None:
stream.meta.update(stream_metadata)
stream.mime_type = stream.meta.get("content_type", None)
streams.append(stream)
return {"streams": streams}

View File

@ -90,7 +90,7 @@ class FileTypeRouter:
if isinstance(source, Path):
mime_type = self._get_mime_type(source)
elif isinstance(source, ByteStream):
mime_type = source.meta.get("content_type", None)
mime_type = source.mime_type
else:
raise ValueError(f"Unsupported data source type: {type(source).__name__}")

View File

@ -0,0 +1,4 @@
---
enhancements:
- |
Improved MIME type management by directly setting MIME types on ByteStreams, enhancing the overall handling and routing of different file types. This update makes MIME type data more consistently accessible and simplifies the process of working with various document formats.

View File

@ -50,7 +50,7 @@ class TestFileTypeRouter:
byte_streams = []
for path, mime_type in zip(file_paths, mime_types):
stream = ByteStream(path.read_bytes())
stream.meta["content_type"] = mime_type
stream.mime_type = mime_type
byte_streams.append(stream)
# add unclassified ByteStream
@ -81,7 +81,7 @@ class TestFileTypeRouter:
byte_stream_sources = []
for path, mime_type in zip(file_paths, mime_types):
stream = ByteStream(path.read_bytes())
stream.meta["content_type"] = mime_type
stream.mime_type = mime_type
byte_stream_sources.append(stream)
mixed_sources = file_paths[:2] + byte_stream_sources[2:]
@ -165,9 +165,12 @@ class TestFileTypeRouter:
"""
Test if the component correctly matches mime types exactly, without regex patterns.
"""
txt_stream = ByteStream(io.BytesIO(b"Text file content"), meta={"content_type": "text/plain"})
jpg_stream = ByteStream(io.BytesIO(b"JPEG file content"), meta={"content_type": "image/jpeg"})
mp3_stream = ByteStream(io.BytesIO(b"MP3 file content"), meta={"content_type": "audio/mpeg"})
txt_stream = ByteStream(io.BytesIO(b"Text file content").read())
txt_stream.mime_type = "text/plain"
jpg_stream = ByteStream(io.BytesIO(b"JPEG file content").read())
jpg_stream.mime_type = "image/jpeg"
mp3_stream = ByteStream(io.BytesIO(b"MP3 file content").read())
mp3_stream.mime_type = "audio/mpeg"
byte_streams = [txt_stream, jpg_stream, mp3_stream]