From c642695ec04c2f192b1f4fc0fd5ff03e74a0e7c8 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Thu, 14 Dec 2023 16:30:57 +0100 Subject: [PATCH] feat: Add FileTypeRouter markdown support (#6551) * Add FileTypeRouter markdown support * Add releae note --- haystack/components/routers/file_type_router.py | 16 ++++++++++++++-- ...ile-type-router-support-39a607faa5c1436f.yaml | 4 ++++ test/components/routers/test_file_router.py | 6 ++++-- 3 files changed, 22 insertions(+), 4 deletions(-) create mode 100644 releasenotes/notes/add-markdown-file-type-router-support-39a607faa5c1436f.yaml diff --git a/haystack/components/routers/file_type_router.py b/haystack/components/routers/file_type_router.py index c9a12d272..932472bf1 100644 --- a/haystack/components/routers/file_type_router.py +++ b/haystack/components/routers/file_type_router.py @@ -76,7 +76,10 @@ class FileTypeRouter: :param path: The file path to get the MIME type for. :return: The MIME type of the provided file path, or None if the MIME type cannot be determined. """ - return mimetypes.guess_type(path.as_posix())[0] + extension = path.suffix.lower() + mime_type = mimetypes.guess_type(path.as_posix())[0] + # lookup custom mappings if the mime type is not found + return self.get_custom_mime_mappings().get(extension, mime_type) def is_valid_mime_type_format(self, mime_type: str) -> bool: """ @@ -84,4 +87,13 @@ class FileTypeRouter: :param mime_type: The MIME type to check. :return: True if the provided MIME type is a valid MIME type format, False otherwise. """ - return mime_type in mimetypes.types_map.values() + return mime_type in mimetypes.types_map.values() or mime_type in self.get_custom_mime_mappings().values() + + @staticmethod + def get_custom_mime_mappings() -> Dict[str, str]: + """ + Returns a dictionary of custom file extension to MIME type mappings. + """ + # we add markdown because it is not added by the mimetypes module + # see https://github.com/python/cpython/pull/17995 + return {".md": "text/markdown", ".markdown": "text/markdown"} diff --git a/releasenotes/notes/add-markdown-file-type-router-support-39a607faa5c1436f.yaml b/releasenotes/notes/add-markdown-file-type-router-support-39a607faa5c1436f.yaml new file mode 100644 index 000000000..e9cf3e7dd --- /dev/null +++ b/releasenotes/notes/add-markdown-file-type-router-support-39a607faa5c1436f.yaml @@ -0,0 +1,4 @@ +--- +enhancements: + - | + Adds markdown mimetype support to the file type router i.e. `FileTypeRouter` class. diff --git a/test/components/routers/test_file_router.py b/test/components/routers/test_file_router.py index b2d630cd1..c9c868470 100644 --- a/test/components/routers/test_file_router.py +++ b/test/components/routers/test_file_router.py @@ -69,8 +69,9 @@ class TestFileTypeRouter: test_files_path / "audio" / "the context for this answer is here.wav", test_files_path / "txt" / "doc_2.txt", test_files_path / "images" / "apple.jpg", + test_files_path / "markdown" / "sample.md", ] - mime_types = ["text/plain", "audio/x-wav", "text/plain", "image/jpeg"] + mime_types = ["text/plain", "audio/x-wav", "text/plain", "image/jpeg", "text/markdown"] byte_stream_sources = [] for path, mime_type in zip(file_paths, mime_types): stream = ByteStream(path.read_bytes()) @@ -79,11 +80,12 @@ class TestFileTypeRouter: mixed_sources = file_paths[:2] + byte_stream_sources[2:] - router = FileTypeRouter(mime_types=["text/plain", "audio/x-wav", "image/jpeg"]) + router = FileTypeRouter(mime_types=["text/plain", "audio/x-wav", "image/jpeg", "text/markdown"]) output = router.run(sources=mixed_sources) assert len(output["text/plain"]) == 2 assert len(output["audio/x-wav"]) == 1 assert len(output["image/jpeg"]) == 1 + assert len(output["text/markdown"]) == 1 def test_no_files(self): """