mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-01 17:37:17 +00:00
implement additional mime types (#8446)
* implement additional mime types * correct typo * reduce complexity * add optional * add missing release note * yamllint * yamllint * Update file-router-additional-mime-types-47fe57e6816b83da.yaml minor reno change for consistency --------- Co-authored-by: Vladimir Blagojevic <dovlex@gmail.com>
This commit is contained in:
parent
8613bb7653
commit
78f378b34d
@ -54,16 +54,24 @@ class FileTypeRouter:
|
|||||||
:param mime_types: A list of MIME types or regex patterns to classify the input files or byte streams.
|
:param mime_types: A list of MIME types or regex patterns to classify the input files or byte streams.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, mime_types: List[str]):
|
def __init__(self, mime_types: List[str], additional_mimetypes: Optional[Dict[str, str]] = None):
|
||||||
"""
|
"""
|
||||||
Initialize the FileTypeRouter component.
|
Initialize the FileTypeRouter component.
|
||||||
|
|
||||||
:param mime_types: A list of MIME types or regex patterns to classify the input files or byte streams.
|
:param mime_types: A list of MIME types or regex patterns to classify the input files or byte streams.
|
||||||
(for example: `["text/plain", "audio/x-wav", "image/jpeg"]`).
|
(for example: `["text/plain", "audio/x-wav", "image/jpeg"]`).
|
||||||
|
|
||||||
|
:param additional_mimetypes: A dictionary containing the MIME type to add to the mimetypes package to prevent
|
||||||
|
unsupported or non native packages from being unclassified.
|
||||||
|
(for example: `{"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx"}`).
|
||||||
"""
|
"""
|
||||||
if not mime_types:
|
if not mime_types:
|
||||||
raise ValueError("The list of mime types cannot be empty.")
|
raise ValueError("The list of mime types cannot be empty.")
|
||||||
|
|
||||||
|
if additional_mimetypes:
|
||||||
|
for mime, ext in additional_mimetypes.items():
|
||||||
|
mimetypes.add_type(mime, ext)
|
||||||
|
|
||||||
self.mime_type_patterns = []
|
self.mime_type_patterns = []
|
||||||
for mime_type in mime_types:
|
for mime_type in mime_types:
|
||||||
if not self._is_valid_mime_type_format(mime_type):
|
if not self._is_valid_mime_type_format(mime_type):
|
||||||
|
|||||||
@ -0,0 +1,22 @@
|
|||||||
|
---
|
||||||
|
features:
|
||||||
|
- |
|
||||||
|
Added a new parameter `additional_mimetypes` to the FileTypeRouter
|
||||||
|
component.
|
||||||
|
|
||||||
|
This allows users to specify additional MIME type mappings, ensuring
|
||||||
|
correct
|
||||||
|
|
||||||
|
file classification across different runtime environments and Python
|
||||||
|
versions.
|
||||||
|
enhancements:
|
||||||
|
- |
|
||||||
|
Improved file type detection in FileTypeRouter, particularly for Microsoft
|
||||||
|
Office file formats like .docx and .pptx. This enhancement ensures more
|
||||||
|
consistent behavior across different environments, including AWS Lambda
|
||||||
|
functions and systems without pre-installed office suites.
|
||||||
|
fixes:
|
||||||
|
- |
|
||||||
|
Addressed an issue where certain file types (e.g., .docx, .pptx) were
|
||||||
|
incorrectly classified as 'unclassified' in environments with limited
|
||||||
|
MIME type definitions, such as AWS Lambda functions.
|
||||||
Loading…
x
Reference in New Issue
Block a user