mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-25 14:36:05 +00:00
implement additional mime types (#8446)
* implement additional mime types * correct typo * reduce complexity * add optional * add missing release note * yamllint * yamllint * Update file-router-additional-mime-types-47fe57e6816b83da.yaml minor reno change for consistency --------- Co-authored-by: Vladimir Blagojevic <dovlex@gmail.com>
This commit is contained in:
parent
8613bb7653
commit
78f378b34d
@ -54,16 +54,24 @@ class FileTypeRouter:
|
||||
:param mime_types: A list of MIME types or regex patterns to classify the input files or byte streams.
|
||||
"""
|
||||
|
||||
def __init__(self, mime_types: List[str]):
|
||||
def __init__(self, mime_types: List[str], additional_mimetypes: Optional[Dict[str, str]] = None):
|
||||
"""
|
||||
Initialize the FileTypeRouter component.
|
||||
|
||||
:param mime_types: A list of MIME types or regex patterns to classify the input files or byte streams.
|
||||
(for example: `["text/plain", "audio/x-wav", "image/jpeg"]`).
|
||||
|
||||
:param additional_mimetypes: A dictionary containing the MIME type to add to the mimetypes package to prevent
|
||||
unsupported or non native packages from being unclassified.
|
||||
(for example: `{"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx"}`).
|
||||
"""
|
||||
if not mime_types:
|
||||
raise ValueError("The list of mime types cannot be empty.")
|
||||
|
||||
if additional_mimetypes:
|
||||
for mime, ext in additional_mimetypes.items():
|
||||
mimetypes.add_type(mime, ext)
|
||||
|
||||
self.mime_type_patterns = []
|
||||
for mime_type in mime_types:
|
||||
if not self._is_valid_mime_type_format(mime_type):
|
||||
|
||||
@ -0,0 +1,22 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
Added a new parameter `additional_mimetypes` to the FileTypeRouter
|
||||
component.
|
||||
|
||||
This allows users to specify additional MIME type mappings, ensuring
|
||||
correct
|
||||
|
||||
file classification across different runtime environments and Python
|
||||
versions.
|
||||
enhancements:
|
||||
- |
|
||||
Improved file type detection in FileTypeRouter, particularly for Microsoft
|
||||
Office file formats like .docx and .pptx. This enhancement ensures more
|
||||
consistent behavior across different environments, including AWS Lambda
|
||||
functions and systems without pre-installed office suites.
|
||||
fixes:
|
||||
- |
|
||||
Addressed an issue where certain file types (e.g., .docx, .pptx) were
|
||||
incorrectly classified as 'unclassified' in environments with limited
|
||||
MIME type definitions, such as AWS Lambda functions.
|
||||
Loading…
x
Reference in New Issue
Block a user