diff --git a/CHANGELOG.md b/CHANGELOG.md index acb90d48e..0630abda7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.15.0-dev14 +## 0.15.0-dev15 ### Enhancements diff --git a/test_unstructured/file_utils/test_exploration.py b/test_unstructured/file_utils/test_exploration.py index 6456e6e16..7e38fbfd8 100644 --- a/test_unstructured/file_utils/test_exploration.py +++ b/test_unstructured/file_utils/test_exploration.py @@ -5,7 +5,7 @@ import pandas as pd import pytest from unstructured.file_utils import exploration -from unstructured.file_utils.filetype import FileType +from unstructured.file_utils.model import FileType DIRECTORY = pathlib.Path(__file__).parent.resolve() diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index 8055fb2e7..af1857722 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -22,13 +22,13 @@ from test_unstructured.unit_utils import ( ) from unstructured.file_utils import filetype from unstructured.file_utils.filetype import ( - FileType, _detect_filetype_from_octet_stream, _is_code_mime_type, _is_text_file_a_csv, _is_text_file_a_json, detect_filetype, ) +from unstructured.file_utils.model import FileType is_in_docker = os.path.exists("/.dockerenv") diff --git a/test_unstructured/file_utils/test_model.py b/test_unstructured/file_utils/test_model.py new file mode 100644 index 000000000..b3a67f152 --- /dev/null +++ b/test_unstructured/file_utils/test_model.py @@ -0,0 +1,70 @@ +"""Test suite for `unstructured.file_utils.filetype`.""" + +from __future__ import annotations + +import pytest + +from unstructured.file_utils.model import FileType + + +class DescribeFileType: + """Unit-test suite for `unstructured.file_utils.model.Filetype`.""" + + @pytest.mark.parametrize( + ("ext", "file_type"), + [ + (".bmp", FileType.BMP), + (".html", FileType.HTML), + (".eml", FileType.EML), + (".p7s", FileType.EML), + (".java", FileType.TXT), + ], + ) + def it_can_recognize_a_file_type_from_an_extension(self, ext: str, file_type: FileType | None): + assert FileType.from_extension(ext) is file_type + + @pytest.mark.parametrize("ext", [".foobar", ".xyz", ".mdx", "", "."]) + def but_not_when_that_extension_is_empty_or_not_registered(self, ext: str): + assert FileType.from_extension(ext) is None + + @pytest.mark.parametrize( + ("mime_type", "file_type"), + [ + ("image/bmp", FileType.BMP), + ("text/x-csv", FileType.CSV), + ("application/msword", FileType.DOC), + ("message/rfc822", FileType.EML), + ("text/plain", FileType.TXT), + ("text/yaml", FileType.TXT), + ("application/xml", FileType.XML), + ("text/xml", FileType.XML), + ("inode/x-empty", FileType.EMPTY), + ], + ) + def it_can_recognize_a_file_type_from_a_mime_type( + self, mime_type: str, file_type: FileType | None + ): + assert FileType.from_mime_type(mime_type) is file_type + + @pytest.mark.parametrize("mime_type", ["text/css", "image/gif", "audio/mpeg", "foo/bar"]) + def but_not_when_that_mime_type_is_not_registered_by_a_file_type(self, mime_type: str): + assert FileType.from_mime_type(mime_type) is None + + @pytest.mark.parametrize( + ("file_type", "mime_type"), + [ + (FileType.BMP, "image/bmp"), + (FileType.CSV, "text/csv"), + (FileType.DOC, "application/msword"), + (FileType.EML, "message/rfc822"), + (FileType.HTML, "text/html"), + (FileType.JPG, "image/jpeg"), + (FileType.PDF, "application/pdf"), + (FileType.TXT, "text/plain"), + (FileType.XML, "application/xml"), + (FileType.EMPTY, "inode/x-empty"), + (FileType.UNK, "application/octet-stream"), + ], + ) + def it_knows_its_canonical_MIME_type(self, file_type: FileType, mime_type: str): + assert file_type.mime_type == mime_type diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index 93caf3c68..7663d84ab 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -45,7 +45,7 @@ from unstructured.documents.elements import ( Text, Title, ) -from unstructured.file_utils.filetype import FILETYPE_TO_MIMETYPE, FileType +from unstructured.file_utils.model import FileType from unstructured.partition import auto from unstructured.partition.auto import IMAGE_FILETYPES, _get_partition_with_extras, partition from unstructured.partition.utils.constants import PartitionStrategy @@ -1245,7 +1245,7 @@ def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(filetype: assert elements assert all( - e.metadata.filetype == FILETYPE_TO_MIMETYPE[filetype] + e.metadata.filetype == filetype.mime_type for e in elements if e.metadata.filetype is not None ) diff --git a/test_unstructured/partition/test_json.py b/test_unstructured/partition/test_json.py index d5659bcb1..5b08a23e8 100644 --- a/test_unstructured/partition/test_json.py +++ b/test_unstructured/partition/test_json.py @@ -10,7 +10,8 @@ import pytest from pytest_mock import MockFixture from unstructured.documents.elements import CompositeElement -from unstructured.file_utils.filetype import FileType, detect_filetype +from unstructured.file_utils.filetype import detect_filetype +from unstructured.file_utils.model import FileType from unstructured.partition.email import partition_email from unstructured.partition.html import partition_html from unstructured.partition.json import partition_json diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 1097befad..a9a2902f8 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.0-dev14" # pragma: no cover +__version__ = "0.15.0-dev15" # pragma: no cover diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index ca7758d1e..9472a10b1 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -12,13 +12,7 @@ from typing_extensions import ParamSpec from unstructured.documents.elements import Element from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str -from unstructured.file_utils.model import ( - EXT_TO_FILETYPE, - FILETYPE_TO_MIMETYPE, - PLAIN_TEXT_EXTENSIONS, - STR_TO_FILETYPE, - FileType, -) +from unstructured.file_utils.model import PLAIN_TEXT_EXTENSIONS, FileType from unstructured.logger import logger from unstructured.nlp.patterns import EMAIL_HEAD_RE, LIST_OF_DICTS_PATTERN from unstructured.partition.common import ( @@ -49,9 +43,9 @@ def detect_filetype( # first check (content_type) if content_type: - filetype = STR_TO_FILETYPE.get(content_type) - if filetype: - return filetype + file_type = FileType.from_mime_type(content_type) + if file_type: + return file_type # second check (filename/file_name/file) # continue if successfully define mime_type @@ -68,7 +62,7 @@ def detect_filetype( mime_type = ft.guess_mime(_filename) if mime_type is None: - return EXT_TO_FILETYPE.get(extension, FileType.UNK) + return FileType.from_extension(extension) or FileType.UNK elif file is not None: if hasattr(file, "name"): @@ -92,7 +86,7 @@ def detect_filetype( "libmagic is unavailable but assists in filetype detection on file-like objects. " "Please consider installing libmagic for better results.", ) - return EXT_TO_FILETYPE.get(extension, FileType.UNK) + return FileType.from_extension(extension) or FileType.UNK else: raise ValueError("No filename, file, nor file_filename were specified.") @@ -128,7 +122,7 @@ def detect_filetype( ".tsv", ".json", ]: - return EXT_TO_FILETYPE.get(extension) + return FileType.from_extension(extension) # NOTE(crag): for older versions of the OS libmagic package, such as is currently # installed on the Unstructured docker image, .json files resolve to "text/plain" @@ -151,11 +145,11 @@ def detect_filetype( return FileType.EML if extension in PLAIN_TEXT_EXTENSIONS: - return EXT_TO_FILETYPE.get(extension, FileType.UNK) + return FileType.from_extension(extension) or FileType.UNK # Safety catch - if mime_type in STR_TO_FILETYPE: - return STR_TO_FILETYPE[mime_type] + if file_type := FileType.from_mime_type(mime_type): + return file_type return FileType.TXT @@ -165,21 +159,22 @@ def detect_filetype( elif file: return _detect_filetype_from_octet_stream(file=file) else: - return EXT_TO_FILETYPE.get(extension, FileType.UNK) + return FileType.from_extension(extension) or FileType.UNK elif mime_type == "application/zip": - filetype = FileType.UNK + file_type = FileType.UNK if file: - filetype = _detect_filetype_from_octet_stream(file=file) + file_type = _detect_filetype_from_octet_stream(file=file) elif filename is not None: with open(filename, "rb") as f: - filetype = _detect_filetype_from_octet_stream(file=f) + file_type = _detect_filetype_from_octet_stream(file=f) extension = extension if extension else "" - if filetype == FileType.UNK: - return FileType.ZIP - else: - return EXT_TO_FILETYPE.get(extension, filetype) + return ( + FileType.ZIP + if file_type in (FileType.UNK, FileType.ZIP) + else FileType.from_extension(extension) or file_type + ) elif _is_code_mime_type(mime_type): # NOTE(robinson) - we'll treat all code files as plain text for now. @@ -191,14 +186,14 @@ def detect_filetype( return FileType.EMPTY # For everything else - elif mime_type in STR_TO_FILETYPE: - return STR_TO_FILETYPE[mime_type] + elif file_type := FileType.from_mime_type(mime_type): + return file_type logger.warning( f"The MIME type{f' of {filename!r}' if filename else ''} is {mime_type!r}. " "This file type is not currently supported in unstructured.", ) - return EXT_TO_FILETYPE.get(extension, FileType.UNK) + return FileType.from_extension(extension) or FileType.UNK def is_json_processable( @@ -260,7 +255,7 @@ def _detect_filetype_from_octet_stream(file: IO[bytes]) -> FileType: # Infer mime type using magic if octet-stream is not zip file mime_type = magic.from_buffer(file.read(4096), mime=True) - return STR_TO_FILETYPE.get(mime_type, FileType.UNK) + return FileType.from_mime_type(mime_type) or FileType.UNK logger.warning( "Could not detect the filetype from application/octet-stream MIME type.", ) @@ -439,7 +434,7 @@ def add_filetype( # NOTE(robinson) - Attached files have already run through this logic # in their own partitioning function if element.metadata.attached_to_filename is None: - add_element_metadata(element, filetype=FILETYPE_TO_MIMETYPE[filetype]) + add_element_metadata(element, filetype=filetype.mime_type) return elements else: diff --git a/unstructured/file_utils/model.py b/unstructured/file_utils/model.py index 8f7d50735..0ec769b7c 100644 --- a/unstructured/file_utils/model.py +++ b/unstructured/file_utils/model.py @@ -3,55 +3,36 @@ from __future__ import annotations import enum +from typing import Iterable, cast class FileType(enum.Enum): - UNK = 0 - EMPTY = 1 + """The collection of file-types recognized by `unstructured`. - # MS Office Types - DOC = 10 - DOCX = 11 - XLS = 12 - XLSX = 13 - PPT = 14 - PPTX = 15 - MSG = 16 + Note not all of these can be partitioned, e.g. WAV and ZIP have no partitioner. + """ - # Adobe Types - PDF = 20 + _extensions: tuple[str, ...] - # Image Types - JPG = 30 - PNG = 31 - TIFF = 32 - BMP = 33 - HEIC = 34 + _canonical_mime_type: str + """The MIME-type used as `.metadata.filetype` for this file-type.""" - # Plain Text Types - EML = 40 - RTF = 41 - TXT = 42 - JSON = 43 - CSV = 44 - TSV = 45 + _alias_mime_types: tuple[str, ...] + """MIME-types accepted as identifying this file-type.""" - # Markup Types - HTML = 50 - XML = 51 - MD = 52 - EPUB = 53 - RST = 54 - ORG = 55 - - # Compressed Types - ZIP = 60 - - # Open Office Types - ODT = 70 - - # Audio Files - WAV = 80 + def __new__( + cls, + value: str, + extensions: Iterable[str], + canonical_mime_type: str, + alias_mime_types: Iterable[str], + ): + self = object.__new__(cls) + self._value_ = value + self._extensions = tuple(extensions) + self._canonical_mime_type = canonical_mime_type + self._alias_mime_types = tuple(alias_mime_types) + return self def __lt__(self, other: FileType) -> bool: """Makes `FileType` members comparable with relational operators, at least with `<`. @@ -60,193 +41,148 @@ class FileType(enum.Enum): """ return self.name < other.name + BMP = ("bmp", [".bmp"], "image/bmp", cast(list[str], [])) + CSV = ( + "csv", + [".csv"], + "text/csv", + [ + "application/csv", + "application/x-csv", + "text/comma-separated-values", + "text/x-comma-separated-values", + "text/x-csv", + ], + ) + DOC = ("doc", [".doc"], "application/msword", cast(list[str], [])) + DOCX = ( + "docx", + [".docx"], + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + cast(list[str], []), + ) + EML = ("eml", [".eml", ".p7s"], "message/rfc822", cast(list[str], [])) + EPUB = ("epub", [".epub"], "application/epub", ["application/epub+zip"]) + HEIC = ("heic", [".heic"], "image/heic", cast(list[str], [])) + HTML = ("html", [".html", ".htm"], "text/html", cast(list[str], [])) + JPG = ("jpg", [".jpeg", ".jpg"], "image/jpeg", cast(list[str], [])) + JSON = ("json", [".json"], "application/json", cast(list[str], [])) + MD = ("md", [".md"], "text/markdown", ["text/x-markdown"]) + MSG = ("msg", [".msg"], "application/vnd.ms-outlook", ["application/x-ole-storage"]) + ODT = ("odt", [".odt"], "application/vnd.oasis.opendocument.text", cast(list[str], [])) + ORG = ("org", [".org"], "text/org", cast(list[str], [])) + PDF = ("pdf", [".pdf"], "application/pdf", cast(list[str], [])) + PNG = ("png", [".png"], "image/png", cast(list[str], [])) + PPT = ("ppt", [".ppt"], "application/vnd.ms-powerpoint", cast(list[str], [])) + PPTX = ( + "pptx", + [".pptx"], + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + cast(list[str], []), + ) + RST = ("rst", [".rst"], "text/x-rst", cast(list[str], [])) + RTF = ("rtf", [".rtf"], "text/rtf", ["application/rtf"]) + TIFF = ("tiff", [".tiff"], "image/tiff", cast(list[str], [])) + TSV = ("tsv", [".tab", ".tsv"], "text/tsv", cast(list[str], [])) + TXT = ( + "txt", + [ + ".txt", + ".text", + # NOTE(robinson) - for now we are treating code files as plain text + ".c", + ".cc", + ".cpp", + ".cs", + ".cxx", + ".go", + ".java", + ".js", + ".log", + ".php", + ".py", + ".rb", + ".swift", + ".ts", + ".yaml", + ".yml", + ], + "text/plain", + [ + # NOTE(robinson) - In the future, we may have special processing for YAML files + # instead of treating them as plaintext. + "text/yaml", + "application/x-yaml", + "application/yaml", + "text/x-yaml", + ], + ) + WAV = ( + "wav", + [".wav"], + "audio/wav", + [ + "audio/vnd.wav", + "audio/vnd.wave", + "audio/wave", + "audio/x-pn-wav", + "audio/x-wav", + ], + ) + XLS = ("xls", [".xls"], "application/vnd.ms-excel", cast(list[str], [])) + XLSX = ( + "xlsx", + [".xlsx"], + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + cast(list[str], []), + ) + XML = ("xml", [".xml"], "application/xml", ["text/xml"]) + ZIP = ("zip", [".zip"], "application/zip", cast(list[str], [])) -STR_TO_FILETYPE = { - # -- BMP -- - "image/bmp": FileType.BMP, - # -- CSV -- - "text/csv": FileType.CSV, - "application/csv": FileType.CSV, - "application/x-csv": FileType.CSV, - "text/comma-separated-values": FileType.CSV, - "text/x-comma-separated-values": FileType.CSV, - "text/x-csv": FileType.CSV, - # -- DOC -- - "application/msword": FileType.DOC, - # -- DOCX -- - "application/vnd.openxmlformats-officedocument.wordprocessingml.document": FileType.DOCX, - # -- EML -- - "message/rfc822": FileType.EML, - # -- EPUB -- - "application/epub": FileType.EPUB, - "application/epub+zip": FileType.EPUB, - # -- HEIF -- - "image/heic": FileType.HEIC, - # -- HTML -- - "text/html": FileType.HTML, - # -- JPG -- - "image/jpeg": FileType.JPG, - # -- JSON -- - "application/json": FileType.JSON, - # -- MD -- - "text/markdown": FileType.MD, - "text/x-markdown": FileType.MD, - # -- MSG -- - "application/vnd.ms-outlook": FileType.MSG, - "application/x-ole-storage": FileType.MSG, - # -- ODT -- - "application/vnd.oasis.opendocument.text": FileType.ODT, - # -- ORG -- - "text/org": FileType.ORG, - # -- PDF -- - "application/pdf": FileType.PDF, - # -- PNG -- - "image/png": FileType.PNG, - # -- PPT -- - "application/vnd.ms-powerpoint": FileType.PPT, - # -- PPTX -- - "application/vnd.openxmlformats-officedocument.presentationml.presentation": FileType.PPTX, - # -- RST -- - "text/x-rst": FileType.RST, - # -- RTF -- - "text/rtf": FileType.RTF, - "application/rtf": FileType.RTF, - # -- TIFF -- - "image/tiff": FileType.TIFF, - # -- TSV -- - "text/tsv": FileType.TSV, - # -- TXT -- - "text/plain": FileType.TXT, - # NOTE(robinson) - https://mimetype.io/application/yaml - # In the future, we may have special processing for YAML - # files instead of treating them as plaintext - "text/yaml": FileType.TXT, - "application/x-yaml": FileType.TXT, - "application/yaml": FileType.TXT, - "text/x-yaml": FileType.TXT, - # -- WAV -- - # NOTE(robinson) - https://mimetype.io/audio/wav - "audio/wav": FileType.WAV, - "audio/vnd.wav": FileType.WAV, - "audio/vnd.wave": FileType.WAV, - "audio/wave": FileType.WAV, - "audio/x-pn-wav": FileType.WAV, - "audio/x-wav": FileType.WAV, - # -- XLS -- - "application/vnd.ms-excel": FileType.XLS, - # -- XLSX -- - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": FileType.XLSX, - # -- XML -- - "application/xml": FileType.XML, - # -- EMPTY -- - "inode/x-empty": FileType.EMPTY, -} + UNK = ("unk", cast(list[str], []), "application/octet-stream", cast(list[str], [])) + EMPTY = ("empty", cast(list[str], []), "inode/x-empty", cast(list[str], [])) -# -- MIME-types in STR_TO_FILETYPE that are not the canonical MIME-type for that file-type -- -MIMETYPE_ALIASES = ( - "application/csv", - "application/epub+zip", - "application/rtf", - "application/x-csv", - "application/x-ole-storage", - "application/x-yaml", - "application/yaml", - "audio/vnd.wav", - "audio/vnd.wave", - "audio/wave", - "audio/x-pn-wav", - "audio/x-wav", - "text/comma-separated-values", - "text/x-comma-separated-values", - "text/x-csv", - "text/x-markdown", - "text/x-yaml", - "text/yaml", -) + @classmethod + def from_extension(cls, extension: str | None) -> FileType | None: + """Select a FileType member based on an extension. -FILETYPE_TO_MIMETYPE = {v: k for k, v in STR_TO_FILETYPE.items() if k not in MIMETYPE_ALIASES} + `extension` must include the leading period, like `".pdf"`. Extension is suitable as a + secondary file-type identification method but is unreliable for primary identification.. + + Returns `None` when `extension` is not registered for any supported file-type. + """ + if extension in (None, "", "."): + return None + # -- not super efficient but plenty fast enough for once-or-twice-per-file use and avoids + # -- limitations on defining a class variable on an Enum. + for m in cls.__members__.values(): + if extension in m._extensions: + return m + return None + + @classmethod + def from_mime_type(cls, mime_type: str) -> FileType | None: + """Select a FileType member based on a MIME-type. + + `extension` must include the leading period, like `".pdf"`. Extension is suitable as a + secondary file-type identification method but is unreliable for primary identification.. + """ + # -- not super efficient but plenty fast enough for once-or-twice-per-file use and avoids + # -- limitations on defining a class variable on an Enum. + for m in cls.__members__.values(): + if mime_type == m._canonical_mime_type or mime_type in m._alias_mime_types: + return m + return None + + @property + def mime_type(self) -> str: + """The canonical MIME-type for this file-type, suitable for use in metadata. + + This value is used in `.metadata.filetype` for elements partitioned from files of this + type. In general it is the "offical", "recommended", or "defacto-standard" MIME-type for + files of this type, in that order, as available. + """ + return self._canonical_mime_type -EXT_TO_FILETYPE = { - # -- BMP -- - ".bmp": FileType.BMP, - # -- CSV -- - ".csv": FileType.CSV, - # -- DOC -- - ".doc": FileType.DOC, - # -- DOCX -- - ".docx": FileType.DOCX, - # -- EML -- - ".eml": FileType.EML, - ".p7s": FileType.EML, - # -- EPUB -- - ".epub": FileType.EPUB, - # -- HEIC -- - ".heic": FileType.HEIC, - # -- HTML -- - ".htm": FileType.HTML, - ".html": FileType.HTML, - # -- JPG -- - ".jpeg": FileType.JPG, - ".jpg": FileType.JPG, - # -- JSON -- - ".json": FileType.JSON, - # -- MD -- - ".md": FileType.MD, - # -- MSG -- - ".msg": FileType.MSG, - # -- ODT -- - ".odt": FileType.ODT, - # -- ORG -- - ".org": FileType.ORG, - # -- PDF -- - ".pdf": FileType.PDF, - # -- PNG -- - ".png": FileType.PNG, - # -- PPT -- - ".ppt": FileType.PPT, - # -- PPTX -- - ".pptx": FileType.PPTX, - # -- RST -- - ".rst": FileType.RST, - # -- RTF -- - ".rtf": FileType.RTF, - # -- TIFF -- - ".tiff": FileType.TIFF, - # -- TSV -- - ".tab": FileType.TSV, - ".tsv": FileType.TSV, - # -- TXT -- - ".text": FileType.TXT, - ".txt": FileType.TXT, - # NOTE(robinson) - for now we are treating code files as plain text - ".c": FileType.TXT, - ".cc": FileType.TXT, - ".cpp": FileType.TXT, - ".cs": FileType.TXT, - ".cxx": FileType.TXT, - ".go": FileType.TXT, - ".java": FileType.TXT, - ".js": FileType.TXT, - ".log": FileType.TXT, - ".php": FileType.TXT, - ".py": FileType.TXT, - ".rb": FileType.TXT, - ".swift": FileType.TXT, - ".ts": FileType.TXT, - ".yaml": FileType.TXT, - ".yml": FileType.TXT, - # -- WAV -- - ".wav": FileType.WAV, - # -- XLS -- - ".xls": FileType.XLS, - # -- XLSX -- - ".xlsx": FileType.XLSX, - # -- XML -- - ".xml": FileType.XML, - # -- ZIP -- - ".zip": FileType.ZIP, - # -- UNK -- - None: FileType.UNK, -} PLAIN_TEXT_EXTENSIONS = ".csv .eml .html .json .md .org .p7s .rst .rtf .tab .text .tsv .txt".split() diff --git a/unstructured/ingest/connector/google_drive.py b/unstructured/ingest/connector/google_drive.py index cd6187344..e3b0f931c 100644 --- a/unstructured/ingest/connector/google_drive.py +++ b/unstructured/ingest/connector/google_drive.py @@ -7,7 +7,6 @@ from datetime import datetime from mimetypes import guess_extension from pathlib import Path -from unstructured.file_utils.filetype import EXT_TO_FILETYPE from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES from unstructured.ingest.enhanced_dataclass import enhanced_field from unstructured.ingest.error import SourceConnectionError, SourceConnectionNetworkError @@ -110,13 +109,6 @@ class SimpleGoogleDriveConfig(ConfigSessionHandleMixin, BaseConnectorConfig): extension: t.Optional[str] = None recursive: bool = False - def __post_init__(self): - if self.extension and self.extension not in EXT_TO_FILETYPE: - raise ValueError( - f"Extension not supported. " - f"Value MUST be one of {', '.join([k for k in EXT_TO_FILETYPE if k is not None])}.", - ) - def create_session_handle( self, ) -> GoogleDriveSessionHandle: diff --git a/unstructured/ingest/connector/onedrive.py b/unstructured/ingest/connector/onedrive.py index 51107781c..303e7f8fc 100644 --- a/unstructured/ingest/connector/onedrive.py +++ b/unstructured/ingest/connector/onedrive.py @@ -2,7 +2,6 @@ import typing as t from dataclasses import dataclass, field from pathlib import Path -from unstructured.file_utils.filetype import EXT_TO_FILETYPE from unstructured.ingest.enhanced_dataclass import enhanced_field from unstructured.ingest.error import SourceConnectionError, SourceConnectionNetworkError from unstructured.ingest.interfaces import ( @@ -77,12 +76,6 @@ class OneDriveIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc): if not self.ext: raise ValueError("Unsupported file without extension.") - if self.ext not in EXT_TO_FILETYPE: - raise ValueError( - f"Extension not supported. " - f"Value MUST be one of {', '.join([k for k in EXT_TO_FILETYPE if k is not None])}.", - ) - self.server_relative_path = self.file_path + "/" + self.file_name self._set_download_paths() diff --git a/unstructured/ingest/connector/sharepoint.py b/unstructured/ingest/connector/sharepoint.py index 055b4f9cf..c65722404 100644 --- a/unstructured/ingest/connector/sharepoint.py +++ b/unstructured/ingest/connector/sharepoint.py @@ -6,7 +6,6 @@ from html import unescape from pathlib import Path from urllib.parse import urlparse -from unstructured.file_utils.filetype import EXT_TO_FILETYPE from unstructured.ingest.enhanced_dataclass import enhanced_field from unstructured.ingest.error import SourceConnectionError, SourceConnectionNetworkError from unstructured.ingest.interfaces import ( @@ -94,11 +93,6 @@ class SharepointIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc): if not self.extension: raise ValueError("Unsupported file without extension.") - if self.extension not in EXT_TO_FILETYPE: - raise ValueError( - f"Extension {self.extension} not supported. " - f"Value MUST be one of {', '.join([k for k in EXT_TO_FILETYPE if k is not None])}.", - ) self._set_download_paths() def _set_download_paths(self) -> None: diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index 04a24faff..fe13d26fe 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -9,7 +9,7 @@ import requests from unstructured.documents.elements import DataSourceMetadata, Element from unstructured.file_utils.filetype import detect_filetype, is_json_processable -from unstructured.file_utils.model import FILETYPE_TO_MIMETYPE, STR_TO_FILETYPE, FileType +from unstructured.file_utils.model import FileType from unstructured.logger import logger from unstructured.partition.common import exactly_one from unstructured.partition.email import partition_email @@ -548,12 +548,10 @@ def partition( element.metadata.url = url element.metadata.data_source = data_source_metadata if content_type is not None: - out_filetype = STR_TO_FILETYPE.get(content_type) - element.metadata.filetype = ( - FILETYPE_TO_MIMETYPE[out_filetype] if out_filetype is not None else None - ) + out_filetype = FileType.from_mime_type(content_type) + element.metadata.filetype = out_filetype.mime_type if out_filetype is not None else None else: - element.metadata.filetype = FILETYPE_TO_MIMETYPE[filetype] + element.metadata.filetype = filetype.mime_type return elements diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py index 78ed29ea5..5a6171a47 100644 --- a/unstructured/partition/csv.py +++ b/unstructured/partition/csv.py @@ -13,7 +13,8 @@ from unstructured.documents.elements import ( Table, process_metadata, ) -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from unstructured.file_utils.filetype import add_metadata_with_filetype +from unstructured.file_utils.model import FileType from unstructured.partition.common import ( exactly_one, get_last_modified_date, diff --git a/unstructured/partition/doc.py b/unstructured/partition/doc.py index 5b01abe4b..23f5afb48 100644 --- a/unstructured/partition/doc.py +++ b/unstructured/partition/doc.py @@ -6,7 +6,8 @@ from typing import IO, Any, Optional from unstructured.chunking import add_chunking_strategy from unstructured.documents.elements import Element, process_metadata -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from unstructured.file_utils.filetype import add_metadata_with_filetype +from unstructured.file_utils.model import FileType from unstructured.partition.common import ( convert_office_doc, exactly_one, diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py index 8092ac321..485dec124 100644 --- a/unstructured/partition/docx.py +++ b/unstructured/partition/docx.py @@ -45,7 +45,8 @@ from unstructured.documents.elements import ( Title, process_metadata, ) -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from unstructured.file_utils.filetype import add_metadata_with_filetype +from unstructured.file_utils.model import FileType from unstructured.partition.common import ( get_last_modified_date, get_last_modified_date_from_file, diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py index 1ab3c2ce3..e47bab242 100644 --- a/unstructured/partition/email.py +++ b/unstructured/partition/email.py @@ -5,31 +5,10 @@ import datetime import email import os import re -import sys from email.message import Message from functools import partial from tempfile import NamedTemporaryFile, TemporaryDirectory -from typing import IO, Any, Callable, Optional - -from unstructured.file_utils.encoding import ( - COMMON_ENCODINGS, - format_encoding_str, - read_txt_file, - validate_encoding, -) -from unstructured.logger import logger -from unstructured.partition.common import ( - convert_to_bytes, - exactly_one, - get_last_modified_date, - get_last_modified_date_from_file, -) -from unstructured.partition.lang import apply_lang_metadata - -if sys.version_info < (3, 8): - from typing_extensions import Final -else: - from typing import Final +from typing import IO, Any, Callable, Final, Optional from unstructured.chunking import add_chunking_strategy from unstructured.cleaners.core import clean_extra_whitespace, replace_mime_encodings @@ -56,9 +35,24 @@ from unstructured.documents.email_elements import ( Sender, Subject, ) -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from unstructured.file_utils.encoding import ( + COMMON_ENCODINGS, + format_encoding_str, + read_txt_file, + validate_encoding, +) +from unstructured.file_utils.filetype import add_metadata_with_filetype +from unstructured.file_utils.model import FileType +from unstructured.logger import logger from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE +from unstructured.partition.common import ( + convert_to_bytes, + exactly_one, + get_last_modified_date, + get_last_modified_date_from_file, +) from unstructured.partition.html import partition_html +from unstructured.partition.lang import apply_lang_metadata from unstructured.partition.text import partition_text VALID_CONTENT_SOURCES: Final[list[str]] = ["text/html", "text/plain"] diff --git a/unstructured/partition/epub.py b/unstructured/partition/epub.py index 7b5e93940..f8ecfb2c1 100644 --- a/unstructured/partition/epub.py +++ b/unstructured/partition/epub.py @@ -5,7 +5,8 @@ from typing import IO, Any, Optional from unstructured.chunking import add_chunking_strategy from unstructured.documents.elements import Element, process_metadata from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from unstructured.file_utils.filetype import add_metadata_with_filetype +from unstructured.file_utils.model import FileType from unstructured.partition.common import exactly_one, get_last_modified from unstructured.partition.html import partition_html diff --git a/unstructured/partition/html/partition.py b/unstructured/partition/html/partition.py index bb97b70eb..92da768c9 100644 --- a/unstructured/partition/html/partition.py +++ b/unstructured/partition/html/partition.py @@ -12,7 +12,8 @@ from lxml import etree from unstructured.chunking import add_chunking_strategy from unstructured.documents.elements import Element, process_metadata from unstructured.file_utils.encoding import read_txt_file -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from unstructured.file_utils.filetype import add_metadata_with_filetype +from unstructured.file_utils.model import FileType from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file from unstructured.partition.html.parser import Flow, html_parser from unstructured.partition.lang import apply_lang_metadata diff --git a/unstructured/partition/md.py b/unstructured/partition/md.py index 14c875b72..92fe362f6 100644 --- a/unstructured/partition/md.py +++ b/unstructured/partition/md.py @@ -7,7 +7,8 @@ import requests from unstructured.chunking import add_chunking_strategy from unstructured.documents.elements import Element, process_metadata -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from unstructured.file_utils.filetype import add_metadata_with_filetype +from unstructured.file_utils.model import FileType from unstructured.partition.common import ( exactly_one, get_last_modified_date, diff --git a/unstructured/partition/odt.py b/unstructured/partition/odt.py index f793a9ef0..99e6ec9ec 100644 --- a/unstructured/partition/odt.py +++ b/unstructured/partition/odt.py @@ -6,7 +6,8 @@ from typing import IO, Any, Optional, cast from unstructured.chunking import add_chunking_strategy from unstructured.documents.elements import Element, process_metadata -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from unstructured.file_utils.filetype import add_metadata_with_filetype +from unstructured.file_utils.model import FileType from unstructured.partition.common import exactly_one, get_last_modified from unstructured.partition.docx import partition_docx from unstructured.utils import requires_dependencies diff --git a/unstructured/partition/org.py b/unstructured/partition/org.py index b677d3f31..797c73d24 100644 --- a/unstructured/partition/org.py +++ b/unstructured/partition/org.py @@ -5,7 +5,8 @@ from typing import IO, Any, Optional from unstructured.chunking import add_chunking_strategy from unstructured.documents.elements import Element from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from unstructured.file_utils.filetype import add_metadata_with_filetype +from unstructured.file_utils.model import FileType from unstructured.partition.common import exactly_one, get_last_modified from unstructured.partition.html import partition_html diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index e52fe0f8c..dcb06635a 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -36,7 +36,8 @@ from unstructured.documents.elements import ( Text, process_metadata, ) -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from unstructured.file_utils.filetype import add_metadata_with_filetype +from unstructured.file_utils.model import FileType from unstructured.logger import logger, trace_logger from unstructured.nlp.patterns import PARAGRAPH_PATTERN from unstructured.partition.common import ( diff --git a/unstructured/partition/ppt.py b/unstructured/partition/ppt.py index bb73587c1..f8030cd82 100644 --- a/unstructured/partition/ppt.py +++ b/unstructured/partition/ppt.py @@ -6,7 +6,8 @@ from typing import IO, Any, Optional from unstructured.chunking import add_chunking_strategy from unstructured.documents.elements import Element, process_metadata -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from unstructured.file_utils.filetype import add_metadata_with_filetype +from unstructured.file_utils.model import FileType from unstructured.partition.common import ( convert_office_doc, exactly_one, diff --git a/unstructured/partition/pptx.py b/unstructured/partition/pptx.py index 1a78e4f4a..06e617a13 100644 --- a/unstructured/partition/pptx.py +++ b/unstructured/partition/pptx.py @@ -34,7 +34,8 @@ from unstructured.documents.elements import ( Title, process_metadata, ) -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from unstructured.file_utils.filetype import add_metadata_with_filetype +from unstructured.file_utils.model import FileType from unstructured.partition.common import ( convert_ms_office_table_to_text, get_last_modified_date, diff --git a/unstructured/partition/rst.py b/unstructured/partition/rst.py index dfb738804..30105ef4a 100644 --- a/unstructured/partition/rst.py +++ b/unstructured/partition/rst.py @@ -5,7 +5,8 @@ from typing import IO, Any, Optional from unstructured.chunking import add_chunking_strategy from unstructured.documents.elements import Element, process_metadata from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from unstructured.file_utils.filetype import add_metadata_with_filetype +from unstructured.file_utils.model import FileType from unstructured.partition.common import exactly_one, get_last_modified from unstructured.partition.html import partition_html diff --git a/unstructured/partition/rtf.py b/unstructured/partition/rtf.py index 43cd5210a..ac5cf00e4 100644 --- a/unstructured/partition/rtf.py +++ b/unstructured/partition/rtf.py @@ -5,7 +5,8 @@ from typing import IO, Any, Optional from unstructured.chunking import add_chunking_strategy from unstructured.documents.elements import Element, process_metadata from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from unstructured.file_utils.filetype import add_metadata_with_filetype +from unstructured.file_utils.model import FileType from unstructured.partition.common import exactly_one, get_last_modified from unstructured.partition.html import partition_html diff --git a/unstructured/partition/text.py b/unstructured/partition/text.py index 96cd10525..cc648384a 100644 --- a/unstructured/partition/text.py +++ b/unstructured/partition/text.py @@ -25,7 +25,8 @@ from unstructured.documents.elements import ( process_metadata, ) from unstructured.file_utils.encoding import read_txt_file -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from unstructured.file_utils.filetype import add_metadata_with_filetype +from unstructured.file_utils.model import FileType from unstructured.nlp.patterns import PARAGRAPH_PATTERN, UNICODE_BULLETS_RE from unstructured.nlp.tokenize import sent_tokenize from unstructured.partition.common import ( diff --git a/unstructured/partition/tsv.py b/unstructured/partition/tsv.py index 62add9cec..04368b96a 100644 --- a/unstructured/partition/tsv.py +++ b/unstructured/partition/tsv.py @@ -12,7 +12,8 @@ from unstructured.documents.elements import ( Table, process_metadata, ) -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from unstructured.file_utils.filetype import add_metadata_with_filetype +from unstructured.file_utils.model import FileType from unstructured.partition.common import ( exactly_one, get_last_modified_date, diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py index f219ab974..af52d12df 100644 --- a/unstructured/partition/xlsx.py +++ b/unstructured/partition/xlsx.py @@ -24,7 +24,8 @@ from unstructured.documents.elements import ( Title, process_metadata, ) -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from unstructured.file_utils.filetype import add_metadata_with_filetype +from unstructured.file_utils.model import FileType from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file from unstructured.partition.lang import apply_lang_metadata from unstructured.partition.text_type import ( diff --git a/unstructured/partition/xml.py b/unstructured/partition/xml.py index b6cc2accb..91e79c575 100644 --- a/unstructured/partition/xml.py +++ b/unstructured/partition/xml.py @@ -14,7 +14,8 @@ from unstructured.documents.elements import ( process_metadata, ) from unstructured.file_utils.encoding import read_txt_file -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from unstructured.file_utils.filetype import add_metadata_with_filetype +from unstructured.file_utils.model import FileType from unstructured.partition.common import ( exactly_one, get_last_modified_date,