rfctr(file): make FileType enum a file-type descriptor (#3411)

**Summary**
Elaborate the `FileType` enum to be a complete descriptor of file-types.
Add methods to allow `STR_TO_FILETYPE`, `EXT_TO_FILETYPE` and
`FILETYPE_TO_MIMETYPE` mappings to be replaced, removing those redundant
and noisy declarations.

In the process, fix some lingering file-type identification and
`.metadata.filetype` errors that had been skipped in the tests.

**Additional Context**
Gathering the various attributes of a file-type into the `FileType` enum
eliminates the duplication inherent in the separate `STR_TO_FILETYPE`
etc. mappings and makes access to those values convenient for callers.
These attributes include what MIME-type a file-type should record in
metadata and what MIME-types and extensions map to that file-type. These
values and others are made available as methods and properties directly
on the `FileType` class and members. Because all attributes are defined
in the `FileType` enum there is no risk of inconsistency across multiple
locations and any changes happen in one and only one place. Further
attributes and methods will be added in later commits to support other
file-type related operations like mapping to a partitioner and verifying
its dependencies are installed.
This commit is contained in:
Steve Canny 2024-07-17 19:05:33 -07:00 committed by GitHub
parent 35ee6bf8e4
commit e99e5a8abd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
31 changed files with 318 additions and 328 deletions

View File

@ -1,4 +1,4 @@
## 0.15.0-dev14 ## 0.15.0-dev15
### Enhancements ### Enhancements

View File

@ -5,7 +5,7 @@ import pandas as pd
import pytest import pytest
from unstructured.file_utils import exploration from unstructured.file_utils import exploration
from unstructured.file_utils.filetype import FileType from unstructured.file_utils.model import FileType
DIRECTORY = pathlib.Path(__file__).parent.resolve() DIRECTORY = pathlib.Path(__file__).parent.resolve()

View File

@ -22,13 +22,13 @@ from test_unstructured.unit_utils import (
) )
from unstructured.file_utils import filetype from unstructured.file_utils import filetype
from unstructured.file_utils.filetype import ( from unstructured.file_utils.filetype import (
FileType,
_detect_filetype_from_octet_stream, _detect_filetype_from_octet_stream,
_is_code_mime_type, _is_code_mime_type,
_is_text_file_a_csv, _is_text_file_a_csv,
_is_text_file_a_json, _is_text_file_a_json,
detect_filetype, detect_filetype,
) )
from unstructured.file_utils.model import FileType
is_in_docker = os.path.exists("/.dockerenv") is_in_docker = os.path.exists("/.dockerenv")

View File

@ -0,0 +1,70 @@
"""Test suite for `unstructured.file_utils.filetype`."""
from __future__ import annotations
import pytest
from unstructured.file_utils.model import FileType
class DescribeFileType:
"""Unit-test suite for `unstructured.file_utils.model.Filetype`."""
@pytest.mark.parametrize(
("ext", "file_type"),
[
(".bmp", FileType.BMP),
(".html", FileType.HTML),
(".eml", FileType.EML),
(".p7s", FileType.EML),
(".java", FileType.TXT),
],
)
def it_can_recognize_a_file_type_from_an_extension(self, ext: str, file_type: FileType | None):
assert FileType.from_extension(ext) is file_type
@pytest.mark.parametrize("ext", [".foobar", ".xyz", ".mdx", "", "."])
def but_not_when_that_extension_is_empty_or_not_registered(self, ext: str):
assert FileType.from_extension(ext) is None
@pytest.mark.parametrize(
("mime_type", "file_type"),
[
("image/bmp", FileType.BMP),
("text/x-csv", FileType.CSV),
("application/msword", FileType.DOC),
("message/rfc822", FileType.EML),
("text/plain", FileType.TXT),
("text/yaml", FileType.TXT),
("application/xml", FileType.XML),
("text/xml", FileType.XML),
("inode/x-empty", FileType.EMPTY),
],
)
def it_can_recognize_a_file_type_from_a_mime_type(
self, mime_type: str, file_type: FileType | None
):
assert FileType.from_mime_type(mime_type) is file_type
@pytest.mark.parametrize("mime_type", ["text/css", "image/gif", "audio/mpeg", "foo/bar"])
def but_not_when_that_mime_type_is_not_registered_by_a_file_type(self, mime_type: str):
assert FileType.from_mime_type(mime_type) is None
@pytest.mark.parametrize(
("file_type", "mime_type"),
[
(FileType.BMP, "image/bmp"),
(FileType.CSV, "text/csv"),
(FileType.DOC, "application/msword"),
(FileType.EML, "message/rfc822"),
(FileType.HTML, "text/html"),
(FileType.JPG, "image/jpeg"),
(FileType.PDF, "application/pdf"),
(FileType.TXT, "text/plain"),
(FileType.XML, "application/xml"),
(FileType.EMPTY, "inode/x-empty"),
(FileType.UNK, "application/octet-stream"),
],
)
def it_knows_its_canonical_MIME_type(self, file_type: FileType, mime_type: str):
assert file_type.mime_type == mime_type

View File

@ -45,7 +45,7 @@ from unstructured.documents.elements import (
Text, Text,
Title, Title,
) )
from unstructured.file_utils.filetype import FILETYPE_TO_MIMETYPE, FileType from unstructured.file_utils.model import FileType
from unstructured.partition import auto from unstructured.partition import auto
from unstructured.partition.auto import IMAGE_FILETYPES, _get_partition_with_extras, partition from unstructured.partition.auto import IMAGE_FILETYPES, _get_partition_with_extras, partition
from unstructured.partition.utils.constants import PartitionStrategy from unstructured.partition.utils.constants import PartitionStrategy
@ -1245,7 +1245,7 @@ def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(filetype:
assert elements assert elements
assert all( assert all(
e.metadata.filetype == FILETYPE_TO_MIMETYPE[filetype] e.metadata.filetype == filetype.mime_type
for e in elements for e in elements
if e.metadata.filetype is not None if e.metadata.filetype is not None
) )

View File

@ -10,7 +10,8 @@ import pytest
from pytest_mock import MockFixture from pytest_mock import MockFixture
from unstructured.documents.elements import CompositeElement from unstructured.documents.elements import CompositeElement
from unstructured.file_utils.filetype import FileType, detect_filetype from unstructured.file_utils.filetype import detect_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.email import partition_email from unstructured.partition.email import partition_email
from unstructured.partition.html import partition_html from unstructured.partition.html import partition_html
from unstructured.partition.json import partition_json from unstructured.partition.json import partition_json

View File

@ -1 +1 @@
__version__ = "0.15.0-dev14" # pragma: no cover __version__ = "0.15.0-dev15" # pragma: no cover

View File

@ -12,13 +12,7 @@ from typing_extensions import ParamSpec
from unstructured.documents.elements import Element from unstructured.documents.elements import Element
from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
from unstructured.file_utils.model import ( from unstructured.file_utils.model import PLAIN_TEXT_EXTENSIONS, FileType
EXT_TO_FILETYPE,
FILETYPE_TO_MIMETYPE,
PLAIN_TEXT_EXTENSIONS,
STR_TO_FILETYPE,
FileType,
)
from unstructured.logger import logger from unstructured.logger import logger
from unstructured.nlp.patterns import EMAIL_HEAD_RE, LIST_OF_DICTS_PATTERN from unstructured.nlp.patterns import EMAIL_HEAD_RE, LIST_OF_DICTS_PATTERN
from unstructured.partition.common import ( from unstructured.partition.common import (
@ -49,9 +43,9 @@ def detect_filetype(
# first check (content_type) # first check (content_type)
if content_type: if content_type:
filetype = STR_TO_FILETYPE.get(content_type) file_type = FileType.from_mime_type(content_type)
if filetype: if file_type:
return filetype return file_type
# second check (filename/file_name/file) # second check (filename/file_name/file)
# continue if successfully define mime_type # continue if successfully define mime_type
@ -68,7 +62,7 @@ def detect_filetype(
mime_type = ft.guess_mime(_filename) mime_type = ft.guess_mime(_filename)
if mime_type is None: if mime_type is None:
return EXT_TO_FILETYPE.get(extension, FileType.UNK) return FileType.from_extension(extension) or FileType.UNK
elif file is not None: elif file is not None:
if hasattr(file, "name"): if hasattr(file, "name"):
@ -92,7 +86,7 @@ def detect_filetype(
"libmagic is unavailable but assists in filetype detection on file-like objects. " "libmagic is unavailable but assists in filetype detection on file-like objects. "
"Please consider installing libmagic for better results.", "Please consider installing libmagic for better results.",
) )
return EXT_TO_FILETYPE.get(extension, FileType.UNK) return FileType.from_extension(extension) or FileType.UNK
else: else:
raise ValueError("No filename, file, nor file_filename were specified.") raise ValueError("No filename, file, nor file_filename were specified.")
@ -128,7 +122,7 @@ def detect_filetype(
".tsv", ".tsv",
".json", ".json",
]: ]:
return EXT_TO_FILETYPE.get(extension) return FileType.from_extension(extension)
# NOTE(crag): for older versions of the OS libmagic package, such as is currently # NOTE(crag): for older versions of the OS libmagic package, such as is currently
# installed on the Unstructured docker image, .json files resolve to "text/plain" # installed on the Unstructured docker image, .json files resolve to "text/plain"
@ -151,11 +145,11 @@ def detect_filetype(
return FileType.EML return FileType.EML
if extension in PLAIN_TEXT_EXTENSIONS: if extension in PLAIN_TEXT_EXTENSIONS:
return EXT_TO_FILETYPE.get(extension, FileType.UNK) return FileType.from_extension(extension) or FileType.UNK
# Safety catch # Safety catch
if mime_type in STR_TO_FILETYPE: if file_type := FileType.from_mime_type(mime_type):
return STR_TO_FILETYPE[mime_type] return file_type
return FileType.TXT return FileType.TXT
@ -165,21 +159,22 @@ def detect_filetype(
elif file: elif file:
return _detect_filetype_from_octet_stream(file=file) return _detect_filetype_from_octet_stream(file=file)
else: else:
return EXT_TO_FILETYPE.get(extension, FileType.UNK) return FileType.from_extension(extension) or FileType.UNK
elif mime_type == "application/zip": elif mime_type == "application/zip":
filetype = FileType.UNK file_type = FileType.UNK
if file: if file:
filetype = _detect_filetype_from_octet_stream(file=file) file_type = _detect_filetype_from_octet_stream(file=file)
elif filename is not None: elif filename is not None:
with open(filename, "rb") as f: with open(filename, "rb") as f:
filetype = _detect_filetype_from_octet_stream(file=f) file_type = _detect_filetype_from_octet_stream(file=f)
extension = extension if extension else "" extension = extension if extension else ""
if filetype == FileType.UNK: return (
return FileType.ZIP FileType.ZIP
else: if file_type in (FileType.UNK, FileType.ZIP)
return EXT_TO_FILETYPE.get(extension, filetype) else FileType.from_extension(extension) or file_type
)
elif _is_code_mime_type(mime_type): elif _is_code_mime_type(mime_type):
# NOTE(robinson) - we'll treat all code files as plain text for now. # NOTE(robinson) - we'll treat all code files as plain text for now.
@ -191,14 +186,14 @@ def detect_filetype(
return FileType.EMPTY return FileType.EMPTY
# For everything else # For everything else
elif mime_type in STR_TO_FILETYPE: elif file_type := FileType.from_mime_type(mime_type):
return STR_TO_FILETYPE[mime_type] return file_type
logger.warning( logger.warning(
f"The MIME type{f' of {filename!r}' if filename else ''} is {mime_type!r}. " f"The MIME type{f' of {filename!r}' if filename else ''} is {mime_type!r}. "
"This file type is not currently supported in unstructured.", "This file type is not currently supported in unstructured.",
) )
return EXT_TO_FILETYPE.get(extension, FileType.UNK) return FileType.from_extension(extension) or FileType.UNK
def is_json_processable( def is_json_processable(
@ -260,7 +255,7 @@ def _detect_filetype_from_octet_stream(file: IO[bytes]) -> FileType:
# Infer mime type using magic if octet-stream is not zip file # Infer mime type using magic if octet-stream is not zip file
mime_type = magic.from_buffer(file.read(4096), mime=True) mime_type = magic.from_buffer(file.read(4096), mime=True)
return STR_TO_FILETYPE.get(mime_type, FileType.UNK) return FileType.from_mime_type(mime_type) or FileType.UNK
logger.warning( logger.warning(
"Could not detect the filetype from application/octet-stream MIME type.", "Could not detect the filetype from application/octet-stream MIME type.",
) )
@ -439,7 +434,7 @@ def add_filetype(
# NOTE(robinson) - Attached files have already run through this logic # NOTE(robinson) - Attached files have already run through this logic
# in their own partitioning function # in their own partitioning function
if element.metadata.attached_to_filename is None: if element.metadata.attached_to_filename is None:
add_element_metadata(element, filetype=FILETYPE_TO_MIMETYPE[filetype]) add_element_metadata(element, filetype=filetype.mime_type)
return elements return elements
else: else:

View File

@ -3,55 +3,36 @@
from __future__ import annotations from __future__ import annotations
import enum import enum
from typing import Iterable, cast
class FileType(enum.Enum): class FileType(enum.Enum):
UNK = 0 """The collection of file-types recognized by `unstructured`.
EMPTY = 1
# MS Office Types Note not all of these can be partitioned, e.g. WAV and ZIP have no partitioner.
DOC = 10 """
DOCX = 11
XLS = 12
XLSX = 13
PPT = 14
PPTX = 15
MSG = 16
# Adobe Types _extensions: tuple[str, ...]
PDF = 20
# Image Types _canonical_mime_type: str
JPG = 30 """The MIME-type used as `.metadata.filetype` for this file-type."""
PNG = 31
TIFF = 32
BMP = 33
HEIC = 34
# Plain Text Types _alias_mime_types: tuple[str, ...]
EML = 40 """MIME-types accepted as identifying this file-type."""
RTF = 41
TXT = 42
JSON = 43
CSV = 44
TSV = 45
# Markup Types def __new__(
HTML = 50 cls,
XML = 51 value: str,
MD = 52 extensions: Iterable[str],
EPUB = 53 canonical_mime_type: str,
RST = 54 alias_mime_types: Iterable[str],
ORG = 55 ):
self = object.__new__(cls)
# Compressed Types self._value_ = value
ZIP = 60 self._extensions = tuple(extensions)
self._canonical_mime_type = canonical_mime_type
# Open Office Types self._alias_mime_types = tuple(alias_mime_types)
ODT = 70 return self
# Audio Files
WAV = 80
def __lt__(self, other: FileType) -> bool: def __lt__(self, other: FileType) -> bool:
"""Makes `FileType` members comparable with relational operators, at least with `<`. """Makes `FileType` members comparable with relational operators, at least with `<`.
@ -60,193 +41,148 @@ class FileType(enum.Enum):
""" """
return self.name < other.name return self.name < other.name
BMP = ("bmp", [".bmp"], "image/bmp", cast(list[str], []))
STR_TO_FILETYPE = { CSV = (
# -- BMP -- "csv",
"image/bmp": FileType.BMP, [".csv"],
# -- CSV -- "text/csv",
"text/csv": FileType.CSV, [
"application/csv": FileType.CSV,
"application/x-csv": FileType.CSV,
"text/comma-separated-values": FileType.CSV,
"text/x-comma-separated-values": FileType.CSV,
"text/x-csv": FileType.CSV,
# -- DOC --
"application/msword": FileType.DOC,
# -- DOCX --
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": FileType.DOCX,
# -- EML --
"message/rfc822": FileType.EML,
# -- EPUB --
"application/epub": FileType.EPUB,
"application/epub+zip": FileType.EPUB,
# -- HEIF --
"image/heic": FileType.HEIC,
# -- HTML --
"text/html": FileType.HTML,
# -- JPG --
"image/jpeg": FileType.JPG,
# -- JSON --
"application/json": FileType.JSON,
# -- MD --
"text/markdown": FileType.MD,
"text/x-markdown": FileType.MD,
# -- MSG --
"application/vnd.ms-outlook": FileType.MSG,
"application/x-ole-storage": FileType.MSG,
# -- ODT --
"application/vnd.oasis.opendocument.text": FileType.ODT,
# -- ORG --
"text/org": FileType.ORG,
# -- PDF --
"application/pdf": FileType.PDF,
# -- PNG --
"image/png": FileType.PNG,
# -- PPT --
"application/vnd.ms-powerpoint": FileType.PPT,
# -- PPTX --
"application/vnd.openxmlformats-officedocument.presentationml.presentation": FileType.PPTX,
# -- RST --
"text/x-rst": FileType.RST,
# -- RTF --
"text/rtf": FileType.RTF,
"application/rtf": FileType.RTF,
# -- TIFF --
"image/tiff": FileType.TIFF,
# -- TSV --
"text/tsv": FileType.TSV,
# -- TXT --
"text/plain": FileType.TXT,
# NOTE(robinson) - https://mimetype.io/application/yaml
# In the future, we may have special processing for YAML
# files instead of treating them as plaintext
"text/yaml": FileType.TXT,
"application/x-yaml": FileType.TXT,
"application/yaml": FileType.TXT,
"text/x-yaml": FileType.TXT,
# -- WAV --
# NOTE(robinson) - https://mimetype.io/audio/wav
"audio/wav": FileType.WAV,
"audio/vnd.wav": FileType.WAV,
"audio/vnd.wave": FileType.WAV,
"audio/wave": FileType.WAV,
"audio/x-pn-wav": FileType.WAV,
"audio/x-wav": FileType.WAV,
# -- XLS --
"application/vnd.ms-excel": FileType.XLS,
# -- XLSX --
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": FileType.XLSX,
# -- XML --
"application/xml": FileType.XML,
# -- EMPTY --
"inode/x-empty": FileType.EMPTY,
}
# -- MIME-types in STR_TO_FILETYPE that are not the canonical MIME-type for that file-type --
MIMETYPE_ALIASES = (
"application/csv", "application/csv",
"application/epub+zip",
"application/rtf",
"application/x-csv", "application/x-csv",
"application/x-ole-storage", "text/comma-separated-values",
"text/x-comma-separated-values",
"text/x-csv",
],
)
DOC = ("doc", [".doc"], "application/msword", cast(list[str], []))
DOCX = (
"docx",
[".docx"],
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
cast(list[str], []),
)
EML = ("eml", [".eml", ".p7s"], "message/rfc822", cast(list[str], []))
EPUB = ("epub", [".epub"], "application/epub", ["application/epub+zip"])
HEIC = ("heic", [".heic"], "image/heic", cast(list[str], []))
HTML = ("html", [".html", ".htm"], "text/html", cast(list[str], []))
JPG = ("jpg", [".jpeg", ".jpg"], "image/jpeg", cast(list[str], []))
JSON = ("json", [".json"], "application/json", cast(list[str], []))
MD = ("md", [".md"], "text/markdown", ["text/x-markdown"])
MSG = ("msg", [".msg"], "application/vnd.ms-outlook", ["application/x-ole-storage"])
ODT = ("odt", [".odt"], "application/vnd.oasis.opendocument.text", cast(list[str], []))
ORG = ("org", [".org"], "text/org", cast(list[str], []))
PDF = ("pdf", [".pdf"], "application/pdf", cast(list[str], []))
PNG = ("png", [".png"], "image/png", cast(list[str], []))
PPT = ("ppt", [".ppt"], "application/vnd.ms-powerpoint", cast(list[str], []))
PPTX = (
"pptx",
[".pptx"],
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
cast(list[str], []),
)
RST = ("rst", [".rst"], "text/x-rst", cast(list[str], []))
RTF = ("rtf", [".rtf"], "text/rtf", ["application/rtf"])
TIFF = ("tiff", [".tiff"], "image/tiff", cast(list[str], []))
TSV = ("tsv", [".tab", ".tsv"], "text/tsv", cast(list[str], []))
TXT = (
"txt",
[
".txt",
".text",
# NOTE(robinson) - for now we are treating code files as plain text
".c",
".cc",
".cpp",
".cs",
".cxx",
".go",
".java",
".js",
".log",
".php",
".py",
".rb",
".swift",
".ts",
".yaml",
".yml",
],
"text/plain",
[
# NOTE(robinson) - In the future, we may have special processing for YAML files
# instead of treating them as plaintext.
"text/yaml",
"application/x-yaml", "application/x-yaml",
"application/yaml", "application/yaml",
"text/x-yaml",
],
)
WAV = (
"wav",
[".wav"],
"audio/wav",
[
"audio/vnd.wav", "audio/vnd.wav",
"audio/vnd.wave", "audio/vnd.wave",
"audio/wave", "audio/wave",
"audio/x-pn-wav", "audio/x-pn-wav",
"audio/x-wav", "audio/x-wav",
"text/comma-separated-values", ],
"text/x-comma-separated-values", )
"text/x-csv", XLS = ("xls", [".xls"], "application/vnd.ms-excel", cast(list[str], []))
"text/x-markdown", XLSX = (
"text/x-yaml", "xlsx",
"text/yaml", [".xlsx"],
) "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
cast(list[str], []),
)
XML = ("xml", [".xml"], "application/xml", ["text/xml"])
ZIP = ("zip", [".zip"], "application/zip", cast(list[str], []))
FILETYPE_TO_MIMETYPE = {v: k for k, v in STR_TO_FILETYPE.items() if k not in MIMETYPE_ALIASES} UNK = ("unk", cast(list[str], []), "application/octet-stream", cast(list[str], []))
EMPTY = ("empty", cast(list[str], []), "inode/x-empty", cast(list[str], []))
@classmethod
def from_extension(cls, extension: str | None) -> FileType | None:
"""Select a FileType member based on an extension.
`extension` must include the leading period, like `".pdf"`. Extension is suitable as a
secondary file-type identification method but is unreliable for primary identification..
Returns `None` when `extension` is not registered for any supported file-type.
"""
if extension in (None, "", "."):
return None
# -- not super efficient but plenty fast enough for once-or-twice-per-file use and avoids
# -- limitations on defining a class variable on an Enum.
for m in cls.__members__.values():
if extension in m._extensions:
return m
return None
@classmethod
def from_mime_type(cls, mime_type: str) -> FileType | None:
"""Select a FileType member based on a MIME-type.
`extension` must include the leading period, like `".pdf"`. Extension is suitable as a
secondary file-type identification method but is unreliable for primary identification..
"""
# -- not super efficient but plenty fast enough for once-or-twice-per-file use and avoids
# -- limitations on defining a class variable on an Enum.
for m in cls.__members__.values():
if mime_type == m._canonical_mime_type or mime_type in m._alias_mime_types:
return m
return None
@property
def mime_type(self) -> str:
"""The canonical MIME-type for this file-type, suitable for use in metadata.
This value is used in `.metadata.filetype` for elements partitioned from files of this
type. In general it is the "offical", "recommended", or "defacto-standard" MIME-type for
files of this type, in that order, as available.
"""
return self._canonical_mime_type
EXT_TO_FILETYPE = {
# -- BMP --
".bmp": FileType.BMP,
# -- CSV --
".csv": FileType.CSV,
# -- DOC --
".doc": FileType.DOC,
# -- DOCX --
".docx": FileType.DOCX,
# -- EML --
".eml": FileType.EML,
".p7s": FileType.EML,
# -- EPUB --
".epub": FileType.EPUB,
# -- HEIC --
".heic": FileType.HEIC,
# -- HTML --
".htm": FileType.HTML,
".html": FileType.HTML,
# -- JPG --
".jpeg": FileType.JPG,
".jpg": FileType.JPG,
# -- JSON --
".json": FileType.JSON,
# -- MD --
".md": FileType.MD,
# -- MSG --
".msg": FileType.MSG,
# -- ODT --
".odt": FileType.ODT,
# -- ORG --
".org": FileType.ORG,
# -- PDF --
".pdf": FileType.PDF,
# -- PNG --
".png": FileType.PNG,
# -- PPT --
".ppt": FileType.PPT,
# -- PPTX --
".pptx": FileType.PPTX,
# -- RST --
".rst": FileType.RST,
# -- RTF --
".rtf": FileType.RTF,
# -- TIFF --
".tiff": FileType.TIFF,
# -- TSV --
".tab": FileType.TSV,
".tsv": FileType.TSV,
# -- TXT --
".text": FileType.TXT,
".txt": FileType.TXT,
# NOTE(robinson) - for now we are treating code files as plain text
".c": FileType.TXT,
".cc": FileType.TXT,
".cpp": FileType.TXT,
".cs": FileType.TXT,
".cxx": FileType.TXT,
".go": FileType.TXT,
".java": FileType.TXT,
".js": FileType.TXT,
".log": FileType.TXT,
".php": FileType.TXT,
".py": FileType.TXT,
".rb": FileType.TXT,
".swift": FileType.TXT,
".ts": FileType.TXT,
".yaml": FileType.TXT,
".yml": FileType.TXT,
# -- WAV --
".wav": FileType.WAV,
# -- XLS --
".xls": FileType.XLS,
# -- XLSX --
".xlsx": FileType.XLSX,
# -- XML --
".xml": FileType.XML,
# -- ZIP --
".zip": FileType.ZIP,
# -- UNK --
None: FileType.UNK,
}
PLAIN_TEXT_EXTENSIONS = ".csv .eml .html .json .md .org .p7s .rst .rtf .tab .text .tsv .txt".split() PLAIN_TEXT_EXTENSIONS = ".csv .eml .html .json .md .org .p7s .rst .rtf .tab .text .tsv .txt".split()

View File

@ -7,7 +7,6 @@ from datetime import datetime
from mimetypes import guess_extension from mimetypes import guess_extension
from pathlib import Path from pathlib import Path
from unstructured.file_utils.filetype import EXT_TO_FILETYPE
from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
from unstructured.ingest.enhanced_dataclass import enhanced_field from unstructured.ingest.enhanced_dataclass import enhanced_field
from unstructured.ingest.error import SourceConnectionError, SourceConnectionNetworkError from unstructured.ingest.error import SourceConnectionError, SourceConnectionNetworkError
@ -110,13 +109,6 @@ class SimpleGoogleDriveConfig(ConfigSessionHandleMixin, BaseConnectorConfig):
extension: t.Optional[str] = None extension: t.Optional[str] = None
recursive: bool = False recursive: bool = False
def __post_init__(self):
if self.extension and self.extension not in EXT_TO_FILETYPE:
raise ValueError(
f"Extension not supported. "
f"Value MUST be one of {', '.join([k for k in EXT_TO_FILETYPE if k is not None])}.",
)
def create_session_handle( def create_session_handle(
self, self,
) -> GoogleDriveSessionHandle: ) -> GoogleDriveSessionHandle:

View File

@ -2,7 +2,6 @@ import typing as t
from dataclasses import dataclass, field from dataclasses import dataclass, field
from pathlib import Path from pathlib import Path
from unstructured.file_utils.filetype import EXT_TO_FILETYPE
from unstructured.ingest.enhanced_dataclass import enhanced_field from unstructured.ingest.enhanced_dataclass import enhanced_field
from unstructured.ingest.error import SourceConnectionError, SourceConnectionNetworkError from unstructured.ingest.error import SourceConnectionError, SourceConnectionNetworkError
from unstructured.ingest.interfaces import ( from unstructured.ingest.interfaces import (
@ -77,12 +76,6 @@ class OneDriveIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
if not self.ext: if not self.ext:
raise ValueError("Unsupported file without extension.") raise ValueError("Unsupported file without extension.")
if self.ext not in EXT_TO_FILETYPE:
raise ValueError(
f"Extension not supported. "
f"Value MUST be one of {', '.join([k for k in EXT_TO_FILETYPE if k is not None])}.",
)
self.server_relative_path = self.file_path + "/" + self.file_name self.server_relative_path = self.file_path + "/" + self.file_name
self._set_download_paths() self._set_download_paths()

View File

@ -6,7 +6,6 @@ from html import unescape
from pathlib import Path from pathlib import Path
from urllib.parse import urlparse from urllib.parse import urlparse
from unstructured.file_utils.filetype import EXT_TO_FILETYPE
from unstructured.ingest.enhanced_dataclass import enhanced_field from unstructured.ingest.enhanced_dataclass import enhanced_field
from unstructured.ingest.error import SourceConnectionError, SourceConnectionNetworkError from unstructured.ingest.error import SourceConnectionError, SourceConnectionNetworkError
from unstructured.ingest.interfaces import ( from unstructured.ingest.interfaces import (
@ -94,11 +93,6 @@ class SharepointIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
if not self.extension: if not self.extension:
raise ValueError("Unsupported file without extension.") raise ValueError("Unsupported file without extension.")
if self.extension not in EXT_TO_FILETYPE:
raise ValueError(
f"Extension {self.extension} not supported. "
f"Value MUST be one of {', '.join([k for k in EXT_TO_FILETYPE if k is not None])}.",
)
self._set_download_paths() self._set_download_paths()
def _set_download_paths(self) -> None: def _set_download_paths(self) -> None:

View File

@ -9,7 +9,7 @@ import requests
from unstructured.documents.elements import DataSourceMetadata, Element from unstructured.documents.elements import DataSourceMetadata, Element
from unstructured.file_utils.filetype import detect_filetype, is_json_processable from unstructured.file_utils.filetype import detect_filetype, is_json_processable
from unstructured.file_utils.model import FILETYPE_TO_MIMETYPE, STR_TO_FILETYPE, FileType from unstructured.file_utils.model import FileType
from unstructured.logger import logger from unstructured.logger import logger
from unstructured.partition.common import exactly_one from unstructured.partition.common import exactly_one
from unstructured.partition.email import partition_email from unstructured.partition.email import partition_email
@ -548,12 +548,10 @@ def partition(
element.metadata.url = url element.metadata.url = url
element.metadata.data_source = data_source_metadata element.metadata.data_source = data_source_metadata
if content_type is not None: if content_type is not None:
out_filetype = STR_TO_FILETYPE.get(content_type) out_filetype = FileType.from_mime_type(content_type)
element.metadata.filetype = ( element.metadata.filetype = out_filetype.mime_type if out_filetype is not None else None
FILETYPE_TO_MIMETYPE[out_filetype] if out_filetype is not None else None
)
else: else:
element.metadata.filetype = FILETYPE_TO_MIMETYPE[filetype] element.metadata.filetype = filetype.mime_type
return elements return elements

View File

@ -13,7 +13,8 @@ from unstructured.documents.elements import (
Table, Table,
process_metadata, process_metadata,
) )
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common import ( from unstructured.partition.common import (
exactly_one, exactly_one,
get_last_modified_date, get_last_modified_date,

View File

@ -6,7 +6,8 @@ from typing import IO, Any, Optional
from unstructured.chunking import add_chunking_strategy from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common import ( from unstructured.partition.common import (
convert_office_doc, convert_office_doc,
exactly_one, exactly_one,

View File

@ -45,7 +45,8 @@ from unstructured.documents.elements import (
Title, Title,
process_metadata, process_metadata,
) )
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common import ( from unstructured.partition.common import (
get_last_modified_date, get_last_modified_date,
get_last_modified_date_from_file, get_last_modified_date_from_file,

View File

@ -5,31 +5,10 @@ import datetime
import email import email
import os import os
import re import re
import sys
from email.message import Message from email.message import Message
from functools import partial from functools import partial
from tempfile import NamedTemporaryFile, TemporaryDirectory from tempfile import NamedTemporaryFile, TemporaryDirectory
from typing import IO, Any, Callable, Optional from typing import IO, Any, Callable, Final, Optional
from unstructured.file_utils.encoding import (
COMMON_ENCODINGS,
format_encoding_str,
read_txt_file,
validate_encoding,
)
from unstructured.logger import logger
from unstructured.partition.common import (
convert_to_bytes,
exactly_one,
get_last_modified_date,
get_last_modified_date_from_file,
)
from unstructured.partition.lang import apply_lang_metadata
if sys.version_info < (3, 8):
from typing_extensions import Final
else:
from typing import Final
from unstructured.chunking import add_chunking_strategy from unstructured.chunking import add_chunking_strategy
from unstructured.cleaners.core import clean_extra_whitespace, replace_mime_encodings from unstructured.cleaners.core import clean_extra_whitespace, replace_mime_encodings
@ -56,9 +35,24 @@ from unstructured.documents.email_elements import (
Sender, Sender,
Subject, Subject,
) )
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.file_utils.encoding import (
COMMON_ENCODINGS,
format_encoding_str,
read_txt_file,
validate_encoding,
)
from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.logger import logger
from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE
from unstructured.partition.common import (
convert_to_bytes,
exactly_one,
get_last_modified_date,
get_last_modified_date_from_file,
)
from unstructured.partition.html import partition_html from unstructured.partition.html import partition_html
from unstructured.partition.lang import apply_lang_metadata
from unstructured.partition.text import partition_text from unstructured.partition.text import partition_text
VALID_CONTENT_SOURCES: Final[list[str]] = ["text/html", "text/plain"] VALID_CONTENT_SOURCES: Final[list[str]] = ["text/html", "text/plain"]

View File

@ -5,7 +5,8 @@ from typing import IO, Any, Optional
from unstructured.chunking import add_chunking_strategy from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common import exactly_one, get_last_modified from unstructured.partition.common import exactly_one, get_last_modified
from unstructured.partition.html import partition_html from unstructured.partition.html import partition_html

View File

@ -12,7 +12,8 @@ from lxml import etree
from unstructured.chunking import add_chunking_strategy from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.encoding import read_txt_file from unstructured.file_utils.encoding import read_txt_file
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file
from unstructured.partition.html.parser import Flow, html_parser from unstructured.partition.html.parser import Flow, html_parser
from unstructured.partition.lang import apply_lang_metadata from unstructured.partition.lang import apply_lang_metadata

View File

@ -7,7 +7,8 @@ import requests
from unstructured.chunking import add_chunking_strategy from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common import ( from unstructured.partition.common import (
exactly_one, exactly_one,
get_last_modified_date, get_last_modified_date,

View File

@ -6,7 +6,8 @@ from typing import IO, Any, Optional, cast
from unstructured.chunking import add_chunking_strategy from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common import exactly_one, get_last_modified from unstructured.partition.common import exactly_one, get_last_modified
from unstructured.partition.docx import partition_docx from unstructured.partition.docx import partition_docx
from unstructured.utils import requires_dependencies from unstructured.utils import requires_dependencies

View File

@ -5,7 +5,8 @@ from typing import IO, Any, Optional
from unstructured.chunking import add_chunking_strategy from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element from unstructured.documents.elements import Element
from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common import exactly_one, get_last_modified from unstructured.partition.common import exactly_one, get_last_modified
from unstructured.partition.html import partition_html from unstructured.partition.html import partition_html

View File

@ -36,7 +36,8 @@ from unstructured.documents.elements import (
Text, Text,
process_metadata, process_metadata,
) )
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.logger import logger, trace_logger from unstructured.logger import logger, trace_logger
from unstructured.nlp.patterns import PARAGRAPH_PATTERN from unstructured.nlp.patterns import PARAGRAPH_PATTERN
from unstructured.partition.common import ( from unstructured.partition.common import (

View File

@ -6,7 +6,8 @@ from typing import IO, Any, Optional
from unstructured.chunking import add_chunking_strategy from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common import ( from unstructured.partition.common import (
convert_office_doc, convert_office_doc,
exactly_one, exactly_one,

View File

@ -34,7 +34,8 @@ from unstructured.documents.elements import (
Title, Title,
process_metadata, process_metadata,
) )
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common import ( from unstructured.partition.common import (
convert_ms_office_table_to_text, convert_ms_office_table_to_text,
get_last_modified_date, get_last_modified_date,

View File

@ -5,7 +5,8 @@ from typing import IO, Any, Optional
from unstructured.chunking import add_chunking_strategy from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common import exactly_one, get_last_modified from unstructured.partition.common import exactly_one, get_last_modified
from unstructured.partition.html import partition_html from unstructured.partition.html import partition_html

View File

@ -5,7 +5,8 @@ from typing import IO, Any, Optional
from unstructured.chunking import add_chunking_strategy from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common import exactly_one, get_last_modified from unstructured.partition.common import exactly_one, get_last_modified
from unstructured.partition.html import partition_html from unstructured.partition.html import partition_html

View File

@ -25,7 +25,8 @@ from unstructured.documents.elements import (
process_metadata, process_metadata,
) )
from unstructured.file_utils.encoding import read_txt_file from unstructured.file_utils.encoding import read_txt_file
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.nlp.patterns import PARAGRAPH_PATTERN, UNICODE_BULLETS_RE from unstructured.nlp.patterns import PARAGRAPH_PATTERN, UNICODE_BULLETS_RE
from unstructured.nlp.tokenize import sent_tokenize from unstructured.nlp.tokenize import sent_tokenize
from unstructured.partition.common import ( from unstructured.partition.common import (

View File

@ -12,7 +12,8 @@ from unstructured.documents.elements import (
Table, Table,
process_metadata, process_metadata,
) )
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common import ( from unstructured.partition.common import (
exactly_one, exactly_one,
get_last_modified_date, get_last_modified_date,

View File

@ -24,7 +24,8 @@ from unstructured.documents.elements import (
Title, Title,
process_metadata, process_metadata,
) )
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file
from unstructured.partition.lang import apply_lang_metadata from unstructured.partition.lang import apply_lang_metadata
from unstructured.partition.text_type import ( from unstructured.partition.text_type import (

View File

@ -14,7 +14,8 @@ from unstructured.documents.elements import (
process_metadata, process_metadata,
) )
from unstructured.file_utils.encoding import read_txt_file from unstructured.file_utils.encoding import read_txt_file
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common import ( from unstructured.partition.common import (
exactly_one, exactly_one,
get_last_modified_date, get_last_modified_date,