rfctr(file): improve file-type auto-detect (#3409)

**Summary**
In preparation for further work on auto file-type detection, improve
`filetype.py` and related modules:
- improve docstrings
- improve type annotations
- extract domain model to `.model` module
This commit is contained in:
Steve Canny 2024-07-16 22:27:31 -07:00 committed by GitHub
parent 48bdf94656
commit a5c9a3695c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 431 additions and 366 deletions

View File

@ -1,4 +1,4 @@
## 0.15.0-dev13
## 0.15.0-dev14
### Enhancements

View File

@ -8,6 +8,7 @@ import io
import os
import pathlib
import magic
import pytest
from test_unstructured.unit_utils import (
@ -17,7 +18,7 @@ from test_unstructured.unit_utils import (
MonkeyPatch,
call,
example_doc_path,
function_mock,
method_mock,
)
from unstructured.file_utils import filetype
from unstructured.file_utils.filetype import (
@ -44,9 +45,7 @@ is_in_docker = os.path.exists("/.dockerenv")
("example-10k.html", FileType.HTML),
("fake-html.html", FileType.HTML),
("stanley-cups.xlsx", FileType.XLSX),
# NOTE(robinson) - currently failing in the docker tests because the detected
# MIME type is text/csv
# ("stanley-cups.csv", FileType.CSV),
("stanley-cups.csv", FileType.CSV),
("stanley-cups.tsv", FileType.TSV),
("fake-power-point.pptx", FileType.PPTX),
("winter-sports.epub", FileType.EPUB),
@ -111,9 +110,7 @@ def test_detect_filetype_from_filename_with_extension(
("example-10k.html", [FileType.HTML, FileType.XML]),
("fake-html.html", [FileType.HTML]),
("stanley-cups.xlsx", [FileType.XLSX]),
# NOTE(robinson]) - currently failing in the docker tests because the detected
# MIME type is text/csv
# ("stanley-cups.csv", [FileType.CSV]),
("stanley-cups.csv", [FileType.CSV]),
("stanley-cups.tsv", [FileType.TSV]),
("fake-power-point.pptx", [FileType.PPTX]),
("winter-sports.epub", [FileType.EPUB]),
@ -546,11 +543,13 @@ def test_detect_TXT_from_yaml_file(magic_from_buffer_: Mock):
# ================================================================================================
# -- `from_buffer()` and `from_file()` are not "methods" on `magic` per-se (`magic` is a module)
# -- but they behave like methods for mocking purposes.
@pytest.fixture()
def magic_from_buffer_(request: FixtureRequest):
return function_mock(request, "unstructured.file_utils.filetype.magic.from_buffer")
return method_mock(request, magic, "from_buffer")
@pytest.fixture()
def magic_from_file_(request: FixtureRequest):
return function_mock(request, "unstructured.file_utils.filetype.magic.from_file")
return method_mock(request, magic, "from_file")

View File

@ -0,0 +1,6 @@
from __future__ import annotations
import pathlib
from typing import IO
def guess_mime(obj: bytearray | str | bytes | pathlib.PurePath | IO[bytes]) -> str | None: ...

View File

@ -0,0 +1,6 @@
from __future__ import annotations
from os import PathLike
def from_buffer(buffer: bytes | str, mime: bool = ...) -> str: ...
def from_file(filename: bytes | str | PathLike[str], mime: bool = ...) -> str: ...

View File

@ -1 +1 @@
__version__ = "0.15.0-dev13" # pragma: no cover
__version__ = "0.15.0-dev14" # pragma: no cover

View File

@ -1,7 +1,7 @@
from __future__ import annotations
import enum
import functools
import importlib.util
import json
import os
import re
@ -12,7 +12,15 @@ from typing_extensions import ParamSpec
from unstructured.documents.elements import Element
from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
from unstructured.file_utils.model import (
EXT_TO_FILETYPE,
FILETYPE_TO_MIMETYPE,
PLAIN_TEXT_EXTENSIONS,
STR_TO_FILETYPE,
FileType,
)
from unstructured.logger import logger
from unstructured.nlp.patterns import EMAIL_HEAD_RE, LIST_OF_DICTS_PATTERN
from unstructured.partition.common import (
add_element_metadata,
exactly_one,
@ -21,232 +29,7 @@ from unstructured.partition.common import (
)
from unstructured.utils import get_call_args_applying_defaults
try:
import magic
LIBMAGIC_AVAILABLE = True
except ImportError: # pragma: nocover
LIBMAGIC_AVAILABLE = False # pragma: nocover
from unstructured.logger import logger
from unstructured.nlp.patterns import EMAIL_HEAD_RE
TXT_MIME_TYPES = [
"text/plain",
"message/rfc822", # ref: https://www.rfc-editor.org/rfc/rfc822
]
# NOTE(robinson) - .docx.xlsx files are actually zip file with a .docx/.xslx extension.
# If the MIME type is application/octet-stream, we check if it's a .docx/.xlsx file by
# looking for expected filenames within the zip file.
EXPECTED_DOCX_FILES = [
"docProps/core.xml",
"word/document.xml",
]
EXPECTED_XLSX_FILES = [
"xl/workbook.xml",
]
EXPECTED_PPTX_FILES = [
"docProps/core.xml",
"ppt/presentation.xml",
]
class FileType(enum.Enum):
UNK = 0
EMPTY = 1
# MS Office Types
DOC = 10
DOCX = 11
XLS = 12
XLSX = 13
PPT = 14
PPTX = 15
MSG = 16
# Adobe Types
PDF = 20
# Image Types
JPG = 30
PNG = 31
TIFF = 32
BMP = 33
HEIC = 34
# Plain Text Types
EML = 40
RTF = 41
TXT = 42
JSON = 43
CSV = 44
TSV = 45
# Markup Types
HTML = 50
XML = 51
MD = 52
EPUB = 53
RST = 54
ORG = 55
# Compressed Types
ZIP = 60
# Open Office Types
ODT = 70
# Audio Files
WAV = 80
# NOTE(robinson) - This is to support sorting for pandas groupby functions
def __lt__(self, other):
return self.name < other.name
STR_TO_FILETYPE = {
"application/pdf": FileType.PDF,
"application/msword": FileType.DOC,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": FileType.DOCX,
"image/jpeg": FileType.JPG,
"image/png": FileType.PNG,
"image/heic": FileType.HEIC,
"image/tiff": FileType.TIFF,
"image/bmp": FileType.BMP,
# NOTE(robinson) - https://mimetype.io/application/yaml
# In the future, we may have special processing for YAML
# files instead of treating them as plaintext
"application/yaml": FileType.TXT,
"application/x-yaml": FileType.TXT,
"text/x-yaml": FileType.TXT,
"text/yaml": FileType.TXT,
"text/plain": FileType.TXT,
"text/x-csv": FileType.CSV,
"application/csv": FileType.CSV,
"application/x-csv": FileType.CSV,
"text/comma-separated-values": FileType.CSV,
"text/x-comma-separated-values": FileType.CSV,
"text/csv": FileType.CSV,
"text/tsv": FileType.TSV,
"text/markdown": FileType.MD,
"text/x-markdown": FileType.MD,
"text/org": FileType.ORG,
"text/x-rst": FileType.RST,
"application/epub": FileType.EPUB,
"application/epub+zip": FileType.EPUB,
"application/json": FileType.JSON,
"application/rtf": FileType.RTF,
"text/rtf": FileType.RTF,
"text/html": FileType.HTML,
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": FileType.XLSX,
"application/vnd.ms-excel": FileType.XLS,
"application/vnd.openxmlformats-officedocument.presentationml.presentation": FileType.PPTX,
"application/vnd.ms-powerpoint": FileType.PPT,
"application/xml": FileType.XML,
"application/vnd.oasis.opendocument.text": FileType.ODT,
"message/rfc822": FileType.EML,
"application/x-ole-storage": FileType.MSG,
"application/vnd.ms-outlook": FileType.MSG,
# NOTE(robinson) - https://mimetype.io/audio/wav
"audio/vnd.wav": FileType.WAV,
"audio/vnd.wave": FileType.WAV,
"audio/wave": FileType.WAV,
"audio/x-pn-wav": FileType.WAV,
"audio/x-wav": FileType.WAV,
"inode/x-empty": FileType.EMPTY,
}
MIMETYPES_TO_EXCLUDE = [
"text/x-markdown",
"application/epub+zip",
"text/x-csv",
"application/csv",
"application/x-csv",
"text/comma-separated-values",
"text/x-comma-separated-values",
]
FILETYPE_TO_MIMETYPE = {v: k for k, v in STR_TO_FILETYPE.items() if k not in MIMETYPES_TO_EXCLUDE}
EXT_TO_FILETYPE = {
".pdf": FileType.PDF,
".docx": FileType.DOCX,
".jpg": FileType.JPG,
".jpeg": FileType.JPG,
".txt": FileType.TXT,
".text": FileType.TXT,
".log": FileType.TXT,
".eml": FileType.EML,
".xml": FileType.XML,
".heic": FileType.HEIC,
".htm": FileType.HTML,
".html": FileType.HTML,
".md": FileType.MD,
".org": FileType.ORG,
".rst": FileType.RST,
".xlsx": FileType.XLSX,
".pptx": FileType.PPTX,
".p7s": FileType.EML,
".png": FileType.PNG,
".doc": FileType.DOC,
".zip": FileType.ZIP,
".xls": FileType.XLS,
".ppt": FileType.PPT,
".rtf": FileType.RTF,
".json": FileType.JSON,
".epub": FileType.EPUB,
".msg": FileType.MSG,
".odt": FileType.ODT,
".csv": FileType.CSV,
".tsv": FileType.TSV,
".tab": FileType.TSV,
".tiff": FileType.TIFF,
".bmp": FileType.BMP,
".wav": FileType.WAV,
# NOTE(robinson) - for now we are treating code files as plain text
".js": FileType.TXT,
".py": FileType.TXT,
".java": FileType.TXT,
".cpp": FileType.TXT,
".cc": FileType.TXT,
".cxx": FileType.TXT,
".c": FileType.TXT,
".cs": FileType.TXT,
".php": FileType.TXT,
".rb": FileType.TXT,
".swift": FileType.TXT,
".ts": FileType.TXT,
".go": FileType.TXT,
".yaml": FileType.TXT,
".yml": FileType.TXT,
None: FileType.UNK,
}
PLAIN_TEXT_EXTENSIONS = [
".txt",
".text",
".eml",
".p7s",
".md",
".rtf",
".html",
".rst",
".org",
".csv",
".tsv",
".tab",
".json",
]
def _resolve_symlink(file_path):
# Resolve the symlink to get the actual file path
if os.path.islink(file_path):
file_path = os.path.realpath(file_path)
return file_path
LIBMAGIC_AVAILABLE = bool(importlib.util.find_spec("magic"))
def detect_filetype(
@ -256,8 +39,10 @@ def detect_filetype(
file_filename: Optional[str] = None,
encoding: Optional[str] = "utf-8",
) -> Optional[FileType]:
"""Use libmagic to determine a file's type. Helps determine which partition brick
to use for a given file. A return value of None indicates a non-supported file type.
"""Use libmagic to determine a file's type.
Helps determine which partition brick to use for a given file. A return value of None indicates
a non-supported file type.
"""
mime_type = None
exactly_one(filename=filename, file=file)
@ -275,14 +60,13 @@ def detect_filetype(
_, extension = os.path.splitext(_filename)
extension = extension.lower()
if os.path.isfile(_filename) and LIBMAGIC_AVAILABLE:
mime_type = magic.from_file(
_resolve_symlink(filename or file_filename),
mime=True,
) # type: ignore
import magic
mime_type = magic.from_file(_resolve_symlink(_filename), mime=True)
elif os.path.isfile(_filename):
import filetype as ft
mime_type = ft.guess_mime(filename)
mime_type = ft.guess_mime(_filename)
if mime_type is None:
return EXT_TO_FILETYPE.get(extension, FileType.UNK)
@ -296,6 +80,8 @@ def detect_filetype(
# Increased to 4096 because otherwise .xlsx files get detected as a zip file
# ref: https://github.com/ahupp/python-magic#usage
if LIBMAGIC_AVAILABLE:
import magic
mime_type = magic.from_buffer(file.read(4096), mime=True)
else:
import filetype as ft
@ -324,7 +110,8 @@ def detect_filetype(
else:
return FileType.XML
elif mime_type in TXT_MIME_TYPES or mime_type.startswith("text"):
# -- ref: https://www.rfc-editor.org/rfc/rfc822 --
elif mime_type == "message/rfc822" or mime_type.startswith("text"):
if not encoding:
encoding = "utf-8"
formatted_encoding = format_encoding_str(encoding)
@ -414,6 +201,42 @@ def detect_filetype(
return EXT_TO_FILETYPE.get(extension, FileType.UNK)
def is_json_processable(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,
file_text: Optional[str] = None,
encoding: Optional[str] = "utf-8",
) -> bool:
"""True when file looks like a JSON array of objects.
Uses regex on a file prefix, so not entirely reliable but good enough if you already know the
file is JSON.
"""
exactly_one(filename=filename, file=file, file_text=file_text)
if file_text is None:
file_text = _read_file_start_for_type_check(
file=file,
filename=filename,
encoding=encoding,
)
return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None
def _check_eml_from_buffer(file: IO[bytes] | IO[str]) -> bool:
"""Checks if a text/plain file is actually a .eml file.
Uses a regex pattern to see if the start of the file matches the typical pattern for a .eml
file.
"""
file.seek(0)
file_content = file.read(4096)
if isinstance(file_content, bytes):
file_head = file_content.decode("utf-8", errors="ignore")
else:
file_head = file_content
return EMAIL_HEAD_RE.match(file_head) is not None
def _detect_filetype_from_octet_stream(file: IO[bytes]) -> FileType:
"""Detects the filetype, given a file with an application/octet-stream MIME type."""
file.seek(0)
@ -421,15 +244,20 @@ def _detect_filetype_from_octet_stream(file: IO[bytes]) -> FileType:
file.seek(0)
archive = zipfile.ZipFile(file)
# NOTE(robinson) - .docx.xlsx files are actually zip file with a .docx/.xslx extension.
# If the MIME type is application/octet-stream, we check if it's a .docx/.xlsx file by
# looking for expected filenames within the zip file.
archive_filenames = [f.filename for f in archive.filelist]
if all(f in archive_filenames for f in EXPECTED_DOCX_FILES):
if all(f in archive_filenames for f in ("docProps/core.xml", "word/document.xml")):
return FileType.DOCX
elif all(f in archive_filenames for f in EXPECTED_XLSX_FILES):
elif all(f in archive_filenames for f in ("xl/workbook.xml",)):
return FileType.XLSX
elif all(f in archive_filenames for f in EXPECTED_PPTX_FILES):
elif all(f in archive_filenames for f in ("docProps/core.xml", "ppt/presentation.xml")):
return FileType.PPTX
if LIBMAGIC_AVAILABLE:
import magic
# Infer mime type using magic if octet-stream is not zip file
mime_type = magic.from_buffer(file.read(4096), mime=True)
return STR_TO_FILETYPE.get(mime_type, FileType.UNK)
@ -439,30 +267,55 @@ def _detect_filetype_from_octet_stream(file: IO[bytes]) -> FileType:
return FileType.UNK
def _read_file_start_for_type_check(
def _is_code_mime_type(mime_type: str) -> bool:
"""True when `mime_type` plausibly indicates a programming language source-code file."""
PROGRAMMING_LANGUAGES = [
"javascript",
"python",
"java",
"c++",
"cpp",
"csharp",
"c#",
"php",
"ruby",
"swift",
"typescript",
]
mime_type = mime_type.lower()
# NOTE(robinson) - check this one explicitly to avoid conflicts with other
# MIME types that contain "go"
if mime_type == "text/x-go":
return True
return any(language in mime_type for language in PROGRAMMING_LANGUAGES)
def _is_text_file_a_csv(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,
encoding: Optional[str] = "utf-8",
) -> str:
"""Reads the start of the file and returns the text content."""
exactly_one(filename=filename, file=file)
if file is not None:
file.seek(0)
file_content = file.read(4096)
if isinstance(file_content, str):
file_text = file_content
else:
file_text = file_content.decode(errors="ignore")
file.seek(0)
if filename is not None:
try:
with open(filename, encoding=encoding) as f:
file_text = f.read(4096)
except UnicodeDecodeError:
formatted_encoding, _ = detect_file_encoding(filename=filename)
with open(filename, encoding=formatted_encoding) as f:
file_text = f.read(4096)
return file_text
):
"""Detects if a file that has a text/plain MIME type is a CSV file."""
def count_commas(text: str):
"""Counts the number of commas in a line, excluding commas in quotes."""
pattern = r"(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$),"
matches = re.findall(pattern, text)
return len(matches)
file_text = _read_file_start_for_type_check(
file=file,
filename=filename,
encoding=encoding,
)
lines = file_text.strip().splitlines()
if len(lines) < 2:
return False
lines = lines[: len(lines)] if len(lines) < 10 else lines[:10]
header_count = count_commas(lines[0])
if any("," not in line for line in lines):
return False
return all(count_commas(line) == header_count for line in lines[1:])
def _is_text_file_a_json(
@ -484,93 +337,48 @@ def _is_text_file_a_json(
# References:
# https://stackoverflow.com/questions/7487869/is-this-simple-string-considered-valid-json
# https://www.ietf.org/rfc/rfc4627.txt
if isinstance(output, str):
return False
return True
return not isinstance(output, str)
except json.JSONDecodeError:
return False
def is_json_processable(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,
file_text: Optional[str] = None,
encoding: Optional[str] = "utf-8",
) -> bool:
exactly_one(filename=filename, file=file, file_text=file_text)
if file_text is None:
file_text = _read_file_start_for_type_check(
file=file,
filename=filename,
encoding=encoding,
)
return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None
def _count_commas(text: str):
"""Counts the number of commas in a line, excluding commas in quotes."""
pattern = r"(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$),"
matches = re.findall(pattern, text)
return len(matches)
def _is_text_file_a_csv(
def _read_file_start_for_type_check(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,
encoding: Optional[str] = "utf-8",
):
"""Detects if a file that has a text/plain MIME type is a CSV file."""
file_text = _read_file_start_for_type_check(
file=file,
filename=filename,
encoding=encoding,
)
lines = file_text.strip().splitlines()
if len(lines) < 2:
return False
lines = lines[: len(lines)] if len(lines) < 10 else lines[:10]
header_count = _count_commas(lines[0])
if any("," not in line for line in lines):
return False
return all(_count_commas(line) == header_count for line in lines[1:])
) -> str:
"""Reads the start of the file and returns the text content."""
exactly_one(filename=filename, file=file)
if file is not None:
file.seek(0)
file_content = file.read(4096)
if isinstance(file_content, str):
file_text = file_content
else:
file_text = file_content.decode(errors="ignore")
file.seek(0)
return file_text
# -- guaranteed by `exactly_one()` call --
assert filename is not None
try:
with open(filename, encoding=encoding) as f:
file_text = f.read(4096)
except UnicodeDecodeError:
formatted_encoding, _ = detect_file_encoding(filename=filename)
with open(filename, encoding=formatted_encoding) as f:
file_text = f.read(4096)
return file_text
def _check_eml_from_buffer(file: IO[bytes]) -> bool:
"""Checks if a text/plain file is actually a .eml file. Uses a regex pattern to see if the
start of the file matches the typical pattern for a .eml file."""
file.seek(0)
file_content = file.read(4096)
if isinstance(file_content, bytes):
file_head = file_content.decode("utf-8", errors="ignore")
else:
file_head = file_content
return EMAIL_HEAD_RE.match(file_head) is not None
PROGRAMMING_LANGUAGES = [
"javascript",
"python",
"java",
"c++",
"cpp",
"csharp",
"c#",
"php",
"ruby",
"swift",
"typescript",
]
def _is_code_mime_type(mime_type: str) -> bool:
"""Checks to see if the MIME type is a MIME type that would be used for a code
file."""
mime_type = mime_type.lower()
# NOTE(robinson) - check this one explicitly to avoid conflicts with other
# MIME types that contain "go"
if mime_type == "text/x-go":
return True
return any(language in mime_type for language in PROGRAMMING_LANGUAGES)
def _resolve_symlink(file_path: str) -> str:
"""Resolve `file_path` containing symlink to the actual file path."""
if os.path.islink(file_path):
file_path = os.path.realpath(file_path)
return file_path
_P = ParamSpec("_P")

View File

@ -0,0 +1,252 @@
"""Domain-model for file-types."""
from __future__ import annotations
import enum
class FileType(enum.Enum):
UNK = 0
EMPTY = 1
# MS Office Types
DOC = 10
DOCX = 11
XLS = 12
XLSX = 13
PPT = 14
PPTX = 15
MSG = 16
# Adobe Types
PDF = 20
# Image Types
JPG = 30
PNG = 31
TIFF = 32
BMP = 33
HEIC = 34
# Plain Text Types
EML = 40
RTF = 41
TXT = 42
JSON = 43
CSV = 44
TSV = 45
# Markup Types
HTML = 50
XML = 51
MD = 52
EPUB = 53
RST = 54
ORG = 55
# Compressed Types
ZIP = 60
# Open Office Types
ODT = 70
# Audio Files
WAV = 80
def __lt__(self, other: FileType) -> bool:
"""Makes `FileType` members comparable with relational operators, at least with `<`.
This makes them sortable, in particular it supports sorting for pandas groupby functions.
"""
return self.name < other.name
STR_TO_FILETYPE = {
# -- BMP --
"image/bmp": FileType.BMP,
# -- CSV --
"text/csv": FileType.CSV,
"application/csv": FileType.CSV,
"application/x-csv": FileType.CSV,
"text/comma-separated-values": FileType.CSV,
"text/x-comma-separated-values": FileType.CSV,
"text/x-csv": FileType.CSV,
# -- DOC --
"application/msword": FileType.DOC,
# -- DOCX --
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": FileType.DOCX,
# -- EML --
"message/rfc822": FileType.EML,
# -- EPUB --
"application/epub": FileType.EPUB,
"application/epub+zip": FileType.EPUB,
# -- HEIF --
"image/heic": FileType.HEIC,
# -- HTML --
"text/html": FileType.HTML,
# -- JPG --
"image/jpeg": FileType.JPG,
# -- JSON --
"application/json": FileType.JSON,
# -- MD --
"text/markdown": FileType.MD,
"text/x-markdown": FileType.MD,
# -- MSG --
"application/vnd.ms-outlook": FileType.MSG,
"application/x-ole-storage": FileType.MSG,
# -- ODT --
"application/vnd.oasis.opendocument.text": FileType.ODT,
# -- ORG --
"text/org": FileType.ORG,
# -- PDF --
"application/pdf": FileType.PDF,
# -- PNG --
"image/png": FileType.PNG,
# -- PPT --
"application/vnd.ms-powerpoint": FileType.PPT,
# -- PPTX --
"application/vnd.openxmlformats-officedocument.presentationml.presentation": FileType.PPTX,
# -- RST --
"text/x-rst": FileType.RST,
# -- RTF --
"text/rtf": FileType.RTF,
"application/rtf": FileType.RTF,
# -- TIFF --
"image/tiff": FileType.TIFF,
# -- TSV --
"text/tsv": FileType.TSV,
# -- TXT --
"text/plain": FileType.TXT,
# NOTE(robinson) - https://mimetype.io/application/yaml
# In the future, we may have special processing for YAML
# files instead of treating them as plaintext
"text/yaml": FileType.TXT,
"application/x-yaml": FileType.TXT,
"application/yaml": FileType.TXT,
"text/x-yaml": FileType.TXT,
# -- WAV --
# NOTE(robinson) - https://mimetype.io/audio/wav
"audio/wav": FileType.WAV,
"audio/vnd.wav": FileType.WAV,
"audio/vnd.wave": FileType.WAV,
"audio/wave": FileType.WAV,
"audio/x-pn-wav": FileType.WAV,
"audio/x-wav": FileType.WAV,
# -- XLS --
"application/vnd.ms-excel": FileType.XLS,
# -- XLSX --
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": FileType.XLSX,
# -- XML --
"application/xml": FileType.XML,
# -- EMPTY --
"inode/x-empty": FileType.EMPTY,
}
# -- MIME-types in STR_TO_FILETYPE that are not the canonical MIME-type for that file-type --
MIMETYPE_ALIASES = (
"application/csv",
"application/epub+zip",
"application/rtf",
"application/x-csv",
"application/x-ole-storage",
"application/x-yaml",
"application/yaml",
"audio/vnd.wav",
"audio/vnd.wave",
"audio/wave",
"audio/x-pn-wav",
"audio/x-wav",
"text/comma-separated-values",
"text/x-comma-separated-values",
"text/x-csv",
"text/x-markdown",
"text/x-yaml",
"text/yaml",
)
FILETYPE_TO_MIMETYPE = {v: k for k, v in STR_TO_FILETYPE.items() if k not in MIMETYPE_ALIASES}
EXT_TO_FILETYPE = {
# -- BMP --
".bmp": FileType.BMP,
# -- CSV --
".csv": FileType.CSV,
# -- DOC --
".doc": FileType.DOC,
# -- DOCX --
".docx": FileType.DOCX,
# -- EML --
".eml": FileType.EML,
".p7s": FileType.EML,
# -- EPUB --
".epub": FileType.EPUB,
# -- HEIC --
".heic": FileType.HEIC,
# -- HTML --
".htm": FileType.HTML,
".html": FileType.HTML,
# -- JPG --
".jpeg": FileType.JPG,
".jpg": FileType.JPG,
# -- JSON --
".json": FileType.JSON,
# -- MD --
".md": FileType.MD,
# -- MSG --
".msg": FileType.MSG,
# -- ODT --
".odt": FileType.ODT,
# -- ORG --
".org": FileType.ORG,
# -- PDF --
".pdf": FileType.PDF,
# -- PNG --
".png": FileType.PNG,
# -- PPT --
".ppt": FileType.PPT,
# -- PPTX --
".pptx": FileType.PPTX,
# -- RST --
".rst": FileType.RST,
# -- RTF --
".rtf": FileType.RTF,
# -- TIFF --
".tiff": FileType.TIFF,
# -- TSV --
".tab": FileType.TSV,
".tsv": FileType.TSV,
# -- TXT --
".text": FileType.TXT,
".txt": FileType.TXT,
# NOTE(robinson) - for now we are treating code files as plain text
".c": FileType.TXT,
".cc": FileType.TXT,
".cpp": FileType.TXT,
".cs": FileType.TXT,
".cxx": FileType.TXT,
".go": FileType.TXT,
".java": FileType.TXT,
".js": FileType.TXT,
".log": FileType.TXT,
".php": FileType.TXT,
".py": FileType.TXT,
".rb": FileType.TXT,
".swift": FileType.TXT,
".ts": FileType.TXT,
".yaml": FileType.TXT,
".yml": FileType.TXT,
# -- WAV --
".wav": FileType.WAV,
# -- XLS --
".xls": FileType.XLS,
# -- XLSX --
".xlsx": FileType.XLSX,
# -- XML --
".xml": FileType.XML,
# -- ZIP --
".zip": FileType.ZIP,
# -- UNK --
None: FileType.UNK,
}
PLAIN_TEXT_EXTENSIONS = ".csv .eml .html .json .md .org .p7s .rst .rtf .tab .text .tsv .txt".split()

View File

@ -8,13 +8,8 @@ from typing import IO, Any, Callable, Literal, Optional
import requests
from unstructured.documents.elements import DataSourceMetadata, Element
from unstructured.file_utils.filetype import (
FILETYPE_TO_MIMETYPE,
STR_TO_FILETYPE,
FileType,
detect_filetype,
is_json_processable,
)
from unstructured.file_utils.filetype import detect_filetype, is_json_processable
from unstructured.file_utils.model import FILETYPE_TO_MIMETYPE, STR_TO_FILETYPE, FileType
from unstructured.logger import logger
from unstructured.partition.common import exactly_one
from unstructured.partition.email import partition_email

View File

@ -348,7 +348,7 @@ def add_element_metadata(
return element
def remove_element_metadata(layout_elements) -> list[Element]:
def remove_element_metadata(layout_elements: list[Element]) -> list[Element]:
"""Removes document metadata from the document element.
Document metadata includes information like the filename, source url, and page number.

View File

@ -10,7 +10,8 @@ from oxmsg.attachment import Attachment
from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, ElementMetadata, process_metadata
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.logger import logger
from unstructured.partition.common import (
get_last_modified_date,
@ -98,14 +99,12 @@ class MsgPartitionerOptions:
"""True when message is encrypted."""
# NOTE(robinson) - Per RFC 2015, the content type for emails with PGP encrypted content
# is multipart/encrypted (ref: https://www.ietf.org/rfc/rfc2015.txt)
if "encrypted" in self.msg.message_headers.get("Content-Type", ""):
return True
# -- pretty sure we're going to want to dig deeper to discover messages that are encrypted
# -- with something other than PGP.
# - might be able to distinguish based on PID_MESSAGE_CLASS = 'IPM.Note.Signed'
# - Content-Type header might include "application/pkcs7-mime" for Microsoft S/MIME
# encryption.
return False
# NOTE(scanny) - pretty sure we're going to want to dig deeper to discover messages that are
# encrypted with something other than PGP.
# - might be able to distinguish based on PID_MESSAGE_CLASS = 'IPM.Note.Signed'
# - Content-Type header might include "application/pkcs7-mime" for Microsoft S/MIME
# encryption.
return "encrypted" in self.msg.message_headers.get("Content-Type", "")
@lazyproperty
def metadata_file_path(self) -> str | None: