mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2026-01-06 04:11:08 +00:00
rfctr(file): improve file-type auto-detect (#3409)
**Summary** In preparation for further work on auto file-type detection, improve `filetype.py` and related modules: - improve docstrings - improve type annotations - extract domain model to `.model` module
This commit is contained in:
parent
48bdf94656
commit
a5c9a3695c
@ -1,4 +1,4 @@
|
||||
## 0.15.0-dev13
|
||||
## 0.15.0-dev14
|
||||
|
||||
### Enhancements
|
||||
|
||||
|
||||
@ -8,6 +8,7 @@ import io
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
import magic
|
||||
import pytest
|
||||
|
||||
from test_unstructured.unit_utils import (
|
||||
@ -17,7 +18,7 @@ from test_unstructured.unit_utils import (
|
||||
MonkeyPatch,
|
||||
call,
|
||||
example_doc_path,
|
||||
function_mock,
|
||||
method_mock,
|
||||
)
|
||||
from unstructured.file_utils import filetype
|
||||
from unstructured.file_utils.filetype import (
|
||||
@ -44,9 +45,7 @@ is_in_docker = os.path.exists("/.dockerenv")
|
||||
("example-10k.html", FileType.HTML),
|
||||
("fake-html.html", FileType.HTML),
|
||||
("stanley-cups.xlsx", FileType.XLSX),
|
||||
# NOTE(robinson) - currently failing in the docker tests because the detected
|
||||
# MIME type is text/csv
|
||||
# ("stanley-cups.csv", FileType.CSV),
|
||||
("stanley-cups.csv", FileType.CSV),
|
||||
("stanley-cups.tsv", FileType.TSV),
|
||||
("fake-power-point.pptx", FileType.PPTX),
|
||||
("winter-sports.epub", FileType.EPUB),
|
||||
@ -111,9 +110,7 @@ def test_detect_filetype_from_filename_with_extension(
|
||||
("example-10k.html", [FileType.HTML, FileType.XML]),
|
||||
("fake-html.html", [FileType.HTML]),
|
||||
("stanley-cups.xlsx", [FileType.XLSX]),
|
||||
# NOTE(robinson]) - currently failing in the docker tests because the detected
|
||||
# MIME type is text/csv
|
||||
# ("stanley-cups.csv", [FileType.CSV]),
|
||||
("stanley-cups.csv", [FileType.CSV]),
|
||||
("stanley-cups.tsv", [FileType.TSV]),
|
||||
("fake-power-point.pptx", [FileType.PPTX]),
|
||||
("winter-sports.epub", [FileType.EPUB]),
|
||||
@ -546,11 +543,13 @@ def test_detect_TXT_from_yaml_file(magic_from_buffer_: Mock):
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
# -- `from_buffer()` and `from_file()` are not "methods" on `magic` per-se (`magic` is a module)
|
||||
# -- but they behave like methods for mocking purposes.
|
||||
@pytest.fixture()
|
||||
def magic_from_buffer_(request: FixtureRequest):
|
||||
return function_mock(request, "unstructured.file_utils.filetype.magic.from_buffer")
|
||||
return method_mock(request, magic, "from_buffer")
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def magic_from_file_(request: FixtureRequest):
|
||||
return function_mock(request, "unstructured.file_utils.filetype.magic.from_file")
|
||||
return method_mock(request, magic, "from_file")
|
||||
|
||||
6
typings/filetype/__init__.pyi
Normal file
6
typings/filetype/__init__.pyi
Normal file
@ -0,0 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import pathlib
|
||||
from typing import IO
|
||||
|
||||
def guess_mime(obj: bytearray | str | bytes | pathlib.PurePath | IO[bytes]) -> str | None: ...
|
||||
6
typings/magic/__init__.pyi
Normal file
6
typings/magic/__init__.pyi
Normal file
@ -0,0 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from os import PathLike
|
||||
|
||||
def from_buffer(buffer: bytes | str, mime: bool = ...) -> str: ...
|
||||
def from_file(filename: bytes | str | PathLike[str], mime: bool = ...) -> str: ...
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.15.0-dev13" # pragma: no cover
|
||||
__version__ = "0.15.0-dev14" # pragma: no cover
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import enum
|
||||
import functools
|
||||
import importlib.util
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
@ -12,7 +12,15 @@ from typing_extensions import ParamSpec
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
|
||||
from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
|
||||
from unstructured.file_utils.model import (
|
||||
EXT_TO_FILETYPE,
|
||||
FILETYPE_TO_MIMETYPE,
|
||||
PLAIN_TEXT_EXTENSIONS,
|
||||
STR_TO_FILETYPE,
|
||||
FileType,
|
||||
)
|
||||
from unstructured.logger import logger
|
||||
from unstructured.nlp.patterns import EMAIL_HEAD_RE, LIST_OF_DICTS_PATTERN
|
||||
from unstructured.partition.common import (
|
||||
add_element_metadata,
|
||||
exactly_one,
|
||||
@ -21,232 +29,7 @@ from unstructured.partition.common import (
|
||||
)
|
||||
from unstructured.utils import get_call_args_applying_defaults
|
||||
|
||||
try:
|
||||
import magic
|
||||
|
||||
LIBMAGIC_AVAILABLE = True
|
||||
except ImportError: # pragma: nocover
|
||||
LIBMAGIC_AVAILABLE = False # pragma: nocover
|
||||
|
||||
from unstructured.logger import logger
|
||||
from unstructured.nlp.patterns import EMAIL_HEAD_RE
|
||||
|
||||
TXT_MIME_TYPES = [
|
||||
"text/plain",
|
||||
"message/rfc822", # ref: https://www.rfc-editor.org/rfc/rfc822
|
||||
]
|
||||
|
||||
# NOTE(robinson) - .docx.xlsx files are actually zip file with a .docx/.xslx extension.
|
||||
# If the MIME type is application/octet-stream, we check if it's a .docx/.xlsx file by
|
||||
# looking for expected filenames within the zip file.
|
||||
EXPECTED_DOCX_FILES = [
|
||||
"docProps/core.xml",
|
||||
"word/document.xml",
|
||||
]
|
||||
|
||||
EXPECTED_XLSX_FILES = [
|
||||
"xl/workbook.xml",
|
||||
]
|
||||
|
||||
EXPECTED_PPTX_FILES = [
|
||||
"docProps/core.xml",
|
||||
"ppt/presentation.xml",
|
||||
]
|
||||
|
||||
|
||||
class FileType(enum.Enum):
|
||||
UNK = 0
|
||||
EMPTY = 1
|
||||
|
||||
# MS Office Types
|
||||
DOC = 10
|
||||
DOCX = 11
|
||||
XLS = 12
|
||||
XLSX = 13
|
||||
PPT = 14
|
||||
PPTX = 15
|
||||
MSG = 16
|
||||
|
||||
# Adobe Types
|
||||
PDF = 20
|
||||
|
||||
# Image Types
|
||||
JPG = 30
|
||||
PNG = 31
|
||||
TIFF = 32
|
||||
BMP = 33
|
||||
HEIC = 34
|
||||
|
||||
# Plain Text Types
|
||||
EML = 40
|
||||
RTF = 41
|
||||
TXT = 42
|
||||
JSON = 43
|
||||
CSV = 44
|
||||
TSV = 45
|
||||
|
||||
# Markup Types
|
||||
HTML = 50
|
||||
XML = 51
|
||||
MD = 52
|
||||
EPUB = 53
|
||||
RST = 54
|
||||
ORG = 55
|
||||
|
||||
# Compressed Types
|
||||
ZIP = 60
|
||||
|
||||
# Open Office Types
|
||||
ODT = 70
|
||||
|
||||
# Audio Files
|
||||
WAV = 80
|
||||
|
||||
# NOTE(robinson) - This is to support sorting for pandas groupby functions
|
||||
def __lt__(self, other):
|
||||
return self.name < other.name
|
||||
|
||||
|
||||
STR_TO_FILETYPE = {
|
||||
"application/pdf": FileType.PDF,
|
||||
"application/msword": FileType.DOC,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": FileType.DOCX,
|
||||
"image/jpeg": FileType.JPG,
|
||||
"image/png": FileType.PNG,
|
||||
"image/heic": FileType.HEIC,
|
||||
"image/tiff": FileType.TIFF,
|
||||
"image/bmp": FileType.BMP,
|
||||
# NOTE(robinson) - https://mimetype.io/application/yaml
|
||||
# In the future, we may have special processing for YAML
|
||||
# files instead of treating them as plaintext
|
||||
"application/yaml": FileType.TXT,
|
||||
"application/x-yaml": FileType.TXT,
|
||||
"text/x-yaml": FileType.TXT,
|
||||
"text/yaml": FileType.TXT,
|
||||
"text/plain": FileType.TXT,
|
||||
"text/x-csv": FileType.CSV,
|
||||
"application/csv": FileType.CSV,
|
||||
"application/x-csv": FileType.CSV,
|
||||
"text/comma-separated-values": FileType.CSV,
|
||||
"text/x-comma-separated-values": FileType.CSV,
|
||||
"text/csv": FileType.CSV,
|
||||
"text/tsv": FileType.TSV,
|
||||
"text/markdown": FileType.MD,
|
||||
"text/x-markdown": FileType.MD,
|
||||
"text/org": FileType.ORG,
|
||||
"text/x-rst": FileType.RST,
|
||||
"application/epub": FileType.EPUB,
|
||||
"application/epub+zip": FileType.EPUB,
|
||||
"application/json": FileType.JSON,
|
||||
"application/rtf": FileType.RTF,
|
||||
"text/rtf": FileType.RTF,
|
||||
"text/html": FileType.HTML,
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": FileType.XLSX,
|
||||
"application/vnd.ms-excel": FileType.XLS,
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation": FileType.PPTX,
|
||||
"application/vnd.ms-powerpoint": FileType.PPT,
|
||||
"application/xml": FileType.XML,
|
||||
"application/vnd.oasis.opendocument.text": FileType.ODT,
|
||||
"message/rfc822": FileType.EML,
|
||||
"application/x-ole-storage": FileType.MSG,
|
||||
"application/vnd.ms-outlook": FileType.MSG,
|
||||
# NOTE(robinson) - https://mimetype.io/audio/wav
|
||||
"audio/vnd.wav": FileType.WAV,
|
||||
"audio/vnd.wave": FileType.WAV,
|
||||
"audio/wave": FileType.WAV,
|
||||
"audio/x-pn-wav": FileType.WAV,
|
||||
"audio/x-wav": FileType.WAV,
|
||||
"inode/x-empty": FileType.EMPTY,
|
||||
}
|
||||
|
||||
MIMETYPES_TO_EXCLUDE = [
|
||||
"text/x-markdown",
|
||||
"application/epub+zip",
|
||||
"text/x-csv",
|
||||
"application/csv",
|
||||
"application/x-csv",
|
||||
"text/comma-separated-values",
|
||||
"text/x-comma-separated-values",
|
||||
]
|
||||
|
||||
FILETYPE_TO_MIMETYPE = {v: k for k, v in STR_TO_FILETYPE.items() if k not in MIMETYPES_TO_EXCLUDE}
|
||||
|
||||
EXT_TO_FILETYPE = {
|
||||
".pdf": FileType.PDF,
|
||||
".docx": FileType.DOCX,
|
||||
".jpg": FileType.JPG,
|
||||
".jpeg": FileType.JPG,
|
||||
".txt": FileType.TXT,
|
||||
".text": FileType.TXT,
|
||||
".log": FileType.TXT,
|
||||
".eml": FileType.EML,
|
||||
".xml": FileType.XML,
|
||||
".heic": FileType.HEIC,
|
||||
".htm": FileType.HTML,
|
||||
".html": FileType.HTML,
|
||||
".md": FileType.MD,
|
||||
".org": FileType.ORG,
|
||||
".rst": FileType.RST,
|
||||
".xlsx": FileType.XLSX,
|
||||
".pptx": FileType.PPTX,
|
||||
".p7s": FileType.EML,
|
||||
".png": FileType.PNG,
|
||||
".doc": FileType.DOC,
|
||||
".zip": FileType.ZIP,
|
||||
".xls": FileType.XLS,
|
||||
".ppt": FileType.PPT,
|
||||
".rtf": FileType.RTF,
|
||||
".json": FileType.JSON,
|
||||
".epub": FileType.EPUB,
|
||||
".msg": FileType.MSG,
|
||||
".odt": FileType.ODT,
|
||||
".csv": FileType.CSV,
|
||||
".tsv": FileType.TSV,
|
||||
".tab": FileType.TSV,
|
||||
".tiff": FileType.TIFF,
|
||||
".bmp": FileType.BMP,
|
||||
".wav": FileType.WAV,
|
||||
# NOTE(robinson) - for now we are treating code files as plain text
|
||||
".js": FileType.TXT,
|
||||
".py": FileType.TXT,
|
||||
".java": FileType.TXT,
|
||||
".cpp": FileType.TXT,
|
||||
".cc": FileType.TXT,
|
||||
".cxx": FileType.TXT,
|
||||
".c": FileType.TXT,
|
||||
".cs": FileType.TXT,
|
||||
".php": FileType.TXT,
|
||||
".rb": FileType.TXT,
|
||||
".swift": FileType.TXT,
|
||||
".ts": FileType.TXT,
|
||||
".go": FileType.TXT,
|
||||
".yaml": FileType.TXT,
|
||||
".yml": FileType.TXT,
|
||||
None: FileType.UNK,
|
||||
}
|
||||
|
||||
PLAIN_TEXT_EXTENSIONS = [
|
||||
".txt",
|
||||
".text",
|
||||
".eml",
|
||||
".p7s",
|
||||
".md",
|
||||
".rtf",
|
||||
".html",
|
||||
".rst",
|
||||
".org",
|
||||
".csv",
|
||||
".tsv",
|
||||
".tab",
|
||||
".json",
|
||||
]
|
||||
|
||||
|
||||
def _resolve_symlink(file_path):
|
||||
# Resolve the symlink to get the actual file path
|
||||
if os.path.islink(file_path):
|
||||
file_path = os.path.realpath(file_path)
|
||||
return file_path
|
||||
LIBMAGIC_AVAILABLE = bool(importlib.util.find_spec("magic"))
|
||||
|
||||
|
||||
def detect_filetype(
|
||||
@ -256,8 +39,10 @@ def detect_filetype(
|
||||
file_filename: Optional[str] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
) -> Optional[FileType]:
|
||||
"""Use libmagic to determine a file's type. Helps determine which partition brick
|
||||
to use for a given file. A return value of None indicates a non-supported file type.
|
||||
"""Use libmagic to determine a file's type.
|
||||
|
||||
Helps determine which partition brick to use for a given file. A return value of None indicates
|
||||
a non-supported file type.
|
||||
"""
|
||||
mime_type = None
|
||||
exactly_one(filename=filename, file=file)
|
||||
@ -275,14 +60,13 @@ def detect_filetype(
|
||||
_, extension = os.path.splitext(_filename)
|
||||
extension = extension.lower()
|
||||
if os.path.isfile(_filename) and LIBMAGIC_AVAILABLE:
|
||||
mime_type = magic.from_file(
|
||||
_resolve_symlink(filename or file_filename),
|
||||
mime=True,
|
||||
) # type: ignore
|
||||
import magic
|
||||
|
||||
mime_type = magic.from_file(_resolve_symlink(_filename), mime=True)
|
||||
elif os.path.isfile(_filename):
|
||||
import filetype as ft
|
||||
|
||||
mime_type = ft.guess_mime(filename)
|
||||
mime_type = ft.guess_mime(_filename)
|
||||
if mime_type is None:
|
||||
return EXT_TO_FILETYPE.get(extension, FileType.UNK)
|
||||
|
||||
@ -296,6 +80,8 @@ def detect_filetype(
|
||||
# Increased to 4096 because otherwise .xlsx files get detected as a zip file
|
||||
# ref: https://github.com/ahupp/python-magic#usage
|
||||
if LIBMAGIC_AVAILABLE:
|
||||
import magic
|
||||
|
||||
mime_type = magic.from_buffer(file.read(4096), mime=True)
|
||||
else:
|
||||
import filetype as ft
|
||||
@ -324,7 +110,8 @@ def detect_filetype(
|
||||
else:
|
||||
return FileType.XML
|
||||
|
||||
elif mime_type in TXT_MIME_TYPES or mime_type.startswith("text"):
|
||||
# -- ref: https://www.rfc-editor.org/rfc/rfc822 --
|
||||
elif mime_type == "message/rfc822" or mime_type.startswith("text"):
|
||||
if not encoding:
|
||||
encoding = "utf-8"
|
||||
formatted_encoding = format_encoding_str(encoding)
|
||||
@ -414,6 +201,42 @@ def detect_filetype(
|
||||
return EXT_TO_FILETYPE.get(extension, FileType.UNK)
|
||||
|
||||
|
||||
def is_json_processable(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
file_text: Optional[str] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
) -> bool:
|
||||
"""True when file looks like a JSON array of objects.
|
||||
|
||||
Uses regex on a file prefix, so not entirely reliable but good enough if you already know the
|
||||
file is JSON.
|
||||
"""
|
||||
exactly_one(filename=filename, file=file, file_text=file_text)
|
||||
if file_text is None:
|
||||
file_text = _read_file_start_for_type_check(
|
||||
file=file,
|
||||
filename=filename,
|
||||
encoding=encoding,
|
||||
)
|
||||
return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None
|
||||
|
||||
|
||||
def _check_eml_from_buffer(file: IO[bytes] | IO[str]) -> bool:
|
||||
"""Checks if a text/plain file is actually a .eml file.
|
||||
|
||||
Uses a regex pattern to see if the start of the file matches the typical pattern for a .eml
|
||||
file.
|
||||
"""
|
||||
file.seek(0)
|
||||
file_content = file.read(4096)
|
||||
if isinstance(file_content, bytes):
|
||||
file_head = file_content.decode("utf-8", errors="ignore")
|
||||
else:
|
||||
file_head = file_content
|
||||
return EMAIL_HEAD_RE.match(file_head) is not None
|
||||
|
||||
|
||||
def _detect_filetype_from_octet_stream(file: IO[bytes]) -> FileType:
|
||||
"""Detects the filetype, given a file with an application/octet-stream MIME type."""
|
||||
file.seek(0)
|
||||
@ -421,15 +244,20 @@ def _detect_filetype_from_octet_stream(file: IO[bytes]) -> FileType:
|
||||
file.seek(0)
|
||||
archive = zipfile.ZipFile(file)
|
||||
|
||||
# NOTE(robinson) - .docx.xlsx files are actually zip file with a .docx/.xslx extension.
|
||||
# If the MIME type is application/octet-stream, we check if it's a .docx/.xlsx file by
|
||||
# looking for expected filenames within the zip file.
|
||||
archive_filenames = [f.filename for f in archive.filelist]
|
||||
if all(f in archive_filenames for f in EXPECTED_DOCX_FILES):
|
||||
if all(f in archive_filenames for f in ("docProps/core.xml", "word/document.xml")):
|
||||
return FileType.DOCX
|
||||
elif all(f in archive_filenames for f in EXPECTED_XLSX_FILES):
|
||||
elif all(f in archive_filenames for f in ("xl/workbook.xml",)):
|
||||
return FileType.XLSX
|
||||
elif all(f in archive_filenames for f in EXPECTED_PPTX_FILES):
|
||||
elif all(f in archive_filenames for f in ("docProps/core.xml", "ppt/presentation.xml")):
|
||||
return FileType.PPTX
|
||||
|
||||
if LIBMAGIC_AVAILABLE:
|
||||
import magic
|
||||
|
||||
# Infer mime type using magic if octet-stream is not zip file
|
||||
mime_type = magic.from_buffer(file.read(4096), mime=True)
|
||||
return STR_TO_FILETYPE.get(mime_type, FileType.UNK)
|
||||
@ -439,30 +267,55 @@ def _detect_filetype_from_octet_stream(file: IO[bytes]) -> FileType:
|
||||
return FileType.UNK
|
||||
|
||||
|
||||
def _read_file_start_for_type_check(
|
||||
def _is_code_mime_type(mime_type: str) -> bool:
|
||||
"""True when `mime_type` plausibly indicates a programming language source-code file."""
|
||||
PROGRAMMING_LANGUAGES = [
|
||||
"javascript",
|
||||
"python",
|
||||
"java",
|
||||
"c++",
|
||||
"cpp",
|
||||
"csharp",
|
||||
"c#",
|
||||
"php",
|
||||
"ruby",
|
||||
"swift",
|
||||
"typescript",
|
||||
]
|
||||
mime_type = mime_type.lower()
|
||||
# NOTE(robinson) - check this one explicitly to avoid conflicts with other
|
||||
# MIME types that contain "go"
|
||||
if mime_type == "text/x-go":
|
||||
return True
|
||||
return any(language in mime_type for language in PROGRAMMING_LANGUAGES)
|
||||
|
||||
|
||||
def _is_text_file_a_csv(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
) -> str:
|
||||
"""Reads the start of the file and returns the text content."""
|
||||
exactly_one(filename=filename, file=file)
|
||||
if file is not None:
|
||||
file.seek(0)
|
||||
file_content = file.read(4096)
|
||||
if isinstance(file_content, str):
|
||||
file_text = file_content
|
||||
else:
|
||||
file_text = file_content.decode(errors="ignore")
|
||||
file.seek(0)
|
||||
if filename is not None:
|
||||
try:
|
||||
with open(filename, encoding=encoding) as f:
|
||||
file_text = f.read(4096)
|
||||
except UnicodeDecodeError:
|
||||
formatted_encoding, _ = detect_file_encoding(filename=filename)
|
||||
with open(filename, encoding=formatted_encoding) as f:
|
||||
file_text = f.read(4096)
|
||||
return file_text
|
||||
):
|
||||
"""Detects if a file that has a text/plain MIME type is a CSV file."""
|
||||
|
||||
def count_commas(text: str):
|
||||
"""Counts the number of commas in a line, excluding commas in quotes."""
|
||||
pattern = r"(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$),"
|
||||
matches = re.findall(pattern, text)
|
||||
return len(matches)
|
||||
|
||||
file_text = _read_file_start_for_type_check(
|
||||
file=file,
|
||||
filename=filename,
|
||||
encoding=encoding,
|
||||
)
|
||||
lines = file_text.strip().splitlines()
|
||||
if len(lines) < 2:
|
||||
return False
|
||||
lines = lines[: len(lines)] if len(lines) < 10 else lines[:10]
|
||||
header_count = count_commas(lines[0])
|
||||
if any("," not in line for line in lines):
|
||||
return False
|
||||
return all(count_commas(line) == header_count for line in lines[1:])
|
||||
|
||||
|
||||
def _is_text_file_a_json(
|
||||
@ -484,93 +337,48 @@ def _is_text_file_a_json(
|
||||
# References:
|
||||
# https://stackoverflow.com/questions/7487869/is-this-simple-string-considered-valid-json
|
||||
# https://www.ietf.org/rfc/rfc4627.txt
|
||||
if isinstance(output, str):
|
||||
return False
|
||||
return True
|
||||
return not isinstance(output, str)
|
||||
except json.JSONDecodeError:
|
||||
return False
|
||||
|
||||
|
||||
def is_json_processable(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
file_text: Optional[str] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
) -> bool:
|
||||
exactly_one(filename=filename, file=file, file_text=file_text)
|
||||
if file_text is None:
|
||||
file_text = _read_file_start_for_type_check(
|
||||
file=file,
|
||||
filename=filename,
|
||||
encoding=encoding,
|
||||
)
|
||||
return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None
|
||||
|
||||
|
||||
def _count_commas(text: str):
|
||||
"""Counts the number of commas in a line, excluding commas in quotes."""
|
||||
pattern = r"(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$),"
|
||||
matches = re.findall(pattern, text)
|
||||
return len(matches)
|
||||
|
||||
|
||||
def _is_text_file_a_csv(
|
||||
def _read_file_start_for_type_check(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
):
|
||||
"""Detects if a file that has a text/plain MIME type is a CSV file."""
|
||||
file_text = _read_file_start_for_type_check(
|
||||
file=file,
|
||||
filename=filename,
|
||||
encoding=encoding,
|
||||
)
|
||||
lines = file_text.strip().splitlines()
|
||||
if len(lines) < 2:
|
||||
return False
|
||||
lines = lines[: len(lines)] if len(lines) < 10 else lines[:10]
|
||||
header_count = _count_commas(lines[0])
|
||||
if any("," not in line for line in lines):
|
||||
return False
|
||||
return all(_count_commas(line) == header_count for line in lines[1:])
|
||||
) -> str:
|
||||
"""Reads the start of the file and returns the text content."""
|
||||
exactly_one(filename=filename, file=file)
|
||||
|
||||
if file is not None:
|
||||
file.seek(0)
|
||||
file_content = file.read(4096)
|
||||
if isinstance(file_content, str):
|
||||
file_text = file_content
|
||||
else:
|
||||
file_text = file_content.decode(errors="ignore")
|
||||
file.seek(0)
|
||||
return file_text
|
||||
|
||||
# -- guaranteed by `exactly_one()` call --
|
||||
assert filename is not None
|
||||
|
||||
try:
|
||||
with open(filename, encoding=encoding) as f:
|
||||
file_text = f.read(4096)
|
||||
except UnicodeDecodeError:
|
||||
formatted_encoding, _ = detect_file_encoding(filename=filename)
|
||||
with open(filename, encoding=formatted_encoding) as f:
|
||||
file_text = f.read(4096)
|
||||
|
||||
return file_text
|
||||
|
||||
|
||||
def _check_eml_from_buffer(file: IO[bytes]) -> bool:
|
||||
"""Checks if a text/plain file is actually a .eml file. Uses a regex pattern to see if the
|
||||
start of the file matches the typical pattern for a .eml file."""
|
||||
file.seek(0)
|
||||
file_content = file.read(4096)
|
||||
if isinstance(file_content, bytes):
|
||||
file_head = file_content.decode("utf-8", errors="ignore")
|
||||
else:
|
||||
file_head = file_content
|
||||
return EMAIL_HEAD_RE.match(file_head) is not None
|
||||
|
||||
|
||||
PROGRAMMING_LANGUAGES = [
|
||||
"javascript",
|
||||
"python",
|
||||
"java",
|
||||
"c++",
|
||||
"cpp",
|
||||
"csharp",
|
||||
"c#",
|
||||
"php",
|
||||
"ruby",
|
||||
"swift",
|
||||
"typescript",
|
||||
]
|
||||
|
||||
|
||||
def _is_code_mime_type(mime_type: str) -> bool:
|
||||
"""Checks to see if the MIME type is a MIME type that would be used for a code
|
||||
file."""
|
||||
mime_type = mime_type.lower()
|
||||
# NOTE(robinson) - check this one explicitly to avoid conflicts with other
|
||||
# MIME types that contain "go"
|
||||
if mime_type == "text/x-go":
|
||||
return True
|
||||
return any(language in mime_type for language in PROGRAMMING_LANGUAGES)
|
||||
def _resolve_symlink(file_path: str) -> str:
|
||||
"""Resolve `file_path` containing symlink to the actual file path."""
|
||||
if os.path.islink(file_path):
|
||||
file_path = os.path.realpath(file_path)
|
||||
return file_path
|
||||
|
||||
|
||||
_P = ParamSpec("_P")
|
||||
|
||||
252
unstructured/file_utils/model.py
Normal file
252
unstructured/file_utils/model.py
Normal file
@ -0,0 +1,252 @@
|
||||
"""Domain-model for file-types."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import enum
|
||||
|
||||
|
||||
class FileType(enum.Enum):
|
||||
UNK = 0
|
||||
EMPTY = 1
|
||||
|
||||
# MS Office Types
|
||||
DOC = 10
|
||||
DOCX = 11
|
||||
XLS = 12
|
||||
XLSX = 13
|
||||
PPT = 14
|
||||
PPTX = 15
|
||||
MSG = 16
|
||||
|
||||
# Adobe Types
|
||||
PDF = 20
|
||||
|
||||
# Image Types
|
||||
JPG = 30
|
||||
PNG = 31
|
||||
TIFF = 32
|
||||
BMP = 33
|
||||
HEIC = 34
|
||||
|
||||
# Plain Text Types
|
||||
EML = 40
|
||||
RTF = 41
|
||||
TXT = 42
|
||||
JSON = 43
|
||||
CSV = 44
|
||||
TSV = 45
|
||||
|
||||
# Markup Types
|
||||
HTML = 50
|
||||
XML = 51
|
||||
MD = 52
|
||||
EPUB = 53
|
||||
RST = 54
|
||||
ORG = 55
|
||||
|
||||
# Compressed Types
|
||||
ZIP = 60
|
||||
|
||||
# Open Office Types
|
||||
ODT = 70
|
||||
|
||||
# Audio Files
|
||||
WAV = 80
|
||||
|
||||
def __lt__(self, other: FileType) -> bool:
|
||||
"""Makes `FileType` members comparable with relational operators, at least with `<`.
|
||||
|
||||
This makes them sortable, in particular it supports sorting for pandas groupby functions.
|
||||
"""
|
||||
return self.name < other.name
|
||||
|
||||
|
||||
STR_TO_FILETYPE = {
|
||||
# -- BMP --
|
||||
"image/bmp": FileType.BMP,
|
||||
# -- CSV --
|
||||
"text/csv": FileType.CSV,
|
||||
"application/csv": FileType.CSV,
|
||||
"application/x-csv": FileType.CSV,
|
||||
"text/comma-separated-values": FileType.CSV,
|
||||
"text/x-comma-separated-values": FileType.CSV,
|
||||
"text/x-csv": FileType.CSV,
|
||||
# -- DOC --
|
||||
"application/msword": FileType.DOC,
|
||||
# -- DOCX --
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": FileType.DOCX,
|
||||
# -- EML --
|
||||
"message/rfc822": FileType.EML,
|
||||
# -- EPUB --
|
||||
"application/epub": FileType.EPUB,
|
||||
"application/epub+zip": FileType.EPUB,
|
||||
# -- HEIF --
|
||||
"image/heic": FileType.HEIC,
|
||||
# -- HTML --
|
||||
"text/html": FileType.HTML,
|
||||
# -- JPG --
|
||||
"image/jpeg": FileType.JPG,
|
||||
# -- JSON --
|
||||
"application/json": FileType.JSON,
|
||||
# -- MD --
|
||||
"text/markdown": FileType.MD,
|
||||
"text/x-markdown": FileType.MD,
|
||||
# -- MSG --
|
||||
"application/vnd.ms-outlook": FileType.MSG,
|
||||
"application/x-ole-storage": FileType.MSG,
|
||||
# -- ODT --
|
||||
"application/vnd.oasis.opendocument.text": FileType.ODT,
|
||||
# -- ORG --
|
||||
"text/org": FileType.ORG,
|
||||
# -- PDF --
|
||||
"application/pdf": FileType.PDF,
|
||||
# -- PNG --
|
||||
"image/png": FileType.PNG,
|
||||
# -- PPT --
|
||||
"application/vnd.ms-powerpoint": FileType.PPT,
|
||||
# -- PPTX --
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation": FileType.PPTX,
|
||||
# -- RST --
|
||||
"text/x-rst": FileType.RST,
|
||||
# -- RTF --
|
||||
"text/rtf": FileType.RTF,
|
||||
"application/rtf": FileType.RTF,
|
||||
# -- TIFF --
|
||||
"image/tiff": FileType.TIFF,
|
||||
# -- TSV --
|
||||
"text/tsv": FileType.TSV,
|
||||
# -- TXT --
|
||||
"text/plain": FileType.TXT,
|
||||
# NOTE(robinson) - https://mimetype.io/application/yaml
|
||||
# In the future, we may have special processing for YAML
|
||||
# files instead of treating them as plaintext
|
||||
"text/yaml": FileType.TXT,
|
||||
"application/x-yaml": FileType.TXT,
|
||||
"application/yaml": FileType.TXT,
|
||||
"text/x-yaml": FileType.TXT,
|
||||
# -- WAV --
|
||||
# NOTE(robinson) - https://mimetype.io/audio/wav
|
||||
"audio/wav": FileType.WAV,
|
||||
"audio/vnd.wav": FileType.WAV,
|
||||
"audio/vnd.wave": FileType.WAV,
|
||||
"audio/wave": FileType.WAV,
|
||||
"audio/x-pn-wav": FileType.WAV,
|
||||
"audio/x-wav": FileType.WAV,
|
||||
# -- XLS --
|
||||
"application/vnd.ms-excel": FileType.XLS,
|
||||
# -- XLSX --
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": FileType.XLSX,
|
||||
# -- XML --
|
||||
"application/xml": FileType.XML,
|
||||
# -- EMPTY --
|
||||
"inode/x-empty": FileType.EMPTY,
|
||||
}
|
||||
|
||||
# -- MIME-types in STR_TO_FILETYPE that are not the canonical MIME-type for that file-type --
|
||||
MIMETYPE_ALIASES = (
|
||||
"application/csv",
|
||||
"application/epub+zip",
|
||||
"application/rtf",
|
||||
"application/x-csv",
|
||||
"application/x-ole-storage",
|
||||
"application/x-yaml",
|
||||
"application/yaml",
|
||||
"audio/vnd.wav",
|
||||
"audio/vnd.wave",
|
||||
"audio/wave",
|
||||
"audio/x-pn-wav",
|
||||
"audio/x-wav",
|
||||
"text/comma-separated-values",
|
||||
"text/x-comma-separated-values",
|
||||
"text/x-csv",
|
||||
"text/x-markdown",
|
||||
"text/x-yaml",
|
||||
"text/yaml",
|
||||
)
|
||||
|
||||
FILETYPE_TO_MIMETYPE = {v: k for k, v in STR_TO_FILETYPE.items() if k not in MIMETYPE_ALIASES}
|
||||
|
||||
EXT_TO_FILETYPE = {
|
||||
# -- BMP --
|
||||
".bmp": FileType.BMP,
|
||||
# -- CSV --
|
||||
".csv": FileType.CSV,
|
||||
# -- DOC --
|
||||
".doc": FileType.DOC,
|
||||
# -- DOCX --
|
||||
".docx": FileType.DOCX,
|
||||
# -- EML --
|
||||
".eml": FileType.EML,
|
||||
".p7s": FileType.EML,
|
||||
# -- EPUB --
|
||||
".epub": FileType.EPUB,
|
||||
# -- HEIC --
|
||||
".heic": FileType.HEIC,
|
||||
# -- HTML --
|
||||
".htm": FileType.HTML,
|
||||
".html": FileType.HTML,
|
||||
# -- JPG --
|
||||
".jpeg": FileType.JPG,
|
||||
".jpg": FileType.JPG,
|
||||
# -- JSON --
|
||||
".json": FileType.JSON,
|
||||
# -- MD --
|
||||
".md": FileType.MD,
|
||||
# -- MSG --
|
||||
".msg": FileType.MSG,
|
||||
# -- ODT --
|
||||
".odt": FileType.ODT,
|
||||
# -- ORG --
|
||||
".org": FileType.ORG,
|
||||
# -- PDF --
|
||||
".pdf": FileType.PDF,
|
||||
# -- PNG --
|
||||
".png": FileType.PNG,
|
||||
# -- PPT --
|
||||
".ppt": FileType.PPT,
|
||||
# -- PPTX --
|
||||
".pptx": FileType.PPTX,
|
||||
# -- RST --
|
||||
".rst": FileType.RST,
|
||||
# -- RTF --
|
||||
".rtf": FileType.RTF,
|
||||
# -- TIFF --
|
||||
".tiff": FileType.TIFF,
|
||||
# -- TSV --
|
||||
".tab": FileType.TSV,
|
||||
".tsv": FileType.TSV,
|
||||
# -- TXT --
|
||||
".text": FileType.TXT,
|
||||
".txt": FileType.TXT,
|
||||
# NOTE(robinson) - for now we are treating code files as plain text
|
||||
".c": FileType.TXT,
|
||||
".cc": FileType.TXT,
|
||||
".cpp": FileType.TXT,
|
||||
".cs": FileType.TXT,
|
||||
".cxx": FileType.TXT,
|
||||
".go": FileType.TXT,
|
||||
".java": FileType.TXT,
|
||||
".js": FileType.TXT,
|
||||
".log": FileType.TXT,
|
||||
".php": FileType.TXT,
|
||||
".py": FileType.TXT,
|
||||
".rb": FileType.TXT,
|
||||
".swift": FileType.TXT,
|
||||
".ts": FileType.TXT,
|
||||
".yaml": FileType.TXT,
|
||||
".yml": FileType.TXT,
|
||||
# -- WAV --
|
||||
".wav": FileType.WAV,
|
||||
# -- XLS --
|
||||
".xls": FileType.XLS,
|
||||
# -- XLSX --
|
||||
".xlsx": FileType.XLSX,
|
||||
# -- XML --
|
||||
".xml": FileType.XML,
|
||||
# -- ZIP --
|
||||
".zip": FileType.ZIP,
|
||||
# -- UNK --
|
||||
None: FileType.UNK,
|
||||
}
|
||||
|
||||
PLAIN_TEXT_EXTENSIONS = ".csv .eml .html .json .md .org .p7s .rst .rtf .tab .text .tsv .txt".split()
|
||||
@ -8,13 +8,8 @@ from typing import IO, Any, Callable, Literal, Optional
|
||||
import requests
|
||||
|
||||
from unstructured.documents.elements import DataSourceMetadata, Element
|
||||
from unstructured.file_utils.filetype import (
|
||||
FILETYPE_TO_MIMETYPE,
|
||||
STR_TO_FILETYPE,
|
||||
FileType,
|
||||
detect_filetype,
|
||||
is_json_processable,
|
||||
)
|
||||
from unstructured.file_utils.filetype import detect_filetype, is_json_processable
|
||||
from unstructured.file_utils.model import FILETYPE_TO_MIMETYPE, STR_TO_FILETYPE, FileType
|
||||
from unstructured.logger import logger
|
||||
from unstructured.partition.common import exactly_one
|
||||
from unstructured.partition.email import partition_email
|
||||
|
||||
@ -348,7 +348,7 @@ def add_element_metadata(
|
||||
return element
|
||||
|
||||
|
||||
def remove_element_metadata(layout_elements) -> list[Element]:
|
||||
def remove_element_metadata(layout_elements: list[Element]) -> list[Element]:
|
||||
"""Removes document metadata from the document element.
|
||||
|
||||
Document metadata includes information like the filename, source url, and page number.
|
||||
|
||||
@ -10,7 +10,8 @@ from oxmsg.attachment import Attachment
|
||||
|
||||
from unstructured.chunking import add_chunking_strategy
|
||||
from unstructured.documents.elements import Element, ElementMetadata, process_metadata
|
||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||
from unstructured.file_utils.model import FileType
|
||||
from unstructured.logger import logger
|
||||
from unstructured.partition.common import (
|
||||
get_last_modified_date,
|
||||
@ -98,14 +99,12 @@ class MsgPartitionerOptions:
|
||||
"""True when message is encrypted."""
|
||||
# NOTE(robinson) - Per RFC 2015, the content type for emails with PGP encrypted content
|
||||
# is multipart/encrypted (ref: https://www.ietf.org/rfc/rfc2015.txt)
|
||||
if "encrypted" in self.msg.message_headers.get("Content-Type", ""):
|
||||
return True
|
||||
# -- pretty sure we're going to want to dig deeper to discover messages that are encrypted
|
||||
# -- with something other than PGP.
|
||||
# - might be able to distinguish based on PID_MESSAGE_CLASS = 'IPM.Note.Signed'
|
||||
# - Content-Type header might include "application/pkcs7-mime" for Microsoft S/MIME
|
||||
# encryption.
|
||||
return False
|
||||
# NOTE(scanny) - pretty sure we're going to want to dig deeper to discover messages that are
|
||||
# encrypted with something other than PGP.
|
||||
# - might be able to distinguish based on PID_MESSAGE_CLASS = 'IPM.Note.Signed'
|
||||
# - Content-Type header might include "application/pkcs7-mime" for Microsoft S/MIME
|
||||
# encryption.
|
||||
return "encrypted" in self.msg.message_headers.get("Content-Type", "")
|
||||
|
||||
@lazyproperty
|
||||
def metadata_file_path(self) -> str | None:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user