refactor: update detect_filetype() to use hashmap for mime type return (#591)

* Update detect_filetype() to use hashmap for mime type return

* fix: text mime type and linting

* fix: declare docx and xlsx mime types locally and also fix linting

* Update CHANGELOG.md

* tweaks for failing tests

---------

Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>
This commit is contained in:
Eu Jin Marcus Yatim 2023-05-17 21:48:52 +08:00 committed by GitHub
parent f4f40f58e3
commit 7eac1f8ca7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 79 additions and 108 deletions

View File

@ -7,6 +7,8 @@
partition strategy in CLI. For example, `--partition-strategy fast`.
* Added metadata for filetype.
* Add Discord connector to pull messages from a list of channels
* Refactor `unstructured/file-utils/filetype.py` to better utilise hashmap to return mime type.
* Add local declaration of DOCX_MIME_TYPES and XLSX_MIME_TYPES for `test_filetype.py`.
### Features

View File

@ -7,8 +7,6 @@ import pytest
from unstructured.file_utils import filetype
from unstructured.file_utils.filetype import (
DOCX_MIME_TYPES,
XLSX_MIME_TYPES,
FileType,
_is_text_file_a_json,
detect_filetype,
@ -17,6 +15,14 @@ from unstructured.file_utils.filetype import (
FILE_DIRECTORY = pathlib.Path(__file__).parent.resolve()
EXAMPLE_DOCS_DIRECTORY = os.path.join(FILE_DIRECTORY, "..", "..", "example-docs")
DOCX_MIME_TYPES = [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
]
XLSX_MIME_TYPES = [
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
]
@pytest.mark.parametrize(
("file", "expected"),
@ -142,7 +148,11 @@ def test_detect_html_text_xml(monkeypatch):
def test_detect_docx_filetype_application_octet_stream(monkeypatch):
monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
monkeypatch.setattr(
magic,
"from_buffer",
lambda *args, **kwargs: "application/octet-stream",
)
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.docx")
with open(filename, "rb") as f:
filetype = detect_filetype(file=f)
@ -150,7 +160,11 @@ def test_detect_docx_filetype_application_octet_stream(monkeypatch):
def test_detect_docx_filetype_application_octet_stream_with_filename(monkeypatch):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/octet-stream")
monkeypatch.setattr(
magic,
"from_file",
lambda *args, **kwargs: "application/octet-stream",
)
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.docx")
filetype = detect_filetype(filename=filename)
assert filetype == FileType.DOCX
@ -173,25 +187,41 @@ def test_detect_application_zip_files(monkeypatch, tmpdir):
def test_detect_doc_file_from_mime_type(monkeypatch):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/msword")
monkeypatch.setattr(
magic,
"from_file",
lambda *args, **kwargs: "application/msword",
)
filetype = detect_filetype(filename="fake.doc")
assert filetype == FileType.DOC
def test_detect_ppt_file_from_mime_type(monkeypatch):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/vnd.ms-powerpoint")
monkeypatch.setattr(
magic,
"from_file",
lambda *args, **kwargs: "application/vnd.ms-powerpoint",
)
filetype = detect_filetype(filename="fake.ppt")
assert filetype == FileType.PPT
def test_detect_xls_file_from_mime_type(monkeypatch):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/vnd.ms-excel")
monkeypatch.setattr(
magic,
"from_file",
lambda *args, **kwargs: "application/vnd.ms-excel",
)
filetype = detect_filetype(filename="fake.xls")
assert filetype == FileType.XLS
def test_detect_xlsx_filetype_application_octet_stream(monkeypatch):
monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
monkeypatch.setattr(
magic,
"from_buffer",
lambda *args, **kwargs: "application/octet-stream",
)
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "stanley-cups.xlsx")
with open(filename, "rb") as f:
filetype = detect_filetype(file=f)
@ -199,14 +229,22 @@ def test_detect_xlsx_filetype_application_octet_stream(monkeypatch):
def test_detect_xlsx_filetype_application_octet_stream_with_filename(monkeypatch):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/octet-stream")
monkeypatch.setattr(
magic,
"from_file",
lambda *args, **kwargs: "application/octet-stream",
)
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "stanley-cups.xlsx")
filetype = detect_filetype(filename=filename)
assert filetype == FileType.XLSX
def test_detect_pptx_filetype_application_octet_stream(monkeypatch):
monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
monkeypatch.setattr(
magic,
"from_buffer",
lambda *args, **kwargs: "application/octet-stream",
)
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
with open(filename, "rb") as f:
filetype = detect_filetype(file=f)
@ -214,14 +252,22 @@ def test_detect_pptx_filetype_application_octet_stream(monkeypatch):
def test_detect_pptx_filetype_application_octet_stream_with_filename(monkeypatch):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/octet-stream")
monkeypatch.setattr(
magic,
"from_file",
lambda *args, **kwargs: "application/octet-stream",
)
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
filetype = detect_filetype(filename=filename)
assert filetype == FileType.PPTX
def test_detect_application_octet_stream_returns_none_with_unknown(monkeypatch):
monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
monkeypatch.setattr(
magic,
"from_buffer",
lambda *args, **kwargs: "application/octet-stream",
)
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
with open(filename, "rb") as f:
filetype = detect_filetype(file=f)

View File

@ -20,54 +20,11 @@ except ImportError: # pragma: nocover
from unstructured.logger import logger
from unstructured.nlp.patterns import EMAIL_HEAD_RE
DOCX_MIME_TYPES = [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
]
DOC_MIME_TYPES = [
"application/msword",
]
ODT_MIME_TYPES = [
"application/vnd.oasis.opendocument.text",
]
XLSX_MIME_TYPES = [
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
]
XLS_MIME_TYPES = [
"application/vnd.ms-excel",
]
PPTX_MIME_TYPES = [
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
]
PPT_MIME_TYPES = [
"application/vnd.ms-powerpoint",
]
MSG_MIME_TYPES = [
"application/vnd.ms-outlook",
"application/x-ole-storage",
]
TXT_MIME_TYPES = [
"text/plain",
"message/rfc822", # ref: https://www.rfc-editor.org/rfc/rfc822
]
MD_MIME_TYPES = [
"text/markdown",
"text/x-markdown",
]
EPUB_MIME_TYPES = [
"application/epub",
"application/epub+zip",
]
# NOTE(robinson) - .docx.xlsx files are actually zip file with a .docx/.xslx extension.
# If the MIME type is application/octet-stream, we check if it's a .docx/.xlsx file by
# looking for expected filenames within the zip file.
@ -141,6 +98,7 @@ STR_TO_FILETYPE = {
"application/epub+zip": FileType.EPUB,
"application/json": FileType.JSON,
"application/rtf": FileType.RTF,
"text/rtf": FileType.RTF,
"text/html": FileType.HTML,
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": FileType.XLSX,
"application/vnd.ms-excel": FileType.XLS,
@ -149,6 +107,7 @@ STR_TO_FILETYPE = {
"application/xml": FileType.XML,
"application/vnd.oasis.opendocument.text": FileType.ODT,
"message/rfc822": FileType.EML,
"application/x-ole-storage": FileType.MSG,
"application/vnd.ms-outlook": FileType.MSG,
}
@ -206,13 +165,9 @@ def detect_filetype(
extension = extension.lower()
if os.path.isfile(_filename) and LIBMAGIC_AVAILABLE:
mime_type = magic.from_file(filename or file_filename, mime=True) # type: ignore
# NOTE(crag): for older versions of the OS libmagic package, such as is currently
# installed on the Unstructured docker image, .json files resolve to "text/plain"
# rather than "application/json". this corrects for that case.
if mime_type == "text/plain" and extension == ".json":
return FileType.JSON
else:
return EXT_TO_FILETYPE.get(extension.lower(), FileType.UNK)
elif file is not None:
extension = None
# NOTE(robinson) - the python-magic docs recommend reading at least the first 2048 bytes
@ -229,51 +184,20 @@ def detect_filetype(
else:
raise ValueError("No filename, file, nor file_filename were specified.")
if mime_type == "application/pdf":
return FileType.PDF
"""Mime type special cases."""
elif mime_type == "application/json":
# NOTE(crag): for older versions of the OS libmagic package, such as is currently
# installed on the Unstructured docker image, .json files resolve to "text/plain"
# rather than "application/json". this corrects for that case.
if mime_type == "text/plain" and extension == ".json":
return FileType.JSON
elif mime_type in DOCX_MIME_TYPES:
return FileType.DOCX
elif mime_type in DOC_MIME_TYPES:
return FileType.DOC
elif mime_type in ODT_MIME_TYPES:
return FileType.ODT
elif mime_type in MSG_MIME_TYPES:
return FileType.MSG
elif mime_type == "image/jpeg":
return FileType.JPG
elif mime_type == "image/png":
return FileType.PNG
elif mime_type in MD_MIME_TYPES:
# NOTE - I am not sure whether libmagic ever returns these mimetypes.
return FileType.MD
elif mime_type in EPUB_MIME_TYPES:
return FileType.EPUB
# NOTE(robinson) - examples are application/rtf or text/rtf.
# magic often returns text/plain for RTF files
elif mime_type.endswith("rtf"):
return FileType.RTF
elif mime_type.endswith("xml"):
if extension and (extension == ".html" or extension == ".htm"):
return FileType.HTML
else:
return FileType.XML
elif mime_type == "text/html":
return FileType.HTML
elif mime_type in TXT_MIME_TYPES or mime_type.startswith("text"):
if extension and extension == ".eml":
return FileType.EML
@ -281,26 +205,21 @@ def detect_filetype(
return FileType.MD
elif extension and extension == ".rtf":
return FileType.RTF
elif extension and extension == ".html":
return FileType.HTML
if _is_text_file_a_json(file=file, filename=filename):
return FileType.JSON
if file and not extension and _check_eml_from_buffer(file=file) is True:
return FileType.EML
# Safety catch
if mime_type in STR_TO_FILETYPE:
return STR_TO_FILETYPE[mime_type]
return FileType.TXT
elif mime_type in XLSX_MIME_TYPES:
return FileType.XLSX
elif mime_type in XLS_MIME_TYPES:
return FileType.XLS
elif mime_type in PPTX_MIME_TYPES:
return FileType.PPTX
elif mime_type in PPT_MIME_TYPES:
return FileType.PPT
elif mime_type == "application/octet-stream":
if file and not extension:
return _detect_filetype_from_octet_stream(file=file)
@ -321,6 +240,10 @@ def detect_filetype(
else:
return EXT_TO_FILETYPE.get(extension.lower(), filetype)
# For everything else
elif mime_type in STR_TO_FILETYPE:
return STR_TO_FILETYPE[mime_type]
logger.warning(
f"The MIME type{f' of {filename!r}' if filename else ''} is {mime_type!r}. "
"This file type is not currently supported in unstructured.",