feat: detect filetype with extension if libmagic is unavailable (#268)

* included the previous PR changes and verified black

* resolved the issues mentioned

* make tidy and add tests

---------

Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>
This commit is contained in:
grungyfeline998 2023-02-24 20:53:29 +05:30 committed by GitHub
parent e419ba1d33
commit 956f04d770
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 60 additions and 4 deletions

View File

@ -1,3 +1,9 @@
## 0.4.16-dev0
### Enhancements
* Fallback to using file extensions for filetype detection if `libmagic` is not present
## 0.4.15
### Enhancements
@ -199,3 +205,4 @@ of an email.
## 0.2.0
* Initial release of unstructured

View File

@ -5,6 +5,7 @@ import zipfile
import magic
import unstructured.file_utils.filetype as filetype
from unstructured.file_utils.filetype import (
detect_filetype,
FileType,
@ -36,6 +37,27 @@ def test_detect_filetype_from_filename(file, expected):
assert detect_filetype(filename) == expected
@pytest.mark.parametrize(
"file, expected",
[
("layout-parser-paper-fast.pdf", FileType.PDF),
("fake.docx", FileType.DOCX),
("example.jpg", FileType.JPG),
("fake-text.txt", FileType.TXT),
("fake-email.eml", FileType.EML),
("factbook.xml", FileType.XML),
("example-10k.html", FileType.HTML),
("fake-html.html", FileType.HTML),
("fake-excel.xlsx", FileType.XLSX),
("fake-power-point.pptx", FileType.PPTX),
],
)
def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expected):
monkeypatch.setattr(filetype, "LIBMAGIC_AVAILABLE", False)
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, file)
assert detect_filetype(filename) == expected
@pytest.mark.parametrize(
"file, expected",
[
@ -60,6 +82,14 @@ def test_detect_filetype_from_file(file, expected):
assert detect_filetype(file=f) in expected
def test_detect_filetype_from_file_raises_without_libmagic(monkeypatch):
monkeypatch.setattr(filetype, "LIBMAGIC_AVAILABLE", False)
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
with open(filename, "rb") as f:
with pytest.raises(ImportError):
detect_filetype(file=f)
def test_detect_xml_application_xml(monkeypatch):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/xml")
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.xml")

View File

@ -1 +1 @@
__version__ = "0.4.15" # pragma: no cover
__version__ = "0.4.16-dev0" # pragma: no cover

View File

@ -3,7 +3,13 @@ import os
from typing import IO, Optional
import zipfile
import magic
try:
import magic
LIBMAGIC_AVAILABLE = True
except ImportError: # pragma: nocover
LIBMAGIC_AVAILABLE = False # pragma: nocover
from unstructured.logger import logger
from unstructured.nlp.patterns import EMAIL_HEAD_RE
@ -91,7 +97,9 @@ EXT_TO_FILETYPE = {
".pdf": FileType.PDF,
".docx": FileType.DOCX,
".jpg": FileType.JPG,
".jpeg": FileType.JPG,
".txt": FileType.TXT,
".text": FileType.TXT,
".eml": FileType.EML,
".xml": FileType.XML,
".html": FileType.HTML,
@ -117,13 +125,24 @@ def detect_filetype(
if filename:
_, extension = os.path.splitext(filename)
extension = extension.lower()
if LIBMAGIC_AVAILABLE:
mime_type = None
mime_type = magic.from_file(filename, mime=True)
else:
return EXT_TO_FILETYPE.get(extension.lower(), FileType.UNK)
elif file is not None:
extension = None
# NOTE(robinson) - the python-magic docs recommend reading at least the first 2048 bytes
# Increased to 4096 because otherwise .xlsx files get detected as a zip file
# ref: https://github.com/ahupp/python-magic#usage
if LIBMAGIC_AVAILABLE:
mime_type = magic.from_buffer(file.read(4096), mime=True)
else:
raise ImportError(
"libmagic is unavailable. "
"Filetype detection on file-like objects requires libmagic. "
"Please install libmagic and try again."
)
else:
raise ValueError("No filename nor file were specified.")