mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-18 13:45:45 +00:00
feat: detect filetype with extension if libmagic is unavailable (#268)
* included the previous PR changes and verified black * resolved the issues mentioned * make tidy and add tests --------- Co-authored-by: Matt Robinson <mrobinson@unstructured.io> Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>
This commit is contained in:
parent
e419ba1d33
commit
956f04d770
@ -1,3 +1,9 @@
|
||||
## 0.4.16-dev0
|
||||
|
||||
### Enhancements
|
||||
|
||||
* Fallback to using file extensions for filetype detection if `libmagic` is not present
|
||||
|
||||
## 0.4.15
|
||||
|
||||
### Enhancements
|
||||
@ -199,3 +205,4 @@ of an email.
|
||||
## 0.2.0
|
||||
|
||||
* Initial release of unstructured
|
||||
|
||||
|
@ -5,6 +5,7 @@ import zipfile
|
||||
|
||||
import magic
|
||||
|
||||
import unstructured.file_utils.filetype as filetype
|
||||
from unstructured.file_utils.filetype import (
|
||||
detect_filetype,
|
||||
FileType,
|
||||
@ -36,6 +37,27 @@ def test_detect_filetype_from_filename(file, expected):
|
||||
assert detect_filetype(filename) == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"file, expected",
|
||||
[
|
||||
("layout-parser-paper-fast.pdf", FileType.PDF),
|
||||
("fake.docx", FileType.DOCX),
|
||||
("example.jpg", FileType.JPG),
|
||||
("fake-text.txt", FileType.TXT),
|
||||
("fake-email.eml", FileType.EML),
|
||||
("factbook.xml", FileType.XML),
|
||||
("example-10k.html", FileType.HTML),
|
||||
("fake-html.html", FileType.HTML),
|
||||
("fake-excel.xlsx", FileType.XLSX),
|
||||
("fake-power-point.pptx", FileType.PPTX),
|
||||
],
|
||||
)
|
||||
def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expected):
|
||||
monkeypatch.setattr(filetype, "LIBMAGIC_AVAILABLE", False)
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, file)
|
||||
assert detect_filetype(filename) == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"file, expected",
|
||||
[
|
||||
@ -60,6 +82,14 @@ def test_detect_filetype_from_file(file, expected):
|
||||
assert detect_filetype(file=f) in expected
|
||||
|
||||
|
||||
def test_detect_filetype_from_file_raises_without_libmagic(monkeypatch):
|
||||
monkeypatch.setattr(filetype, "LIBMAGIC_AVAILABLE", False)
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
||||
with open(filename, "rb") as f:
|
||||
with pytest.raises(ImportError):
|
||||
detect_filetype(file=f)
|
||||
|
||||
|
||||
def test_detect_xml_application_xml(monkeypatch):
|
||||
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/xml")
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.xml")
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.4.15" # pragma: no cover
|
||||
__version__ = "0.4.16-dev0" # pragma: no cover
|
||||
|
@ -3,7 +3,13 @@ import os
|
||||
from typing import IO, Optional
|
||||
import zipfile
|
||||
|
||||
import magic
|
||||
try:
|
||||
import magic
|
||||
|
||||
LIBMAGIC_AVAILABLE = True
|
||||
except ImportError: # pragma: nocover
|
||||
LIBMAGIC_AVAILABLE = False # pragma: nocover
|
||||
|
||||
|
||||
from unstructured.logger import logger
|
||||
from unstructured.nlp.patterns import EMAIL_HEAD_RE
|
||||
@ -91,7 +97,9 @@ EXT_TO_FILETYPE = {
|
||||
".pdf": FileType.PDF,
|
||||
".docx": FileType.DOCX,
|
||||
".jpg": FileType.JPG,
|
||||
".jpeg": FileType.JPG,
|
||||
".txt": FileType.TXT,
|
||||
".text": FileType.TXT,
|
||||
".eml": FileType.EML,
|
||||
".xml": FileType.XML,
|
||||
".html": FileType.HTML,
|
||||
@ -117,13 +125,24 @@ def detect_filetype(
|
||||
if filename:
|
||||
_, extension = os.path.splitext(filename)
|
||||
extension = extension.lower()
|
||||
if LIBMAGIC_AVAILABLE:
|
||||
mime_type = None
|
||||
mime_type = magic.from_file(filename, mime=True)
|
||||
else:
|
||||
return EXT_TO_FILETYPE.get(extension.lower(), FileType.UNK)
|
||||
elif file is not None:
|
||||
extension = None
|
||||
# NOTE(robinson) - the python-magic docs recommend reading at least the first 2048 bytes
|
||||
# Increased to 4096 because otherwise .xlsx files get detected as a zip file
|
||||
# ref: https://github.com/ahupp/python-magic#usage
|
||||
if LIBMAGIC_AVAILABLE:
|
||||
mime_type = magic.from_buffer(file.read(4096), mime=True)
|
||||
else:
|
||||
raise ImportError(
|
||||
"libmagic is unavailable. "
|
||||
"Filetype detection on file-like objects requires libmagic. "
|
||||
"Please install libmagic and try again."
|
||||
)
|
||||
else:
|
||||
raise ValueError("No filename nor file were specified.")
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user