diff --git a/CHANGELOG.md b/CHANGELOG.md index d6c2f8011..972d53a9d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## 0.4.16-dev0 + +### Enhancements + +* Fallback to using file extensions for filetype detection if `libmagic` is not present + ## 0.4.15 ### Enhancements @@ -199,3 +205,4 @@ of an email. ## 0.2.0 * Initial release of unstructured + diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index 13afa0c2d..60809215d 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -5,6 +5,7 @@ import zipfile import magic +import unstructured.file_utils.filetype as filetype from unstructured.file_utils.filetype import ( detect_filetype, FileType, @@ -36,6 +37,27 @@ def test_detect_filetype_from_filename(file, expected): assert detect_filetype(filename) == expected +@pytest.mark.parametrize( + "file, expected", + [ + ("layout-parser-paper-fast.pdf", FileType.PDF), + ("fake.docx", FileType.DOCX), + ("example.jpg", FileType.JPG), + ("fake-text.txt", FileType.TXT), + ("fake-email.eml", FileType.EML), + ("factbook.xml", FileType.XML), + ("example-10k.html", FileType.HTML), + ("fake-html.html", FileType.HTML), + ("fake-excel.xlsx", FileType.XLSX), + ("fake-power-point.pptx", FileType.PPTX), + ], +) +def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expected): + monkeypatch.setattr(filetype, "LIBMAGIC_AVAILABLE", False) + filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, file) + assert detect_filetype(filename) == expected + + @pytest.mark.parametrize( "file, expected", [ @@ -60,6 +82,14 @@ def test_detect_filetype_from_file(file, expected): assert detect_filetype(file=f) in expected +def test_detect_filetype_from_file_raises_without_libmagic(monkeypatch): + monkeypatch.setattr(filetype, "LIBMAGIC_AVAILABLE", False) + filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt") + with open(filename, "rb") as f: + with pytest.raises(ImportError): + detect_filetype(file=f) + + def test_detect_xml_application_xml(monkeypatch): monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/xml") filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.xml") diff --git a/unstructured/__version__.py b/unstructured/__version__.py index f1526787e..deb205f76 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.4.15" # pragma: no cover +__version__ = "0.4.16-dev0" # pragma: no cover diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index 39cd49dc9..68909f274 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -3,7 +3,13 @@ import os from typing import IO, Optional import zipfile -import magic +try: + import magic + + LIBMAGIC_AVAILABLE = True +except ImportError: # pragma: nocover + LIBMAGIC_AVAILABLE = False # pragma: nocover + from unstructured.logger import logger from unstructured.nlp.patterns import EMAIL_HEAD_RE @@ -91,7 +97,9 @@ EXT_TO_FILETYPE = { ".pdf": FileType.PDF, ".docx": FileType.DOCX, ".jpg": FileType.JPG, + ".jpeg": FileType.JPG, ".txt": FileType.TXT, + ".text": FileType.TXT, ".eml": FileType.EML, ".xml": FileType.XML, ".html": FileType.HTML, @@ -117,13 +125,24 @@ def detect_filetype( if filename: _, extension = os.path.splitext(filename) extension = extension.lower() - mime_type = magic.from_file(filename, mime=True) + if LIBMAGIC_AVAILABLE: + mime_type = None + mime_type = magic.from_file(filename, mime=True) + else: + return EXT_TO_FILETYPE.get(extension.lower(), FileType.UNK) elif file is not None: extension = None # NOTE(robinson) - the python-magic docs recommend reading at least the first 2048 bytes # Increased to 4096 because otherwise .xlsx files get detected as a zip file # ref: https://github.com/ahupp/python-magic#usage - mime_type = magic.from_buffer(file.read(4096), mime=True) + if LIBMAGIC_AVAILABLE: + mime_type = magic.from_buffer(file.read(4096), mime=True) + else: + raise ImportError( + "libmagic is unavailable. " + "Filetype detection on file-like objects requires libmagic. " + "Please install libmagic and try again." + ) else: raise ValueError("No filename nor file were specified.")