feat: detect filetype with extension if libmagic is unavailable (#268)

* included the previous PR changes and verified black * resolved the issues mentioned * make tidy and add tests --------- Co-authored-by: Matt Robinson <mrobinson@unstructured.io> Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>
2025-11-28 00:05:55 +00:00 · 2023-02-24 20:53:29 +05:30 · 2023-02-24 20:53:29 +05:30 · 956f04d770
commit 956f04d770
parent e419ba1d33
4 changed files with 60 additions and 4 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,9 @@
+## 0.4.16-dev0
+
+### Enhancements
+
+* Fallback to using file extensions for filetype detection if `libmagic` is not present
+
 ## 0.4.15

 ### Enhancements
@ -199,3 +205,4 @@ of an email.
 ## 0.2.0

 * Initial release of unstructured
+
--- a/test_unstructured/file_utils/test_filetype.py
+++ b/test_unstructured/file_utils/test_filetype.py
@ -5,6 +5,7 @@ import zipfile

 import magic

+import unstructured.file_utils.filetype as filetype
 from unstructured.file_utils.filetype import (
    detect_filetype,
    FileType,
@ -36,6 +37,27 @@ def test_detect_filetype_from_filename(file, expected):
    assert detect_filetype(filename) == expected


+@pytest.mark.parametrize(
+    "file, expected",
+    [
+        ("layout-parser-paper-fast.pdf", FileType.PDF),
+        ("fake.docx", FileType.DOCX),
+        ("example.jpg", FileType.JPG),
+        ("fake-text.txt", FileType.TXT),
+        ("fake-email.eml", FileType.EML),
+        ("factbook.xml", FileType.XML),
+        ("example-10k.html", FileType.HTML),
+        ("fake-html.html", FileType.HTML),
+        ("fake-excel.xlsx", FileType.XLSX),
+        ("fake-power-point.pptx", FileType.PPTX),
+    ],
+)
+def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expected):
+    monkeypatch.setattr(filetype, "LIBMAGIC_AVAILABLE", False)
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, file)
+    assert detect_filetype(filename) == expected
+
+
@pytest.mark.parametrize(
    "file, expected",
    [
@ -60,6 +82,14 @@ def test_detect_filetype_from_file(file, expected):
        assert detect_filetype(file=f) in expected


+def test_detect_filetype_from_file_raises_without_libmagic(monkeypatch):
+    monkeypatch.setattr(filetype, "LIBMAGIC_AVAILABLE", False)
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
+    with open(filename, "rb") as f:
+        with pytest.raises(ImportError):
+            detect_filetype(file=f)
+
+
 def test_detect_xml_application_xml(monkeypatch):
    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/xml")
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.xml")
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.4.15"  # pragma: no cover
+__version__ = "0.4.16-dev0"  # pragma: no cover
--- a/unstructured/file_utils/filetype.py
+++ b/unstructured/file_utils/filetype.py
@ -3,7 +3,13 @@ import os
 from typing import IO, Optional
 import zipfile

-import magic
+try:
+    import magic
+
+    LIBMAGIC_AVAILABLE = True
+except ImportError:  # pragma: nocover
+    LIBMAGIC_AVAILABLE = False  # pragma: nocover
+

 from unstructured.logger import logger
 from unstructured.nlp.patterns import EMAIL_HEAD_RE
@ -91,7 +97,9 @@ EXT_TO_FILETYPE = {
    ".pdf": FileType.PDF,
    ".docx": FileType.DOCX,
    ".jpg": FileType.JPG,
+    ".jpeg": FileType.JPG,
    ".txt": FileType.TXT,
+    ".text": FileType.TXT,
    ".eml": FileType.EML,
    ".xml": FileType.XML,
    ".html": FileType.HTML,
@ -117,13 +125,24 @@ def detect_filetype(
    if filename:
        _, extension = os.path.splitext(filename)
        extension = extension.lower()
+        if LIBMAGIC_AVAILABLE:
+            mime_type = None
            mime_type = magic.from_file(filename, mime=True)
+        else:
+            return EXT_TO_FILETYPE.get(extension.lower(), FileType.UNK)
    elif file is not None:
        extension = None
        # NOTE(robinson) - the python-magic docs recommend reading at least the first 2048 bytes
        # Increased to 4096 because otherwise .xlsx files get detected as a zip file
        # ref: https://github.com/ahupp/python-magic#usage
+        if LIBMAGIC_AVAILABLE:
            mime_type = magic.from_buffer(file.read(4096), mime=True)
+        else:
+            raise ImportError(
+                "libmagic is unavailable. "
+                "Filetype detection on file-like objects requires libmagic. "
+                "Please install libmagic and try again."
+            )
    else:
        raise ValueError("No filename nor file were specified.")