mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-18 13:45:45 +00:00
feat: detect filetype with extension if libmagic is unavailable (#268)
* included the previous PR changes and verified black * resolved the issues mentioned * make tidy and add tests --------- Co-authored-by: Matt Robinson <mrobinson@unstructured.io> Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>
This commit is contained in:
parent
e419ba1d33
commit
956f04d770
@ -1,3 +1,9 @@
|
|||||||
|
## 0.4.16-dev0
|
||||||
|
|
||||||
|
### Enhancements
|
||||||
|
|
||||||
|
* Fallback to using file extensions for filetype detection if `libmagic` is not present
|
||||||
|
|
||||||
## 0.4.15
|
## 0.4.15
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
@ -199,3 +205,4 @@ of an email.
|
|||||||
## 0.2.0
|
## 0.2.0
|
||||||
|
|
||||||
* Initial release of unstructured
|
* Initial release of unstructured
|
||||||
|
|
||||||
|
@ -5,6 +5,7 @@ import zipfile
|
|||||||
|
|
||||||
import magic
|
import magic
|
||||||
|
|
||||||
|
import unstructured.file_utils.filetype as filetype
|
||||||
from unstructured.file_utils.filetype import (
|
from unstructured.file_utils.filetype import (
|
||||||
detect_filetype,
|
detect_filetype,
|
||||||
FileType,
|
FileType,
|
||||||
@ -36,6 +37,27 @@ def test_detect_filetype_from_filename(file, expected):
|
|||||||
assert detect_filetype(filename) == expected
|
assert detect_filetype(filename) == expected
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"file, expected",
|
||||||
|
[
|
||||||
|
("layout-parser-paper-fast.pdf", FileType.PDF),
|
||||||
|
("fake.docx", FileType.DOCX),
|
||||||
|
("example.jpg", FileType.JPG),
|
||||||
|
("fake-text.txt", FileType.TXT),
|
||||||
|
("fake-email.eml", FileType.EML),
|
||||||
|
("factbook.xml", FileType.XML),
|
||||||
|
("example-10k.html", FileType.HTML),
|
||||||
|
("fake-html.html", FileType.HTML),
|
||||||
|
("fake-excel.xlsx", FileType.XLSX),
|
||||||
|
("fake-power-point.pptx", FileType.PPTX),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expected):
|
||||||
|
monkeypatch.setattr(filetype, "LIBMAGIC_AVAILABLE", False)
|
||||||
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, file)
|
||||||
|
assert detect_filetype(filename) == expected
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"file, expected",
|
"file, expected",
|
||||||
[
|
[
|
||||||
@ -60,6 +82,14 @@ def test_detect_filetype_from_file(file, expected):
|
|||||||
assert detect_filetype(file=f) in expected
|
assert detect_filetype(file=f) in expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_detect_filetype_from_file_raises_without_libmagic(monkeypatch):
|
||||||
|
monkeypatch.setattr(filetype, "LIBMAGIC_AVAILABLE", False)
|
||||||
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
||||||
|
with open(filename, "rb") as f:
|
||||||
|
with pytest.raises(ImportError):
|
||||||
|
detect_filetype(file=f)
|
||||||
|
|
||||||
|
|
||||||
def test_detect_xml_application_xml(monkeypatch):
|
def test_detect_xml_application_xml(monkeypatch):
|
||||||
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/xml")
|
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/xml")
|
||||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.xml")
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.xml")
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.4.15" # pragma: no cover
|
__version__ = "0.4.16-dev0" # pragma: no cover
|
||||||
|
@ -3,8 +3,14 @@ import os
|
|||||||
from typing import IO, Optional
|
from typing import IO, Optional
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
|
try:
|
||||||
import magic
|
import magic
|
||||||
|
|
||||||
|
LIBMAGIC_AVAILABLE = True
|
||||||
|
except ImportError: # pragma: nocover
|
||||||
|
LIBMAGIC_AVAILABLE = False # pragma: nocover
|
||||||
|
|
||||||
|
|
||||||
from unstructured.logger import logger
|
from unstructured.logger import logger
|
||||||
from unstructured.nlp.patterns import EMAIL_HEAD_RE
|
from unstructured.nlp.patterns import EMAIL_HEAD_RE
|
||||||
|
|
||||||
@ -91,7 +97,9 @@ EXT_TO_FILETYPE = {
|
|||||||
".pdf": FileType.PDF,
|
".pdf": FileType.PDF,
|
||||||
".docx": FileType.DOCX,
|
".docx": FileType.DOCX,
|
||||||
".jpg": FileType.JPG,
|
".jpg": FileType.JPG,
|
||||||
|
".jpeg": FileType.JPG,
|
||||||
".txt": FileType.TXT,
|
".txt": FileType.TXT,
|
||||||
|
".text": FileType.TXT,
|
||||||
".eml": FileType.EML,
|
".eml": FileType.EML,
|
||||||
".xml": FileType.XML,
|
".xml": FileType.XML,
|
||||||
".html": FileType.HTML,
|
".html": FileType.HTML,
|
||||||
@ -117,13 +125,24 @@ def detect_filetype(
|
|||||||
if filename:
|
if filename:
|
||||||
_, extension = os.path.splitext(filename)
|
_, extension = os.path.splitext(filename)
|
||||||
extension = extension.lower()
|
extension = extension.lower()
|
||||||
|
if LIBMAGIC_AVAILABLE:
|
||||||
|
mime_type = None
|
||||||
mime_type = magic.from_file(filename, mime=True)
|
mime_type = magic.from_file(filename, mime=True)
|
||||||
|
else:
|
||||||
|
return EXT_TO_FILETYPE.get(extension.lower(), FileType.UNK)
|
||||||
elif file is not None:
|
elif file is not None:
|
||||||
extension = None
|
extension = None
|
||||||
# NOTE(robinson) - the python-magic docs recommend reading at least the first 2048 bytes
|
# NOTE(robinson) - the python-magic docs recommend reading at least the first 2048 bytes
|
||||||
# Increased to 4096 because otherwise .xlsx files get detected as a zip file
|
# Increased to 4096 because otherwise .xlsx files get detected as a zip file
|
||||||
# ref: https://github.com/ahupp/python-magic#usage
|
# ref: https://github.com/ahupp/python-magic#usage
|
||||||
|
if LIBMAGIC_AVAILABLE:
|
||||||
mime_type = magic.from_buffer(file.read(4096), mime=True)
|
mime_type = magic.from_buffer(file.read(4096), mime=True)
|
||||||
|
else:
|
||||||
|
raise ImportError(
|
||||||
|
"libmagic is unavailable. "
|
||||||
|
"Filetype detection on file-like objects requires libmagic. "
|
||||||
|
"Please install libmagic and try again."
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
raise ValueError("No filename nor file were specified.")
|
raise ValueError("No filename nor file were specified.")
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user