mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
fix(file): fix OLE-based file-type auto-detection (#3437)
**Summary** A DOC, PPT, or XLS file sent to partition() as a file-like object is misidentified as a MSG file and raises an exception in python-oxmsg (which is used to process MSG files). **Fix** DOC, PPT, XLS, and MSG are all Microsoft OLE-based files, aka. Compound File Binary Format (CFBF). These can be reliably distinguished by inspecting magic bytes in certain locations. `libmagic` is unreliable at this or doesn't try, reporting the generic `"application/x-ole-storage"` which corresponds to the "container" CFBF format (vaguely like a Microsoft Zip format) that all these document types are stored in. Unconditionally use `filetype.guess_mime()` provided by the `filetype` package that is part of the base unstructured install. Unlike `libmagic`, this package reliably detects the distinguished MIME-type (e.g. `"application/msword"`) for OLE file subtypes. Fixes #3364
This commit is contained in:
parent
432d209c36
commit
4e61acc1c6
@ -1,4 +1,4 @@
|
|||||||
## 0.15.1-dev4
|
## 0.15.1-dev5
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
@ -14,6 +14,7 @@
|
|||||||
* **A DOCX, PPTX, or XLSX file specified by path and ambiguously identified as MIME-type "application/octet-stream" is identified correctly.** Resolves a shortcoming where a file specified by path immediately fell back to filename-extension based identification when misidentified as "application/octet-stream", either by asserted content type or a mis-guess by libmagic. An MS Office file misidentified in this way is now correctly identified regardless of its filename and whether it is specified by path or file-like object.
|
* **A DOCX, PPTX, or XLSX file specified by path and ambiguously identified as MIME-type "application/octet-stream" is identified correctly.** Resolves a shortcoming where a file specified by path immediately fell back to filename-extension based identification when misidentified as "application/octet-stream", either by asserted content type or a mis-guess by libmagic. An MS Office file misidentified in this way is now correctly identified regardless of its filename and whether it is specified by path or file-like object.
|
||||||
* **Textual content retrieved from a URL with gzip transport compression now partitions correctly.** Resolves a bug where a textual file-type (such as Markdown) retrieved by passing a URL to `partition()` would raise when `gzip` compression was used for transport by the server.
|
* **Textual content retrieved from a URL with gzip transport compression now partitions correctly.** Resolves a bug where a textual file-type (such as Markdown) retrieved by passing a URL to `partition()` would raise when `gzip` compression was used for transport by the server.
|
||||||
* **A DOCX, PPTX, or XLSX content-type asserted on partition is confirmed or fixed.** Resolves a bug where calling `partition()` with a swapped MS-Office `content_type` would cause the file-type to be misidentified. A DOCX, PPTX, or XLSX MIME-type received by `partition()` is now checked for accuracy and corrected if the file is for a different MS-Office 2007+ type.
|
* **A DOCX, PPTX, or XLSX content-type asserted on partition is confirmed or fixed.** Resolves a bug where calling `partition()` with a swapped MS-Office `content_type` would cause the file-type to be misidentified. A DOCX, PPTX, or XLSX MIME-type received by `partition()` is now checked for accuracy and corrected if the file is for a different MS-Office 2007+ type.
|
||||||
|
* **DOC, PPT, XLS, and MSG files are now auto-detected correctly.** Resolves a bug where DOC, PPT, and XLS files were auto-detected as MSG files under certain circumstances.
|
||||||
|
|
||||||
## 0.15.0
|
## 0.15.0
|
||||||
|
|
||||||
|
5
example-docs/not-unstructured-payload.json
Normal file
5
example-docs/not-unstructured-payload.json
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
{
|
||||||
|
"id": "Sample-1",
|
||||||
|
"name": "Sample 1",
|
||||||
|
"description": "This is sample data #1"
|
||||||
|
}
|
@ -19,9 +19,11 @@ from test_unstructured.unit_utils import (
|
|||||||
)
|
)
|
||||||
from unstructured.file_utils.filetype import (
|
from unstructured.file_utils.filetype import (
|
||||||
_FileTypeDetectionContext,
|
_FileTypeDetectionContext,
|
||||||
|
_OleFileDifferentiator,
|
||||||
_TextFileDifferentiator,
|
_TextFileDifferentiator,
|
||||||
_ZipFileDifferentiator,
|
_ZipFileDifferentiator,
|
||||||
detect_filetype,
|
detect_filetype,
|
||||||
|
is_json_processable,
|
||||||
)
|
)
|
||||||
from unstructured.file_utils.model import FileType
|
from unstructured.file_utils.model import FileType
|
||||||
|
|
||||||
@ -185,6 +187,46 @@ def test_it_detects_correct_file_type_from_file_no_name_with_swapped_ms_office_c
|
|||||||
assert file_type is expected_value
|
assert file_type is expected_value
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("expected_value", "file_name"),
|
||||||
|
[
|
||||||
|
(FileType.DOC, "simple.doc"),
|
||||||
|
(FileType.PPT, "fake-power-point.ppt"),
|
||||||
|
(FileType.XLS, "tests-example.xls"),
|
||||||
|
(FileType.MSG, "fake-email-multiple-attachments.msg"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"content_type",
|
||||||
|
[
|
||||||
|
"application/msword",
|
||||||
|
"application/vnd.ms-outlook",
|
||||||
|
"application/vnd.ms-powerpoint",
|
||||||
|
"application/vnd.ms-excel",
|
||||||
|
"anything/else",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_asserted_content_type(
|
||||||
|
file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock
|
||||||
|
):
|
||||||
|
"""Fixes wrong XLS asserted as DOC, PPT, etc.
|
||||||
|
|
||||||
|
Asserted content-type can be anything except `None` and differentiator will fix it if the file
|
||||||
|
is DOC, PPT, XLS, or MSG type.
|
||||||
|
"""
|
||||||
|
# -- disable strategies 2 & 3, content-type strategy should get this on its own --
|
||||||
|
ctx_mime_type_.return_value = None
|
||||||
|
with open(example_doc_path(file_name), "rb") as f:
|
||||||
|
file = io.BytesIO(f.read())
|
||||||
|
|
||||||
|
file_type = detect_filetype(file=file, content_type=content_type)
|
||||||
|
|
||||||
|
# -- Strategy 1 should not need to refer to guessed MIME-type and detection should not
|
||||||
|
# -- fall-back to strategy 2 for any of these test cases.
|
||||||
|
ctx_mime_type_.assert_not_called()
|
||||||
|
assert file_type is expected_value
|
||||||
|
|
||||||
|
|
||||||
# ================================================================================================
|
# ================================================================================================
|
||||||
# STRATEGY #2 - GUESS MIME-TYPE WITH LIBMAGIC
|
# STRATEGY #2 - GUESS MIME-TYPE WITH LIBMAGIC
|
||||||
# ================================================================================================
|
# ================================================================================================
|
||||||
@ -264,6 +306,7 @@ def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_rec
|
|||||||
[
|
[
|
||||||
(FileType.BMP, "img/bmp_24.bmp"),
|
(FileType.BMP, "img/bmp_24.bmp"),
|
||||||
(FileType.CSV, "stanley-cups.csv"),
|
(FileType.CSV, "stanley-cups.csv"),
|
||||||
|
(FileType.DOC, "simple.doc"),
|
||||||
(FileType.DOCX, "simple.docx"),
|
(FileType.DOCX, "simple.docx"),
|
||||||
(FileType.EML, "eml/fake-email.eml"),
|
(FileType.EML, "eml/fake-email.eml"),
|
||||||
(FileType.EPUB, "winter-sports.epub"),
|
(FileType.EPUB, "winter-sports.epub"),
|
||||||
@ -271,14 +314,17 @@ def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_rec
|
|||||||
(FileType.HTML, "ideas-page.html"),
|
(FileType.HTML, "ideas-page.html"),
|
||||||
(FileType.JPG, "img/example.jpg"),
|
(FileType.JPG, "img/example.jpg"),
|
||||||
(FileType.JSON, "spring-weather.html.json"),
|
(FileType.JSON, "spring-weather.html.json"),
|
||||||
|
(FileType.MSG, "fake-email.msg"),
|
||||||
(FileType.ODT, "simple.odt"),
|
(FileType.ODT, "simple.odt"),
|
||||||
(FileType.PDF, "pdf/layout-parser-paper-fast.pdf"),
|
(FileType.PDF, "pdf/layout-parser-paper-fast.pdf"),
|
||||||
(FileType.PNG, "img/DA-1p.png"),
|
(FileType.PNG, "img/DA-1p.png"),
|
||||||
|
(FileType.PPT, "fake-power-point.ppt"),
|
||||||
(FileType.PPTX, "fake-power-point.pptx"),
|
(FileType.PPTX, "fake-power-point.pptx"),
|
||||||
(FileType.RTF, "fake-doc.rtf"),
|
(FileType.RTF, "fake-doc.rtf"),
|
||||||
(FileType.TIFF, "img/layout-parser-paper-fast.tiff"),
|
(FileType.TIFF, "img/layout-parser-paper-fast.tiff"),
|
||||||
(FileType.TXT, "norwich-city.txt"),
|
(FileType.TXT, "norwich-city.txt"),
|
||||||
(FileType.WAV, "CantinaBand3.wav"),
|
(FileType.WAV, "CantinaBand3.wav"),
|
||||||
|
(FileType.XLS, "tests-example.xls"),
|
||||||
(FileType.XLSX, "stanley-cups.xlsx"),
|
(FileType.XLSX, "stanley-cups.xlsx"),
|
||||||
(FileType.XML, "factbook.xml"),
|
(FileType.XML, "factbook.xml"),
|
||||||
(FileType.ZIP, "simple.zip"),
|
(FileType.ZIP, "simple.zip"),
|
||||||
@ -290,11 +336,7 @@ def test_it_detects_most_file_types_using_strategy_2_when_libmagic_guesses_mime_
|
|||||||
"""Does not work for all types, in particular:
|
"""Does not work for all types, in particular:
|
||||||
|
|
||||||
TODOs:
|
TODOs:
|
||||||
- DOC is misidentified as MSG, TODO on that below.
|
|
||||||
- MSG is misidentified as UNK, but only on CI.
|
|
||||||
- PPT is misidentified as MSG, same fix as DOC.
|
|
||||||
- TSV is identified as TXT, maybe need an `.is_tsv` predicate in `_TextFileDifferentiator`
|
- TSV is identified as TXT, maybe need an `.is_tsv` predicate in `_TextFileDifferentiator`
|
||||||
- XLS is misidentified as MSG, same fix as DOC.
|
|
||||||
|
|
||||||
NOCANDOs: w/o an extension I think these are the best we can do.
|
NOCANDOs: w/o an extension I think these are the best we can do.
|
||||||
- MD is identified as TXT
|
- MD is identified as TXT
|
||||||
@ -309,25 +351,44 @@ def test_it_detects_most_file_types_using_strategy_2_when_libmagic_guesses_mime_
|
|||||||
assert detect_filetype(file=file) is expected_value
|
assert detect_filetype(file=file) is expected_value
|
||||||
|
|
||||||
|
|
||||||
# NOTE(scanny): magic gets this wrong ("application/x-ole-storage") but filetype lib gets it right
|
|
||||||
# ("application/msword"). Need a differentiator for "application/x-ole-storage".
|
|
||||||
@pytest.mark.xfail(reason="TODO: FIX", raises=AssertionError, strict=True)
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("expected_value", "file_name"),
|
("expected_value", "file_name"),
|
||||||
[
|
[
|
||||||
(FileType.DOC, "simple.doc"),
|
(FileType.DOC, "simple.doc"),
|
||||||
(FileType.PPT, "fake-power-point.ppt"),
|
(FileType.PPT, "fake-power-point.ppt"),
|
||||||
(FileType.XLS, "tests-example.xls"),
|
(FileType.XLS, "tests-example.xls"),
|
||||||
# -- only fails on CI, maybe different libmagic version or "magic-files" --
|
(FileType.MSG, "fake-email-multiple-attachments.msg"),
|
||||||
# (FileType.MSG, "fake-email.msg"),
|
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_it_detects_MS_Office_file_types_using_strategy_2_when_libmagic_guesses_mime_type(
|
@pytest.mark.parametrize(
|
||||||
file_name: str, expected_value: FileType
|
"guessed_mime_type",
|
||||||
|
[
|
||||||
|
"application/msword",
|
||||||
|
"application/vnd.ms-excel",
|
||||||
|
"application/vnd.ms-outlook",
|
||||||
|
"application/vnd.ms-powerpoint",
|
||||||
|
"application/x-ole-storage",
|
||||||
|
"anything/else",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_guessed_mime_type(
|
||||||
|
file_name: str, guessed_mime_type: str, expected_value: FileType, ctx_mime_type_: Mock
|
||||||
):
|
):
|
||||||
|
"""Fixes XLS wrongly-guessed as DOC, PPT, "application/x-ole-storage" etc.
|
||||||
|
|
||||||
|
It's better than that actually, the OLE differentiator will get the right file-type for any DOC,
|
||||||
|
PPT, XLS, or MSG file, regardless of guessed MIME-type.
|
||||||
|
"""
|
||||||
|
ctx_mime_type_.return_value = guessed_mime_type
|
||||||
|
# -- disable strategy 3 by not providing a file-name source --
|
||||||
with open(example_doc_path(file_name), "rb") as f:
|
with open(example_doc_path(file_name), "rb") as f:
|
||||||
file = io.BytesIO(f.read())
|
file = io.BytesIO(f.read())
|
||||||
assert detect_filetype(file=file) is expected_value
|
|
||||||
|
# -- disable strategy 1 by not asserting a content-type --
|
||||||
|
file_type = detect_filetype(file=file)
|
||||||
|
|
||||||
|
ctx_mime_type_.assert_called_with()
|
||||||
|
assert file_type is expected_value
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@ -454,6 +515,7 @@ def test_it_detects_correct_file_type_from_strategy_3_when_extension_maps_to_fil
|
|||||||
[
|
[
|
||||||
(FileType.BMP, "img/bmp_24.bmp", "application/zip"),
|
(FileType.BMP, "img/bmp_24.bmp", "application/zip"),
|
||||||
(FileType.DOC, "simple.doc", None),
|
(FileType.DOC, "simple.doc", None),
|
||||||
|
(FileType.EPUB, "winter-sports.epub", "application/x-ole-storage"),
|
||||||
(FileType.MSG, "fake-email.msg", "application/octet-stream"),
|
(FileType.MSG, "fake-email.msg", "application/octet-stream"),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@ -575,6 +637,41 @@ def test_it_detect_CSV_from_path_and_file_when_content_contains_escaped_commas()
|
|||||||
assert detect_filetype(file=f) == FileType.CSV
|
assert detect_filetype(file=f) == FileType.CSV
|
||||||
|
|
||||||
|
|
||||||
|
# ================================================================================================
|
||||||
|
# Describe `is_json_processable()`
|
||||||
|
# ================================================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def it_affirms_JSON_is_array_of_objects_from_a_file_path():
|
||||||
|
assert is_json_processable(example_doc_path("simple.json")) is True
|
||||||
|
|
||||||
|
|
||||||
|
def and_it_affirms_JSON_is_NOT_an_array_of_objects_from_a_file_path():
|
||||||
|
assert is_json_processable(example_doc_path("not-unstructured-payload.json")) is False
|
||||||
|
|
||||||
|
|
||||||
|
def it_affirms_JSON_is_array_of_objects_from_a_file_like_object_open_for_reading_bytes():
|
||||||
|
with open(example_doc_path("simple.json"), "rb") as f:
|
||||||
|
assert is_json_processable(file=f) is True
|
||||||
|
|
||||||
|
|
||||||
|
def and_it_affirms_JSON_is_NOT_an_array_of_objects_from_a_file_like_object_open_for_reading_bytes():
|
||||||
|
with open(example_doc_path("not-unstructured-payload.json"), "rb") as f:
|
||||||
|
assert is_json_processable(file=f) is False
|
||||||
|
|
||||||
|
|
||||||
|
def it_affirms_JSON_is_array_of_objects_from_text():
|
||||||
|
with open(example_doc_path("simple.json")) as f:
|
||||||
|
text = f.read()
|
||||||
|
assert is_json_processable(file_text=text) is True
|
||||||
|
|
||||||
|
|
||||||
|
def and_it_affirms_JSON_is_NOT_an_array_of_objects_from_text():
|
||||||
|
with open(example_doc_path("not-unstructured-payload.json")) as f:
|
||||||
|
text = f.read()
|
||||||
|
assert is_json_processable(file_text=text) is False
|
||||||
|
|
||||||
|
|
||||||
# ================================================================================================
|
# ================================================================================================
|
||||||
# MODULE-LEVEL FIXTURES
|
# MODULE-LEVEL FIXTURES
|
||||||
# ================================================================================================
|
# ================================================================================================
|
||||||
@ -891,6 +988,52 @@ class Describe_FileTypeDetectionContext:
|
|||||||
return property_mock(request, _FileTypeDetectionContext, "mime_type")
|
return property_mock(request, _FileTypeDetectionContext, "mime_type")
|
||||||
|
|
||||||
|
|
||||||
|
class Describe_OleFileDifferentiator:
|
||||||
|
"""Unit-test suite for `unstructured.file_utils.filetype._OleFileDifferentiator`."""
|
||||||
|
|
||||||
|
# -- .applies() ---------------------------------------------
|
||||||
|
|
||||||
|
def it_provides_a_qualifying_alternate_constructor_which_constructs_when_applicable(self):
|
||||||
|
"""The constructor determines whether this differentiator is applicable.
|
||||||
|
|
||||||
|
It returns an instance only when differentiating a CFBF file-type is required, which it
|
||||||
|
judges by inspecting the initial bytes of the file for the CFBF magic-bytes.
|
||||||
|
"""
|
||||||
|
ctx = _FileTypeDetectionContext(example_doc_path("simple.doc"))
|
||||||
|
|
||||||
|
differentiator = _OleFileDifferentiator.applies(ctx, "foo/bar")
|
||||||
|
|
||||||
|
assert differentiator is not None
|
||||||
|
assert isinstance(differentiator, _OleFileDifferentiator)
|
||||||
|
|
||||||
|
def and_it_returns_None_when_ole_differentiation_is_not_applicable_to_the_mime_type(self):
|
||||||
|
ctx = _FileTypeDetectionContext(example_doc_path("winter-sports.epub"))
|
||||||
|
assert _OleFileDifferentiator.applies(ctx, "application/epub") is None
|
||||||
|
|
||||||
|
# -- .file_type ---------------------------------------------
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("file_name", "expected_value"),
|
||||||
|
[
|
||||||
|
("simple.doc", FileType.DOC),
|
||||||
|
("fake-power-point.ppt", FileType.PPT),
|
||||||
|
("tests-example.xls", FileType.XLS),
|
||||||
|
("fake-email.msg", FileType.MSG),
|
||||||
|
("README.org", None),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def it_distinguishes_the_file_type_of_applicable_zip_files(
|
||||||
|
self, file_name: str, expected_value: FileType | None
|
||||||
|
):
|
||||||
|
# -- no file-name available, just to make sure we're not relying on an extension --
|
||||||
|
with open(example_doc_path(file_name), "rb") as f:
|
||||||
|
file = io.BytesIO(f.read())
|
||||||
|
ctx = _FileTypeDetectionContext(file=file)
|
||||||
|
differentiator = _OleFileDifferentiator(ctx)
|
||||||
|
|
||||||
|
assert differentiator.file_type is expected_value
|
||||||
|
|
||||||
|
|
||||||
class Describe_TextFileDifferentiator:
|
class Describe_TextFileDifferentiator:
|
||||||
"""Unit-test suite for `unstructured.file_utils.filetype._TextFileDifferentiator`."""
|
"""Unit-test suite for `unstructured.file_utils.filetype._TextFileDifferentiator`."""
|
||||||
|
|
||||||
|
@ -1221,33 +1221,39 @@ def test_auto_partition_overwrites_any_filetype_applied_by_file_specific_partiti
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"filetype",
|
"file_type",
|
||||||
[
|
[
|
||||||
t
|
t
|
||||||
for t in FileType
|
for t in FileType
|
||||||
if t not in (FileType.EMPTY, FileType.UNK, FileType.WAV, FileType.XLS, FileType.ZIP)
|
if t
|
||||||
and t.partitioner_function_name != "partition_image"
|
not in (
|
||||||
|
FileType.EMPTY,
|
||||||
|
FileType.JSON,
|
||||||
|
FileType.UNK,
|
||||||
|
FileType.WAV,
|
||||||
|
FileType.XLS,
|
||||||
|
FileType.ZIP,
|
||||||
|
)
|
||||||
|
and t.partitioner_shortname != "image"
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(filetype: FileType):
|
def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(file_type: FileType):
|
||||||
extension = filetype.name.lower()
|
partition_fn_name = file_type.partitioner_function_name
|
||||||
# -- except for two oddballs, the shortname is the extension --
|
module = import_module(file_type.partitioner_module_qname)
|
||||||
partitioner_shortname = {FileType.TXT: "text", FileType.EML: "email"}.get(filetype, extension)
|
|
||||||
partition_fn_name = f"partition_{partitioner_shortname}"
|
|
||||||
module = import_module(f"unstructured.partition.{partitioner_shortname}")
|
|
||||||
partition_fn = getattr(module, partition_fn_name)
|
partition_fn = getattr(module, partition_fn_name)
|
||||||
|
|
||||||
# -- partition the first example-doc with the extension for this filetype --
|
# -- partition the first example-doc with the extension for this filetype --
|
||||||
elements: list[Element] = []
|
elements: list[Element] = []
|
||||||
doc_path = example_doc_path("pdf") if filetype == FileType.PDF else example_doc_path("")
|
doc_path = example_doc_path("pdf") if file_type == FileType.PDF else example_doc_path("")
|
||||||
|
extensions = file_type._extensions
|
||||||
for file in pathlib.Path(doc_path).iterdir():
|
for file in pathlib.Path(doc_path).iterdir():
|
||||||
if file.is_file() and file.suffix == f".{extension}":
|
if file.is_file() and file.suffix in extensions:
|
||||||
elements = partition_fn(str(file))
|
elements = partition_fn(str(file))
|
||||||
break
|
break
|
||||||
|
|
||||||
assert elements
|
assert elements
|
||||||
assert all(
|
assert all(
|
||||||
e.metadata.filetype == filetype.mime_type
|
e.metadata.filetype == file_type.mime_type
|
||||||
for e in elements
|
for e in elements
|
||||||
if e.metadata.filetype is not None
|
if e.metadata.filetype is not None
|
||||||
)
|
)
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.15.1-dev4" # pragma: no cover
|
__version__ = "0.15.1-dev5" # pragma: no cover
|
||||||
|
@ -112,12 +112,12 @@ def is_json_processable(
|
|||||||
file is JSON.
|
file is JSON.
|
||||||
"""
|
"""
|
||||||
exactly_one(filename=filename, file=file, file_text=file_text)
|
exactly_one(filename=filename, file=file, file_text=file_text)
|
||||||
|
|
||||||
if file_text is None:
|
if file_text is None:
|
||||||
file_text = _read_file_start_for_type_check(
|
file_text = _FileTypeDetectionContext.new(
|
||||||
file=file,
|
file_path=filename, file=file, encoding=encoding
|
||||||
filename=filename,
|
).text_head
|
||||||
encoding=encoding,
|
|
||||||
)
|
|
||||||
return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None
|
return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None
|
||||||
|
|
||||||
|
|
||||||
@ -161,6 +161,11 @@ class _FileTypeDetector:
|
|||||||
if not content_type:
|
if not content_type:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# -- OLE-based file-format content_type values are sometimes unreliable. These are
|
||||||
|
# -- DOC, PPT, XLS, and MSG.
|
||||||
|
if differentiator := _OleFileDifferentiator.applies(self._ctx, content_type):
|
||||||
|
return differentiator.file_type
|
||||||
|
|
||||||
# -- MS-Office 2007+ (OpenXML) content_type value is sometimes unreliable --
|
# -- MS-Office 2007+ (OpenXML) content_type value is sometimes unreliable --
|
||||||
if differentiator := _ZipFileDifferentiator.applies(self._ctx, content_type):
|
if differentiator := _ZipFileDifferentiator.applies(self._ctx, content_type):
|
||||||
return differentiator.file_type
|
return differentiator.file_type
|
||||||
@ -185,9 +190,8 @@ class _FileTypeDetector:
|
|||||||
if mime_type is None:
|
if mime_type is None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# NOTE(Crag): older magic lib does not differentiate between xls and doc
|
if differentiator := _OleFileDifferentiator.applies(self._ctx, mime_type):
|
||||||
if mime_type == "application/msword" and extension == ".xls":
|
return differentiator.file_type
|
||||||
return FileType.XLS
|
|
||||||
|
|
||||||
if mime_type.endswith("xml"):
|
if mime_type.endswith("xml"):
|
||||||
return FileType.HTML if extension in (".html", ".htm") else FileType.XML
|
return FileType.HTML if extension in (".html", ".htm") else FileType.XML
|
||||||
@ -248,7 +252,7 @@ class _FileTypeDetectionContext:
|
|||||||
content_type: str | None = None,
|
content_type: str | None = None,
|
||||||
metadata_file_path: str | None = None,
|
metadata_file_path: str | None = None,
|
||||||
):
|
):
|
||||||
self._file_path = file_path
|
self._file_path_arg = file_path
|
||||||
self._file_arg = file
|
self._file_arg = file
|
||||||
self._encoding_arg = encoding
|
self._encoding_arg = encoding
|
||||||
self._content_type = content_type
|
self._content_type = content_type
|
||||||
@ -261,9 +265,9 @@ class _FileTypeDetectionContext:
|
|||||||
file_path: str | None,
|
file_path: str | None,
|
||||||
file: IO[bytes] | None,
|
file: IO[bytes] | None,
|
||||||
encoding: str | None,
|
encoding: str | None,
|
||||||
content_type: str | None,
|
content_type: str | None = None,
|
||||||
metadata_file_path: str | None,
|
metadata_file_path: str | None = None,
|
||||||
):
|
) -> _FileTypeDetectionContext:
|
||||||
self = cls(
|
self = cls(
|
||||||
file_path=file_path,
|
file_path=file_path,
|
||||||
file=file,
|
file=file,
|
||||||
@ -320,7 +324,10 @@ class _FileTypeDetectionContext:
|
|||||||
None when the caller specified the source as a file-like object instead. Useful for user
|
None when the caller specified the source as a file-like object instead. Useful for user
|
||||||
feedback on an error, but users of context should have little use for it otherwise.
|
feedback on an error, but users of context should have little use for it otherwise.
|
||||||
"""
|
"""
|
||||||
return self._file_path
|
if (file_path := self._file_path_arg) is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return os.path.realpath(file_path) if os.path.islink(file_path) else file_path
|
||||||
|
|
||||||
@lazyproperty
|
@lazyproperty
|
||||||
def is_zipfile(self) -> bool:
|
def is_zipfile(self) -> bool:
|
||||||
@ -351,19 +358,19 @@ class _FileTypeDetectionContext:
|
|||||||
|
|
||||||
A `str` return value is always in lower-case.
|
A `str` return value is always in lower-case.
|
||||||
"""
|
"""
|
||||||
|
file_path = self.file_path
|
||||||
|
|
||||||
if LIBMAGIC_AVAILABLE:
|
if LIBMAGIC_AVAILABLE:
|
||||||
import magic
|
import magic
|
||||||
|
|
||||||
mime_type = (
|
mime_type = (
|
||||||
magic.from_file(_resolve_symlink(self._file_path), mime=True)
|
magic.from_file(file_path, mime=True)
|
||||||
if self._file_path
|
if file_path
|
||||||
else magic.from_buffer(self.file_head, mime=True)
|
else magic.from_buffer(self.file_head, mime=True)
|
||||||
)
|
)
|
||||||
return mime_type.lower() if mime_type else None
|
return mime_type.lower() if mime_type else None
|
||||||
|
|
||||||
mime_type = (
|
mime_type = ft.guess_mime(file_path) if file_path else ft.guess_mime(self.file_head)
|
||||||
ft.guess_mime(self._file_path) if self._file_path else ft.guess_mime(self.file_head)
|
|
||||||
)
|
|
||||||
|
|
||||||
if mime_type is None:
|
if mime_type is None:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
@ -387,8 +394,8 @@ class _FileTypeDetectionContext:
|
|||||||
|
|
||||||
File is guaranteed to be at read position 0 when called.
|
File is guaranteed to be at read position 0 when called.
|
||||||
"""
|
"""
|
||||||
if self._file_path:
|
if self.file_path:
|
||||||
with open(self._file_path, "rb") as f:
|
with open(self.file_path, "rb") as f:
|
||||||
yield f
|
yield f
|
||||||
else:
|
else:
|
||||||
file = self._file_arg
|
file = self._file_arg
|
||||||
@ -416,7 +423,7 @@ class _FileTypeDetectionContext:
|
|||||||
else content.decode(encoding=self.encoding, errors="ignore")
|
else content.decode(encoding=self.encoding, errors="ignore")
|
||||||
)
|
)
|
||||||
|
|
||||||
file_path = self._file_path
|
file_path = self.file_path
|
||||||
assert file_path is not None # -- guaranteed by `._validate` --
|
assert file_path is not None # -- guaranteed by `._validate` --
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -429,12 +436,61 @@ class _FileTypeDetectionContext:
|
|||||||
|
|
||||||
def _validate(self) -> None:
|
def _validate(self) -> None:
|
||||||
"""Raise if the context is invalid."""
|
"""Raise if the context is invalid."""
|
||||||
if self._file_path and not os.path.isfile(self._file_path):
|
if self.file_path and not os.path.isfile(self.file_path):
|
||||||
raise FileNotFoundError(f"no such file {self._file_path}")
|
raise FileNotFoundError(f"no such file {self._file_path_arg}")
|
||||||
if not self._file_path and not self._file_arg:
|
if not self.file_path and not self._file_arg:
|
||||||
raise ValueError("either `file_path` or `file` argument must be provided")
|
raise ValueError("either `file_path` or `file` argument must be provided")
|
||||||
|
|
||||||
|
|
||||||
|
class _OleFileDifferentiator:
|
||||||
|
"""Refine an OLE-storage package (CFBF) file-type that may not be as specific as it could be.
|
||||||
|
|
||||||
|
Compound File Binary Format (CFBF), aka. OLE file, is use by Microsoft for legacy MS Office
|
||||||
|
files (DOC, PPT, XLS) as well as for Outlook MSG files. `libmagic` tends to identify these as
|
||||||
|
`"application/x-ole-storage"` which is true but too not specific enough for partitioning
|
||||||
|
purposes.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, ctx: _FileTypeDetectionContext):
|
||||||
|
self._ctx = ctx
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def applies(
|
||||||
|
cls, ctx: _FileTypeDetectionContext, mime_type: str
|
||||||
|
) -> _OleFileDifferentiator | None:
|
||||||
|
"""Constructs an instance, but only if this differentiator applies for `mime_type`."""
|
||||||
|
return cls(ctx) if cls._is_ole_file(ctx) else None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def file_type(self) -> FileType | None:
|
||||||
|
"""Differentiated file-type for Microsoft Compound File Binary Format (CFBF).
|
||||||
|
|
||||||
|
Returns one of:
|
||||||
|
- `FileType.DOC`
|
||||||
|
- `FileType.PPT`
|
||||||
|
- `FileType.XLS`
|
||||||
|
- `FileType.MSG`
|
||||||
|
"""
|
||||||
|
# -- if this is not a CFBF file then whatever MIME-type was guessed is wrong, so return
|
||||||
|
# -- `None` to trigger fall-back to next strategy.
|
||||||
|
if not self._is_ole_file(self._ctx):
|
||||||
|
return None
|
||||||
|
|
||||||
|
# -- `filetype` lib is better at legacy MS-Office files than `libmagic`, so rely on it to
|
||||||
|
# -- differentiate those. Note it doesn't detect MSG type though, so we assume any OLE file
|
||||||
|
# -- that is not a legacy MS-Office type to be a MSG file.
|
||||||
|
with self._ctx.open() as file:
|
||||||
|
mime_type = ft.guess_mime(file)
|
||||||
|
|
||||||
|
return FileType.from_mime_type(mime_type or "application/vnd.ms-outlook")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _is_ole_file(ctx: _FileTypeDetectionContext) -> bool:
|
||||||
|
"""True when file has CFBF magic first 8 bytes."""
|
||||||
|
with ctx.open() as file:
|
||||||
|
return file.read(8) == b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"
|
||||||
|
|
||||||
|
|
||||||
class _TextFileDifferentiator:
|
class _TextFileDifferentiator:
|
||||||
"""Refine a textual file-type that may not be as specific as it could be."""
|
"""Refine a textual file-type that may not be as specific as it could be."""
|
||||||
|
|
||||||
@ -597,45 +653,6 @@ class _ZipFileDifferentiator:
|
|||||||
return FileType.ZIP
|
return FileType.ZIP
|
||||||
|
|
||||||
|
|
||||||
def _read_file_start_for_type_check(
|
|
||||||
filename: Optional[str] = None,
|
|
||||||
file: Optional[IO[bytes]] = None,
|
|
||||||
encoding: Optional[str] = "utf-8",
|
|
||||||
) -> str:
|
|
||||||
"""Reads the start of the file and returns the text content."""
|
|
||||||
exactly_one(filename=filename, file=file)
|
|
||||||
|
|
||||||
if file is not None:
|
|
||||||
file.seek(0)
|
|
||||||
file_content = file.read(4096)
|
|
||||||
if isinstance(file_content, str):
|
|
||||||
file_text = file_content
|
|
||||||
else:
|
|
||||||
file_text = file_content.decode(errors="ignore")
|
|
||||||
file.seek(0)
|
|
||||||
return file_text
|
|
||||||
|
|
||||||
# -- guaranteed by `exactly_one()` call --
|
|
||||||
assert filename is not None
|
|
||||||
|
|
||||||
try:
|
|
||||||
with open(filename, encoding=encoding) as f:
|
|
||||||
file_text = f.read(4096)
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
formatted_encoding, _ = detect_file_encoding(filename=filename)
|
|
||||||
with open(filename, encoding=formatted_encoding) as f:
|
|
||||||
file_text = f.read(4096)
|
|
||||||
|
|
||||||
return file_text
|
|
||||||
|
|
||||||
|
|
||||||
def _resolve_symlink(file_path: str) -> str:
|
|
||||||
"""Resolve `file_path` containing symlink to the actual file path."""
|
|
||||||
if os.path.islink(file_path):
|
|
||||||
file_path = os.path.realpath(file_path)
|
|
||||||
return file_path
|
|
||||||
|
|
||||||
|
|
||||||
_P = ParamSpec("_P")
|
_P = ParamSpec("_P")
|
||||||
|
|
||||||
|
|
||||||
|
@ -286,7 +286,7 @@ class FileType(enum.Enum):
|
|||||||
"msg",
|
"msg",
|
||||||
[".msg"],
|
[".msg"],
|
||||||
"application/vnd.ms-outlook",
|
"application/vnd.ms-outlook",
|
||||||
["application/x-ole-storage"],
|
cast(list[str], []),
|
||||||
)
|
)
|
||||||
ODT = (
|
ODT = (
|
||||||
"odt",
|
"odt",
|
||||||
|
Loading…
x
Reference in New Issue
Block a user