fix: refine filetype detection (#3828)

**Summary**
Fixes a bug where a CSV file with asserted content-type
`application/vnd.ms-excel` was incorrectly identified as an XLS file and
failed partitioning.

**Additional Context**
The `content_type` argument to partitioning is often authored by the
client system (e.g. Unstructured SDK) and is both unreliable and outside
the control of the user. In this case the `.csv -> XLS` mapping is
correct for certain purposes (Excel is often used to load and edit CSV
files) but not for partitioning, and the user has no readily available
way to override the mapping.

XLS files as well as seven other common binary file types can be
efficiently detected 100% of the time (at least 99.999%) using code we
already have in the file detector.

- Promote this direct-inspection strategy to be tried first.
- When DOC, DOCX, EPUB, ODT, PPT, PPTX, XLS, or XLSX is detected, use
that file-type.
- When one of those types is NOT detected, clear the asserted
`content_type` when it matches any of those types. This prevents the
problem seen in the bug where the asserted content type was used to
determine the file-type.
- The remaining content_type, guess MIME-type, and filename-extension
mapping strategies are tried, in that order, only when direct inspection
fails. This is largely the same as it was before.
- Fix #3781 while we were in the neighborhood.
- Fix #3596 as well, essentially an earlier report of #3781.
This commit is contained in:
Steve Canny 2024-12-16 16:56:21 -08:00 committed by GitHub
parent 10f0d54ac2
commit b5ff79d8db
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 224 additions and 479 deletions

View File

@ -1,4 +1,4 @@
## 0.16.12-dev2 ## 0.16.12-dev3
### Enhancements ### Enhancements
@ -9,6 +9,7 @@
### Fixes ### Fixes
- **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted. - **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted.
- **CSV with asserted XLS content-type is correctly identified as CSV.** Resolves a bug where a CSV file with an asserted content-type of `application/vnd.ms-excel` was incorrectly identified as an XLS file.
## 0.16.11 ## 0.16.11

View File

@ -14,15 +14,14 @@ from test_unstructured.unit_utils import (
LogCaptureFixture, LogCaptureFixture,
Mock, Mock,
example_doc_path, example_doc_path,
function_mock,
patch, patch,
property_mock, property_mock,
) )
from unstructured.file_utils.filetype import ( from unstructured.file_utils.filetype import (
_FileTypeDetectionContext, _FileTypeDetectionContext,
_OleFileDifferentiator, _OleFileDetector,
_TextFileDifferentiator, _TextFileDifferentiator,
_ZipFileDifferentiator, _ZipFileDetector,
detect_filetype, detect_filetype,
is_json_processable, is_json_processable,
) )
@ -31,7 +30,41 @@ from unstructured.file_utils.model import FileType
is_in_docker = os.path.exists("/.dockerenv") is_in_docker = os.path.exists("/.dockerenv")
# ================================================================================================ # ================================================================================================
# STRATEGY #1 - CONTENT-TYPE ASSERTED IN CALL # STRATEGY #1 - DIRECT DETECTION OF CFB/ZIP-BASED BINARY FILE TYPES (8 TYPES)
# ================================================================================================
@pytest.mark.parametrize(
("expected_value", "file_name"),
[
(FileType.DOC, "simple.doc"),
(FileType.DOCX, "simple.docx"),
(FileType.EPUB, "winter-sports.epub"),
(FileType.ODT, "simple.odt"),
(FileType.PPT, "fake-power-point.ppt"),
(FileType.PPTX, "fake-power-point.pptx"),
(FileType.XLS, "tests-example.xls"),
(FileType.XLSX, "stanley-cups.xlsx"),
],
)
def test_it_detects_correct_file_type_for_CFB_and_ZIP_subtypes_detected_by_direct_inspection(
file_name: str, expected_value: FileType, ctx_mime_type_: Mock
):
# -- disable other strategies; no content-type, guessed MIME-type or extension --
ctx_mime_type_.return_value = None
with open(example_doc_path(file_name), "rb") as f:
file = io.BytesIO(f.read())
file_type = detect_filetype(file=file)
# -- Strategy 1 should not need to refer to guessed MIME-type and detection should not
# -- fall back to MIME-type guessing for any of these test cases.
ctx_mime_type_.assert_not_called()
assert file_type == expected_value
# ================================================================================================
# STRATEGY #2 - CONTENT-TYPE ASSERTED IN CALL
# ================================================================================================ # ================================================================================================
@ -40,41 +73,21 @@ is_in_docker = os.path.exists("/.dockerenv")
[ [
(FileType.BMP, "img/bmp_24.bmp", "image/bmp"), (FileType.BMP, "img/bmp_24.bmp", "image/bmp"),
(FileType.CSV, "stanley-cups.csv", "text/csv"), (FileType.CSV, "stanley-cups.csv", "text/csv"),
(FileType.DOC, "simple.doc", "application/msword"),
(
FileType.DOCX,
"simple.docx",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
),
(FileType.EML, "eml/fake-email.eml", "message/rfc822"), (FileType.EML, "eml/fake-email.eml", "message/rfc822"),
(FileType.EPUB, "winter-sports.epub", "application/epub+zip"),
(FileType.HEIC, "img/DA-1p.heic", "image/heic"), (FileType.HEIC, "img/DA-1p.heic", "image/heic"),
(FileType.HTML, "example-10k-1p.html", "text/html"), (FileType.HTML, "example-10k-1p.html", "text/html"),
(FileType.JPG, "img/example.jpg", "image/jpeg"), (FileType.JPG, "img/example.jpg", "image/jpeg"),
(FileType.JSON, "spring-weather.html.json", "application/json"), (FileType.JSON, "spring-weather.html.json", "application/json"),
(FileType.MD, "README.md", "text/markdown"), (FileType.MD, "README.md", "text/markdown"),
(FileType.ODT, "simple.odt", "application/vnd.oasis.opendocument.text"),
(FileType.ORG, "README.org", "text/org"), (FileType.ORG, "README.org", "text/org"),
(FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"), (FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"),
(FileType.PNG, "img/DA-1p.png", "image/png"), (FileType.PNG, "img/DA-1p.png", "image/png"),
(FileType.PPT, "fake-power-point.ppt", "application/vnd.ms-powerpoint"),
(
FileType.PPTX,
"fake-power-point.pptx",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
),
(FileType.RST, "README.rst", "text/x-rst"), (FileType.RST, "README.rst", "text/x-rst"),
(FileType.RTF, "fake-doc.rtf", "text/rtf"), (FileType.RTF, "fake-doc.rtf", "text/rtf"),
(FileType.TIFF, "img/layout-parser-paper-fast.tiff", "image/tiff"), (FileType.TIFF, "img/layout-parser-paper-fast.tiff", "image/tiff"),
(FileType.TSV, "stanley-cups.tsv", "text/tsv"), (FileType.TSV, "stanley-cups.tsv", "text/tsv"),
(FileType.TXT, "norwich-city.txt", "text/plain"), (FileType.TXT, "norwich-city.txt", "text/plain"),
(FileType.WAV, "CantinaBand3.wav", "audio/wav"), (FileType.WAV, "CantinaBand3.wav", "audio/wav"),
(FileType.XLS, "tests-example.xls", "application/vnd.ms-excel"),
(
FileType.XLSX,
"stanley-cups.xlsx",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
),
(FileType.XML, "factbook.xml", "application/xml"), (FileType.XML, "factbook.xml", "application/xml"),
(FileType.ZIP, "simple.zip", "application/zip"), (FileType.ZIP, "simple.zip", "application/zip"),
], ],
@ -82,13 +95,13 @@ is_in_docker = os.path.exists("/.dockerenv")
def test_it_detects_correct_file_type_from_file_path_with_correct_asserted_content_type( def test_it_detects_correct_file_type_from_file_path_with_correct_asserted_content_type(
file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock
): ):
# -- disable strategy #2, leaving only asserted content-type and extension -- # -- disable mime-guessing leaving only asserted content-type and extension --
ctx_mime_type_.return_value = None ctx_mime_type_.return_value = None
file_type = detect_filetype(example_doc_path(file_name), content_type=content_type) file_type = detect_filetype(example_doc_path(file_name), content_type=content_type)
# -- Strategy 1 should not need to refer to guessed MIME-type and detection should not # -- Content-type strategy should not need to refer to guessed MIME-type and detection should
# -- fall back to strategy 2 for any of these test cases. # not -- fall back to strategy 2 for any of these test cases.
ctx_mime_type_.assert_not_called() ctx_mime_type_.assert_not_called()
assert file_type == expected_value assert file_type == expected_value
@ -98,41 +111,21 @@ def test_it_detects_correct_file_type_from_file_path_with_correct_asserted_conte
[ [
(FileType.BMP, "img/bmp_24.bmp", "image/bmp"), (FileType.BMP, "img/bmp_24.bmp", "image/bmp"),
(FileType.CSV, "stanley-cups.csv", "text/csv"), (FileType.CSV, "stanley-cups.csv", "text/csv"),
(FileType.DOC, "simple.doc", "application/msword"),
(
FileType.DOCX,
"simple.docx",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
),
(FileType.EML, "eml/fake-email.eml", "message/rfc822"), (FileType.EML, "eml/fake-email.eml", "message/rfc822"),
(FileType.EPUB, "winter-sports.epub", "application/epub+zip"),
(FileType.HEIC, "img/DA-1p.heic", "image/heic"), (FileType.HEIC, "img/DA-1p.heic", "image/heic"),
(FileType.HTML, "example-10k-1p.html", "text/html"), (FileType.HTML, "example-10k-1p.html", "text/html"),
(FileType.JPG, "img/example.jpg", "image/jpeg"), (FileType.JPG, "img/example.jpg", "image/jpeg"),
(FileType.JSON, "spring-weather.html.json", "application/json"), (FileType.JSON, "spring-weather.html.json", "application/json"),
(FileType.MD, "README.md", "text/markdown"), (FileType.MD, "README.md", "text/markdown"),
(FileType.ODT, "simple.odt", "application/vnd.oasis.opendocument.text"),
(FileType.ORG, "README.org", "text/org"), (FileType.ORG, "README.org", "text/org"),
(FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"), (FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"),
(FileType.PNG, "img/DA-1p.png", "image/png"), (FileType.PNG, "img/DA-1p.png", "image/png"),
(FileType.PPT, "fake-power-point.ppt", "application/vnd.ms-powerpoint"),
(
FileType.PPTX,
"fake-power-point.pptx",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
),
(FileType.RST, "README.rst", "text/x-rst"), (FileType.RST, "README.rst", "text/x-rst"),
(FileType.RTF, "fake-doc.rtf", "text/rtf"), (FileType.RTF, "fake-doc.rtf", "text/rtf"),
(FileType.TIFF, "img/layout-parser-paper-fast.tiff", "image/tiff"), (FileType.TIFF, "img/layout-parser-paper-fast.tiff", "image/tiff"),
(FileType.TSV, "stanley-cups.tsv", "text/tsv"), (FileType.TSV, "stanley-cups.tsv", "text/tsv"),
(FileType.TXT, "norwich-city.txt", "text/plain"), (FileType.TXT, "norwich-city.txt", "text/plain"),
(FileType.WAV, "CantinaBand3.wav", "audio/wav"), (FileType.WAV, "CantinaBand3.wav", "audio/wav"),
(FileType.XLS, "tests-example.xls", "application/vnd.ms-excel"),
(
FileType.XLSX,
"stanley-cups.xlsx",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
),
(FileType.XML, "factbook.xml", "application/xml"), (FileType.XML, "factbook.xml", "application/xml"),
(FileType.ZIP, "simple.zip", "application/zip"), (FileType.ZIP, "simple.zip", "application/zip"),
], ],
@ -140,93 +133,22 @@ def test_it_detects_correct_file_type_from_file_path_with_correct_asserted_conte
def test_it_detects_correct_file_type_from_file_no_name_with_correct_asserted_content_type( def test_it_detects_correct_file_type_from_file_no_name_with_correct_asserted_content_type(
file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock
): ):
# -- disable strategy #2 (guessed mime-type) -- # -- disable mime-guessing --
ctx_mime_type_.return_value = None ctx_mime_type_.return_value = None
# -- disable strategy #3 (filename extension) by supplying no source of file name -- # -- disable filename extension mapping by supplying no source of file name --
with open(example_doc_path(file_name), "rb") as f: with open(example_doc_path(file_name), "rb") as f:
file = io.BytesIO(f.read()) file = io.BytesIO(f.read())
file_type = detect_filetype(file=file, content_type=content_type) file_type = detect_filetype(file=file, content_type=content_type)
# -- Strategy 1 should not need to refer to guessed MIME-type and detection should not # -- Content-type strategy should not need to refer to guessed MIME-type and detection should
# -- fall-back to strategy 2 for any of these test cases. # -- not fall-back to strategy 2 for any of these test cases.
ctx_mime_type_.assert_not_called()
assert file_type is expected_value
@pytest.mark.parametrize(
("expected_value", "file_name"),
[
(FileType.DOCX, "simple.docx"),
(FileType.PPTX, "fake-power-point.pptx"),
(FileType.XLSX, "stanley-cups.xlsx"),
],
)
@pytest.mark.parametrize(
"content_type",
[
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
],
)
def test_it_detects_correct_file_type_from_file_no_name_with_swapped_ms_office_content_type(
file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock
):
# -- disable strategies 2 & 3, content-type strategy should get this on its own --
ctx_mime_type_.return_value = None
with open(example_doc_path(file_name), "rb") as f:
file = io.BytesIO(f.read())
file_type = detect_filetype(file=file, content_type=content_type)
# -- Strategy 1 should not need to refer to guessed MIME-type and detection should not
# -- fall-back to strategy 2 for any of these test cases.
ctx_mime_type_.assert_not_called()
assert file_type is expected_value
@pytest.mark.parametrize(
("expected_value", "file_name"),
[
(FileType.DOC, "simple.doc"),
(FileType.PPT, "fake-power-point.ppt"),
(FileType.XLS, "tests-example.xls"),
],
)
@pytest.mark.parametrize(
"content_type",
[
"application/msword",
"application/vnd.ms-outlook",
"application/vnd.ms-powerpoint",
"application/vnd.ms-excel",
"anything/else",
],
)
def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_asserted_content_type(
file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock
):
"""Fixes wrong XLS asserted as DOC, PPT, etc.
Asserted content-type can be anything except `None` and differentiator will fix it if the file
is DOC, PPT, or XLS type.
"""
# -- disable strategies 2 & 3, content-type strategy should get this on its own --
ctx_mime_type_.return_value = None
with open(example_doc_path(file_name), "rb") as f:
file = io.BytesIO(f.read())
file_type = detect_filetype(file=file, content_type=content_type)
# -- Strategy 1 should not need to refer to guessed MIME-type and detection should not
# -- fall-back to strategy 2 for any of these test cases.
ctx_mime_type_.assert_not_called() ctx_mime_type_.assert_not_called()
assert file_type is expected_value assert file_type is expected_value
# ================================================================================================ # ================================================================================================
# STRATEGY #2 - GUESS MIME-TYPE WITH LIBMAGIC # STRATEGY #3 - GUESS MIME-TYPE WITH LIBMAGIC/FILETYPE LIBRARY
# ================================================================================================ # ================================================================================================
@ -237,31 +159,16 @@ def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_asserted_
(FileType.CSV, "stanley-cups.csv", "text/csv"), (FileType.CSV, "stanley-cups.csv", "text/csv"),
(FileType.CSV, "stanley-cups.csv", "application/csv"), (FileType.CSV, "stanley-cups.csv", "application/csv"),
(FileType.CSV, "stanley-cups.csv", "application/x-csv"), (FileType.CSV, "stanley-cups.csv", "application/x-csv"),
(FileType.DOC, "simple.doc", "application/msword"),
(
FileType.DOCX,
"simple.docx",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
),
(FileType.EML, "eml/fake-email.eml", "message/rfc822"), (FileType.EML, "eml/fake-email.eml", "message/rfc822"),
(FileType.EPUB, "winter-sports.epub", "application/epub"),
(FileType.EPUB, "winter-sports.epub", "application/epub+zip"),
(FileType.HEIC, "img/DA-1p.heic", "image/heic"), (FileType.HEIC, "img/DA-1p.heic", "image/heic"),
(FileType.HTML, "example-10k-1p.html", "text/html"), (FileType.HTML, "example-10k-1p.html", "text/html"),
(FileType.JPG, "img/example.jpg", "image/jpeg"), (FileType.JPG, "img/example.jpg", "image/jpeg"),
(FileType.JSON, "spring-weather.html.json", "application/json"), (FileType.JSON, "spring-weather.html.json", "application/json"),
(FileType.MD, "README.md", "text/markdown"), (FileType.MD, "README.md", "text/markdown"),
(FileType.MD, "README.md", "text/x-markdown"), (FileType.MD, "README.md", "text/x-markdown"),
(FileType.ODT, "simple.odt", "application/vnd.oasis.opendocument.text"),
(FileType.ORG, "README.org", "text/org"), (FileType.ORG, "README.org", "text/org"),
(FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"), (FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"),
(FileType.PNG, "img/DA-1p.png", "image/png"), (FileType.PNG, "img/DA-1p.png", "image/png"),
(FileType.PPT, "fake-power-point.ppt", "application/vnd.ms-powerpoint"),
(
FileType.PPTX,
"fake-power-point.pptx",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
),
(FileType.RST, "README.rst", "text/x-rst"), (FileType.RST, "README.rst", "text/x-rst"),
(FileType.RTF, "fake-doc.rtf", "text/rtf"), (FileType.RTF, "fake-doc.rtf", "text/rtf"),
(FileType.RTF, "fake-doc.rtf", "application/rtf"), (FileType.RTF, "fake-doc.rtf", "application/rtf"),
@ -270,18 +177,11 @@ def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_asserted_
(FileType.TXT, "norwich-city.txt", "text/plain"), (FileType.TXT, "norwich-city.txt", "text/plain"),
(FileType.TXT, "simple.yaml", "text/yaml"), (FileType.TXT, "simple.yaml", "text/yaml"),
(FileType.WAV, "CantinaBand3.wav", "audio/wav"), (FileType.WAV, "CantinaBand3.wav", "audio/wav"),
(FileType.XLS, "tests-example.xls", "application/vnd.ms-excel"),
(
FileType.XLSX,
"stanley-cups.xlsx",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
),
(FileType.XML, "factbook.xml", "application/xml"), (FileType.XML, "factbook.xml", "application/xml"),
(FileType.XML, "factbook.xml", "text/xml"), (FileType.XML, "factbook.xml", "text/xml"),
(FileType.ZIP, "simple.zip", "application/zip"),
], ],
) )
def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_recognized_mime_type( def test_it_detects_correct_file_type_by_guessed_MIME_when_libmagic_guesses_recognized_mime_type(
file_name: str, mime_type: str, expected_value: FileType, ctx_mime_type_: Mock file_name: str, mime_type: str, expected_value: FileType, ctx_mime_type_: Mock
): ):
# -- libmagic guesses a MIME-type mapped to a `FileType` -- # -- libmagic guesses a MIME-type mapped to a `FileType` --
@ -290,7 +190,7 @@ def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_rec
with open(example_doc_path(file_name), "rb") as f: with open(example_doc_path(file_name), "rb") as f:
file = io.BytesIO(f.read()) file = io.BytesIO(f.read())
# -- disable strategy #1 by not asserting a content_type in the call -- # -- disable content-type strategy by not asserting a content_type in the call --
file_type = detect_filetype(file=file) file_type = detect_filetype(file=file)
# -- ctx.mime_type may be referenced multiple times, but at least once -- # -- ctx.mime_type may be referenced multiple times, but at least once --
@ -303,30 +203,22 @@ def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_rec
[ [
(FileType.BMP, "img/bmp_24.bmp"), (FileType.BMP, "img/bmp_24.bmp"),
(FileType.CSV, "stanley-cups.csv"), (FileType.CSV, "stanley-cups.csv"),
(FileType.DOC, "simple.doc"),
(FileType.DOCX, "simple.docx"),
(FileType.EML, "eml/fake-email.eml"), (FileType.EML, "eml/fake-email.eml"),
(FileType.EPUB, "winter-sports.epub"),
(FileType.HEIC, "img/DA-1p.heic"), (FileType.HEIC, "img/DA-1p.heic"),
(FileType.HTML, "ideas-page.html"), (FileType.HTML, "ideas-page.html"),
(FileType.JPG, "img/example.jpg"), (FileType.JPG, "img/example.jpg"),
(FileType.JSON, "spring-weather.html.json"), (FileType.JSON, "spring-weather.html.json"),
(FileType.ODT, "simple.odt"),
(FileType.PDF, "pdf/layout-parser-paper-fast.pdf"), (FileType.PDF, "pdf/layout-parser-paper-fast.pdf"),
(FileType.PNG, "img/DA-1p.png"), (FileType.PNG, "img/DA-1p.png"),
(FileType.PPT, "fake-power-point.ppt"),
(FileType.PPTX, "fake-power-point.pptx"),
(FileType.RTF, "fake-doc.rtf"), (FileType.RTF, "fake-doc.rtf"),
(FileType.TIFF, "img/layout-parser-paper-fast.tiff"), (FileType.TIFF, "img/layout-parser-paper-fast.tiff"),
(FileType.TXT, "norwich-city.txt"), (FileType.TXT, "norwich-city.txt"),
(FileType.WAV, "CantinaBand3.wav"), (FileType.WAV, "CantinaBand3.wav"),
(FileType.XLS, "tests-example.xls"),
(FileType.XLSX, "stanley-cups.xlsx"),
(FileType.XML, "factbook.xml"), (FileType.XML, "factbook.xml"),
(FileType.ZIP, "simple.zip"), (FileType.ZIP, "simple.zip"),
], ],
) )
def test_it_detects_most_file_types_using_strategy_2_when_libmagic_guesses_mime_type_for_itself( def test_it_detects_most_file_types_using_mime_guessing_when_libmagic_guesses_mime_type_for_itself(
file_name: str, expected_value: FileType file_name: str, expected_value: FileType
): ):
"""Does not work for all types, in particular: """Does not work for all types, in particular:
@ -339,90 +231,26 @@ def test_it_detects_most_file_types_using_strategy_2_when_libmagic_guesses_mime_
- ORG is identified as TXT - ORG is identified as TXT
- RST is identified as TXT - RST is identified as TXT
""" """
# -- disable strategy #1 by not asserting a content_type in the call -- # -- disable content-type strategy by not asserting a content_type in the call --
# -- disable strategy #3 (extension) by passing file-like object with no `.name` attribute -- # -- disable extension-mapping strategy by passing file-like object with no `.name` attribute --
with open(example_doc_path(file_name), "rb") as f: with open(example_doc_path(file_name), "rb") as f:
file = io.BytesIO(f.read()) file = io.BytesIO(f.read())
assert detect_filetype(file=file) is expected_value assert detect_filetype(file=file) is expected_value
@pytest.mark.parametrize(
("expected_value", "file_name"),
[
(FileType.DOC, "simple.doc"),
(FileType.PPT, "fake-power-point.ppt"),
(FileType.XLS, "tests-example.xls"),
],
)
@pytest.mark.parametrize(
"guessed_mime_type",
[
"application/msword",
"application/vnd.ms-excel",
"application/vnd.ms-outlook",
"application/vnd.ms-powerpoint",
"application/x-ole-storage",
"anything/else",
],
)
def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_guessed_mime_type(
file_name: str, guessed_mime_type: str, expected_value: FileType, ctx_mime_type_: Mock
):
"""Fixes XLS wrongly-guessed as DOC, PPT, "application/x-ole-storage" etc.
It's better than that actually, the OLE differentiator will get the right file-type for any DOC,
PPT, XLS, or MSG file, regardless of guessed MIME-type.
"""
ctx_mime_type_.return_value = guessed_mime_type
# -- disable strategy 3 by not providing a file-name source --
with open(example_doc_path(file_name), "rb") as f:
file = io.BytesIO(f.read())
# -- disable strategy 1 by not asserting a content-type --
file_type = detect_filetype(file=file)
ctx_mime_type_.assert_called_with()
assert file_type is expected_value
@pytest.mark.parametrize(
("filename", "mime_type", "expected"),
[
("fake.doc", "application/vnd.ms-excel", FileType.DOC),
("fake-power-point.ppt", "application/vnd.ms-excel", FileType.PPT),
("tests-example.xls", "application/msword", FileType.XLS),
("fake-email.msg", "application/vnd.ms-excel", FileType.MSG),
],
)
def test_ole_file_structure_trusted_over_mime_type_guess(filename, mime_type, expected):
def _guess_mime(*args, **kwargs):
return mime_type
with patch("filetype.guess_mime", _guess_mime):
detect_filetype(example_doc_path(filename)) == expected
@pytest.mark.parametrize( @pytest.mark.parametrize(
("expected_value", "file_name"), ("expected_value", "file_name"),
[ [
# -- `filetype` lib recognizes all these binary file-types -- # -- `filetype` lib recognizes all these binary file-types --
(FileType.BMP, "img/bmp_24.bmp"), (FileType.BMP, "img/bmp_24.bmp"),
(FileType.DOC, "simple.doc"),
(FileType.DOCX, "simple.docx"),
(FileType.EPUB, "winter-sports.epub"),
(FileType.HEIC, "img/DA-1p.heic"), (FileType.HEIC, "img/DA-1p.heic"),
(FileType.JPG, "img/example.jpg"), (FileType.JPG, "img/example.jpg"),
(FileType.ODT, "simple.odt"),
(FileType.PDF, "pdf/layout-parser-paper-fast.pdf"), (FileType.PDF, "pdf/layout-parser-paper-fast.pdf"),
(FileType.PNG, "img/DA-1p.png"), (FileType.PNG, "img/DA-1p.png"),
(FileType.PPT, "fake-power-point.ppt"),
(FileType.PPTX, "fake-power-point.pptx"),
(FileType.RTF, "fake-doc.rtf"), (FileType.RTF, "fake-doc.rtf"),
(FileType.TIFF, "img/layout-parser-paper-fast.tiff"), (FileType.TIFF, "img/layout-parser-paper-fast.tiff"),
(FileType.WAV, "CantinaBand3.wav"), (FileType.WAV, "CantinaBand3.wav"),
(FileType.XLS, "tests-example.xls"),
(FileType.XLSX, "stanley-cups.xlsx"),
(FileType.ZIP, "simple.zip"), (FileType.ZIP, "simple.zip"),
# -- but it doesn't recognize textual file-types at all -- # -- but it doesn't recognize textual file-types at all --
(FileType.UNK, "stanley-cups.csv"), (FileType.UNK, "stanley-cups.csv"),
@ -435,11 +263,9 @@ def test_ole_file_structure_trusted_over_mime_type_guess(filename, mime_type, ex
(FileType.UNK, "stanley-cups.tsv"), (FileType.UNK, "stanley-cups.tsv"),
(FileType.UNK, "norwich-city.txt"), (FileType.UNK, "norwich-city.txt"),
(FileType.UNK, "factbook.xml"), (FileType.UNK, "factbook.xml"),
# -- and it doesn't recognize MSG files --
(FileType.UNK, "fake-email.msg"),
], ],
) )
def test_strategy_2_can_detect_only_binary_file_types_when_libmagic_is_unavailable( def test_strategy_mime_guessing_can_detect_only_binary_file_types_when_libmagic_is_unavailable(
file_name: str, expected_value: FileType, LIBMAGIC_AVAILABLE_False: bool file_name: str, expected_value: FileType, LIBMAGIC_AVAILABLE_False: bool
): ):
"""File-type is detected using `filetype` library when libmagic is not available. """File-type is detected using `filetype` library when libmagic is not available.
@ -447,7 +273,7 @@ def test_strategy_2_can_detect_only_binary_file_types_when_libmagic_is_unavailab
`filetype.guess_mime()` does a good job on binary file types (PDF, images, legacy MS-Office), `filetype.guess_mime()` does a good job on binary file types (PDF, images, legacy MS-Office),
but doesn't even try to guess textual file-types. but doesn't even try to guess textual file-types.
""" """
# -- disable strategy #3 (extension) by passing file-like object with no `.name` attribute -- # -- disable detection by extension by passing file-like object with no `.name` attribute --
with open(example_doc_path(file_name), "rb") as f: with open(example_doc_path(file_name), "rb") as f:
file = io.BytesIO(f.read()) file = io.BytesIO(f.read())
# -- simulate libmagic is not available -- # -- simulate libmagic is not available --
@ -470,7 +296,7 @@ def test_detect_filetype_from_file_warns_when_libmagic_is_not_installed(
# ================================================================================================ # ================================================================================================
# STRATEGY #3 - MAP FILENAME EXTENSION TO FILETYPE # STRATEGY #4 - MAP FILENAME EXTENSION TO FILETYPE
# ================================================================================================ # ================================================================================================
@ -479,35 +305,25 @@ def test_detect_filetype_from_file_warns_when_libmagic_is_not_installed(
[ [
(FileType.BMP, "img/bmp_24.bmp"), (FileType.BMP, "img/bmp_24.bmp"),
(FileType.CSV, "stanley-cups.csv"), (FileType.CSV, "stanley-cups.csv"),
(FileType.DOC, "simple.doc"),
(FileType.DOCX, "simple.docx"),
(FileType.EML, "eml/fake-email.eml"), (FileType.EML, "eml/fake-email.eml"),
(FileType.EPUB, "winter-sports.epub"),
(FileType.HEIC, "img/DA-1p.heic"), (FileType.HEIC, "img/DA-1p.heic"),
(FileType.HTML, "example-10k-1p.html"), (FileType.HTML, "example-10k-1p.html"),
(FileType.JPG, "img/example.jpg"), (FileType.JPG, "img/example.jpg"),
(FileType.JSON, "spring-weather.html.json"), (FileType.JSON, "spring-weather.html.json"),
(FileType.MD, "README.md"), (FileType.MD, "README.md"),
(FileType.MSG, "fake-email.msg"),
(FileType.ODT, "simple.odt"),
(FileType.ORG, "README.org"), (FileType.ORG, "README.org"),
(FileType.PDF, "pdf/layout-parser-paper-fast.pdf"), (FileType.PDF, "pdf/layout-parser-paper-fast.pdf"),
(FileType.PNG, "img/DA-1p.png"), (FileType.PNG, "img/DA-1p.png"),
(FileType.PPT, "fake-power-point.ppt"),
(FileType.PPTX, "fake-power-point.pptx"),
(FileType.RST, "README.rst"), (FileType.RST, "README.rst"),
(FileType.RTF, "fake-doc.rtf"), (FileType.RTF, "fake-doc.rtf"),
(FileType.TIFF, "img/layout-parser-paper-fast.tiff"), (FileType.TIFF, "img/layout-parser-paper-fast.tiff"),
(FileType.TSV, "stanley-cups.tsv"), (FileType.TSV, "stanley-cups.tsv"),
(FileType.TXT, "norwich-city.txt"), (FileType.TXT, "norwich-city.txt"),
(FileType.WAV, "CantinaBand3.wav"), (FileType.WAV, "CantinaBand3.wav"),
(FileType.XLS, "tests-example.xls"),
(FileType.XLSX, "stanley-cups.xlsx"),
(FileType.XML, "factbook.xml"), (FileType.XML, "factbook.xml"),
(FileType.ZIP, "simple.zip"),
], ],
) )
def test_it_detects_correct_file_type_from_strategy_3_when_extension_maps_to_file_type( def test_it_detects_correct_file_type_from_extension_when_that_maps_to_a_file_type(
file_name: str, expected_value: FileType, ctx_mime_type_: Mock file_name: str, expected_value: FileType, ctx_mime_type_: Mock
): ):
# -- disable strategy #2 by making libmagic always guess `None` -- # -- disable strategy #2 by making libmagic always guess `None` --
@ -525,10 +341,8 @@ def test_it_detects_correct_file_type_from_strategy_3_when_extension_maps_to_fil
@pytest.mark.parametrize( @pytest.mark.parametrize(
("expected_value", "file_name", "mime_type"), ("expected_value", "file_name", "mime_type"),
[ [
(FileType.BMP, "img/bmp_24.bmp", "application/zip"), (FileType.BMP, "img/bmp_24.bmp", "application/octet-stream"),
(FileType.DOC, "simple.doc", None), (FileType.HEIC, "img/DA-1p.heic", "application/octet-stream"),
(FileType.EPUB, "winter-sports.epub", "application/x-ole-storage"),
(FileType.MSG, "fake-email.msg", "application/octet-stream"),
], ],
) )
def test_it_falls_back_to_extension_strategy_when_prior_strategies_fail( def test_it_falls_back_to_extension_strategy_when_prior_strategies_fail(
@ -547,6 +361,12 @@ def test_it_falls_back_to_extension_strategy_when_prior_strategies_fail(
# ================================================================================================ # ================================================================================================
@pytest.mark.parametrize("mime_type", [FileType.XLS.mime_type, FileType.XLSX.mime_type])
def test_it_ignores_asserted_XLS_content_type_when_file_is_CSV(mime_type: str):
file_path = example_doc_path("stanley-cups.csv")
assert detect_filetype(file_path, content_type=mime_type) == FileType.CSV
@pytest.mark.parametrize("mime_type", ["application/xml", "text/xml"]) @pytest.mark.parametrize("mime_type", ["application/xml", "text/xml"])
@pytest.mark.parametrize("extension", [".html", ".htm"]) @pytest.mark.parametrize("extension", [".html", ".htm"])
def test_it_detects_HTML_from_guessed_mime_type_ending_with_xml_and_html_extension( def test_it_detects_HTML_from_guessed_mime_type_ending_with_xml_and_html_extension(
@ -563,39 +383,6 @@ def test_it_detects_HTML_from_guessed_mime_type_ending_with_xml_and_html_extensi
assert file_type is FileType.HTML assert file_type is FileType.HTML
@pytest.mark.parametrize(
"mime_type",
[
"application/octet-stream",
"application/zip",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
],
)
@pytest.mark.parametrize(
("expected_value", "file_name"),
[
(FileType.DOCX, "simple.docx"),
(FileType.PPTX, "fake-power-point.pptx"),
(FileType.XLSX, "stanley-cups.xlsx"),
(FileType.ZIP, "simple.zip"),
],
)
def test_it_differentiates_files_when_libmagic_guesses_octet_stream_zip_or_modern_ms_office(
mime_type: str, file_name: str, expected_value: FileType, ctx_mime_type_: Mock
):
ctx_mime_type_.return_value = mime_type
# -- disable extension-based strategy #3 --
with open(example_doc_path(file_name), "rb") as f:
file = io.BytesIO(f.read())
file_type = detect_filetype(file=file)
ctx_mime_type_.assert_called_with()
assert file_type is expected_value
@pytest.mark.parametrize( @pytest.mark.parametrize(
("mime_type", "file_name"), ("mime_type", "file_name"),
[ [
@ -1000,29 +787,8 @@ class Describe_FileTypeDetectionContext:
return property_mock(request, _FileTypeDetectionContext, "mime_type") return property_mock(request, _FileTypeDetectionContext, "mime_type")
class Describe_OleFileDifferentiator: class Describe_OleFileDetector:
"""Unit-test suite for `unstructured.file_utils.filetype._OleFileDifferentiator`.""" """Unit-test suite for `unstructured.file_utils.filetype._OleFileDetector`."""
# -- .applies() ---------------------------------------------
def it_provides_a_qualifying_alternate_constructor_which_constructs_when_applicable(self):
"""The constructor determines whether this differentiator is applicable.
It returns an instance only when differentiating a CFBF file-type is required, which it
judges by inspecting the initial bytes of the file for the CFBF magic-bytes.
"""
ctx = _FileTypeDetectionContext(example_doc_path("simple.doc"))
differentiator = _OleFileDifferentiator.applies(ctx, "foo/bar")
assert differentiator is not None
assert isinstance(differentiator, _OleFileDifferentiator)
def and_it_returns_None_when_ole_differentiation_is_not_applicable_to_the_mime_type(self):
ctx = _FileTypeDetectionContext(example_doc_path("winter-sports.epub"))
assert _OleFileDifferentiator.applies(ctx, "application/epub") is None
# -- .file_type ---------------------------------------------
@pytest.mark.parametrize( @pytest.mark.parametrize(
("file_name", "expected_value"), ("file_name", "expected_value"),
@ -1034,59 +800,15 @@ class Describe_OleFileDifferentiator:
("README.org", None), ("README.org", None),
], ],
) )
def it_distinguishes_the_file_type_of_applicable_OLE_files( def it_distinguishes_the_file_type_of_applicable_CFB_files(
self, file_name: str, expected_value: FileType | None self, file_name: str, expected_value: FileType | None
): ):
# -- no file-name available, just to make sure we're not relying on an extension -- # -- no file-name available, just to make sure we're not relying on an extension --
with open(example_doc_path(file_name), "rb") as f: with open(example_doc_path(file_name), "rb") as f:
file = io.BytesIO(f.read()) file = io.BytesIO(f.read())
ctx = _FileTypeDetectionContext(file=file) ctx = _FileTypeDetectionContext(file=file)
differentiator = _OleFileDifferentiator(ctx)
assert differentiator.file_type is expected_value assert _OleFileDetector.file_type(ctx) is expected_value
@pytest.mark.parametrize(
("file_name", "expected_value"),
[
("simple.doc", FileType.DOC),
("fake-power-point.ppt", FileType.PPT),
("tests-example.xls", FileType.XLS),
("fake-email.msg", FileType.MSG),
],
)
def it_distinguishes_the_file_type_of_applicable_OLE_files_from_storage_content(
self, file_name: str, expected_value: FileType | None
):
# -- no file-name available, just to make sure we're not relying on an extension --
with open(example_doc_path(file_name), "rb") as f:
file = io.BytesIO(f.read())
ctx = _FileTypeDetectionContext(file=file)
differentiator = _OleFileDifferentiator(ctx)
assert differentiator._check_ole_file_type(ctx) is expected_value
def but_it_returns_None_to_engage_fallback_when_filetype_cannot_guess_mime(
self, guess_mime_: Mock
):
guess_mime_.return_value = None
# -- no file-name available, just to make sure we're not relying on an extension --
with open(example_doc_path("fake-email.msg"), "rb") as f:
file = io.BytesIO(f.read())
ctx = _FileTypeDetectionContext(file=file)
differentiator = _OleFileDifferentiator(ctx)
# -- force method to return None to trigger the mime type being guessed
differentiator._check_ole_file_type = lambda ctx: None
file_type = differentiator.file_type
guess_mime_.assert_called_once_with(file)
assert file_type is None
# -- fixtures --------------------------------------------------------------------------------
@pytest.fixture
def guess_mime_(self, request: FixtureRequest):
return function_mock(request, "unstructured.file_utils.filetype.ft.guess_mime")
class Describe_TextFileDifferentiator: class Describe_TextFileDifferentiator:
@ -1164,33 +886,15 @@ class Describe_TextFileDifferentiator:
assert differentiator._is_json is expected_value assert differentiator._is_json is expected_value
class Describe_ZipFileDifferentiator: class Describe_ZipFileDetector:
"""Unit-test suite for `unstructured.file_utils.filetype._ZipFileDifferentiator`.""" """Unit-test suite for `unstructured.file_utils.filetype._ZipFileDetector`."""
# -- .applies() ---------------------------------------------
def it_provides_a_qualifying_alternate_constructor_which_constructs_when_applicable(self):
"""The constructor determines whether this differentiator is applicable.
It returns an instance only when differentiating a zip file-type is required, which it can
judge from the mime-type provided by the context (`ctx`).
"""
ctx = _FileTypeDetectionContext(example_doc_path("simple.docx"))
differentiator = _ZipFileDifferentiator.applies(ctx, "application/zip")
assert isinstance(differentiator, _ZipFileDifferentiator)
def and_it_returns_None_when_zip_differentiation_does_not_apply_to_the_detection_context(self):
ctx = _FileTypeDetectionContext(example_doc_path("norwich-city.txt"))
assert _ZipFileDifferentiator.applies(ctx, "application/epub") is None
# -- .file_type ---------------------------------------------
@pytest.mark.parametrize( @pytest.mark.parametrize(
("file_name", "expected_value"), ("file_name", "expected_value"),
[ [
("simple.docx", FileType.DOCX), ("simple.docx", FileType.DOCX),
("winter-sports.epub", FileType.EPUB),
("simple.odt", FileType.ODT),
("picture.pptx", FileType.PPTX), ("picture.pptx", FileType.PPTX),
("vodafone.xlsx", FileType.XLSX), ("vodafone.xlsx", FileType.XLSX),
("simple.zip", FileType.ZIP), ("simple.zip", FileType.ZIP),
@ -1201,6 +905,4 @@ class Describe_ZipFileDifferentiator:
self, file_name: str, expected_value: FileType | None self, file_name: str, expected_value: FileType | None
): ):
ctx = _FileTypeDetectionContext(example_doc_path(file_name)) ctx = _FileTypeDetectionContext(example_doc_path(file_name))
differentiator = _ZipFileDifferentiator(ctx) assert _ZipFileDetector.file_type(ctx) is expected_value
assert differentiator.file_type is expected_value

View File

@ -1 +1 @@
__version__ = "0.16.12-dev2" # pragma: no cover __version__ = "0.16.12-dev3" # pragma: no cover

View File

@ -51,7 +51,11 @@ from unstructured.partition.common.common import add_element_metadata, exactly_o
from unstructured.partition.common.metadata import set_element_hierarchy from unstructured.partition.common.metadata import set_element_hierarchy
from unstructured.utils import get_call_args_applying_defaults, lazyproperty from unstructured.utils import get_call_args_applying_defaults, lazyproperty
LIBMAGIC_AVAILABLE = bool(importlib.util.find_spec("magic")) try:
importlib.import_module("magic")
LIBMAGIC_AVAILABLE = True
except ImportError:
LIBMAGIC_AVAILABLE = False # pyright: ignore[reportConstantRedefinition]
def detect_filetype( def detect_filetype(
@ -133,43 +137,57 @@ class _FileTypeDetector:
@property @property
def _file_type(self) -> FileType: def _file_type(self) -> FileType:
"""FileType member corresponding to this document source.""" """FileType member corresponding to this document source."""
# -- strategy 1: use content-type asserted by caller -- # -- An explicit content-type most commonly asserted by the client/SDK and is therefore
# -- inherently unreliable. On the other hand, binary file-types can be detected with 100%
# -- accuracy. So start with binary types and only then consider an asserted content-type,
# -- generally as a last resort.
# -- strategy 1: most binary types can be detected with 100% accuracy --
if file_type := self._known_binary_file_type:
return file_type
# -- strategy 2: use content-type asserted by caller --
if file_type := self._file_type_from_content_type: if file_type := self._file_type_from_content_type:
return file_type return file_type
# -- strategy 2: guess MIME-type using libmagic and use that -- # -- strategy 3: guess MIME-type using libmagic and use that --
if file_type := self._file_type_from_guessed_mime_type: if file_type := self._file_type_from_guessed_mime_type:
return file_type return file_type
# -- strategy 3: use filename-extension, like ".docx" -> FileType.DOCX -- # -- strategy 4: use filename-extension, like ".docx" -> FileType.DOCX --
if file_type := self._file_type_from_file_extension: if file_type := self._file_type_from_file_extension:
return file_type return file_type
# -- strategy 4: give up and report FileType.UNK -- # -- strategy 5: give up and report FileType.UNK --
return FileType.UNK return FileType.UNK
# == STRATEGIES ============================================================ # == STRATEGIES ============================================================
@property @property
def _file_type_from_content_type(self) -> FileType | None: def _known_binary_file_type(self) -> FileType | None:
"""Map passed content-type argument to a file-type, subject to certain rules.""" """Detect file-type for binary types we can positively detect."""
content_type = self._ctx.content_type if file_type := _OleFileDetector.file_type(self._ctx):
return file_type
self._ctx.rule_out_cfb_content_types()
if file_type := _ZipFileDetector.file_type(self._ctx):
return file_type
self._ctx.rule_out_zip_content_types()
# -- when no content-type was asserted by caller, this strategy is not applicable --
if not content_type:
return None return None
# -- OLE-based file-format content_type values are sometimes unreliable. These are @property
# -- DOC, PPT, XLS, and MSG. def _file_type_from_content_type(self) -> FileType | None:
if differentiator := _OleFileDifferentiator.applies(self._ctx, content_type): """Map passed content-type argument to a file-type, subject to certain rules."""
return differentiator.file_type
# -- MS-Office 2007+ (OpenXML) content_type value is sometimes unreliable -- # -- when no content-type was asserted by caller, this strategy is not applicable --
if differentiator := _ZipFileDifferentiator.applies(self._ctx, content_type): if not self._ctx.content_type:
return differentiator.file_type return None
# -- otherwise we trust the passed `content_type` as long as `FileType` recognizes it -- # -- otherwise we trust the passed `content_type` as long as `FileType` recognizes it --
return FileType.from_mime_type(content_type) return FileType.from_mime_type(self._ctx.content_type)
@property @property
def _file_type_from_guessed_mime_type(self) -> FileType | None: def _file_type_from_guessed_mime_type(self) -> FileType | None:
@ -188,24 +206,12 @@ class _FileTypeDetector:
if mime_type is None: if mime_type is None:
return None return None
if differentiator := _OleFileDifferentiator.applies(self._ctx, mime_type):
return differentiator.file_type
if mime_type.endswith("xml"): if mime_type.endswith("xml"):
return FileType.HTML if extension in (".html", ".htm") else FileType.XML return FileType.HTML if extension in (".html", ".htm") else FileType.XML
if differentiator := _TextFileDifferentiator.applies(self._ctx): if differentiator := _TextFileDifferentiator.applies(self._ctx):
return differentiator.file_type return differentiator.file_type
# -- applicable to "application/octet-stream", "application/zip", and all Office 2007+
# -- document MIME-types, i.e. those for DOCX, PPTX, and XLSX. Note however it does NOT
# -- apply to EPUB or ODT documents, even though those are also Zip archives. The zip and
# -- octet-stream MIME-types are fed in because they are ambiguous. The MS-Office types are
# -- differentiated because they are sometimes mistaken for each other, like DOCX mime-type
# -- is actually a PPTX file etc.
if differentiator := _ZipFileDifferentiator.applies(self._ctx, mime_type):
return differentiator.file_type
# -- All source-code files (e.g. *.py, *.js) are classified as plain text for the moment -- # -- All source-code files (e.g. *.py, *.js) are classified as plain text for the moment --
if self._ctx.has_code_mime_type: if self._ctx.has_code_mime_type:
return FileType.TXT return FileType.TXT
@ -214,14 +220,8 @@ class _FileTypeDetector:
return FileType.EMPTY return FileType.EMPTY
# -- if no more-specific rules apply, use the MIME-type -> FileType mapping when present -- # -- if no more-specific rules apply, use the MIME-type -> FileType mapping when present --
if file_type := FileType.from_mime_type(mime_type): file_type = FileType.from_mime_type(mime_type)
return file_type return file_type if file_type != FileType.UNK else None
logger.warning(
f"The MIME type{f' of {self._ctx.file_path!r}' if self._ctx.file_path else ''} is"
f" {mime_type!r}. This file type is not currently supported in unstructured.",
)
return None
@lazyproperty @lazyproperty
def _file_type_from_file_extension(self) -> FileType | None: def _file_type_from_file_extension(self) -> FileType | None:
@ -236,6 +236,9 @@ class _FileTypeDetector:
class _FileTypeDetectionContext: class _FileTypeDetectionContext:
"""Provides all arguments to auto-file detection and values derived from them. """Provides all arguments to auto-file detection and values derived from them.
NOTE that `._content_type` is mutable via `.rule_out_*_content_types()` methods, so it should
not be assumed to be a constant value across those calls.
This keeps computation of derived values out of the file-detection code but more importantly This keeps computation of derived values out of the file-detection code but more importantly
allows the main filetype-detector to pass the full context to any delegates without coupling allows the main filetype-detector to pass the full context to any delegates without coupling
itself to which values it might need. itself to which values it might need.
@ -276,7 +279,7 @@ class _FileTypeDetectionContext:
self._validate() self._validate()
return self return self
@lazyproperty @property
def content_type(self) -> str | None: def content_type(self) -> str | None:
"""MIME-type asserted by caller; not based on inspection of file by this process. """MIME-type asserted by caller; not based on inspection of file by this process.
@ -284,6 +287,8 @@ class _FileTypeDetectionContext:
present on the response. These are often ambiguous and sometimes just wrong so get some present on the response. These are often ambiguous and sometimes just wrong so get some
further verification. All lower-case when not `None`. further verification. All lower-case when not `None`.
""" """
# -- Note `._content_type` is mutable via `.invalidate_content_type()` so this cannot be a
# -- `@lazyproperty`.
return self._content_type.lower() if self._content_type else None return self._content_type.lower() if self._content_type else None
@lazyproperty @lazyproperty
@ -327,12 +332,6 @@ class _FileTypeDetectionContext:
return os.path.realpath(file_path) if os.path.islink(file_path) else file_path return os.path.realpath(file_path) if os.path.islink(file_path) else file_path
@lazyproperty
def is_zipfile(self) -> bool:
"""True when file is a Zip archive."""
with self.open() as file:
return zipfile.is_zipfile(file)
@lazyproperty @lazyproperty
def has_code_mime_type(self) -> bool: def has_code_mime_type(self) -> bool:
"""True when `mime_type` plausibly indicates a programming language source-code file.""" """True when `mime_type` plausibly indicates a programming language source-code file."""
@ -347,9 +346,27 @@ class _FileTypeDetectionContext:
return any( return any(
lang in mime_type lang in mime_type
for lang in "c# c++ cpp csharp java javascript php python ruby swift typescript".split() for lang in [
"c#",
"c++",
"cpp",
"csharp",
"java",
"javascript",
"php",
"python",
"ruby",
"swift",
"typescript",
]
) )
@lazyproperty
def is_zipfile(self) -> bool:
"""True when file is a Zip archive."""
with self.open() as file:
return zipfile.is_zipfile(file)
@lazyproperty @lazyproperty
def mime_type(self) -> str | None: def mime_type(self) -> str | None:
"""The best MIME-type we can get from `magic` (or `filetype` package). """The best MIME-type we can get from `magic` (or `filetype` package).
@ -401,6 +418,38 @@ class _FileTypeDetectionContext:
file.seek(0) file.seek(0)
yield file yield file
def rule_out_cfb_content_types(self) -> None:
"""Invalidate content-type when a legacy MS-Office file-type is asserted.
Used before returning `None`; at that point we know the file is not one of these formats
so if the asserted `content-type` is a legacy MS-Office type we know it's wrong and should
not be used as a fallback later in the detection process.
"""
if FileType.from_mime_type(self._content_type) in (
FileType.DOC,
FileType.MSG,
FileType.PPT,
FileType.XLS,
):
self._content_type = None
def rule_out_zip_content_types(self) -> None:
"""Invalidate content-type when an MS-Office 2007+ file-type is asserted.
Used before returning `None`; at that point we know the file is not one of these formats
so if the asserted `content-type` is an MS-Office 2007+ type we know it's wrong and should
not be used as a fallback later in the detection process.
"""
if FileType.from_mime_type(self._content_type) in (
FileType.DOCX,
FileType.EPUB,
FileType.ODT,
FileType.PPTX,
FileType.XLSX,
FileType.ZIP,
):
self._content_type = None
@lazyproperty @lazyproperty
def text_head(self) -> str: def text_head(self) -> str:
"""The initial characters of the text file for use with text-format differentiation. """The initial characters of the text file for use with text-format differentiation.
@ -440,27 +489,23 @@ class _FileTypeDetectionContext:
raise ValueError("either `file_path` or `file` argument must be provided") raise ValueError("either `file_path` or `file` argument must be provided")
class _OleFileDifferentiator: class _OleFileDetector:
"""Refine an OLE-storage package (CFBF) file-type that may not be as specific as it could be. """Detect and differentiate a CFB file, aka. "OLE" file.
Compound File Binary Format (CFBF), aka. OLE file, is use by Microsoft for legacy MS Office Compound File Binary Format (CFB), aka. OLE file, is use by Microsoft for legacy MS Office
files (DOC, PPT, XLS) as well as for Outlook MSG files. `libmagic` tends to identify these as files (DOC, PPT, XLS) as well as for Outlook MSG files.
`"application/x-ole-storage"` which is true but too not specific enough for partitioning
purposes.
""" """
def __init__(self, ctx: _FileTypeDetectionContext): def __init__(self, ctx: _FileTypeDetectionContext):
self._ctx = ctx self._ctx = ctx
@classmethod @classmethod
def applies( def file_type(cls, ctx: _FileTypeDetectionContext) -> FileType | None:
cls, ctx: _FileTypeDetectionContext, mime_type: str """Specific file-type when file is a CFB file, `None` otherwise."""
) -> _OleFileDifferentiator | None: return cls(ctx)._file_type
"""Constructs an instance, but only if this differentiator applies for `mime_type`."""
return cls(ctx) if cls._is_ole_file(ctx) else None
@property @property
def file_type(self) -> FileType | None: def _file_type(self) -> FileType | None:
"""Differentiated file-type for Microsoft Compound File Binary Format (CFBF). """Differentiated file-type for Microsoft Compound File Binary Format (CFBF).
Returns one of: Returns one of:
@ -468,34 +513,27 @@ class _OleFileDifferentiator:
- `FileType.PPT` - `FileType.PPT`
- `FileType.XLS` - `FileType.XLS`
- `FileType.MSG` - `FileType.MSG`
- `None` when the file is not one of these.
""" """
# -- if this is not a CFBF file then whatever MIME-type was guessed is wrong, so return # -- all CFB files share common magic number, start with that --
# -- `None` to trigger fall-back to next strategy. if not self._is_ole_file:
if not self._is_ole_file(self._ctx):
return None return None
# -- check storage contents of the ole file for file type markers # -- check storage contents of the ole file for file-type specific stream names --
if (ole_file_type := self._check_ole_file_type(self._ctx)) is not None: if (ole_file_type := self._ole_file_type) is not None:
return ole_file_type return ole_file_type
# -- `filetype` lib is better at legacy MS-Office files than `libmagic`, so we rely on it return None
# -- to differentiate those. Note `filetype` doesn't detect MSG type and won't always
# -- detect DOC, PPT, or XLS, returning `None` instead. We let those fall through and we @lazyproperty
# -- rely on filename-extension to identify those. def _is_ole_file(self) -> bool:
"""True when file has CFB magic first 8 bytes."""
with self._ctx.open() as file: with self._ctx.open() as file:
mime_type = ft.guess_mime(file)
return FileType.from_mime_type(mime_type) if mime_type else None
@staticmethod
def _is_ole_file(ctx: _FileTypeDetectionContext) -> bool:
"""True when file has CFBF magic first 8 bytes."""
with ctx.open() as file:
return file.read(8) == b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" return file.read(8) == b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"
@staticmethod @lazyproperty
def _check_ole_file_type(ctx: _FileTypeDetectionContext) -> FileType | None: def _ole_file_type(self) -> FileType | None:
with ctx.open() as f: with self._ctx.open() as f:
ole = OleFileIO(f) # pyright: ignore[reportUnknownVariableType] ole = OleFileIO(f) # pyright: ignore[reportUnknownVariableType]
root_storage = Storage.from_ole(ole) # pyright: ignore[reportUnknownMemberType] root_storage = Storage.from_ole(ole) # pyright: ignore[reportUnknownMemberType]
@ -537,7 +575,20 @@ class _TextFileDifferentiator:
""" """
extension = self._ctx.extension extension = self._ctx.extension
if extension in ".csv .eml .html .json .md .org .p7s .rst .rtf .tab .tsv".split(): if extension in [
".csv",
".eml",
".html",
".json",
".markdown",
".md",
".org",
".p7s",
".rst",
".rtf",
".tab",
".tsv",
]:
return FileType.from_extension(extension) or FileType.TXT return FileType.from_extension(extension) or FileType.TXT
# NOTE(crag): for older versions of the OS libmagic package, such as is currently # NOTE(crag): for older versions of the OS libmagic package, such as is currently
@ -616,40 +667,28 @@ class _TextFileDifferentiator:
return False return False
class _ZipFileDifferentiator: class _ZipFileDetector:
"""Refine a Zip-packaged file-type that may be ambiguous or swapped.""" """Detect and differentiate a Zip-archive file."""
def __init__(self, ctx: _FileTypeDetectionContext): def __init__(self, ctx: _FileTypeDetectionContext):
self._ctx = ctx self._ctx = ctx
@classmethod @classmethod
def applies( def file_type(cls, ctx: _FileTypeDetectionContext) -> FileType | None:
cls, ctx: _FileTypeDetectionContext, mime_type: str """Most specific file-type available when file is a Zip file, `None` otherwise.
) -> _ZipFileDifferentiator | None:
"""Constructs an instance, but only if this differentiator applies for `mime_type`.
Separate `mime_type` argument allows it to be applied to either asserted content-type or MS-Office 2007+ files are detected with 100% accuracy. Otherwise this returns `None`, even
guessed mime-type. when we can tell it's a Zip file, so later strategies can have a crack at it. In
particular, ODT and EPUB files are Zip archives but are not detected here.
""" """
return ( return cls(ctx)._file_type
cls(ctx)
if mime_type
in (
"application/octet-stream",
"application/zip",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
)
else None
)
@lazyproperty @lazyproperty
def file_type(self) -> FileType | None: def _file_type(self) -> FileType | None:
"""Differentiated file-type for a Zip archive. """Differentiated file-type for a Zip archive.
Returns `None` if the file is not a Zip archive. Otherwise it returns `FileType.DOCX`, Returns `FileType.DOCX`, `FileType.PPTX`, or `FileType.XLSX` when one of those applies,
`FileType.PPTX`, or `FileType.XLSX` when one of those applies and `FileType.ZIP` otherwise. `None` otherwise.
""" """
if not self._ctx.is_zipfile: if not self._ctx.is_zipfile:
return None return None
@ -657,20 +696,23 @@ class _ZipFileDifferentiator:
with self._ctx.open() as file: with self._ctx.open() as file:
zip = zipfile.ZipFile(file) zip = zipfile.ZipFile(file)
# NOTE(robinson) - .docx and .xlsx files are actually a zip file with a .docx/.xslx filenames = zip.namelist()
# extension. If the MIME type is application/octet-stream, we check if it's a
# .docx/.xlsx file by looking for expected filenames within the zip file.
filenames = [f.filename for f in zip.filelist]
if all(f in filenames for f in ("word/document.xml",)): if "word/document.xml" in filenames:
return FileType.DOCX return FileType.DOCX
if all(f in filenames for f in ("xl/workbook.xml",)): if "xl/workbook.xml" in filenames:
return FileType.XLSX return FileType.XLSX
if all(f in filenames for f in ("ppt/presentation.xml",)): if "ppt/presentation.xml" in filenames:
return FileType.PPTX return FileType.PPTX
# -- ODT and EPUB files place their MIME-type in `mimetype` in the archive root --
if "mimetype" in filenames:
with zip.open("mimetype") as f:
mime_type = f.read().decode("utf-8").strip()
return FileType.from_mime_type(mime_type)
return FileType.ZIP return FileType.ZIP