diff --git a/CHANGELOG.md b/CHANGELOG.md index dbe51c5ba..4999a4975 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.15.1-dev1 +## 0.15.1-dev2 ### Enhancements @@ -7,6 +7,10 @@ ### Fixes * **Update import of Pinecone exception** Adds compatibility for pinecone-client>=5.0.0 +* **File-type detection catches non-existent file-path.** `detect_filetype()` no longer silently falls back to detecting a file-type based on the extension when no file exists at the path provided. Instead `FileNotFoundError` is raised. This provides consistent user notification of a mis-typed path rather than an unpredictable exception from a file-type specific partitioner when the file cannot be opened. +* **EML files specified as a file-path are detected correctly.** Resolved a bug where an EML file submitted to `partition()` as a file-path was identified as TXT and partitioned using `partition_text()`. EML files specified by path are now identified and processed correctly, including processing any attachments. +* **A DOCX, PPTX, or XLSX file specified by path and ambiguously identified as MIME-type "application/octet-stream" is identified correctly.** Resolves a shortcoming where a file specified by path immediately fell back to filename-extension based identification when misidentified as "application/octet-stream", either by asserted content type or a mis-guess by libmagic. An MS Office file misidentified in this way is now correctly identified regardless of its filename and whether it is specified by path or file-like object. +* **Textual content retrieved from a URL with gzip transport compression now partitions correctly.** Resolves a bug where a textual file-type (such as Markdown) retrieved by passing a URL to `partition()` would raise when `gzip` compression was used for transport by the server. ## 0.15.0 diff --git a/example-docs/simple.pptx b/example-docs/simple.pptx new file mode 100644 index 000000000..ab165bb5b Binary files /dev/null and b/example-docs/simple.pptx differ diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index 545f441a0..fac88ab2d 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -6,550 +6,971 @@ from __future__ import annotations import io import os -import pathlib -import magic import pytest from test_unstructured.unit_utils import ( FixtureRequest, LogCaptureFixture, Mock, - MonkeyPatch, - call, example_doc_path, - method_mock, + patch, + property_mock, ) -from unstructured.file_utils import filetype from unstructured.file_utils.filetype import ( - _detect_filetype_from_octet_stream, - _is_code_mime_type, - _is_text_file_a_csv, - _is_text_file_a_json, + _FileTypeDetectionContext, + _TextFileDifferentiator, + _ZipFileDifferentiator, detect_filetype, ) from unstructured.file_utils.model import FileType is_in_docker = os.path.exists("/.dockerenv") - -@pytest.mark.parametrize( - ("file_name", "expected_value"), - [ - ("layout-parser-paper-fast.pdf", FileType.PDF), - ("fake.docx", FileType.DOCX), - ("example.jpg", FileType.JPG), - ("fake-text.txt", FileType.TXT), - ("eml/fake-email.eml", FileType.EML), - ("factbook.xml", FileType.XML), - ("example-10k.html", FileType.HTML), - ("fake-html.html", FileType.HTML), - ("stanley-cups.xlsx", FileType.XLSX), - ("stanley-cups.csv", FileType.CSV), - ("stanley-cups.tsv", FileType.TSV), - ("fake-power-point.pptx", FileType.PPTX), - ("winter-sports.epub", FileType.EPUB), - ("spring-weather.html.json", FileType.JSON), - ("README.org", FileType.ORG), - ("README.rst", FileType.RST), - ("README.md", FileType.MD), - ("fake.odt", FileType.ODT), - ("fake-incomplete-json.txt", FileType.TXT), - ], -) -def test_detect_filetype_from_filename(file_name: str, expected_value: FileType): - assert detect_filetype(example_doc_path(file_name)) == expected_value +# ================================================================================================ +# STRATEGY #1 - CONTENT-TYPE ASSERTED IN CALL +# ================================================================================================ @pytest.mark.parametrize( - ("file_name", "expected_value"), + ("expected_value", "file_name", "content_type"), [ - ("layout-parser-paper-fast.pdf", FileType.PDF), - ("fake.docx", FileType.DOCX), - ("example.jpg", FileType.JPG), - ("fake-text.txt", FileType.TXT), - ("eml/fake-email.eml", FileType.EML), - ("factbook.xml", FileType.XML), - ("example-10k.html", FileType.HTML), - ("fake-html.html", FileType.HTML), - ("stanley-cups.xlsx", FileType.XLSX), - ("stanley-cups.csv", FileType.CSV), - ("stanley-cups.tsv", FileType.TSV), - ("fake-power-point.pptx", FileType.PPTX), - ("winter-sports.epub", FileType.EPUB), - ("fake-doc.rtf", FileType.RTF), - ("spring-weather.html.json", FileType.JSON), - ("fake.odt", FileType.ODT), - ("fake-incomplete-json.txt", FileType.TXT), + (FileType.BMP, "img/bmp_24.bmp", "image/bmp"), + (FileType.CSV, "stanley-cups.csv", "text/csv"), + (FileType.DOC, "simple.doc", "application/msword"), + ( + FileType.DOCX, + "simple.docx", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ), + (FileType.EML, "eml/fake-email.eml", "message/rfc822"), + (FileType.EPUB, "winter-sports.epub", "application/epub+zip"), + (FileType.HEIC, "img/DA-1p.heic", "image/heic"), + (FileType.HTML, "example-10k-1p.html", "text/html"), + (FileType.JPG, "img/example.jpg", "image/jpeg"), + (FileType.JSON, "spring-weather.html.json", "application/json"), + (FileType.MD, "README.md", "text/markdown"), + (FileType.MSG, "fake-email.msg", "application/vnd.ms-outlook"), + (FileType.ODT, "simple.odt", "application/vnd.oasis.opendocument.text"), + (FileType.ORG, "README.org", "text/org"), + (FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"), + (FileType.PNG, "img/DA-1p.png", "image/png"), + (FileType.PPT, "fake-power-point.ppt", "application/vnd.ms-powerpoint"), + ( + FileType.PPTX, + "fake-power-point.pptx", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + ), + (FileType.RST, "README.rst", "text/x-rst"), + (FileType.RTF, "fake-doc.rtf", "text/rtf"), + (FileType.TIFF, "img/layout-parser-paper-fast.tiff", "image/tiff"), + (FileType.TSV, "stanley-cups.tsv", "text/tsv"), + (FileType.TXT, "norwich-city.txt", "text/plain"), + (FileType.WAV, "CantinaBand3.wav", "audio/wav"), + (FileType.XLS, "tests-example.xls", "application/vnd.ms-excel"), + ( + FileType.XLSX, + "stanley-cups.xlsx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ), + (FileType.XML, "factbook.xml", "application/xml"), + (FileType.ZIP, "simple.zip", "application/zip"), ], ) -def test_detect_filetype_from_filename_with_extension( - file_name: str, expected_value: FileType, monkeypatch: MonkeyPatch +def test_it_detects_correct_file_type_from_file_path_with_correct_asserted_content_type( + file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock ): - """File-type is detected from extension when libmagic not available or file does not exist.""" - # -- when libmagic is not available -- - monkeypatch.setattr(filetype, "LIBMAGIC_AVAILABLE", False) - assert detect_filetype(example_doc_path(file_name)) == expected_value - # -- when file does not exist -- - monkeypatch.setattr(filetype, "LIBMAGIC_AVAILABLE", True) - extension = pathlib.Path(file_name).suffix - assert detect_filetype(example_doc_path("not-on-disk" + extension)) == expected_value + # -- disable strategy #2, leaving only asserted content-type and extension -- + ctx_mime_type_.return_value = None + + file_type = detect_filetype(example_doc_path(file_name), content_type=content_type) + + # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not + # -- fall back to strategy 2 for any of these test cases. + ctx_mime_type_.assert_not_called() + assert file_type == expected_value @pytest.mark.parametrize( - ("file_name", "expected_value"), + ("expected_value", "file_name", "content_type"), [ - ("pdf/layout-parser-paper-fast.pdf", [FileType.PDF]), - ("fake.docx", [FileType.DOCX]), - ("img/example.jpg", [FileType.JPG]), - ("fake-text.txt", [FileType.TXT]), - ("eml/fake-email.eml", [FileType.EML]), - ("factbook.xml", [FileType.XML]), - # NOTE(robinson]) - For the document, some operating systems return - # */xml and some return */html. Either could be acceptable depending on the OS - ("example-10k.html", [FileType.HTML, FileType.XML]), - ("fake-html.html", [FileType.HTML]), - ("stanley-cups.xlsx", [FileType.XLSX]), - ("stanley-cups.csv", [FileType.CSV]), - ("stanley-cups.tsv", [FileType.TSV]), - ("fake-power-point.pptx", [FileType.PPTX]), - ("winter-sports.epub", [FileType.EPUB]), - ("fake-incomplete-json.txt", [FileType.TXT]), + (FileType.BMP, "img/bmp_24.bmp", "image/bmp"), + (FileType.CSV, "stanley-cups.csv", "text/csv"), + (FileType.DOC, "simple.doc", "application/msword"), + ( + FileType.DOCX, + "simple.docx", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ), + (FileType.EML, "eml/fake-email.eml", "message/rfc822"), + (FileType.EPUB, "winter-sports.epub", "application/epub+zip"), + (FileType.HEIC, "img/DA-1p.heic", "image/heic"), + (FileType.HTML, "example-10k-1p.html", "text/html"), + (FileType.JPG, "img/example.jpg", "image/jpeg"), + (FileType.JSON, "spring-weather.html.json", "application/json"), + (FileType.MD, "README.md", "text/markdown"), + (FileType.MSG, "fake-email.msg", "application/vnd.ms-outlook"), + (FileType.ODT, "simple.odt", "application/vnd.oasis.opendocument.text"), + (FileType.ORG, "README.org", "text/org"), + (FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"), + (FileType.PNG, "img/DA-1p.png", "image/png"), + (FileType.PPT, "fake-power-point.ppt", "application/vnd.ms-powerpoint"), + ( + FileType.PPTX, + "fake-power-point.pptx", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + ), + (FileType.RST, "README.rst", "text/x-rst"), + (FileType.RTF, "fake-doc.rtf", "text/rtf"), + (FileType.TIFF, "img/layout-parser-paper-fast.tiff", "image/tiff"), + (FileType.TSV, "stanley-cups.tsv", "text/tsv"), + (FileType.TXT, "norwich-city.txt", "text/plain"), + (FileType.WAV, "CantinaBand3.wav", "audio/wav"), + (FileType.XLS, "tests-example.xls", "application/vnd.ms-excel"), + ( + FileType.XLSX, + "stanley-cups.xlsx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ), + (FileType.XML, "factbook.xml", "application/xml"), + (FileType.ZIP, "simple.zip", "application/zip"), ], ) -def test_detect_filetype_from_file(file_name: str, expected_value: list[FileType]): +def test_it_detects_correct_file_type_from_file_no_name_with_correct_asserted_content_type( + file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock +): + # -- disable strategy #2 (guessed mime-type) -- + ctx_mime_type_.return_value = None + # -- disable strategy #3 (filename extension) by supplying no source of file name -- with open(example_doc_path(file_name), "rb") as f: - assert detect_filetype(file=f) in expected_value + file = io.BytesIO(f.read()) + + file_type = detect_filetype(file=file, content_type=content_type) + + # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not + # -- fall-back to strategy 2 for any of these test cases. + ctx_mime_type_.assert_not_called() + assert file_type is expected_value + + +# ================================================================================================ +# STRATEGY #2 - GUESS MIME-TYPE WITH LIBMAGIC +# ================================================================================================ + + +@pytest.mark.parametrize( + ("expected_value", "file_name", "mime_type"), + [ + (FileType.BMP, "img/bmp_24.bmp", "image/bmp"), + (FileType.CSV, "stanley-cups.csv", "text/csv"), + (FileType.CSV, "stanley-cups.csv", "application/csv"), + (FileType.CSV, "stanley-cups.csv", "application/x-csv"), + (FileType.DOC, "simple.doc", "application/msword"), + ( + FileType.DOCX, + "simple.docx", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ), + (FileType.EML, "eml/fake-email.eml", "message/rfc822"), + (FileType.EPUB, "winter-sports.epub", "application/epub"), + (FileType.EPUB, "winter-sports.epub", "application/epub+zip"), + (FileType.HEIC, "img/DA-1p.heic", "image/heic"), + (FileType.HTML, "example-10k-1p.html", "text/html"), + (FileType.JPG, "img/example.jpg", "image/jpeg"), + (FileType.JSON, "spring-weather.html.json", "application/json"), + (FileType.MD, "README.md", "text/markdown"), + (FileType.MD, "README.md", "text/x-markdown"), + (FileType.MSG, "fake-email.msg", "application/vnd.ms-outlook"), + (FileType.ODT, "simple.odt", "application/vnd.oasis.opendocument.text"), + (FileType.ORG, "README.org", "text/org"), + (FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"), + (FileType.PNG, "img/DA-1p.png", "image/png"), + (FileType.PPT, "fake-power-point.ppt", "application/vnd.ms-powerpoint"), + ( + FileType.PPTX, + "fake-power-point.pptx", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + ), + (FileType.RST, "README.rst", "text/x-rst"), + (FileType.RTF, "fake-doc.rtf", "text/rtf"), + (FileType.RTF, "fake-doc.rtf", "application/rtf"), + (FileType.TIFF, "img/layout-parser-paper-fast.tiff", "image/tiff"), + (FileType.TSV, "stanley-cups.tsv", "text/tsv"), + (FileType.TXT, "norwich-city.txt", "text/plain"), + (FileType.TXT, "simple.yaml", "text/yaml"), + (FileType.WAV, "CantinaBand3.wav", "audio/wav"), + (FileType.XLS, "tests-example.xls", "application/vnd.ms-excel"), + ( + FileType.XLSX, + "stanley-cups.xlsx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ), + (FileType.XML, "factbook.xml", "application/xml"), + (FileType.XML, "factbook.xml", "text/xml"), + (FileType.ZIP, "simple.zip", "application/zip"), + ], +) +def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_recognized_mime_type( + file_name: str, mime_type: str, expected_value: FileType, ctx_mime_type_: Mock +): + # -- libmagic guesses a MIME-type mapped to a `FileType` -- + ctx_mime_type_.return_value = mime_type + # -- disable strategy #3 (filename extension) by not providing filename -- + with open(example_doc_path(file_name), "rb") as f: + file = io.BytesIO(f.read()) + + # -- disable strategy #1 by not asserting a content_type in the call -- + file_type = detect_filetype(file=file) + + # -- ctx.mime_type may be referenced multiple times, but at least once -- + ctx_mime_type_.assert_called_with() + assert file_type is expected_value + + +@pytest.mark.parametrize( + ("expected_value", "file_name"), + [ + (FileType.BMP, "img/bmp_24.bmp"), + (FileType.CSV, "stanley-cups.csv"), + (FileType.DOCX, "simple.docx"), + (FileType.EML, "eml/fake-email.eml"), + (FileType.EPUB, "winter-sports.epub"), + (FileType.HEIC, "img/DA-1p.heic"), + (FileType.HTML, "ideas-page.html"), + (FileType.JPG, "img/example.jpg"), + (FileType.JSON, "spring-weather.html.json"), + (FileType.ODT, "simple.odt"), + (FileType.PDF, "pdf/layout-parser-paper-fast.pdf"), + (FileType.PNG, "img/DA-1p.png"), + (FileType.PPTX, "fake-power-point.pptx"), + (FileType.RTF, "fake-doc.rtf"), + (FileType.TIFF, "img/layout-parser-paper-fast.tiff"), + (FileType.TXT, "norwich-city.txt"), + (FileType.WAV, "CantinaBand3.wav"), + (FileType.XLSX, "stanley-cups.xlsx"), + (FileType.XML, "factbook.xml"), + (FileType.ZIP, "simple.zip"), + ], +) +def test_it_detects_most_file_types_using_strategy_2_when_libmagic_guesses_mime_type_for_itself( + file_name: str, expected_value: FileType +): + """Does not work for all types, in particular: + + TODOs: + - DOC is misidentified as MSG, TODO on that below. + - MSG is misidentified as UNK, but only on CI. + - PPT is misidentified as MSG, same fix as DOC. + - TSV is identified as TXT, maybe need an `.is_tsv` predicate in `_TextFileDifferentiator` + - XLS is misidentified as MSG, same fix as DOC. + + NOCANDOs: w/o an extension I think these are the best we can do. + - MD is identified as TXT + - ORG is identified as TXT + - RST is identified as TXT + """ + # -- disable strategy #1 by not asserting a content_type in the call -- + # -- disable strategy #3 (extension) by passing file-like object with no `.name` attribute -- + with open(example_doc_path(file_name), "rb") as f: + file = io.BytesIO(f.read()) + + assert detect_filetype(file=file) is expected_value + + +# NOTE(scanny): magic gets this wrong ("application/x-ole-storage") but filetype lib gets it right +# ("application/msword"). Need a differentiator for "application/x-ole-storage". +@pytest.mark.xfail(reason="TODO: FIX", raises=AssertionError, strict=True) +@pytest.mark.parametrize( + ("expected_value", "file_name"), + [ + (FileType.DOC, "simple.doc"), + (FileType.PPT, "fake-power-point.ppt"), + (FileType.XLS, "tests-example.xls"), + # -- only fails on CI, maybe different libmagic version or "magic-files" -- + # (FileType.MSG, "fake-email.msg"), + ], +) +def test_it_detects_MS_Office_file_types_using_strategy_2_when_libmagic_guesses_mime_type( + file_name: str, expected_value: FileType +): + with open(example_doc_path(file_name), "rb") as f: + file = io.BytesIO(f.read()) + assert detect_filetype(file=file) is expected_value + + +@pytest.mark.parametrize( + ("expected_value", "file_name"), + [ + # -- `filetype` lib recognizes all these binary file-types -- + (FileType.BMP, "img/bmp_24.bmp"), + (FileType.DOC, "simple.doc"), + (FileType.DOCX, "simple.docx"), + (FileType.EPUB, "winter-sports.epub"), + (FileType.HEIC, "img/DA-1p.heic"), + (FileType.JPG, "img/example.jpg"), + (FileType.ODT, "simple.odt"), + (FileType.PDF, "pdf/layout-parser-paper-fast.pdf"), + (FileType.PNG, "img/DA-1p.png"), + (FileType.PPT, "fake-power-point.ppt"), + (FileType.PPTX, "fake-power-point.pptx"), + (FileType.RTF, "fake-doc.rtf"), + (FileType.TIFF, "img/layout-parser-paper-fast.tiff"), + (FileType.WAV, "CantinaBand3.wav"), + (FileType.XLS, "tests-example.xls"), + (FileType.XLSX, "stanley-cups.xlsx"), + (FileType.ZIP, "simple.zip"), + # -- but it doesn't recognize textual file-types at all -- + (FileType.UNK, "stanley-cups.csv"), + (FileType.UNK, "eml/fake-email.eml"), + (FileType.UNK, "example-10k-1p.html"), + (FileType.UNK, "spring-weather.html.json"), + (FileType.UNK, "README.md"), + (FileType.UNK, "README.org"), + (FileType.UNK, "README.rst"), + (FileType.UNK, "stanley-cups.tsv"), + (FileType.UNK, "norwich-city.txt"), + (FileType.UNK, "factbook.xml"), + # -- and it doesn't recognize MSG files -- + (FileType.UNK, "fake-email.msg"), + ], +) +def test_strategy_2_can_detect_only_binary_file_types_when_libmagic_is_unavailable( + file_name: str, expected_value: FileType, LIBMAGIC_AVAILABLE_False: bool +): + """File-type is detected using `filetype` library when libmagic is not available. + + `filetype.guess_mime()` does a good job on binary file types (PDF, images, legacy MS-Office), + but doesn't even try to guess textual file-types. + """ + # -- disable strategy #3 (extension) by passing file-like object with no `.name` attribute -- + with open(example_doc_path(file_name), "rb") as f: + file = io.BytesIO(f.read()) + # -- simulate libmagic is not available -- + assert LIBMAGIC_AVAILABLE_False is False + + # -- disable strategy #1 by not asserting a content_type in the call -- + file_type = detect_filetype(file=file) + + assert file_type is expected_value def test_detect_filetype_from_file_warns_when_libmagic_is_not_installed( - monkeypatch: MonkeyPatch, caplog: LogCaptureFixture + caplog: LogCaptureFixture, LIBMAGIC_AVAILABLE_False: bool ): - monkeypatch.setattr(filetype, "LIBMAGIC_AVAILABLE", False) with open(example_doc_path("fake-text.txt"), "rb") as f: detect_filetype(file=f) assert "WARNING" in caplog.text + assert "libmagic is unavailable but assists in filetype detection. Please cons" in caplog.text -def test_detect_XML_from_application_xml_file_path(magic_from_file_: Mock): - magic_from_file_.return_value = "application/xml" - file_path = example_doc_path("factbook.xml") - - filetype = detect_filetype(file_path) - - magic_from_file_.assert_called_once_with(file_path, mime=True) - assert filetype == FileType.XML +# ================================================================================================ +# STRATEGY #3 - MAP FILENAME EXTENSION TO FILETYPE +# ================================================================================================ -def test_detect_CSV_from_text_csv_file_path(magic_from_file_: Mock): - magic_from_file_.return_value = "text/csv" - file_path = example_doc_path("stanley-cups.csv") +@pytest.mark.parametrize( + ("expected_value", "file_name"), + [ + (FileType.BMP, "img/bmp_24.bmp"), + (FileType.CSV, "stanley-cups.csv"), + (FileType.DOC, "simple.doc"), + (FileType.DOCX, "simple.docx"), + (FileType.EML, "eml/fake-email.eml"), + (FileType.EPUB, "winter-sports.epub"), + (FileType.HEIC, "img/DA-1p.heic"), + (FileType.HTML, "example-10k-1p.html"), + (FileType.JPG, "img/example.jpg"), + (FileType.JSON, "spring-weather.html.json"), + (FileType.MD, "README.md"), + (FileType.MSG, "fake-email.msg"), + (FileType.ODT, "simple.odt"), + (FileType.ORG, "README.org"), + (FileType.PDF, "pdf/layout-parser-paper-fast.pdf"), + (FileType.PNG, "img/DA-1p.png"), + (FileType.PPT, "fake-power-point.ppt"), + (FileType.PPTX, "fake-power-point.pptx"), + (FileType.RST, "README.rst"), + (FileType.RTF, "fake-doc.rtf"), + (FileType.TIFF, "img/layout-parser-paper-fast.tiff"), + (FileType.TSV, "stanley-cups.tsv"), + (FileType.TXT, "norwich-city.txt"), + (FileType.WAV, "CantinaBand3.wav"), + (FileType.XLS, "tests-example.xls"), + (FileType.XLSX, "stanley-cups.xlsx"), + (FileType.XML, "factbook.xml"), + (FileType.ZIP, "simple.zip"), + ], +) +def test_it_detects_correct_file_type_from_strategy_3_when_extension_maps_to_file_type( + file_name: str, expected_value: FileType, ctx_mime_type_: Mock +): + # -- disable strategy #2 by making libmagic always guess `None` -- + ctx_mime_type_.return_value = None - filetype = detect_filetype(file_path) + # -- disable strategy #1 by not asserting a content_type in the call -- + # -- enable strategy #3 by passing filename as source for extension -- + file_type = detect_filetype(example_doc_path(file_name)) - magic_from_file_.assert_called_once_with(file_path, mime=True) - assert filetype == FileType.CSV + # -- ctx.mime_type may be referenced multiple times, but at least once -- + ctx_mime_type_.assert_called_with() + assert file_type is expected_value -def test_detect_TXT_from_text_x_script_python_file_path(magic_from_file_: Mock): - magic_from_file_.return_value = "text/x-script.python" - file_path = example_doc_path("logger.py") +@pytest.mark.parametrize( + ("expected_value", "file_name", "mime_type"), + [ + (FileType.BMP, "img/bmp_24.bmp", "application/zip"), + (FileType.DOC, "simple.doc", None), + (FileType.MSG, "fake-email.msg", "application/octet-stream"), + ], +) +def test_it_falls_back_to_extension_strategy_when_prior_strategies_fail( + file_name: str, mime_type: str | None, expected_value: FileType, ctx_mime_type_: Mock +): + ctx_mime_type_.return_value = mime_type - filetype = detect_filetype(file_path) + file_type = detect_filetype(example_doc_path(file_name)) - magic_from_file_.assert_called_once_with(file_path, mime=True) - assert filetype == FileType.TXT + ctx_mime_type_.assert_called_with() + assert file_type is expected_value -def test_detect_TXT_from_text_x_script_python_file(magic_from_buffer_: Mock): - magic_from_buffer_.return_value = "text/x-script.python" - file_path = example_doc_path("logger.py") - - with open(file_path, "rb") as f: - head = f.read(4096) - f.seek(0) - filetype = detect_filetype(file=f) - - magic_from_buffer_.assert_called_once_with(head, mime=True) - assert filetype == FileType.TXT +# ================================================================================================ +# SPECIAL CASES +# ================================================================================================ -def test_is_code_mime_type_for_Go(): - assert _is_code_mime_type("text/x-go") is True +@pytest.mark.parametrize("mime_type", ["application/xml", "text/xml"]) +@pytest.mark.parametrize("extension", [".html", ".htm"]) +def test_it_detects_HTML_from_guessed_mime_type_ending_with_xml_and_html_extension( + mime_type: str, extension: str, ctx_mime_type_: Mock +): + ctx_mime_type_.return_value = mime_type + with open(example_doc_path("example-10k-1p.html"), "rb") as f: + file = io.BytesIO(f.read()) + file.name = f"a/b/page{extension}" + + file_type = detect_filetype(file=file) + + ctx_mime_type_.assert_called_with() + assert file_type is FileType.HTML -def test_detect_TXT_from_text_go_file(magic_from_buffer_: Mock): - magic_from_buffer_.return_value = "text/x-go" - file_path = example_doc_path("fake.go") - - with open(file_path, "rb") as f: - head = f.read(4096) - f.seek(0) - filetype = detect_filetype(file=f) - - magic_from_buffer_.assert_called_once_with(head, mime=True) - assert filetype == FileType.TXT - - -def test_detect_RTF_from_application_rtf_file_path(magic_from_file_: Mock): - magic_from_file_.return_value = "application/rtf" - file_path = example_doc_path("fake-doc.rtf") - - filetype = detect_filetype(file_path) - - magic_from_file_.assert_called_once_with(file_path, mime=True) - assert filetype == FileType.RTF - - -def test_detect_XML_from_text_xml_file_path(magic_from_file_: Mock): - magic_from_file_.return_value = "text/xml" - file_path = example_doc_path("factbook.xml") - - filetype = detect_filetype(file_path) - - magic_from_file_.assert_called_once_with(file_path, mime=True) - assert filetype == FileType.XML - - -def test_detect_HTML_from_application_xml_file_path_with_html_extension(magic_from_file_: Mock): - magic_from_file_.return_value = "application/xml" - file_path = example_doc_path("fake-html.html") - - filetype = detect_filetype(file_path) - - magic_from_file_.assert_called_once_with(file_path, mime=True) - assert filetype == FileType.HTML - - -def test_detect_HTML_from_text_xml_file_path_with_html_extension(magic_from_file_: Mock): - magic_from_file_.return_value = "text/xml" - file_path = example_doc_path("fake-html.html") - - filetype = detect_filetype(file_path) - - magic_from_file_.assert_called_once_with(file_path, mime=True) - assert filetype == FileType.HTML - - -def test_detect_DOCX_from_application_octet_stream_file_no_extension(magic_from_buffer_: Mock): - magic_from_buffer_.return_value = "application/octet-stream" - with open(example_doc_path("simple.docx"), "rb") as f: +@pytest.mark.parametrize( + "mime_type", + [ + "application/octet-stream", + "application/zip", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ], +) +@pytest.mark.parametrize( + ("expected_value", "file_name"), + [ + (FileType.DOCX, "simple.docx"), + (FileType.PPTX, "fake-power-point.pptx"), + (FileType.XLSX, "stanley-cups.xlsx"), + (FileType.ZIP, "simple.zip"), + ], +) +def test_it_differentiates_files_when_libmagic_guesses_octet_stream_zip_or_modern_ms_office( + mime_type: str, file_name: str, expected_value: FileType, ctx_mime_type_: Mock +): + ctx_mime_type_.return_value = mime_type + # -- disable extension-based strategy #3 -- + with open(example_doc_path(file_name), "rb") as f: file = io.BytesIO(f.read()) - filetype = detect_filetype(file=file) + file_type = detect_filetype(file=file) - magic_from_buffer_.assert_called_once_with(file.getvalue()[:4096], mime=True) - assert filetype == FileType.DOCX + ctx_mime_type_.assert_called_with() + assert file_type is expected_value -def test_detect_DOCX_from_application_octet_stream_file_path(magic_from_file_: Mock): - magic_from_file_.return_value = "application/octet-stream" - file_path = example_doc_path("simple.docx") - - filetype = detect_filetype(file_path) - - magic_from_file_.assert_called_once_with(file_path, mime=True) - assert filetype == FileType.DOCX - - -def test_detect_DOCX_from_application_zip_file_path(magic_from_file_: Mock): - magic_from_file_.return_value = "application/zip" - file_path = example_doc_path("simple.docx") - - filetype = detect_filetype(file_path) - - magic_from_file_.assert_called_once_with(file_path, mime=True) - assert filetype == FileType.DOCX - - -def test_detect_ZIP_from_application_zip_file_path(magic_from_file_: Mock): - magic_from_file_.return_value = "application/zip" - file_path = example_doc_path("simple.zip") - - filetype = detect_filetype(file_path) - - magic_from_file_.assert_called_once_with(file_path, mime=True) - assert filetype == FileType.ZIP - - -def test_detect_DOC_from_application_msword_file_path(magic_from_file_: Mock): - magic_from_file_.return_value = "application/msword" - file_path = example_doc_path("fake.doc") - - filetype = detect_filetype(file_path) - - magic_from_file_.assert_called_once_with(file_path, mime=True) - assert filetype == FileType.DOC - - -def test_detect_PPT_from_application_vnd_ms_powerpoint_file_path(magic_from_file_: Mock): - magic_from_file_.return_value = "application/vnd.ms-powerpoint" - file_path = example_doc_path("fake-power-point.ppt") - - filetype = detect_filetype(file_path) - - magic_from_file_.assert_called_once_with(file_path, mime=True) - assert filetype == FileType.PPT - - -def test_detect_XLS_from_application_vnd_ms_excel_file_path(magic_from_file_: Mock): - magic_from_file_.return_value = "application/vnd.ms-excel" - file_path = example_doc_path("tests-example.xls") - - filetype = detect_filetype(file_path) - - magic_from_file_.assert_called_once_with(file_path, mime=True) - assert filetype == FileType.XLS - - -def test_detect_XLSX_from_application_octet_stream_file_no_extension(magic_from_buffer_: Mock): - magic_from_buffer_.return_value = "application/octet-stream" - with open(example_doc_path("stanley-cups.xlsx"), "rb") as f: +@pytest.mark.parametrize( + ("mime_type", "file_name"), + [ + ("text/x-script.python", "logger.py"), + ("text/x-go", "fake.go"), + ("application/x-javascript", "fake-text.txt"), + ], +) +def test_it_detects_TXT_for_source_code_files(mime_type: str, file_name: str, ctx_mime_type_: Mock): + ctx_mime_type_.return_value = mime_type + # -- disable extension-based strategy #3 -- + with open(example_doc_path(file_name), "rb") as f: file = io.BytesIO(f.read()) - filetype = detect_filetype(file=file) + file_type = detect_filetype(file=file) - magic_from_buffer_.assert_called_once_with(file.getvalue()[:4096], mime=True) - assert filetype == FileType.XLSX + ctx_mime_type_.assert_called_with() + assert file_type is FileType.TXT -def test_detect_XLSX_from_application_octet_stream_file_path(magic_from_file_: Mock): - magic_from_file_.return_value = "application/octet-stream" - file_path = example_doc_path("stanley-cups.xlsx") - - filetype = detect_filetype(file_path) - - magic_from_file_.assert_called_once_with(file_path, mime=True) - assert filetype == FileType.XLSX - - -def test_detect_PPTX_from_application_octet_stream_file_no_extension(magic_from_buffer_: Mock): - magic_from_buffer_.return_value = "application/octet-stream" - with open(example_doc_path("fake-power-point.pptx"), "rb") as f: - file = io.BytesIO(f.read()) - - filetype = detect_filetype(file=file) - - magic_from_buffer_.assert_called_once_with(file.getvalue()[:4096], mime=True) - assert filetype == FileType.PPTX - - -def test_detect_PPTX_from_application_octet_stream_file_path(magic_from_file_: Mock): - magic_from_file_.return_value = "application/octet-stream" - file_path = example_doc_path("fake-power-point.pptx") - - filetype = detect_filetype(file_path) - - magic_from_file_.assert_called_once_with(file_path, mime=True) - assert filetype == FileType.PPTX - - -def test_detect_UNK_from_application_octet_stream_text_file_no_extension(magic_from_buffer_: Mock): - magic_from_buffer_.return_value = "application/octet-stream" +def test_detects_TXT_from_an_unknown_guessed_text_subtype(ctx_mime_type_: Mock): + ctx_mime_type_.return_value = "text/new-type" with open(example_doc_path("fake-text.txt"), "rb") as f: file = io.BytesIO(f.read()) filetype = detect_filetype(file=file) - assert magic_from_buffer_.call_args_list == [ - call(file.getvalue()[:4096], mime=True), - call(b"", mime=True), - ] - assert filetype == FileType.UNK - - -def test_detect_ZIP_from_application_zip_not_a_zip_file(magic_from_buffer_: Mock): - magic_from_buffer_.return_value = "application/zip" - - with open(example_doc_path("fake-text.txt"), "rb") as f: - head = f.read(4096) - f.seek(0) - filetype = detect_filetype(file=f) - - assert magic_from_buffer_.call_args_list == [ - call(head, mime=True), - call(b"", mime=True), - ] - assert filetype == FileType.ZIP - - -def test_detect_DOCX_from_docx_mime_type_file_no_extension(magic_from_buffer_: Mock): - magic_from_buffer_.return_value = ( - "application/vnd.openxmlformats-officedocument.wordprocessingml.document" - ) - with open(example_doc_path("simple.docx"), "rb") as f: - file = io.BytesIO(f.read()) - - filetype = detect_filetype(file=file) - - magic_from_buffer_.assert_called_once_with(file.getvalue()[:4096], mime=True) - assert filetype == FileType.DOCX - - -def test_detect_XLSX_from_xlsx_mime_type_file_no_extension(magic_from_buffer_: Mock): - magic_from_buffer_.return_value = ( - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" - ) - with open(example_doc_path("stanley-cups.xlsx"), "rb") as f: - file = io.BytesIO(f.read()) - - filetype = detect_filetype(file=file) - - magic_from_buffer_.assert_called_once_with(file.getvalue()[:4096], mime=True) - assert filetype == FileType.XLSX - - -def test_detect_UNK_from_extension_of_non_existent_file_path(): - assert detect_filetype(example_doc_path("made_up.fake")) == FileType.UNK - - -def test_detect_PNG_from_extension_of_non_existent_file_path(): - assert detect_filetype(example_doc_path("made_up.png")) == FileType.PNG - - -def test_detect_TXT_from_unknown_text_subtype_file_no_extension(magic_from_buffer_: Mock): - magic_from_buffer_.return_value = "text/new-type" - with open(example_doc_path("fake-text.txt"), "rb") as f: - file = io.BytesIO(f.read()) - - filetype = detect_filetype(file=file) - - magic_from_buffer_.assert_called_once_with(file.getvalue()[:4096], mime=True) + ctx_mime_type_.assert_called_with() assert filetype == FileType.TXT -def test_detect_BMP_from_file_path(): - assert detect_filetype(example_doc_path("bmp_24.bmp")) == FileType.BMP - - -def test_detect_BMP_from_file_no_extension(): - with open(example_doc_path("img/bmp_24.bmp"), "rb") as f: - file = io.BytesIO(f.read()) - assert detect_filetype(file=file) == FileType.BMP - - -def test_detect_filetype_raises_when_both_path_and_file_like_object_are_specified(): - file_path = example_doc_path("fake-email.eml") - with open(example_doc_path(file_path), "rb") as f: - file = io.BytesIO(f.read()) - - with pytest.raises(ValueError, match="Exactly one of filename and file must be specified."): - detect_filetype(filename=file_path, file=file) - - def test_detect_filetype_raises_with_neither_path_or_file_like_object_specified(): - with pytest.raises(ValueError, match="Exactly one of filename and file must be specified."): + with pytest.raises(ValueError, match="either `file_path` or `file` argument must be provided"): detect_filetype() -def test_FileType_is_ordererd_by_name(): - """FileType is a total order on name, e.g. FileType.A < FileType.B.""" - assert FileType.EML < FileType.HTML < FileType.XML - - -@pytest.mark.parametrize( - ("content", "expected_value"), - [ - (b"d\xe2\x80", False), # Invalid JSON - (b'[{"key": "value"}]', True), # Valid JSON - (b"", False), # Empty content - (b'"This is not a JSON"', False), # Serializable as JSON, but we want to treat it as txt - ], -) -def test_is_text_file_a_json_distinguishes_JSON_from_text(content: bytes, expected_value: bool): - with io.BytesIO(content) as f: - assert _is_text_file_a_json(file=f) == expected_value - - -@pytest.mark.parametrize( - ("content", "expected_value"), - [ - (b"d\xe2\x80", False), # Invalid CSV - (b'[{"key": "value"}]', False), # Invalid CSV - (b"column1,column2,column3\nvalue1,value2,value3\n", True), # Valid CSV - (b"", False), # Empty content - ], -) -def test_is_text_file_a_csv_distinguishes_CSV_from_text(content: bytes, expected_value: bool): - with io.BytesIO(content) as f: - assert _is_text_file_a_csv(file=f) == expected_value - - -def test_csv_and_json_checks_with_filename_accommodate_utf_32_encoded_file(): - file_path = example_doc_path("fake-text-utf-32.txt") - assert _is_text_file_a_csv(filename=file_path) is False - assert _is_text_file_a_json(filename=file_path) is False - - -def test_csv_and_json_checks_with_file_accommodate_utf_32_encoded_content(): - with open(example_doc_path("fake-text-utf-32.txt"), "rb") as f: - file = io.BytesIO(f.read()) - - assert _is_text_file_a_csv(file=file) is False - file.seek(0) - assert _is_text_file_a_json(file=file) is False - - -def test_detect_EMPTY_from_file_path_to_empty_file(): +def test_it_detects_EMPTY_from_file_path_to_empty_file(): assert detect_filetype(example_doc_path("empty.txt")) == FileType.EMPTY -def test_detect_EMPTY_from_file_that_is_empty(): +def test_it_detects_EMPTY_from_empty_file_like_object(): with open(example_doc_path("empty.txt"), "rb") as f: assert detect_filetype(file=f) == FileType.EMPTY -def test_detect_CSV_from_path_and_file_when_content_contains_escaped_commas(): +def test_it_detect_CSV_from_path_and_file_when_content_contains_escaped_commas(): file_path = example_doc_path("csv-with-escaped-commas.csv") - assert detect_filetype(filename=file_path) == FileType.CSV + assert detect_filetype(file_path) == FileType.CSV with open(file_path, "rb") as f: assert detect_filetype(file=f) == FileType.CSV -def test_detect_filetype_from_octet_stream(): - with open(example_doc_path("emoji.xlsx"), "rb") as f: - assert _detect_filetype_from_octet_stream(file=f) == FileType.XLSX - - -def test_detect_WAV_from_filename(): - assert detect_filetype(example_doc_path("CantinaBand3.wav")) == FileType.WAV - - -def test_detect_wav_from_file(): - with open(example_doc_path("CantinaBand3.wav"), "rb") as f: - assert detect_filetype(file=f) == FileType.WAV - - -def test_detect_TXT_from_file_path_to_yaml(): - assert detect_filetype(example_doc_path("simple.yaml")) == FileType.TXT - - -def test_detect_TXT_from_yaml_file(magic_from_buffer_: Mock): - magic_from_buffer_.return_value = "text/yaml" - - with open(example_doc_path("simple.yaml"), "rb") as f: - head = f.read(4096) - f.seek(0) - file_type = detect_filetype(file=f) - - magic_from_buffer_.assert_called_once_with(head, mime=True) - assert file_type == FileType.TXT - - # ================================================================================================ # MODULE-LEVEL FIXTURES # ================================================================================================ -# -- `from_buffer()` and `from_file()` are not "methods" on `magic` per-se (`magic` is a module) -# -- but they behave like methods for mocking purposes. @pytest.fixture() -def magic_from_buffer_(request: FixtureRequest): - return method_mock(request, magic, "from_buffer") +def LIBMAGIC_AVAILABLE_False(): + with patch("unstructured.file_utils.filetype.LIBMAGIC_AVAILABLE", False) as m: + yield m @pytest.fixture() -def magic_from_file_(request: FixtureRequest): - return method_mock(request, magic, "from_file") +def ctx_mime_type_(request: FixtureRequest): + return property_mock(request, _FileTypeDetectionContext, "mime_type") + + +# ================================================================================================ +# UNIT-TESTS +# ================================================================================================ + + +class Describe_FileTypeDetectionContext: + """Unit-test suite for `unstructured.file_utils.filetype._FileTypeDetectionContext`.""" + + # -- .new() ------------------------------------------------- + + def it_provides_a_validating_alternate_constructor(self): + ctx = _FileTypeDetectionContext.new( + file_path=example_doc_path("simple.docx"), + file=None, + encoding="utf-8", + content_type="text/plain", + metadata_file_path="a/b/foo.bar", + ) + assert isinstance(ctx, _FileTypeDetectionContext) + + def and_the_validating_constructor_raises_on_an_invalid_context(self): + with pytest.raises(ValueError, match="either `file_path` or `file` argument must be pro"): + _FileTypeDetectionContext.new( + file_path=None, + file=None, + encoding=None, + content_type=None, + metadata_file_path=None, + ) + + # -- .content_type ------------------------------------------ + + def it_knows_the_content_type_asserted_by_the_caller(self): + assert _FileTypeDetectionContext(content_type="TEXT/hTmL").content_type == "text/html" + + # -- .encoding ---------------------------------------------- + + @pytest.mark.parametrize( + ("encoding", "expected_value"), + [ + ("utf-8", "utf-8"), + ("UTF_8", "utf-8"), + ("UTF_16LE", "utf-16le"), + ("ISO_8859_6_I", "iso-8859-6"), + # -- default value is utf-8 -- + (None, "utf-8"), + ], + ) + def it_knows_the_encoding_asserted_by_the_caller_and_normalizes_it( + self, encoding: str | None, expected_value: str + ): + assert _FileTypeDetectionContext(encoding=encoding).encoding == expected_value + + # -- .extension --------------------------------------------- + + def it_derives_the_filename_extension_from_the_file_path_when_one_is_provided(self): + ctx = _FileTypeDetectionContext(file_path=example_doc_path("simple.docx")) + assert ctx.extension == ".docx" + + def and_it_derives_the_extension_from_a_file_opened_from_a_path(self): + with open(example_doc_path("picture.pptx"), "rb") as f: + assert _FileTypeDetectionContext(file=f).extension == ".pptx" + + @pytest.mark.parametrize( + "file_name", + [ + # -- case 1: file-like object has no `.name` attribute + None, + # -- case 2: file-like object has `.name` attribute but it's value is the empty string + "", + ], + ) + def and_it_derives_the_extension_from_metadata_file_path_when_file_object_has_no_name( + self, file_name: str | None + ): + with open(example_doc_path("ideas-page.html"), "rb") as f: + file = io.BytesIO(f.read()) + if file_name is not None: + file.name = file_name + + ctx = _FileTypeDetectionContext(file=file, metadata_file_path="a/b/c.html") + + assert ctx.extension == ".html" + + @pytest.mark.parametrize( + "file_name", + [ + # -- case 1: file-like object has no `.name` attribute + None, + # -- case 2: file-like object has `.name` attribute but it's value is the empty string + "", + ], + ) + def and_it_returns_the_empty_string_as_the_extension_when_there_are_no_file_name_sources( + self, file_name: str | None + ): + with open(example_doc_path("ideas-page.html"), "rb") as f: + file = io.BytesIO(f.read()) + if file_name is not None: + file.name = file_name + + assert _FileTypeDetectionContext(file=file).extension == "" + + # -- .file_head --------------------------------------------- + + def it_grabs_the_first_4k_bytes_of_the_file_for_use_by_magic(self): + ctx = _FileTypeDetectionContext(file_path=example_doc_path("norwich-city.txt")) + + head = ctx.file_head + + assert isinstance(head, bytes) + assert len(head) == 4096 + assert head.startswith(b"Iwan Roberts\nRoberts celebrating after") + + # -- .file_path --------------------------------------------- + + @pytest.mark.parametrize("file_path", [None, "a/b/c.pdf"]) + def it_knows_the_file_path_provided_by_the_caller(self, file_path: str | None): + assert _FileTypeDetectionContext(file_path=file_path).file_path == file_path + + # -- .has_code_mime_type ------------------------------------ + + @pytest.mark.parametrize( + ("mime_type", "expected_value"), + [ + ("text/plain", False), + ("text/x-csharp", True), + ("text/x-go", True), + ("text/x-java", True), + ("text/x-python", True), + ("application/xml", False), + (None, False), + ], + ) + def it_knows_whether_its_mime_type_indicates_programming_language_source_code( + self, mime_type_prop_: Mock, mime_type: str | None, expected_value: bool + ): + mime_type_prop_.return_value = mime_type + assert _FileTypeDetectionContext().has_code_mime_type is expected_value + + # -- .is_zipfile -------------------------------------------- + + @pytest.mark.parametrize( + ("file_name", "expected_value"), + [ + ("README.md", False), + ("emoji.xlsx", True), + ("simple.doc", False), + ("simple.docx", True), + ("simple.odt", True), + ("simple.zip", True), + ("winter-sports.epub", True), + ], + ) + def it_knows_whether_it_is_a_zipfile(self, file_name: str, expected_value: bool): + assert _FileTypeDetectionContext(example_doc_path(file_name)).is_zipfile is expected_value + + # -- .mime_type --------------------------------------------- + + def it_provides_the_MIME_type_detected_by_libmagic_from_a_file_path(self): + ctx = _FileTypeDetectionContext(file_path=example_doc_path("norwich-city.txt")) + assert ctx.mime_type == "text/plain" + + def and_it_provides_the_MIME_type_from_path_using_filetype_lib_when_magic_is_unavailable(self): + with patch("unstructured.file_utils.filetype.LIBMAGIC_AVAILABLE", False): + ctx = _FileTypeDetectionContext(file_path=example_doc_path("simple.doc")) + assert ctx.mime_type == "application/msword" + + def but_it_warns_to_install_libmagic_when_the_filetype_lib_cannot_detect_the_MIME_type( + self, caplog: LogCaptureFixture + ): + with patch("unstructured.file_utils.filetype.LIBMAGIC_AVAILABLE", False): + ctx = _FileTypeDetectionContext(file_path=example_doc_path("norwich-city.txt")) + assert ctx.mime_type is None + assert "WARNING" in caplog.text + assert "libmagic is unavailable" in caplog.text + assert "consider installing libmagic" in caplog.text + + def it_provides_the_MIME_type_detected_by_libmagic_from_a_file_like_object(self): + with open(example_doc_path("norwich-city.txt"), "rb") as f: + ctx = _FileTypeDetectionContext(file=f) + assert ctx.mime_type == "text/plain" + + def and_it_provides_the_MIME_type_from_file_using_filetype_lib_when_magic_is_unavailable(self): + with patch("unstructured.file_utils.filetype.LIBMAGIC_AVAILABLE", False): + file_path = example_doc_path("simple.doc") + with open(file_path, "rb") as f: + ctx = _FileTypeDetectionContext(file=f) + assert ctx.mime_type == "application/msword" + + # -- .open() ------------------------------------------------ + + def it_provides_transparent_access_to_the_source_file_when_it_is_a_file_like_object(self): + with open(example_doc_path("norwich-city.txt"), "rb") as f: + ctx = _FileTypeDetectionContext(file=f) + with ctx.open() as file: + assert file is f + assert file.read(38) == b"Iwan Roberts\nRoberts celebrating after" + + def it_provides_transparent_access_to_the_source_file_when_it_is_a_file_path(self): + ctx = _FileTypeDetectionContext(file_path=example_doc_path("norwich-city.txt")) + with ctx.open() as file: + assert file.read(38) == b"Iwan Roberts\nRoberts celebrating after" + + # -- .text_head --------------------------------------------- + + def it_grabs_the_first_4k_chars_from_file_path_for_textual_type_differentiation(self): + ctx = _FileTypeDetectionContext(file_path=example_doc_path("norwich-city.txt")) + + text_head = ctx.text_head + + assert isinstance(text_head, str) + assert len(text_head) == 4096 + assert text_head.startswith("Iwan Roberts\nRoberts celebrating after") + + def and_it_uses_character_detection_to_correct_a_wrong_encoding_arg_for_file_path(self): + ctx = _FileTypeDetectionContext( + file_path=example_doc_path("norwich-city.txt"), encoding="utf_32_be" + ) + + text_head = ctx.text_head + + assert isinstance(text_head, str) + assert len(text_head) == 4096 + assert text_head.startswith("Iwan Roberts\nRoberts celebrating after") + + def but_not_to_correct_a_wrong_encoding_arg_for_a_file_like_object_open_in_binary_mode(self): + """Fails silently in this case, returning empty string.""" + with open(example_doc_path("norwich-city.txt"), "rb") as f: + file = io.BytesIO(f.read()) + ctx = _FileTypeDetectionContext(file=file, encoding="utf_32_be") + + text_head = ctx.text_head + + assert text_head == "" + + def and_it_grabs_the_first_4k_chars_from_binary_file_for_textual_type_differentiation(self): + with open(example_doc_path("norwich-city.txt"), "rb") as f: + ctx = _FileTypeDetectionContext(file=f) + + text_head = ctx.text_head + + assert isinstance(text_head, str) + # -- some characters consume multiple bytes, so shorter than 4096 -- + assert len(text_head) == 4063 + assert text_head.startswith("Iwan Roberts\nRoberts celebrating after") + + def and_it_grabs_the_first_4k_chars_from_text_file_for_textual_type_differentiation(self): + """Not a documented behavior to accept IO[str], but support is implemented.""" + with open(example_doc_path("norwich-city.txt")) as f: + ctx = _FileTypeDetectionContext(file=f) # pyright: ignore[reportArgumentType] + + text_head = ctx.text_head + + assert isinstance(text_head, str) + assert len(text_head) == 4096 + assert text_head.startswith("Iwan Roberts\nRoberts celebrating after") + + def it_accommodates_a_utf_32_encoded_file_path(self): + ctx = _FileTypeDetectionContext(example_doc_path("fake-text-utf-32.txt")) + + text_head = ctx.text_head + + assert isinstance(text_head, str) + # -- test document is short -- + assert len(text_head) == 188 + assert text_head.startswith("This is a test document to use for unit tests.\n\n Doyle") + + # TODO: this fails because `.text_head` ignores decoding errors on a file open for binary + # reading. Probably better if it used chardet in that case as it does for a file-path. + @pytest.mark.xfail(reason="WIP", raises=AssertionError, strict=True) + def and_it_accommodates_a_utf_32_encoded_file_like_object(self): + with open(example_doc_path("fake-text-utf-32.txt"), "rb") as f: + file = io.BytesIO(f.read()) + ctx = _FileTypeDetectionContext(file=file) + + text_head = ctx.text_head + + assert isinstance(text_head, str) + # -- test document is short -- + assert len(text_head) == 188 + assert text_head.startswith("This is a test document to use for unit tests.\n\n Doyle") + + # -- .validate() -------------------------------------------- + + def it_raises_when_no_file_exists_at_the_specified_file_path(self): + with pytest.raises(FileNotFoundError, match="no such file a/b/c.foo"): + _FileTypeDetectionContext(file_path="a/b/c.foo")._validate() + + def it_raises_when_neither_file_path_nor_file_is_provided(self): + with pytest.raises(ValueError, match="either `file_path` or `file` argument must be pro"): + _FileTypeDetectionContext()._validate() + + # -- fixtures -------------------------------------------------------------------------------- + + @pytest.fixture + def mime_type_prop_(self, request: FixtureRequest): + return property_mock(request, _FileTypeDetectionContext, "mime_type") + + +class Describe_TextFileDifferentiator: + """Unit-test suite for `unstructured.file_utils.filetype._TextFileDifferentiator`.""" + + # -- .applies() --------------------------------------------- + + def it_provides_a_qualifying_alternate_constructor_which_constructs_when_applicable(self): + """The constructor determines whether this differentiator is applicable. + + It returns an instance only when differentiating a text file-type is required, which it can + judge from the context (`ctx`). + """ + ctx = _FileTypeDetectionContext(example_doc_path("norwich-city.txt")) + + differentiator = _TextFileDifferentiator.applies(ctx) + + assert isinstance(differentiator, _TextFileDifferentiator) + + def and_it_returns_None_when_text_differentiation_does_not_apply_to_the_detection_context(self): + ctx = _FileTypeDetectionContext(example_doc_path("simple.docx")) + assert _TextFileDifferentiator.applies(ctx) is None + + # -- ._is_csv ----------------------------------------------- + + @pytest.mark.parametrize( + ("content", "expected_value"), + [ + # -- no commas, too few lines -- + (b"d\xe2\x80", False), + (b'[{"key": "value"}]', False), + # -- at least a header and one data row, at least two columns -- + (b"column1,column2,column3\nvalue1,value2,value3\n", True), + # -- no content -- + (b"", False), + ], + ) + def it_distinguishes_a_CSV_file_from_other_text_files( + self, content: bytes, expected_value: bool + ): + ctx = _FileTypeDetectionContext(file=io.BytesIO(content)) + differentiator = _TextFileDifferentiator(ctx) + + assert differentiator._is_csv is expected_value + + # -- ._is_eml ----------------------------------------------- + + @pytest.mark.parametrize( + ("file_name", "expected_value"), [("fake-email.eml", True), ("norwich-city.txt", False)] + ) + def it_distinguishes_an_EML_file_from_other_text_files( + self, file_name: str, expected_value: bool + ): + ctx = _FileTypeDetectionContext(example_doc_path(file_name)) + assert _TextFileDifferentiator(ctx)._is_eml is expected_value + + # -- ._is_json ---------------------------------------------- + + @pytest.mark.parametrize( + ("content", "expected_value"), + [ + (b"d\xe2\x80", False), + (b'[{"key": "value"}]', True), + (b"", False), + # -- valid JSON, but not for our purposes -- + (b'"This is not a JSON"', False), + ], + ) + def it_distinguishes_a_JSON_file_from_other_text_files( + self, content: bytes, expected_value: bool + ): + ctx = _FileTypeDetectionContext(file=io.BytesIO(content)) + differentiator = _TextFileDifferentiator(ctx) + + assert differentiator._is_json is expected_value + + +class Describe_ZipFileDifferentiator: + """Unit-test suite for `unstructured.file_utils.filetype._ZipFileDifferentiator`.""" + + # -- .applies() --------------------------------------------- + + def it_provides_a_qualifying_alternate_constructor_which_constructs_when_applicable(self): + """The constructor determines whether this differentiator is applicable. + + It returns an instance only when differentiating a zip file-type is required, which it can + judge from the mime-type provided by the context (`ctx`). + """ + ctx = _FileTypeDetectionContext(example_doc_path("simple.docx")) + + differentiator = _ZipFileDifferentiator.applies(ctx, "application/zip") + + assert isinstance(differentiator, _ZipFileDifferentiator) + + def and_it_returns_None_when_zip_differentiation_does_not_apply_to_the_detection_context(self): + ctx = _FileTypeDetectionContext(example_doc_path("norwich-city.txt")) + assert _ZipFileDifferentiator.applies(ctx, "application/epub") is None + + # -- .file_type --------------------------------------------- + + @pytest.mark.parametrize( + ("file_name", "expected_value"), + [ + ("simple.docx", FileType.DOCX), + ("picture.pptx", FileType.PPTX), + ("vodafone.xlsx", FileType.XLSX), + ("simple.zip", FileType.ZIP), + ("README.org", None), + ], + ) + def it_distinguishes_the_file_type_of_applicable_zip_files( + self, file_name: str, expected_value: FileType | None + ): + ctx = _FileTypeDetectionContext(example_doc_path(file_name)) + differentiator = _ZipFileDifferentiator(ctx) + + assert differentiator.file_type is expected_value diff --git a/test_unstructured/file_utils/test_model.py b/test_unstructured/file_utils/test_model.py index 91d2b8bec..98088ee75 100644 --- a/test_unstructured/file_utils/test_model.py +++ b/test_unstructured/file_utils/test_model.py @@ -10,6 +10,14 @@ from unstructured.file_utils.model import FileType class DescribeFileType: """Unit-test suite for `unstructured.file_utils.model.Filetype`.""" + # -- .__lt__() ---------------------------------------------- + + def it_is_a_collection_ordered_by_name_and_can_be_sorted(self): + """FileType is a total order on name, e.g. FileType.A < FileType.B.""" + assert FileType.EML < FileType.HTML < FileType.XML + + # -- .from_extension() -------------------------------------- + @pytest.mark.parametrize( ("ext", "file_type"), [ @@ -23,10 +31,12 @@ class DescribeFileType: def it_can_recognize_a_file_type_from_an_extension(self, ext: str, file_type: FileType | None): assert FileType.from_extension(ext) is file_type - @pytest.mark.parametrize("ext", [".foobar", ".xyz", ".mdx", "", "."]) - def but_not_when_that_extension_is_empty_or_not_registered(self, ext: str): + @pytest.mark.parametrize("ext", [".foobar", ".xyz", ".mdx", "", ".", None]) + def but_not_when_that_extension_is_empty_or_None_or_not_registered(self, ext: str | None): assert FileType.from_extension(ext) is None + # -- .from_mime_type() -------------------------------------- + @pytest.mark.parametrize( ("mime_type", "file_type"), [ @@ -46,29 +56,13 @@ class DescribeFileType: ): assert FileType.from_mime_type(mime_type) is file_type - @pytest.mark.parametrize("mime_type", ["text/css", "image/gif", "audio/mpeg", "foo/bar"]) - def but_not_when_that_mime_type_is_not_registered_by_a_file_type(self, mime_type: str): + @pytest.mark.parametrize("mime_type", ["text/css", "image/gif", "audio/mpeg", "foo/bar", None]) + def but_not_when_that_mime_type_is_not_registered_by_a_file_type_or_None( + self, mime_type: str | None + ): assert FileType.from_mime_type(mime_type) is None - @pytest.mark.parametrize( - ("file_type", "expected_value"), - [ - (FileType.BMP, ("unstructured_inference",)), - (FileType.CSV, ("pandas",)), - (FileType.DOC, ("docx",)), - (FileType.EMPTY, ()), - (FileType.HTML, ()), - (FileType.ODT, ("docx", "pypandoc")), - (FileType.PDF, ("pdf2image", "pdfminer", "PIL")), - (FileType.UNK, ()), - (FileType.WAV, ()), - (FileType.ZIP, ()), - ], - ) - def it_knows_which_importable_packages_its_partitioner_depends_on( - self, file_type: FileType, expected_value: tuple[str, ...] - ): - assert file_type.importable_package_dependencies == expected_value + # -- .extra_name -------------------------------------------- @pytest.mark.parametrize( ("file_type", "expected_value"), @@ -91,6 +85,30 @@ class DescribeFileType: ): assert file_type.extra_name == expected_value + # -- .importable_package_dependencies ----------------------- + + @pytest.mark.parametrize( + ("file_type", "expected_value"), + [ + (FileType.BMP, ("unstructured_inference",)), + (FileType.CSV, ("pandas",)), + (FileType.DOC, ("docx",)), + (FileType.EMPTY, ()), + (FileType.HTML, ()), + (FileType.ODT, ("docx", "pypandoc")), + (FileType.PDF, ("pdf2image", "pdfminer", "PIL")), + (FileType.UNK, ()), + (FileType.WAV, ()), + (FileType.ZIP, ()), + ], + ) + def it_knows_which_importable_packages_its_partitioner_depends_on( + self, file_type: FileType, expected_value: tuple[str, ...] + ): + assert file_type.importable_package_dependencies == expected_value + + # -- .is_partitionable -------------------------------------- + @pytest.mark.parametrize( ("file_type", "expected_value"), [ @@ -112,6 +130,8 @@ class DescribeFileType: ): assert file_type.is_partitionable is expected_value + # -- .mime_type --------------------------------------------- + @pytest.mark.parametrize( ("file_type", "mime_type"), [ @@ -131,6 +151,8 @@ class DescribeFileType: def it_knows_its_canonical_MIME_type(self, file_type: FileType, mime_type: str): assert file_type.mime_type == mime_type + # -- .partitioner_function_name ----------------------------- + @pytest.mark.parametrize( ("file_type", "expected_value"), [ @@ -155,6 +177,8 @@ class DescribeFileType: with pytest.raises(ValueError, match="`.partitioner_function_name` is undefined because "): file_type.partitioner_function_name + # -- .partitioner_module_qname ------------------------------ + @pytest.mark.parametrize( ("file_type", "expected_value"), [ @@ -181,6 +205,8 @@ class DescribeFileType: with pytest.raises(ValueError, match="`.partitioner_module_qname` is undefined because "): file_type.partitioner_module_qname + # -- .partitioner_shortname --------------------------------- + @pytest.mark.parametrize( ("file_type", "expected_value"), [ diff --git a/test_unstructured/metrics/test_element_type.py b/test_unstructured/metrics/test_element_type.py index 54939af0b..183efb8c6 100644 --- a/test_unstructured/metrics/test_element_type.py +++ b/test_unstructured/metrics/test_element_type.py @@ -1,6 +1,10 @@ +from __future__ import annotations + import pytest +from test_unstructured.unit_utils import example_doc_path from unstructured.metrics.element_type import ( + FrequencyDict, calculate_element_type_percent_match, get_element_type_frequency, ) @@ -14,10 +18,9 @@ from unstructured.staging.base import elements_to_json ( "fake-email.txt", { - ("UncategorizedText", None): 6, + ("NarrativeText", None): 1, + ("Title", None): 1, ("ListItem", None): 2, - ("Title", None): 5, - ("NarrativeText", None): 2, }, ), ( @@ -34,8 +37,8 @@ from unstructured.staging.base import elements_to_json ), ], ) -def test_get_element_type_frequency(filename, frequency): - elements = partition(filename=f"example-docs/{filename}") +def test_get_element_type_frequency(filename: str, frequency: dict[tuple[str, int | None], int]): + elements = partition(example_doc_path(filename)) elements_freq = get_element_type_frequency(elements_to_json(elements)) assert elements_freq == frequency @@ -46,11 +49,11 @@ def test_get_element_type_frequency(filename, frequency): ( "fake-email.txt", { - ("UncategorizedText", None): 14, + ("Title", None): 1, ("ListItem", None): 2, ("NarrativeText", None): 2, }, - (0.56, 0.56, 0.56), + (0.8, 0.8, 0.80), ), ( "sample-presentation.pptx", @@ -92,8 +95,10 @@ def test_get_element_type_frequency(filename, frequency): ), ], ) -def test_calculate_element_type_percent_match(filename, expected_frequency, percent_matched): - elements = partition(filename=f"example-docs/{filename}") +def test_calculate_element_type_percent_match( + filename: str, expected_frequency: FrequencyDict, percent_matched: tuple[float, float, float] +): + elements = partition(example_doc_path(filename)) elements_frequency = get_element_type_frequency(elements_to_json(elements)) assert ( round(calculate_element_type_percent_match(elements_frequency, expected_frequency), 2) diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index a09d45f2d..3e3d4c6b9 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -929,7 +929,11 @@ def test_auto_partition_raises_with_bad_type(request: FixtureRequest): partition(filename="made-up.fake", strategy=PartitionStrategy.HI_RES) detect_filetype_.assert_called_once_with( - content_type=None, encoding=None, file=None, file_filename=None, filename="made-up.fake" + file_path="made-up.fake", + file=None, + encoding=None, + content_type=None, + metadata_file_path=None, ) @@ -1305,7 +1309,7 @@ def test_auto_partition_that_requires_extras_raises_when_dependencies_are_not_in ) match = r"partition_pdf\(\) is not available because one or more dependencies are not installed" with pytest.raises(ImportError, match=match): - partition(example_doc_path("layout-parser-paper-fast.pdf")) + partition(example_doc_path("pdf/layout-parser-paper-fast.pdf")) dependency_exists_.assert_called_once_with("pdf2image") diff --git a/test_unstructured/partition/test_json.py b/test_unstructured/partition/test_json.py index 5b08a23e8..f5a5e0b56 100644 --- a/test_unstructured/partition/test_json.py +++ b/test_unstructured/partition/test_json.py @@ -9,8 +9,8 @@ import tempfile import pytest from pytest_mock import MockFixture +from test_unstructured.unit_utils import example_doc_path from unstructured.documents.elements import CompositeElement -from unstructured.file_utils.filetype import detect_filetype from unstructured.file_utils.model import FileType from unstructured.partition.email import partition_email from unstructured.partition.html import partition_html @@ -43,9 +43,9 @@ def test_it_chunks_elements_when_a_chunking_strategy_is_specified(): @pytest.mark.parametrize("filename", test_files) def test_partition_json_from_filename(filename: str): - path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) + path = example_doc_path(filename) elements = [] - filetype = detect_filetype(filename=path) + filetype = FileType.from_extension(os.path.splitext(path)[1]) if filetype == FileType.TXT: elements = partition_text(filename=path) if filetype == FileType.HTML: @@ -72,9 +72,9 @@ def test_partition_json_from_filename(filename: str): @pytest.mark.parametrize("filename", test_files) def test_partition_json_from_filename_with_metadata_filename(filename: str): - path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) + path = example_doc_path(filename) elements = [] - filetype = detect_filetype(filename=path) + filetype = FileType.from_extension(os.path.splitext(path)[1]) if filetype == FileType.TXT: elements = partition_text(filename=path) if filetype == FileType.HTML: @@ -97,9 +97,9 @@ def test_partition_json_from_filename_with_metadata_filename(filename: str): @pytest.mark.parametrize("filename", test_files) def test_partition_json_from_file(filename: str): - path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) + path = example_doc_path(filename) elements = [] - filetype = detect_filetype(filename=path) + filetype = FileType.from_extension(os.path.splitext(path)[1]) if filetype == FileType.TXT: elements = partition_text(filename=path) if filetype == FileType.HTML: @@ -126,9 +126,9 @@ def test_partition_json_from_file(filename: str): @pytest.mark.parametrize("filename", test_files) def test_partition_json_from_file_with_metadata_filename(filename: str): - path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) + path = example_doc_path(filename) elements = [] - filetype = detect_filetype(filename=path) + filetype = FileType.from_extension(os.path.splitext(path)[1]) if filetype == FileType.TXT: elements = partition_text(filename=path) if filetype == FileType.HTML: @@ -150,9 +150,9 @@ def test_partition_json_from_file_with_metadata_filename(filename: str): @pytest.mark.parametrize("filename", test_files) def test_partition_json_from_text(filename: str): - path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) + path = example_doc_path(filename) elements = [] - filetype = detect_filetype(filename=path) + filetype = FileType.from_extension(os.path.splitext(path)[1]) if filetype == FileType.TXT: elements = partition_text(filename=path) if filetype == FileType.HTML: @@ -192,9 +192,9 @@ def test_partition_json_works_with_empty_list(): def test_partition_json_raises_with_too_many_specified(): - path = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt") + path = example_doc_path("fake-text.txt") elements = [] - filetype = detect_filetype(filename=path) + filetype = FileType.from_extension(os.path.splitext(path)[1]) if filetype == FileType.TXT: elements = partition_text(filename=path) if filetype == FileType.HTML: @@ -225,9 +225,9 @@ def test_partition_json_raises_with_too_many_specified(): @pytest.mark.parametrize("filename", test_files) def test_partition_json_from_filename_exclude_metadata(filename: str): - path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) + path = example_doc_path(filename) elements = [] - filetype = detect_filetype(filename=path) + filetype = FileType.from_extension(os.path.splitext(path)[1]) if filetype == FileType.TXT: elements = partition_text(filename=path) if filetype == FileType.HTML: @@ -249,9 +249,9 @@ def test_partition_json_from_filename_exclude_metadata(filename: str): @pytest.mark.parametrize("filename", test_files) def test_partition_json_from_file_exclude_metadata(filename: str): - path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) + path = example_doc_path(filename) elements = [] - filetype = detect_filetype(filename=path) + filetype = FileType.from_extension(os.path.splitext(path)[1]) if filetype == FileType.TXT: elements = partition_text(filename=path) if filetype == FileType.HTML: @@ -274,9 +274,9 @@ def test_partition_json_from_file_exclude_metadata(filename: str): @pytest.mark.parametrize("filename", test_files) def test_partition_json_from_text_exclude_metadata(filename: str): - path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) + path = example_doc_path(filename) elements = [] - filetype = detect_filetype(filename=path) + filetype = FileType.from_extension(os.path.splitext(path)[1]) if filetype == FileType.TXT: elements = partition_text(filename=path) if filetype == FileType.HTML: diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 648e21503..d9462c4cf 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.1-dev1" # pragma: no cover +__version__ = "0.15.1-dev2" # pragma: no cover diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index c11d81c6c..5e930c366 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -1,18 +1,48 @@ +"""Automatically detect file-type based on inspection of the file's contents. + +Auto-detection proceeds via a sequence of strategies. The first strategy to confidently determine a +file-type returns that value. A strategy that is not applicable, either because it lacks the input +required or fails to determine a file-type, returns `None` and execution continues with the next +strategy. + +`_FileTypeDetector` is the main object and implements the three strategies. + +The three strategies are: + +- Use MIME-type asserted by caller in the `content_type` argument. +- Guess a MIME-type using libmagic, falling back to the `filetype` package when libmagic is + unavailable. +- Map filename-extension to a `FileType` member. + +A file that fails all three strategies is assigned the value `FileType.UNK`, for "unknown". + +`_FileTypeDetectionContext` encapsulates the various arguments received by `detect_filetype()` and +provides values derived from them. This object is immutable and can be passed to delegates of +`_FileTypeDetector` to provide whatever context they need on the current detection instance. + +`_FileTypeDetector` delegates to _differentiator_ objects like `_ZipFileDifferentiator` for +specialized discrimination and/or confirmation of ambiguous or frequently mis-identified +MIME-types. Additional differentiators are planned, one for `application/x-ole-storage` +(DOC, PPT, XLS, and MSG file-types) and perhaps others. +""" + from __future__ import annotations +import contextlib import functools import importlib.util import json import os import re import zipfile -from typing import IO, Callable, List, Optional +from typing import IO, Callable, Iterator, Optional +import filetype as ft from typing_extensions import ParamSpec from unstructured.documents.elements import Element from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str -from unstructured.file_utils.model import PLAIN_TEXT_EXTENSIONS, FileType +from unstructured.file_utils.model import FileType from unstructured.logger import logger from unstructured.nlp.patterns import EMAIL_HEAD_RE, LIST_OF_DICTS_PATTERN from unstructured.partition.common import ( @@ -21,179 +51,53 @@ from unstructured.partition.common import ( remove_element_metadata, set_element_hierarchy, ) -from unstructured.utils import get_call_args_applying_defaults +from unstructured.utils import get_call_args_applying_defaults, lazyproperty LIBMAGIC_AVAILABLE = bool(importlib.util.find_spec("magic")) def detect_filetype( - filename: Optional[str] = None, - content_type: Optional[str] = None, - file: Optional[IO[bytes]] = None, - file_filename: Optional[str] = None, - encoding: Optional[str] = "utf-8", + file_path: str | None = None, + file: IO[bytes] | None = None, + encoding: str | None = None, + content_type: str | None = None, + metadata_file_path: Optional[str] = None, ) -> FileType: - """Use libmagic to determine a file's type. + """Determine file-type of specified file using libmagic and/or fallback methods. - Helps determine which partition brick to use for a given file. A return value of None indicates - a non-supported file type. + One of `file_path` or `file` must be specified. A `file_path` that does not + correspond to a file on the filesystem raises `ValueError`. + + Args: + content_type: MIME-type of document-source, when already known. Providing + a value for this argument disables auto-detection unless it does not map + to a FileType member or is ambiguous, in which case it is ignored. + encoding: Only used for textual file-types. When omitted, `utf-8` is + assumed. Should generally be omitted except to resolve a problem with + textual file-types like HTML. + metadata_file_path: Only used when `file` is provided and then only as a + source for a filename-extension that may be needed as a secondary + content-type indicator. Ignored with the document is specified using + `file_path`. + + Returns: + A member of the `FileType` enumeration, `FileType.UNK` when the file type + could not be determined or is not supported. + + Raises: + ValueError: when: + - `file_path` is specified but does not correspond to a file on the + fileesystem. + - Neither `file_path` nor `file` were specified. """ - mime_type = None - exactly_one(filename=filename, file=file) - - # first check (content_type) - if content_type: - file_type = FileType.from_mime_type(content_type) - if file_type: - return file_type - - # second check (filename/file_name/file) - # continue if successfully define mime_type - if filename or file_filename: - _filename = filename or file_filename or "" - _, extension = os.path.splitext(_filename) - extension = extension.lower() - if os.path.isfile(_filename) and LIBMAGIC_AVAILABLE: - import magic - - mime_type = magic.from_file(_resolve_symlink(_filename), mime=True) - elif os.path.isfile(_filename): - import filetype as ft - - mime_type = ft.guess_mime(_filename) - if mime_type is None: - return FileType.from_extension(extension) or FileType.UNK - - elif file is not None: - if hasattr(file, "name"): - _, extension = os.path.splitext(file.name) - else: - extension = "" - extension = extension.lower() - # NOTE(robinson) - the python-magic docs recommend reading at least the first 2048 bytes - # Increased to 4096 because otherwise .xlsx files get detected as a zip file - # ref: https://github.com/ahupp/python-magic#usage - if LIBMAGIC_AVAILABLE: - import magic - - mime_type = magic.from_buffer(file.read(4096), mime=True) - else: - import filetype as ft - - mime_type = ft.guess_mime(file.read(4096)) - if mime_type is None: - logger.warning( - "libmagic is unavailable but assists in filetype detection on file-like objects. " - "Please consider installing libmagic for better results.", - ) - return FileType.from_extension(extension) or FileType.UNK - - else: - raise ValueError("No filename, file, nor file_filename were specified.") - - """Mime type special cases.""" - # third check (mime_type) - - # NOTE(Crag): older magic lib does not differentiate between xls and doc - if mime_type == "application/msword" and extension == ".xls": - return FileType.XLS - - elif mime_type.endswith("xml"): - if extension == ".html" or extension == ".htm": - return FileType.HTML - else: - return FileType.XML - - # -- ref: https://www.rfc-editor.org/rfc/rfc822 -- - elif mime_type == "message/rfc822" or mime_type.startswith("text"): - if not encoding: - encoding = "utf-8" - formatted_encoding = format_encoding_str(encoding) - - if extension in [ - ".eml", - ".p7s", - ".md", - ".rtf", - ".html", - ".rst", - ".org", - ".csv", - ".tsv", - ".json", - ]: - return FileType.from_extension(extension) or FileType.TXT - - # NOTE(crag): for older versions of the OS libmagic package, such as is currently - # installed on the Unstructured docker image, .json files resolve to "text/plain" - # rather than "application/json". this corrects for that case. - if _is_text_file_a_json( - file=file, - filename=filename, - encoding=formatted_encoding, - ): - return FileType.JSON - - if _is_text_file_a_csv( - file=file, - filename=filename, - encoding=formatted_encoding, - ): - return FileType.CSV - - if file and _check_eml_from_buffer(file=file) is True: - return FileType.EML - - if extension in PLAIN_TEXT_EXTENSIONS: - return FileType.from_extension(extension) or FileType.UNK - - # Safety catch - if file_type := FileType.from_mime_type(mime_type): - return file_type - - return FileType.TXT - - elif mime_type == "application/octet-stream": - if extension == ".docx": - return FileType.DOCX - elif file: - return _detect_filetype_from_octet_stream(file=file) - else: - return FileType.from_extension(extension) or FileType.UNK - - elif mime_type == "application/zip": - file_type = FileType.UNK - if file: - file_type = _detect_filetype_from_octet_stream(file=file) - elif filename is not None: - with open(filename, "rb") as f: - file_type = _detect_filetype_from_octet_stream(file=f) - - extension = extension if extension else "" - return ( - FileType.ZIP - if file_type in (FileType.UNK, FileType.ZIP) - else FileType.from_extension(extension) or file_type - ) - - elif _is_code_mime_type(mime_type): - # NOTE(robinson) - we'll treat all code files as plain text for now. - # we can update this logic and add filetypes for specific languages - # later if needed. - return FileType.TXT - - elif mime_type.endswith("empty"): - return FileType.EMPTY - - # For everything else - elif file_type := FileType.from_mime_type(mime_type): - return file_type - - logger.warning( - f"The MIME type{f' of {filename!r}' if filename else ''} is {mime_type!r}. " - "This file type is not currently supported in unstructured.", + ctx = _FileTypeDetectionContext.new( + file_path=file_path, + file=file, + encoding=encoding, + content_type=content_type, + metadata_file_path=metadata_file_path, ) - return FileType.from_extension(extension) or FileType.UNK + return _FileTypeDetector.file_type(ctx) def is_json_processable( @@ -217,124 +121,476 @@ def is_json_processable( return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None -def _check_eml_from_buffer(file: IO[bytes] | IO[str]) -> bool: - """Checks if a text/plain file is actually a .eml file. +class _FileTypeDetector: + """Determines file type from a variety of possible inputs.""" - Uses a regex pattern to see if the start of the file matches the typical pattern for a .eml - file. + def __init__(self, ctx: _FileTypeDetectionContext): + self._ctx = ctx + + @classmethod + def file_type(cls, ctx: _FileTypeDetectionContext) -> FileType: + """Detect file-type of document-source described by `ctx`.""" + return cls(ctx)._file_type + + @property + def _file_type(self) -> FileType: + """FileType member corresponding to this document source.""" + # -- strategy 1: use content-type asserted by caller -- + if file_type := self._file_type_from_content_type: + return file_type + + # -- strategy 2: guess MIME-type using libmagic and use that -- + if file_type := self._file_type_from_guessed_mime_type: + return file_type + + # -- strategy 3: use filename-extension, like ".docx" -> FileType.DOCX -- + if file_type := self._file_type_from_file_extension: + return file_type + + # -- strategy 4: give up and report FileType.UNK -- + return FileType.UNK + + # == STRATEGIES ============================================================ + + @property + def _file_type_from_content_type(self) -> FileType | None: + """Map passed content-type argument to a file-type, subject to certain rules.""" + content_type = self._ctx.content_type + + # -- when no content-type was asserted by caller, this strategy is not applicable -- + if not content_type: + return None + + # -- otherwise we trust the passed `content_type` as long as `FileType` recognizes it -- + return FileType.from_mime_type(content_type) + + @property + def _file_type_from_guessed_mime_type(self) -> FileType | None: + """FileType based on auto-detection of MIME-type by libmagic. + + In some cases refinements are necessary on the magic-derived MIME-types. This process + includes applying those rules, most of which are accumulated through practical experience. + """ + mime_type = self._ctx.mime_type + extension = self._ctx.extension + + # -- when libmagic is not installed, the `filetype` package is used instead. + # -- `filetype.guess()` returns `None` for file-types it does not support, which + # -- unfortunately includes all the textual file-types like CSV, EML, HTML, MD, RST, RTF, + # -- TSV, and TXT. When we have no guessed MIME-type, this strategy is not applicable. + if mime_type is None: + return None + + # NOTE(Crag): older magic lib does not differentiate between xls and doc + if mime_type == "application/msword" and extension == ".xls": + return FileType.XLS + + if mime_type.endswith("xml"): + return FileType.HTML if extension in (".html", ".htm") else FileType.XML + + if differentiator := _TextFileDifferentiator.applies(self._ctx): + return differentiator.file_type + + # -- applicable to "application/octet-stream", "application/zip", and all Office 2007+ + # -- document MIME-types, i.e. those for DOCX, PPTX, and XLSX. Note however it does NOT + # -- apply to EPUB or ODT documents, even though those are also Zip archives. The zip and + # -- octet-stream MIME-types are fed in because they are ambiguous. The MS-Office types are + # -- differentiated because they are sometimes mistaken for each other, like DOCX mime-type + # -- is actually a PPTX file etc. + if differentiator := _ZipFileDifferentiator.applies(self._ctx, mime_type): + return differentiator.file_type + + # -- All source-code files (e.g. *.py, *.js) are classified as plain text for the moment -- + if self._ctx.has_code_mime_type: + return FileType.TXT + + if mime_type.endswith("empty"): + return FileType.EMPTY + + # -- if no more-specific rules apply, use the MIME-type -> FileType mapping when present -- + if file_type := FileType.from_mime_type(mime_type): + return file_type + + logger.warning( + f"The MIME type{f' of {self._ctx.file_path!r}' if self._ctx.file_path else ''} is" + f" {mime_type!r}. This file type is not currently supported in unstructured.", + ) + return None + + @lazyproperty + def _file_type_from_file_extension(self) -> FileType | None: + """Determine file-type from filename extension. + + Returns `None` when no filename is available or when the extension does not map to a + supported file-type. + """ + return FileType.from_extension(self._ctx.extension) + + +class _FileTypeDetectionContext: + """Provides all arguments to auto-file detection and values derived from them. + + This keeps computation of derived values out of the file-detection code but more importantly + allows the main filetype-detector to pass the full context to any delegates without coupling + itself to which values it might need. """ - file.seek(0) - file_content = file.read(4096) - if isinstance(file_content, bytes): - file_head = file_content.decode("utf-8", errors="ignore") - else: - file_head = file_content - return EMAIL_HEAD_RE.match(file_head) is not None + + def __init__( + self, + file_path: str | None = None, + *, + file: IO[bytes] | None = None, + encoding: str | None = None, + content_type: str | None = None, + metadata_file_path: str | None = None, + ): + self._file_path = file_path + self._file_arg = file + self._encoding_arg = encoding + self._content_type = content_type + self._metadata_file_path = metadata_file_path + + @classmethod + def new( + cls, + *, + file_path: str | None, + file: IO[bytes] | None, + encoding: str | None, + content_type: str | None, + metadata_file_path: str | None, + ): + self = cls( + file_path=file_path, + file=file, + encoding=encoding, + content_type=content_type, + metadata_file_path=metadata_file_path, + ) + self._validate() + return self + + @lazyproperty + def content_type(self) -> str | None: + """MIME-type asserted by caller; not based on inspection of file by this process. + + Would commonly occur when the file was downloaded via HTTP and a `"Content-Type:` header was + present on the response. These are often ambiguous and sometimes just wrong so get some + further verification. All lower-case when not `None`. + """ + return self._content_type.lower() if self._content_type else None + + @lazyproperty + def encoding(self) -> str: + """Character-set used to encode text of this file. + + Relevant for textual file-types only, like HTML, TXT, JSON, etc. + """ + return format_encoding_str(self._encoding_arg or "utf-8") + + @lazyproperty + def extension(self) -> str: + """Best filename-extension we can muster, "" when there is no available source.""" + # -- get from file_path, or file when it has a name (path) -- + with self.open() as file: + if hasattr(file, "name") and file.name: + return os.path.splitext(file.name)[1].lower() + + # -- otherwise use metadata file-path when provided -- + if file_path := self._metadata_file_path: + return os.path.splitext(file_path)[1].lower() + + # -- otherwise empty str means no extension, same as a path like "a/b/name-no-ext" -- + return "" + + @lazyproperty + def file_head(self) -> bytes: + """The initial bytes of the file to be recognized, for use with libmagic detection.""" + with self.open() as file: + return file.read(4096) + + @lazyproperty + def file_path(self) -> str | None: + """Filesystem path to file to be inspected, when provided on call. + + None when the caller specified the source as a file-like object instead. Useful for user + feedback on an error, but users of context should have little use for it otherwise. + """ + return self._file_path + + @lazyproperty + def is_zipfile(self) -> bool: + """True when file is a Zip archive.""" + with self.open() as file: + return zipfile.is_zipfile(file) + + @lazyproperty + def has_code_mime_type(self) -> bool: + """True when `mime_type` plausibly indicates a programming language source-code file.""" + mime_type = self.mime_type + + if mime_type is None: + return False + + # -- check Go separately to avoid matching other MIME type containing "go" -- + if mime_type == "text/x-go": + return True + + return any( + lang in mime_type + for lang in "c# c++ cpp csharp java javascript php python ruby swift typescript".split() + ) + + @lazyproperty + def mime_type(self) -> str | None: + """The best MIME-type we can get from `magic` (or `filetype` package). + + A `str` return value is always in lower-case. + """ + if LIBMAGIC_AVAILABLE: + import magic + + mime_type = ( + magic.from_file(_resolve_symlink(self._file_path), mime=True) + if self._file_path + else magic.from_buffer(self.file_head, mime=True) + ) + return mime_type.lower() if mime_type else None + + mime_type = ( + ft.guess_mime(self._file_path) if self._file_path else ft.guess_mime(self.file_head) + ) + + if mime_type is None: + logger.warning( + "libmagic is unavailable but assists in filetype detection. Please consider" + " installing libmagic for better results." + ) + return None + + return mime_type.lower() + + @contextlib.contextmanager + def open(self) -> Iterator[IO[bytes]]: + """Encapsulates complexity of dealing with file-path or file-like-object. + + Provides an `IO[bytes]` object as the "common-denominator" document source. + + Must be used as a context manager using a `with` statement: + + with self._file as file: + do things with file + + File is guaranteed to be at read position 0 when called. + """ + if self._file_path: + with open(self._file_path, "rb") as f: + yield f + else: + file = self._file_arg + assert file is not None # -- guaranteed by `._validate()` -- + file.seek(0) + yield file + + @lazyproperty + def text_head(self) -> str: + """The initial characters of the text file for use with text-format differentiation. + + Raises: + UnicodeDecodeError if file cannot be read as text. + """ + # TODO: only attempts fallback character-set detection for file-path case, not for + # file-like object case. Seems like we should do both. + + if file := self._file_arg: + file.seek(0) + content = file.read(4096) + file.seek(0) + return ( + content + if isinstance(content, str) + else content.decode(encoding=self.encoding, errors="ignore") + ) + + file_path = self._file_path + assert file_path is not None # -- guaranteed by `._validate` -- + + try: + with open(file_path, encoding=self.encoding) as f: + return f.read(4096) + except UnicodeDecodeError: + encoding, _ = detect_file_encoding(filename=file_path) + with open(file_path, encoding=encoding) as f: + return f.read(4096) + + def _validate(self) -> None: + """Raise if the context is invalid.""" + if self._file_path and not os.path.isfile(self._file_path): + raise FileNotFoundError(f"no such file {self._file_path}") + if not self._file_path and not self._file_arg: + raise ValueError("either `file_path` or `file` argument must be provided") -def _detect_filetype_from_octet_stream(file: IO[bytes]) -> FileType: - """Detects the filetype, given a file with an application/octet-stream MIME type.""" - file.seek(0) - if zipfile.is_zipfile(file): - file.seek(0) - archive = zipfile.ZipFile(file) +class _TextFileDifferentiator: + """Refine a textual file-type that may not be as specific as it could be.""" - # NOTE(robinson) - .docx.xlsx files are actually zip file with a .docx/.xslx extension. - # If the MIME type is application/octet-stream, we check if it's a .docx/.xlsx file by - # looking for expected filenames within the zip file. - archive_filenames = [f.filename for f in archive.filelist] - if all(f in archive_filenames for f in ("docProps/core.xml", "word/document.xml")): - return FileType.DOCX - elif all(f in archive_filenames for f in ("xl/workbook.xml",)): - return FileType.XLSX - elif all(f in archive_filenames for f in ("docProps/core.xml", "ppt/presentation.xml")): - return FileType.PPTX + def __init__(self, ctx: _FileTypeDetectionContext): + self._ctx = ctx - if LIBMAGIC_AVAILABLE: - import magic + @classmethod + def applies(cls, ctx: _FileTypeDetectionContext) -> _TextFileDifferentiator | None: + """Constructs an instance, but only if this differentiator applies in `ctx`.""" + mime_type = ctx.mime_type + return ( + cls(ctx) + if mime_type and (mime_type == "message/rfc822" or mime_type.startswith("text")) + else None + ) - # Infer mime type using magic if octet-stream is not zip file - mime_type = magic.from_buffer(file.read(4096), mime=True) - return FileType.from_mime_type(mime_type) or FileType.UNK - logger.warning( - "Could not detect the filetype from application/octet-stream MIME type.", - ) - return FileType.UNK + @lazyproperty + def file_type(self) -> FileType: + """Differentiated file-type for textual content. + + Always produces a file-type, worst case that's `FileType.TXT` when nothing more specific + applies. + """ + extension = self._ctx.extension + + if extension in ".csv .eml .html .json .md .org .p7s .rst .rtf .tab .tsv".split(): + return FileType.from_extension(extension) or FileType.TXT + + # NOTE(crag): for older versions of the OS libmagic package, such as is currently + # installed on the Unstructured docker image, .json files resolve to "text/plain" + # rather than "application/json". this corrects for that case. + if self._is_json: + return FileType.JSON + + if self._is_csv: + return FileType.CSV + + if self._is_eml: + return FileType.EML + + if extension in (".text", ".txt"): + return FileType.TXT + + # Safety catch + if file_type := FileType.from_mime_type(self._ctx.mime_type): + return file_type + + return FileType.TXT + + @lazyproperty + def _is_csv(self) -> bool: + """True when file is plausibly in Comma Separated Values (CSV) format.""" + + def count_commas(text: str): + """Counts the number of commas in a line, excluding commas in quotes.""" + pattern = r"(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)," + matches = re.findall(pattern, text) + return len(matches) + + lines = self._ctx.text_head.strip().splitlines() + if len(lines) < 2: + return False + # -- check at most the first 10 lines -- + lines = lines[: len(lines)] if len(lines) < 10 else lines[:10] + # -- any lines without at least one comma disqualifies the file -- + if any("," not in line for line in lines): + return False + header_count = count_commas(lines[0]) + return all(count_commas(line) == header_count for line in lines[1:]) + + @lazyproperty + def _is_eml(self) -> bool: + """Checks if a text/plain file is actually a .eml file. + + Uses a regex pattern to see if the start of the file matches the typical pattern for a .eml + file. + """ + return EMAIL_HEAD_RE.match(self._ctx.text_head) is not None + + @lazyproperty + def _is_json(self) -> bool: + """True when file is JSON collection. + + A JSON file that contains only a string, number, or boolean, while valid JSON, will fail + this test since it is not partitionable. + """ + text_head = self._ctx.text_head + + # -- an empty file is not JSON -- + if not text_head: + return False + + # -- has to be a list or object, no string, number, or bool -- + if text_head.lstrip()[0] not in "[{": + return False + + try: + with self._ctx.open() as file: + json.load(file) + return True + except json.JSONDecodeError: + return False -def _is_code_mime_type(mime_type: str) -> bool: - """True when `mime_type` plausibly indicates a programming language source-code file.""" - PROGRAMMING_LANGUAGES = [ - "javascript", - "python", - "java", - "c++", - "cpp", - "csharp", - "c#", - "php", - "ruby", - "swift", - "typescript", - ] - mime_type = mime_type.lower() - # NOTE(robinson) - check this one explicitly to avoid conflicts with other - # MIME types that contain "go" - if mime_type == "text/x-go": - return True - return any(language in mime_type for language in PROGRAMMING_LANGUAGES) +class _ZipFileDifferentiator: + """Refine a Zip-packaged file-type that may be ambiguous or swapped.""" + def __init__(self, ctx: _FileTypeDetectionContext): + self._ctx = ctx -def _is_text_file_a_csv( - filename: Optional[str] = None, - file: Optional[IO[bytes]] = None, - encoding: Optional[str] = "utf-8", -): - """Detects if a file that has a text/plain MIME type is a CSV file.""" + @classmethod + def applies( + cls, ctx: _FileTypeDetectionContext, mime_type: str + ) -> _ZipFileDifferentiator | None: + """Constructs an instance, but only if this differentiator applies for `mime_type`. - def count_commas(text: str): - """Counts the number of commas in a line, excluding commas in quotes.""" - pattern = r"(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)," - matches = re.findall(pattern, text) - return len(matches) + Separate `mime_type` argument allows it to be applied to either asserted content-type or + guessed mime-type. + """ + return ( + cls(ctx) + if mime_type + in ( + "application/octet-stream", + "application/zip", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ) + else None + ) - file_text = _read_file_start_for_type_check( - file=file, - filename=filename, - encoding=encoding, - ) - lines = file_text.strip().splitlines() - if len(lines) < 2: - return False - lines = lines[: len(lines)] if len(lines) < 10 else lines[:10] - header_count = count_commas(lines[0]) - if any("," not in line for line in lines): - return False - return all(count_commas(line) == header_count for line in lines[1:]) + @lazyproperty + def file_type(self) -> FileType | None: + """Differentiated file-type for a Zip archive. + Returns `None` if the file is not a Zip archive. Otherwise it returns `FileType.DOCX`, + `FileType.PPTX`, or `FileType.XLSX` when one of those applies and `FileType.ZIP` otherwise. + """ + if not self._ctx.is_zipfile: + return None -def _is_text_file_a_json( - filename: Optional[str] = None, - file: Optional[IO[bytes]] = None, - encoding: Optional[str] = "utf-8", -): - """Detects if a file that has a text/plain MIME type is a JSON file.""" - file_text = _read_file_start_for_type_check( - file=file, - filename=filename, - encoding=encoding, - ) - try: - output = json.loads(file_text) - # NOTE(robinson) - Per RFC 4627 which defines the application/json media type, - # a string is a valid JSON. For our purposes, however, we want to treat that - # as a text file even if it is serializable as json. - # References: - # https://stackoverflow.com/questions/7487869/is-this-simple-string-considered-valid-json - # https://www.ietf.org/rfc/rfc4627.txt - return not isinstance(output, str) - except json.JSONDecodeError: - return False + with self._ctx.open() as file: + zip = zipfile.ZipFile(file) + + # NOTE(robinson) - .docx and .xlsx files are actually a zip file with a .docx/.xslx + # extension. If the MIME type is application/octet-stream, we check if it's a + # .docx/.xlsx file by looking for expected filenames within the zip file. + filenames = [f.filename for f in zip.filelist] + + if all(f in filenames for f in ("word/document.xml",)): + return FileType.DOCX + + if all(f in filenames for f in ("xl/workbook.xml",)): + return FileType.XLSX + + if all(f in filenames for f in ("ppt/presentation.xml",)): + return FileType.PPTX + + return FileType.ZIP def _read_file_start_for_type_check( @@ -379,9 +635,9 @@ def _resolve_symlink(file_path: str) -> str: _P = ParamSpec("_P") -def add_metadata(func: Callable[_P, List[Element]]) -> Callable[_P, List[Element]]: +def add_metadata(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]: @functools.wraps(func) - def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]: + def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]: elements = func(*args, **kwargs) call_args = get_call_args_applying_defaults(func, *args, **kwargs) include_metadata = call_args.get("include_metadata", True) @@ -412,7 +668,7 @@ def add_metadata(func: Callable[_P, List[Element]]) -> Callable[_P, List[Element def add_filetype( filetype: FileType, -) -> Callable[[Callable[_P, List[Element]]], Callable[_P, List[Element]]]: +) -> Callable[[Callable[_P, list[Element]]], Callable[_P, list[Element]]]: """Post-process element-metadata for list[Element] from partitioning. This decorator adds a post-processing step to a document partitioner. @@ -423,9 +679,9 @@ def add_filetype( """ - def decorator(func: Callable[_P, List[Element]]) -> Callable[_P, List[Element]]: + def decorator(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]: @functools.wraps(func) - def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]: + def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]: elements = func(*args, **kwargs) params = get_call_args_applying_defaults(func, *args, **kwargs) include_metadata = params.get("include_metadata", True) @@ -447,10 +703,10 @@ def add_filetype( def add_metadata_with_filetype( filetype: FileType, -) -> Callable[[Callable[_P, List[Element]]], Callable[_P, List[Element]]]: +) -> Callable[[Callable[_P, list[Element]]], Callable[_P, list[Element]]]: """...""" - def decorator(func: Callable[_P, List[Element]]) -> Callable[_P, List[Element]]: + def decorator(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]: return add_filetype(filetype=filetype)(add_metadata(func)) return decorator diff --git a/unstructured/file_utils/model.py b/unstructured/file_utils/model.py index 6c285a704..0fe0caa63 100644 --- a/unstructured/file_utils/model.py +++ b/unstructured/file_utils/model.py @@ -76,12 +76,14 @@ class FileType(enum.Enum): return None @classmethod - def from_mime_type(cls, mime_type: str) -> FileType | None: + def from_mime_type(cls, mime_type: str | None) -> FileType | None: """Select a FileType member based on a MIME-type. Returns `None` when `mime_type` is `None` or does not map to the canonical MIME-type of a `FileType` member or one of its alias MIME-types. """ + if mime_type is None: + return None # -- not super efficient but plenty fast enough for once-or-twice-per-file use and avoids # -- limitations on defining a class variable on an Enum. for m in cls.__members__.values(): @@ -434,6 +436,3 @@ class FileType(enum.Enum): "inode/x-empty", cast(list[str], []), ) - - -PLAIN_TEXT_EXTENSIONS = ".csv .eml .html .json .md .org .p7s .rst .rtf .tab .text .tsv .txt".split() diff --git a/unstructured/metrics/element_type.py b/unstructured/metrics/element_type.py index 3e4e8cbf8..6511900a4 100644 --- a/unstructured/metrics/element_type.py +++ b/unstructured/metrics/element_type.py @@ -1,10 +1,23 @@ +from __future__ import annotations + import json -from typing import Dict, Optional, Tuple, Union + +from typing_extensions import TypeAlias + +FrequencyDict: TypeAlias = "dict[tuple[str, int | None], int]" +"""Like: + { + ("ListItem", 0): 2, + ("NarrativeText", None): 2, + ("Title", 0): 5, + ("UncategorizedText", None): 6, + } +""" def get_element_type_frequency( elements: str, -) -> Union[Dict[Tuple[str, Optional[int]], int], Dict]: +) -> FrequencyDict: """ Calculate the frequency of Element Types from a list of elements. @@ -13,7 +26,7 @@ def get_element_type_frequency( Returns: Element type and its frequency in dictionary format. """ - frequency: Dict = {} + frequency: dict[tuple[str, int | None], int] = {} if len(elements) == 0: return frequency for element in json.loads(elements): @@ -28,14 +41,14 @@ def get_element_type_frequency( def calculate_element_type_percent_match( - output: Dict, - source: Dict, + output: FrequencyDict, + source: FrequencyDict, category_depth_weight: float = 0.5, ) -> float: - """ - Calculate the percent match between two frequency dictionary. Intended to use with - `get_element_type_frequency` function. The function counts the absolute exact match - (type and depth), and counts the weighted match (correct type but different depth), + """Calculate the percent match between two frequency dictionary. + + Intended to use with `get_element_type_frequency` function. The function counts the absolute + exact match (type and depth), and counts the weighted match (correct type but different depth), then normalized with source's total elements. """ if len(output) == 0 or len(source) == 0: @@ -46,8 +59,8 @@ def calculate_element_type_percent_match( total_source_element_count = 0 total_match_element_count = 0 - unmatched_depth_output = {} - unmatched_depth_source = {} + unmatched_depth_output: dict[str, int] = {} + unmatched_depth_source: dict[str, int] = {} # loop through the output list to find match with source for k, _ in output_copy.items(): @@ -80,12 +93,12 @@ def calculate_element_type_percent_match( return min(max(total_match_element_count / total_source_element_count, 0.0), 1.0) -def _convert_to_frequency_without_depth(d: Dict) -> Dict: +def _convert_to_frequency_without_depth(d: FrequencyDict) -> dict[str, int]: """ Takes in element frequency with depth of format (type, depth): value and converts to dictionary without depth of format type: value """ - res = {} + res: dict[str, int] = {} for k, v in d.items(): element_type = k[0] if element_type not in res: diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index b7cad8055..21c15d2f4 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -184,11 +184,11 @@ def partition( "The headers kwarg will be ignored.", ) file_type = detect_filetype( - filename=filename, + file_path=filename, file=file, - file_filename=metadata_filename, - content_type=content_type, encoding=encoding, + content_type=content_type, + metadata_file_path=metadata_filename, ) if file is not None: @@ -471,12 +471,13 @@ def file_and_type_from_url( response = requests.get(url, headers=headers, verify=ssl_verify, timeout=request_timeout) file = io.BytesIO(response.content) - content_type = ( - content_type or response.headers.get("Content-Type", "").split(";")[0].strip().lower() - ) - encoding = response.headers.get("Content-Encoding", "utf-8") + if content_type := content_type or response.headers.get("Content-Type", None): + content_type = content_type.split(";")[0].strip().lower() - filetype = detect_filetype(file=file, content_type=content_type, encoding=encoding) + # -- non-None when response is textual -- + encoding = response.encoding + + filetype = detect_filetype(file=file, encoding=encoding, content_type=content_type) return file, filetype diff --git a/unstructured/staging/base.py b/unstructured/staging/base.py index 331c860a9..429195f68 100644 --- a/unstructured/staging/base.py +++ b/unstructured/staging/base.py @@ -133,10 +133,12 @@ def elements_to_json( filename: Optional[str] = None, indent: int = 4, encoding: str = "utf-8", -) -> Optional[str]: - """Saves a list of elements to a JSON file if filename is specified. +) -> str: + """Serialize `elements` to a JSON array. - Otherwise, return the list of elements as a string. + Also writes the JSON to `filename` if it is provided, encoded using `encoding`. + + The JSON is returned as a string. """ # -- serialize `elements` as a JSON array (str) -- precision_adjusted_elements = _fix_metadata_field_precision(elements) @@ -146,7 +148,6 @@ def elements_to_json( if filename is not None: with open(filename, "w", encoding=encoding) as f: f.write(json_str) - return None return json_str