From 30b5a4da658497bd36e2b63fa28e1eb94b71d27b Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Fri, 10 Mar 2023 18:10:39 -0500 Subject: [PATCH] fix: parsing for files with `message/rfc822` MIME type; dir for unsupported files (#358) Adds the ability to process files with a message/rfc822 MIME type, which previously caused failures for example-docs/fake-email-header.eml. --- CHANGELOG.md | 4 +++- example-docs/{ => unsupported}/factbook.xml | 0 example-docs/{ => unsupported}/factbook.xsl | 0 .../{ => unsupported}/fake-excel.xlsx | Bin test_unstructured/documents/test_xml.py | 8 ++++---- test_unstructured/file_utils/test_filetype.py | 18 +++++++++--------- test_unstructured/partition/test_email.py | 6 ++++++ unstructured/__version__.py | 2 +- unstructured/file_utils/filetype.py | 7 ++++++- 9 files changed, 29 insertions(+), 16 deletions(-) rename example-docs/{ => unsupported}/factbook.xml (100%) rename example-docs/{ => unsupported}/factbook.xsl (100%) rename example-docs/{ => unsupported}/fake-excel.xlsx (100%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4bbd0ba36..bde01ae7d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.5.4-dev2 +## 0.5.4-dev3 ### Enhancements @@ -16,6 +16,8 @@ ### Fixes +* Fixes processing for text files with `message/rfc822` MIME type. + ## 0.5.3 ### Enhancements diff --git a/example-docs/factbook.xml b/example-docs/unsupported/factbook.xml similarity index 100% rename from example-docs/factbook.xml rename to example-docs/unsupported/factbook.xml diff --git a/example-docs/factbook.xsl b/example-docs/unsupported/factbook.xsl similarity index 100% rename from example-docs/factbook.xsl rename to example-docs/unsupported/factbook.xsl diff --git a/example-docs/fake-excel.xlsx b/example-docs/unsupported/fake-excel.xlsx similarity index 100% rename from example-docs/fake-excel.xlsx rename to example-docs/unsupported/fake-excel.xlsx diff --git a/test_unstructured/documents/test_xml.py b/test_unstructured/documents/test_xml.py index 7dddc6369..e5af54d12 100644 --- a/test_unstructured/documents/test_xml.py +++ b/test_unstructured/documents/test_xml.py @@ -45,8 +45,8 @@ def test_from_string(sample_document): def test_read_with_stylesheet(): - filename = os.path.join(FILEPATH, "..", "..", "example-docs", "factbook.xml") - stylesheet = os.path.join(FILEPATH, "..", "..", "example-docs", "factbook.xsl") + filename = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xml") + stylesheet = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xsl") xml_document = XMLDocument.from_file(filename=filename, stylesheet=stylesheet) doc_tree = xml_document.document_tree @@ -57,8 +57,8 @@ def test_read_with_stylesheet(): def test_read_with_stylesheet_warns_with_html_parser(caplog): - filename = os.path.join(FILEPATH, "..", "..", "example-docs", "factbook.xml") - stylesheet = os.path.join(FILEPATH, "..", "..", "example-docs", "factbook.xsl") + filename = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xml") + stylesheet = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xsl") XMLDocument.from_file(filename=filename, stylesheet=stylesheet, parser=etree.HTMLParser()) assert "WARNING" in caplog.text diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index 8f3f1b515..2210ea9ca 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -25,10 +25,10 @@ EXAMPLE_DOCS_DIRECTORY = os.path.join(FILE_DIRECTORY, "..", "..", "example-docs" ("example.jpg", FileType.JPG), ("fake-text.txt", FileType.TXT), ("fake-email.eml", FileType.EML), - ("factbook.xml", FileType.XML), + ("unsupported/factbook.xml", FileType.XML), ("example-10k.html", FileType.HTML), ("fake-html.html", FileType.HTML), - ("fake-excel.xlsx", FileType.XLSX), + ("unsupported/fake-excel.xlsx", FileType.XLSX), ("fake-power-point.pptx", FileType.PPTX), ], ) @@ -45,10 +45,10 @@ def test_detect_filetype_from_filename(file, expected): ("example.jpg", FileType.JPG), ("fake-text.txt", FileType.TXT), ("fake-email.eml", FileType.EML), - ("factbook.xml", FileType.XML), + ("unsupported/factbook.xml", FileType.XML), ("example-10k.html", FileType.HTML), ("fake-html.html", FileType.HTML), - ("fake-excel.xlsx", FileType.XLSX), + ("unsupported/fake-excel.xlsx", FileType.XLSX), ("fake-power-point.pptx", FileType.PPTX), ], ) @@ -66,12 +66,12 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte ("example.jpg", FileType.JPG), ("fake-text.txt", FileType.TXT), ("fake-email.eml", FileType.EML), - ("factbook.xml", FileType.XML), + ("unsupported/factbook.xml", FileType.XML), # NOTE(robinson) - For the document, some operating systems return # */xml and some return */html. Either could be acceptable depending on the OS ("example-10k.html", [FileType.HTML, FileType.XML]), ("fake-html.html", FileType.HTML), - ("fake-excel.xlsx", FileType.XLSX), + ("unsupported/fake-excel.xlsx", FileType.XLSX), ("fake-power-point.pptx", FileType.PPTX), ], ) @@ -168,7 +168,7 @@ def test_detect_xls_file_from_mime_type(monkeypatch): def test_detect_xlsx_filetype_application_octet_stream(monkeypatch): monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream") - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-excel.xlsx") + filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "unsupported", "fake-excel.xlsx") with open(filename, "rb") as f: filetype = detect_filetype(file=f) assert filetype == FileType.XLSX @@ -176,7 +176,7 @@ def test_detect_xlsx_filetype_application_octet_stream(monkeypatch): def test_detect_xlsx_filetype_application_octet_stream_with_filename(monkeypatch): monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/octet-stream") - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-excel.xlsx") + filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "unsupported", "fake-excel.xlsx") filetype = detect_filetype(filename=filename) assert filetype == FileType.XLSX @@ -222,7 +222,7 @@ def test_detect_docx_filetype_word_mime_type(monkeypatch): def test_detect_xlsx_filetype_word_mime_type(monkeypatch): monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: XLSX_MIME_TYPES[0]) - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-excel.xlsx") + filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "unsupported", "fake-excel.xlsx") with open(filename, "rb") as f: filetype = detect_filetype(file=f) assert filetype == FileType.XLSX diff --git a/test_unstructured/partition/test_email.py b/test_unstructured/partition/test_email.py index 5b0865cd9..60e8ff7b6 100644 --- a/test_unstructured/partition/test_email.py +++ b/test_unstructured/partition/test_email.py @@ -195,3 +195,9 @@ def test_partition_email_raises_with_invalid_content_type(): filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml") with pytest.raises(ValueError): partition_email(filename=filename, content_source="application/json") + + +def test_partition_email_processes_fake_email_with_header(): + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-header.eml") + elements = partition_email(filename=filename) + assert len(elements) > 0 diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 15cc6c086..e2f525b7b 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.5.4-dev2" # pragma: no cover +__version__ = "0.5.4-dev3" # pragma: no cover diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index f6ff8902f..b4dd17a1c 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -37,6 +37,11 @@ PPT_MIME_TYPES = [ "application/vnd.ms-powerpoint", ] +TXT_MIME_TYPES = [ + "text/plain", + "message/rfc822", # ref: https://www.rfc-editor.org/rfc/rfc822 +] + MD_MIME_TYPES = [ "text/markdown", "text/x-markdown", @@ -175,7 +180,7 @@ def detect_filetype( # NOTE - I am not sure whether libmagic ever returns these mimetypes. return FileType.MD - elif mime_type == "text/plain": + elif mime_type in TXT_MIME_TYPES: if extension and extension == ".eml": return FileType.EML if extension and extension == ".md":