fix: parsing for files with message/rfc822 MIME type; dir for unsupported files (#358)

Adds the ability to process files with a message/rfc822 MIME type, which previously caused failures for example-docs/fake-email-header.eml.
This commit is contained in:
Matt Robinson 2023-03-10 18:10:39 -05:00 committed by GitHub
parent 3d21b4098e
commit 30b5a4da65
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 29 additions and 16 deletions

View File

@ -1,4 +1,4 @@
## 0.5.4-dev2
## 0.5.4-dev3
### Enhancements
@ -16,6 +16,8 @@
### Fixes
* Fixes processing for text files with `message/rfc822` MIME type.
## 0.5.3
### Enhancements

View File

@ -45,8 +45,8 @@ def test_from_string(sample_document):
def test_read_with_stylesheet():
filename = os.path.join(FILEPATH, "..", "..", "example-docs", "factbook.xml")
stylesheet = os.path.join(FILEPATH, "..", "..", "example-docs", "factbook.xsl")
filename = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xml")
stylesheet = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xsl")
xml_document = XMLDocument.from_file(filename=filename, stylesheet=stylesheet)
doc_tree = xml_document.document_tree
@ -57,8 +57,8 @@ def test_read_with_stylesheet():
def test_read_with_stylesheet_warns_with_html_parser(caplog):
filename = os.path.join(FILEPATH, "..", "..", "example-docs", "factbook.xml")
stylesheet = os.path.join(FILEPATH, "..", "..", "example-docs", "factbook.xsl")
filename = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xml")
stylesheet = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xsl")
XMLDocument.from_file(filename=filename, stylesheet=stylesheet, parser=etree.HTMLParser())
assert "WARNING" in caplog.text

View File

@ -25,10 +25,10 @@ EXAMPLE_DOCS_DIRECTORY = os.path.join(FILE_DIRECTORY, "..", "..", "example-docs"
("example.jpg", FileType.JPG),
("fake-text.txt", FileType.TXT),
("fake-email.eml", FileType.EML),
("factbook.xml", FileType.XML),
("unsupported/factbook.xml", FileType.XML),
("example-10k.html", FileType.HTML),
("fake-html.html", FileType.HTML),
("fake-excel.xlsx", FileType.XLSX),
("unsupported/fake-excel.xlsx", FileType.XLSX),
("fake-power-point.pptx", FileType.PPTX),
],
)
@ -45,10 +45,10 @@ def test_detect_filetype_from_filename(file, expected):
("example.jpg", FileType.JPG),
("fake-text.txt", FileType.TXT),
("fake-email.eml", FileType.EML),
("factbook.xml", FileType.XML),
("unsupported/factbook.xml", FileType.XML),
("example-10k.html", FileType.HTML),
("fake-html.html", FileType.HTML),
("fake-excel.xlsx", FileType.XLSX),
("unsupported/fake-excel.xlsx", FileType.XLSX),
("fake-power-point.pptx", FileType.PPTX),
],
)
@ -66,12 +66,12 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte
("example.jpg", FileType.JPG),
("fake-text.txt", FileType.TXT),
("fake-email.eml", FileType.EML),
("factbook.xml", FileType.XML),
("unsupported/factbook.xml", FileType.XML),
# NOTE(robinson) - For the document, some operating systems return
# */xml and some return */html. Either could be acceptable depending on the OS
("example-10k.html", [FileType.HTML, FileType.XML]),
("fake-html.html", FileType.HTML),
("fake-excel.xlsx", FileType.XLSX),
("unsupported/fake-excel.xlsx", FileType.XLSX),
("fake-power-point.pptx", FileType.PPTX),
],
)
@ -168,7 +168,7 @@ def test_detect_xls_file_from_mime_type(monkeypatch):
def test_detect_xlsx_filetype_application_octet_stream(monkeypatch):
monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-excel.xlsx")
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "unsupported", "fake-excel.xlsx")
with open(filename, "rb") as f:
filetype = detect_filetype(file=f)
assert filetype == FileType.XLSX
@ -176,7 +176,7 @@ def test_detect_xlsx_filetype_application_octet_stream(monkeypatch):
def test_detect_xlsx_filetype_application_octet_stream_with_filename(monkeypatch):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/octet-stream")
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-excel.xlsx")
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "unsupported", "fake-excel.xlsx")
filetype = detect_filetype(filename=filename)
assert filetype == FileType.XLSX
@ -222,7 +222,7 @@ def test_detect_docx_filetype_word_mime_type(monkeypatch):
def test_detect_xlsx_filetype_word_mime_type(monkeypatch):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: XLSX_MIME_TYPES[0])
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-excel.xlsx")
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "unsupported", "fake-excel.xlsx")
with open(filename, "rb") as f:
filetype = detect_filetype(file=f)
assert filetype == FileType.XLSX

View File

@ -195,3 +195,9 @@ def test_partition_email_raises_with_invalid_content_type():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
with pytest.raises(ValueError):
partition_email(filename=filename, content_source="application/json")
def test_partition_email_processes_fake_email_with_header():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-header.eml")
elements = partition_email(filename=filename)
assert len(elements) > 0

View File

@ -1 +1 @@
__version__ = "0.5.4-dev2" # pragma: no cover
__version__ = "0.5.4-dev3" # pragma: no cover

View File

@ -37,6 +37,11 @@ PPT_MIME_TYPES = [
"application/vnd.ms-powerpoint",
]
TXT_MIME_TYPES = [
"text/plain",
"message/rfc822", # ref: https://www.rfc-editor.org/rfc/rfc822
]
MD_MIME_TYPES = [
"text/markdown",
"text/x-markdown",
@ -175,7 +180,7 @@ def detect_filetype(
# NOTE - I am not sure whether libmagic ever returns these mimetypes.
return FileType.MD
elif mime_type == "text/plain":
elif mime_type in TXT_MIME_TYPES:
if extension and extension == ".eml":
return FileType.EML
if extension and extension == ".md":