mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
fix: parsing for files with message/rfc822
MIME type; dir for unsupported files (#358)
Adds the ability to process files with a message/rfc822 MIME type, which previously caused failures for example-docs/fake-email-header.eml.
This commit is contained in:
parent
3d21b4098e
commit
30b5a4da65
@ -1,4 +1,4 @@
|
||||
## 0.5.4-dev2
|
||||
## 0.5.4-dev3
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -16,6 +16,8 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* Fixes processing for text files with `message/rfc822` MIME type.
|
||||
|
||||
## 0.5.3
|
||||
|
||||
### Enhancements
|
||||
|
@ -45,8 +45,8 @@ def test_from_string(sample_document):
|
||||
|
||||
|
||||
def test_read_with_stylesheet():
|
||||
filename = os.path.join(FILEPATH, "..", "..", "example-docs", "factbook.xml")
|
||||
stylesheet = os.path.join(FILEPATH, "..", "..", "example-docs", "factbook.xsl")
|
||||
filename = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xml")
|
||||
stylesheet = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xsl")
|
||||
|
||||
xml_document = XMLDocument.from_file(filename=filename, stylesheet=stylesheet)
|
||||
doc_tree = xml_document.document_tree
|
||||
@ -57,8 +57,8 @@ def test_read_with_stylesheet():
|
||||
|
||||
|
||||
def test_read_with_stylesheet_warns_with_html_parser(caplog):
|
||||
filename = os.path.join(FILEPATH, "..", "..", "example-docs", "factbook.xml")
|
||||
stylesheet = os.path.join(FILEPATH, "..", "..", "example-docs", "factbook.xsl")
|
||||
filename = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xml")
|
||||
stylesheet = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xsl")
|
||||
|
||||
XMLDocument.from_file(filename=filename, stylesheet=stylesheet, parser=etree.HTMLParser())
|
||||
assert "WARNING" in caplog.text
|
||||
|
@ -25,10 +25,10 @@ EXAMPLE_DOCS_DIRECTORY = os.path.join(FILE_DIRECTORY, "..", "..", "example-docs"
|
||||
("example.jpg", FileType.JPG),
|
||||
("fake-text.txt", FileType.TXT),
|
||||
("fake-email.eml", FileType.EML),
|
||||
("factbook.xml", FileType.XML),
|
||||
("unsupported/factbook.xml", FileType.XML),
|
||||
("example-10k.html", FileType.HTML),
|
||||
("fake-html.html", FileType.HTML),
|
||||
("fake-excel.xlsx", FileType.XLSX),
|
||||
("unsupported/fake-excel.xlsx", FileType.XLSX),
|
||||
("fake-power-point.pptx", FileType.PPTX),
|
||||
],
|
||||
)
|
||||
@ -45,10 +45,10 @@ def test_detect_filetype_from_filename(file, expected):
|
||||
("example.jpg", FileType.JPG),
|
||||
("fake-text.txt", FileType.TXT),
|
||||
("fake-email.eml", FileType.EML),
|
||||
("factbook.xml", FileType.XML),
|
||||
("unsupported/factbook.xml", FileType.XML),
|
||||
("example-10k.html", FileType.HTML),
|
||||
("fake-html.html", FileType.HTML),
|
||||
("fake-excel.xlsx", FileType.XLSX),
|
||||
("unsupported/fake-excel.xlsx", FileType.XLSX),
|
||||
("fake-power-point.pptx", FileType.PPTX),
|
||||
],
|
||||
)
|
||||
@ -66,12 +66,12 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte
|
||||
("example.jpg", FileType.JPG),
|
||||
("fake-text.txt", FileType.TXT),
|
||||
("fake-email.eml", FileType.EML),
|
||||
("factbook.xml", FileType.XML),
|
||||
("unsupported/factbook.xml", FileType.XML),
|
||||
# NOTE(robinson) - For the document, some operating systems return
|
||||
# */xml and some return */html. Either could be acceptable depending on the OS
|
||||
("example-10k.html", [FileType.HTML, FileType.XML]),
|
||||
("fake-html.html", FileType.HTML),
|
||||
("fake-excel.xlsx", FileType.XLSX),
|
||||
("unsupported/fake-excel.xlsx", FileType.XLSX),
|
||||
("fake-power-point.pptx", FileType.PPTX),
|
||||
],
|
||||
)
|
||||
@ -168,7 +168,7 @@ def test_detect_xls_file_from_mime_type(monkeypatch):
|
||||
|
||||
def test_detect_xlsx_filetype_application_octet_stream(monkeypatch):
|
||||
monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-excel.xlsx")
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "unsupported", "fake-excel.xlsx")
|
||||
with open(filename, "rb") as f:
|
||||
filetype = detect_filetype(file=f)
|
||||
assert filetype == FileType.XLSX
|
||||
@ -176,7 +176,7 @@ def test_detect_xlsx_filetype_application_octet_stream(monkeypatch):
|
||||
|
||||
def test_detect_xlsx_filetype_application_octet_stream_with_filename(monkeypatch):
|
||||
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/octet-stream")
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-excel.xlsx")
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "unsupported", "fake-excel.xlsx")
|
||||
filetype = detect_filetype(filename=filename)
|
||||
assert filetype == FileType.XLSX
|
||||
|
||||
@ -222,7 +222,7 @@ def test_detect_docx_filetype_word_mime_type(monkeypatch):
|
||||
|
||||
def test_detect_xlsx_filetype_word_mime_type(monkeypatch):
|
||||
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: XLSX_MIME_TYPES[0])
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-excel.xlsx")
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "unsupported", "fake-excel.xlsx")
|
||||
with open(filename, "rb") as f:
|
||||
filetype = detect_filetype(file=f)
|
||||
assert filetype == FileType.XLSX
|
||||
|
@ -195,3 +195,9 @@ def test_partition_email_raises_with_invalid_content_type():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
|
||||
with pytest.raises(ValueError):
|
||||
partition_email(filename=filename, content_source="application/json")
|
||||
|
||||
|
||||
def test_partition_email_processes_fake_email_with_header():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-header.eml")
|
||||
elements = partition_email(filename=filename)
|
||||
assert len(elements) > 0
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.5.4-dev2" # pragma: no cover
|
||||
__version__ = "0.5.4-dev3" # pragma: no cover
|
||||
|
@ -37,6 +37,11 @@ PPT_MIME_TYPES = [
|
||||
"application/vnd.ms-powerpoint",
|
||||
]
|
||||
|
||||
TXT_MIME_TYPES = [
|
||||
"text/plain",
|
||||
"message/rfc822", # ref: https://www.rfc-editor.org/rfc/rfc822
|
||||
]
|
||||
|
||||
MD_MIME_TYPES = [
|
||||
"text/markdown",
|
||||
"text/x-markdown",
|
||||
@ -175,7 +180,7 @@ def detect_filetype(
|
||||
# NOTE - I am not sure whether libmagic ever returns these mimetypes.
|
||||
return FileType.MD
|
||||
|
||||
elif mime_type == "text/plain":
|
||||
elif mime_type in TXT_MIME_TYPES:
|
||||
if extension and extension == ".eml":
|
||||
return FileType.EML
|
||||
if extension and extension == ".md":
|
||||
|
Loading…
x
Reference in New Issue
Block a user