Fix/handle-spooled-temp-file-eml (#800)

This PR is for the unstructured-api smoke tests pass.
This commit is contained in:
Christine Straub 2023-06-22 19:21:28 -07:00 committed by GitHub
parent 901ef16835
commit 5f5da65e0b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 36 additions and 17 deletions

View File

@ -1,4 +1,4 @@
## 0.7.8-dev1
## 0.7.8
### Enhancements
@ -8,6 +8,7 @@
### Fixes
* Updates the `parse_email` for `partition_eml` so that `unstructured-api` passes the smoke tests
* `partition_email` now works if there is no message content
* Updates the `"fast"` strategy for `partition_pdf` so that it's able to recursively
* Adds recursive functionality to all fsspec connectors

View File

@ -1 +1 @@
__version__ = "0.7.8-dev1" # pragma: no cover
__version__ = "0.7.8" # pragma: no cover

View File

@ -2,6 +2,8 @@ from typing import IO, Optional, Tuple, Union
import chardet
from unstructured.partition.common import convert_to_bytes
ENCODE_REC_THRESHOLD = 0.5
# popular encodings from https://en.wikipedia.org/wiki/Popularity_of_text_encodings
@ -46,14 +48,7 @@ def detect_file_encoding(
with open(filename, "rb") as f:
byte_data = f.read()
elif file:
if isinstance(file, bytes):
byte_data = file
else:
if not hasattr(file, "mode") or "b" in file.mode:
byte_data = file.read()
else:
with open(file.name, "rb") as f:
byte_data = f.read()
byte_data = convert_to_bytes(file)
else:
raise FileNotFoundError("No filename nor file were specified")

View File

@ -1,9 +1,9 @@
from __future__ import annotations
import subprocess
from io import BytesIO
from io import BufferedReader, BytesIO, TextIOWrapper
from tempfile import SpooledTemporaryFile
from typing import TYPE_CHECKING, Any, BinaryIO, Dict, List, Optional, Tuple, Union
from typing import IO, TYPE_CHECKING, Any, BinaryIO, Dict, List, Optional, Tuple, Union
from docx import table as docxtable
from tabulate import tabulate
@ -184,6 +184,25 @@ def spooled_to_bytes_io_if_needed(
return file_obj
def convert_to_bytes(
file: Optional[Union[bytes, SpooledTemporaryFile, IO]] = None,
) -> bytes:
if isinstance(file, bytes):
f_bytes = file
elif isinstance(file, SpooledTemporaryFile):
file.seek(0)
f_bytes = file.read()
elif isinstance(file, BytesIO):
f_bytes = file.getvalue()
elif isinstance(file, (TextIOWrapper, BufferedReader)):
with open(file.name, "rb") as f:
f_bytes = f.read()
else:
raise ValueError("Invalid file-like object type")
return f_bytes
def convert_ms_office_table_to_text(table: docxtable.Table, as_html: bool = True):
"""
Convert a table object from a Word document to an HTML table string using the tabulate library.

View File

@ -4,6 +4,7 @@ import re
import sys
from email.message import Message
from functools import partial
from tempfile import SpooledTemporaryFile
from typing import IO, Dict, List, Optional, Tuple, Union
from unstructured.file_utils.encoding import (
@ -11,7 +12,10 @@ from unstructured.file_utils.encoding import (
format_encoding_str,
read_txt_file,
)
from unstructured.partition.common import exactly_one
from unstructured.partition.common import (
convert_to_bytes,
exactly_one,
)
if sys.version_info < (3, 8):
from typing_extensions import Final
@ -189,14 +193,14 @@ def find_embedded_image(
def parse_email(
filename: Optional[str] = None,
file: Optional[IO] = None,
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
) -> Tuple[Optional[str], Message]:
if filename is not None:
with open(filename, "rb") as f:
msg = email.message_from_binary_file(f)
elif file is not None:
with open(file.name, "rb") as f:
msg = email.message_from_binary_file(f)
f_bytes = convert_to_bytes(file)
msg = email.message_from_bytes(f_bytes)
else:
raise ValueError("Either 'filename' or 'file' must be provided.")
@ -216,7 +220,7 @@ def parse_email(
@add_metadata_with_filetype(FileType.EML)
def partition_email(
filename: Optional[str] = None,
file: Optional[IO] = None,
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
text: Optional[str] = None,
content_source: str = "text/html",
encoding: Optional[str] = None,