mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-25 06:04:53 +00:00
Fix/handle-spooled-temp-file-eml (#800)
This PR is for the unstructured-api smoke tests pass.
This commit is contained in:
parent
901ef16835
commit
5f5da65e0b
@ -1,4 +1,4 @@
|
||||
## 0.7.8-dev1
|
||||
## 0.7.8
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -8,6 +8,7 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* Updates the `parse_email` for `partition_eml` so that `unstructured-api` passes the smoke tests
|
||||
* `partition_email` now works if there is no message content
|
||||
* Updates the `"fast"` strategy for `partition_pdf` so that it's able to recursively
|
||||
* Adds recursive functionality to all fsspec connectors
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.7.8-dev1" # pragma: no cover
|
||||
__version__ = "0.7.8" # pragma: no cover
|
||||
|
||||
@ -2,6 +2,8 @@ from typing import IO, Optional, Tuple, Union
|
||||
|
||||
import chardet
|
||||
|
||||
from unstructured.partition.common import convert_to_bytes
|
||||
|
||||
ENCODE_REC_THRESHOLD = 0.5
|
||||
|
||||
# popular encodings from https://en.wikipedia.org/wiki/Popularity_of_text_encodings
|
||||
@ -46,14 +48,7 @@ def detect_file_encoding(
|
||||
with open(filename, "rb") as f:
|
||||
byte_data = f.read()
|
||||
elif file:
|
||||
if isinstance(file, bytes):
|
||||
byte_data = file
|
||||
else:
|
||||
if not hasattr(file, "mode") or "b" in file.mode:
|
||||
byte_data = file.read()
|
||||
else:
|
||||
with open(file.name, "rb") as f:
|
||||
byte_data = f.read()
|
||||
byte_data = convert_to_bytes(file)
|
||||
else:
|
||||
raise FileNotFoundError("No filename nor file were specified")
|
||||
|
||||
|
||||
@ -1,9 +1,9 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
from io import BytesIO
|
||||
from io import BufferedReader, BytesIO, TextIOWrapper
|
||||
from tempfile import SpooledTemporaryFile
|
||||
from typing import TYPE_CHECKING, Any, BinaryIO, Dict, List, Optional, Tuple, Union
|
||||
from typing import IO, TYPE_CHECKING, Any, BinaryIO, Dict, List, Optional, Tuple, Union
|
||||
|
||||
from docx import table as docxtable
|
||||
from tabulate import tabulate
|
||||
@ -184,6 +184,25 @@ def spooled_to_bytes_io_if_needed(
|
||||
return file_obj
|
||||
|
||||
|
||||
def convert_to_bytes(
|
||||
file: Optional[Union[bytes, SpooledTemporaryFile, IO]] = None,
|
||||
) -> bytes:
|
||||
if isinstance(file, bytes):
|
||||
f_bytes = file
|
||||
elif isinstance(file, SpooledTemporaryFile):
|
||||
file.seek(0)
|
||||
f_bytes = file.read()
|
||||
elif isinstance(file, BytesIO):
|
||||
f_bytes = file.getvalue()
|
||||
elif isinstance(file, (TextIOWrapper, BufferedReader)):
|
||||
with open(file.name, "rb") as f:
|
||||
f_bytes = f.read()
|
||||
else:
|
||||
raise ValueError("Invalid file-like object type")
|
||||
|
||||
return f_bytes
|
||||
|
||||
|
||||
def convert_ms_office_table_to_text(table: docxtable.Table, as_html: bool = True):
|
||||
"""
|
||||
Convert a table object from a Word document to an HTML table string using the tabulate library.
|
||||
|
||||
@ -4,6 +4,7 @@ import re
|
||||
import sys
|
||||
from email.message import Message
|
||||
from functools import partial
|
||||
from tempfile import SpooledTemporaryFile
|
||||
from typing import IO, Dict, List, Optional, Tuple, Union
|
||||
|
||||
from unstructured.file_utils.encoding import (
|
||||
@ -11,7 +12,10 @@ from unstructured.file_utils.encoding import (
|
||||
format_encoding_str,
|
||||
read_txt_file,
|
||||
)
|
||||
from unstructured.partition.common import exactly_one
|
||||
from unstructured.partition.common import (
|
||||
convert_to_bytes,
|
||||
exactly_one,
|
||||
)
|
||||
|
||||
if sys.version_info < (3, 8):
|
||||
from typing_extensions import Final
|
||||
@ -189,14 +193,14 @@ def find_embedded_image(
|
||||
|
||||
def parse_email(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO] = None,
|
||||
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
|
||||
) -> Tuple[Optional[str], Message]:
|
||||
if filename is not None:
|
||||
with open(filename, "rb") as f:
|
||||
msg = email.message_from_binary_file(f)
|
||||
elif file is not None:
|
||||
with open(file.name, "rb") as f:
|
||||
msg = email.message_from_binary_file(f)
|
||||
f_bytes = convert_to_bytes(file)
|
||||
msg = email.message_from_bytes(f_bytes)
|
||||
else:
|
||||
raise ValueError("Either 'filename' or 'file' must be provided.")
|
||||
|
||||
@ -216,7 +220,7 @@ def parse_email(
|
||||
@add_metadata_with_filetype(FileType.EML)
|
||||
def partition_email(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO] = None,
|
||||
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
|
||||
text: Optional[str] = None,
|
||||
content_source: str = "text/html",
|
||||
encoding: Optional[str] = None,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user