mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-02 11:03:38 +00:00
fix: raises on file-like object with .name not a valid path (#2614)
**Summary** Fixes: #2308 **Additional context** Through a somewhat deep call-chain, partitioning a file-like object (e.g. io.BytesIO) having its `.name` attribute set to a path not pointing to an actual file on the local filesystem would raise `FileNotFoundError` when the last-modified date was being computed for the document. This scenario is a legitimate partitioning call, where `file.name` is used downstream to describe the source of, for example, a bytes payload downloaded from the network. **Fix** - explicitly check for the existence of a file at the given path before accessing it to get its modified date. Return `None` (already a legitimate return value) when no such file exists. - Generally clean up the implementations. - Add unit tests that exercise all cases. --------- Co-authored-by: John <43506685+Coniferish@users.noreply.github.com>
This commit is contained in:
parent
e35306cfc7
commit
b27ad9b6aa
@ -1,4 +1,4 @@
|
||||
## 0.12.6-dev7
|
||||
## 0.12.6-dev8
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -10,6 +10,7 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* **Partitioning raises on file-like object with `.name` not a local file path.** When partitioning a file using the `file=` argument, and `file` is a file-like object (e.g. io.BytesIO) having a `.name` attribute, and the value of `file.name` is not a valid path to a file present on the local filesystem, `FileNotFoundError` is raised. This prevents use of the `file.name` attribute for downstream purposes to, for example, describe the source of a document retrieved from a network location via HTTP.
|
||||
* **Fix SharePoint dates with inconsistent formatting** Adds logic to conditionally support dates returned by office365 that may vary in date formatting or may be a datetime rather than a string.
|
||||
* **Include warnings** about the potential risk of installing a version of `pandoc` which does not support RTF files + instructions that will help resolve that issue.
|
||||
* **Incorporate the `install-pandoc` Makefile recipe** into relevant stages of CI workflow, ensuring it is a version that supports RTF input files.
|
||||
|
||||
@ -1,3 +1,7 @@
|
||||
import datetime as dt
|
||||
import io
|
||||
import os
|
||||
import pathlib
|
||||
from dataclasses import dataclass
|
||||
from unittest import mock
|
||||
|
||||
@ -540,3 +544,64 @@ def test_ocr_data_to_elements(
|
||||
points=layout_el.bbox.coordinates,
|
||||
system=coordinate_system,
|
||||
)
|
||||
|
||||
|
||||
class Describe_get_last_modified_date:
|
||||
|
||||
def it_gets_the_modified_time_of_a_file_identified_by_a_path(self, tmp_path: pathlib.Path):
|
||||
modified_timestamp = dt.datetime(
|
||||
year=2024, month=3, day=5, hour=17, minute=43, second=40
|
||||
).timestamp()
|
||||
file_path = tmp_path / "some_file.txt"
|
||||
file_path.write_text("abcdefg")
|
||||
os.utime(file_path, (modified_timestamp, modified_timestamp))
|
||||
|
||||
last_modified_date = common.get_last_modified_date(str(file_path))
|
||||
|
||||
assert last_modified_date == "2024-03-05T17:43:40"
|
||||
|
||||
def but_it_returns_None_when_there_is_no_file_at_that_path(self, tmp_path: pathlib.Path):
|
||||
file_path = tmp_path / "some_file_that_does_not_exist.txt"
|
||||
|
||||
last_modified_date = common.get_last_modified_date(str(file_path))
|
||||
|
||||
assert last_modified_date is None
|
||||
|
||||
|
||||
class Describe_get_last_modified_date_from_file:
|
||||
|
||||
def it_gets_the_modified_time_of_a_file_like_object_corresponding_to_a_filesystem_file(
|
||||
self, tmp_path: pathlib.Path
|
||||
):
|
||||
modified_timestamp = dt.datetime(
|
||||
year=2024, month=3, day=5, hour=20, minute=48, second=26
|
||||
).timestamp()
|
||||
file_path = tmp_path / "some_file_2.txt"
|
||||
file_path.write_text("abcdefg")
|
||||
os.utime(file_path, (modified_timestamp, modified_timestamp))
|
||||
|
||||
with open(file_path, "rb") as f:
|
||||
last_modified_date = common.get_last_modified_date_from_file(f)
|
||||
|
||||
assert last_modified_date == "2024-03-05T20:48:26"
|
||||
|
||||
def but_it_returns_None_when_the_argument_is_a_bytes_object(self):
|
||||
assert common.get_last_modified_date_from_file(b"abcdefg") is None
|
||||
|
||||
def and_it_returns_None_when_the_file_like_object_has_no_name_attribute(self):
|
||||
file = io.BytesIO(b"abcdefg")
|
||||
assert hasattr(file, "name") is False
|
||||
|
||||
last_modified_date = common.get_last_modified_date_from_file(file)
|
||||
|
||||
assert last_modified_date is None
|
||||
|
||||
def and_it_returns_None_when_the_file_like_object_name_is_not_a_path_to_a_file(
|
||||
self, tmp_path: pathlib.Path
|
||||
):
|
||||
file = io.BytesIO(b"abcdefg")
|
||||
file.name = str(tmp_path / "a_file_that_isn't_here.txt")
|
||||
|
||||
last_modified_date = common.get_last_modified_date_from_file(file)
|
||||
|
||||
assert last_modified_date is None
|
||||
|
||||
@ -4,6 +4,7 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import sys
|
||||
import tempfile
|
||||
from typing import cast
|
||||
@ -163,6 +164,23 @@ def test_partition_xlsx_from_file():
|
||||
assert elements[1].metadata.filename is None
|
||||
|
||||
|
||||
def test_partition_xlsx_from_file_like_object_with_name():
|
||||
with open("example-docs/stanley-cups.xlsx", "rb") as f:
|
||||
file = io.BytesIO(f.read())
|
||||
file.name = "stanley-cups-downloaded-from-network.xlsx"
|
||||
|
||||
elements = partition_xlsx(file=file, include_header=False)
|
||||
|
||||
assert sum(isinstance(element, Table) for element in elements) == 2
|
||||
assert len(elements) == 4
|
||||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
|
||||
assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT_XLSX
|
||||
assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
|
||||
assert elements[1].metadata.page_number == 1
|
||||
assert elements[1].metadata.filetype == EXPECTED_FILETYPE
|
||||
assert elements[1].metadata.page_name == EXCEPTED_PAGE_NAME
|
||||
|
||||
|
||||
def test_partition_xlsx_from_file_with_metadata_filename():
|
||||
with open("example-docs/stanley-cups.xlsx", "rb") as f:
|
||||
elements = partition_xlsx(file=f, metadata_filename="test", include_header=False)
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.12.6-dev7" # pragma: no cover
|
||||
__version__ = "0.12.6-dev8" # pragma: no cover
|
||||
|
||||
@ -75,21 +75,27 @@ def get_last_modified_date(filename: str) -> Optional[str]:
|
||||
Otherwise returns date and time in ISO 8601 string format (YYYY-MM-DDTHH:MM:SS) like
|
||||
"2024-03-05T17:02:53".
|
||||
"""
|
||||
if not os.path.isfile(filename):
|
||||
return None
|
||||
|
||||
modify_date = datetime.fromtimestamp(os.path.getmtime(filename))
|
||||
return modify_date.strftime("%Y-%m-%dT%H:%M:%S%z")
|
||||
|
||||
|
||||
def get_last_modified_date_from_file(file: IO[bytes] | bytes) -> Optional[str]:
|
||||
"""Modified timestamp of `file` if it corresponds to a file on the local filesystem."""
|
||||
filename = None
|
||||
if hasattr(file, "name"):
|
||||
filename = file.name
|
||||
# -- a file-like object will have a name attribute if created by `open()` or if a name is
|
||||
# -- assigned to it for metadata purposes. Use "" as default because the empty string is never
|
||||
# -- a path to an actual file.
|
||||
filename = str(getattr(file, "name", ""))
|
||||
|
||||
if not filename:
|
||||
# -- there's no guarantee the path corresponds to an actual file on the filesystem. In
|
||||
# -- particular, a user can set the `.name` attribute of an e.g. `io.BytesIO` object to
|
||||
# -- populate the `.metadata.filename` fields for a payload perhaps downloaded via HTTP.
|
||||
if not os.path.isfile(filename):
|
||||
return None
|
||||
|
||||
modify_date = get_last_modified_date(filename)
|
||||
return modify_date
|
||||
return get_last_modified_date(filename)
|
||||
|
||||
|
||||
def normalize_layout_element(
|
||||
@ -145,7 +151,7 @@ def normalize_layout_element(
|
||||
)
|
||||
|
||||
elif element_type in TYPE_TO_TEXT_ELEMENT_MAP:
|
||||
assert isinstance(element_type, str)
|
||||
assert isinstance(element_type, str) # Added to resolve type-error
|
||||
_element_class = TYPE_TO_TEXT_ELEMENT_MAP[element_type]
|
||||
_element_class = _element_class(
|
||||
text=text,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user