fix: raises on file-like object with .name not a valid path (#2614)

**Summary**
Fixes: #2308

**Additional context**
Through a somewhat deep call-chain, partitioning a file-like object
(e.g. io.BytesIO) having its `.name` attribute set to a path not
pointing to an actual file on the local filesystem would raise
`FileNotFoundError` when the last-modified date was being computed for
the document.

This scenario is a legitimate partitioning call, where `file.name` is
used downstream to describe the source of, for example, a bytes payload
downloaded from the network.

**Fix**
- explicitly check for the existence of a file at the given path before
accessing it to get its modified date. Return `None` (already a
legitimate return value) when no such file exists.
- Generally clean up the implementations.
- Add unit tests that exercise all cases.

---------

Co-authored-by: John <43506685+Coniferish@users.noreply.github.com>
This commit is contained in:
Steve Canny 2024-03-07 11:02:04 -08:00 committed by GitHub
parent e35306cfc7
commit b27ad9b6aa
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 99 additions and 9 deletions

View File

@ -1,4 +1,4 @@
## 0.12.6-dev7
## 0.12.6-dev8
### Enhancements
@ -10,6 +10,7 @@
### Fixes
* **Partitioning raises on file-like object with `.name` not a local file path.** When partitioning a file using the `file=` argument, and `file` is a file-like object (e.g. io.BytesIO) having a `.name` attribute, and the value of `file.name` is not a valid path to a file present on the local filesystem, `FileNotFoundError` is raised. This prevents use of the `file.name` attribute for downstream purposes to, for example, describe the source of a document retrieved from a network location via HTTP.
* **Fix SharePoint dates with inconsistent formatting** Adds logic to conditionally support dates returned by office365 that may vary in date formatting or may be a datetime rather than a string.
* **Include warnings** about the potential risk of installing a version of `pandoc` which does not support RTF files + instructions that will help resolve that issue.
* **Incorporate the `install-pandoc` Makefile recipe** into relevant stages of CI workflow, ensuring it is a version that supports RTF input files.

View File

@ -1,3 +1,7 @@
import datetime as dt
import io
import os
import pathlib
from dataclasses import dataclass
from unittest import mock
@ -540,3 +544,64 @@ def test_ocr_data_to_elements(
points=layout_el.bbox.coordinates,
system=coordinate_system,
)
class Describe_get_last_modified_date:
def it_gets_the_modified_time_of_a_file_identified_by_a_path(self, tmp_path: pathlib.Path):
modified_timestamp = dt.datetime(
year=2024, month=3, day=5, hour=17, minute=43, second=40
).timestamp()
file_path = tmp_path / "some_file.txt"
file_path.write_text("abcdefg")
os.utime(file_path, (modified_timestamp, modified_timestamp))
last_modified_date = common.get_last_modified_date(str(file_path))
assert last_modified_date == "2024-03-05T17:43:40"
def but_it_returns_None_when_there_is_no_file_at_that_path(self, tmp_path: pathlib.Path):
file_path = tmp_path / "some_file_that_does_not_exist.txt"
last_modified_date = common.get_last_modified_date(str(file_path))
assert last_modified_date is None
class Describe_get_last_modified_date_from_file:
def it_gets_the_modified_time_of_a_file_like_object_corresponding_to_a_filesystem_file(
self, tmp_path: pathlib.Path
):
modified_timestamp = dt.datetime(
year=2024, month=3, day=5, hour=20, minute=48, second=26
).timestamp()
file_path = tmp_path / "some_file_2.txt"
file_path.write_text("abcdefg")
os.utime(file_path, (modified_timestamp, modified_timestamp))
with open(file_path, "rb") as f:
last_modified_date = common.get_last_modified_date_from_file(f)
assert last_modified_date == "2024-03-05T20:48:26"
def but_it_returns_None_when_the_argument_is_a_bytes_object(self):
assert common.get_last_modified_date_from_file(b"abcdefg") is None
def and_it_returns_None_when_the_file_like_object_has_no_name_attribute(self):
file = io.BytesIO(b"abcdefg")
assert hasattr(file, "name") is False
last_modified_date = common.get_last_modified_date_from_file(file)
assert last_modified_date is None
def and_it_returns_None_when_the_file_like_object_name_is_not_a_path_to_a_file(
self, tmp_path: pathlib.Path
):
file = io.BytesIO(b"abcdefg")
file.name = str(tmp_path / "a_file_that_isn't_here.txt")
last_modified_date = common.get_last_modified_date_from_file(file)
assert last_modified_date is None

View File

@ -4,6 +4,7 @@
from __future__ import annotations
import io
import sys
import tempfile
from typing import cast
@ -163,6 +164,23 @@ def test_partition_xlsx_from_file():
assert elements[1].metadata.filename is None
def test_partition_xlsx_from_file_like_object_with_name():
with open("example-docs/stanley-cups.xlsx", "rb") as f:
file = io.BytesIO(f.read())
file.name = "stanley-cups-downloaded-from-network.xlsx"
elements = partition_xlsx(file=file, include_header=False)
assert sum(isinstance(element, Table) for element in elements) == 2
assert len(elements) == 4
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT_XLSX
assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
assert elements[1].metadata.page_number == 1
assert elements[1].metadata.filetype == EXPECTED_FILETYPE
assert elements[1].metadata.page_name == EXCEPTED_PAGE_NAME
def test_partition_xlsx_from_file_with_metadata_filename():
with open("example-docs/stanley-cups.xlsx", "rb") as f:
elements = partition_xlsx(file=f, metadata_filename="test", include_header=False)

View File

@ -1 +1 @@
__version__ = "0.12.6-dev7" # pragma: no cover
__version__ = "0.12.6-dev8" # pragma: no cover

View File

@ -75,21 +75,27 @@ def get_last_modified_date(filename: str) -> Optional[str]:
Otherwise returns date and time in ISO 8601 string format (YYYY-MM-DDTHH:MM:SS) like
"2024-03-05T17:02:53".
"""
if not os.path.isfile(filename):
return None
modify_date = datetime.fromtimestamp(os.path.getmtime(filename))
return modify_date.strftime("%Y-%m-%dT%H:%M:%S%z")
def get_last_modified_date_from_file(file: IO[bytes] | bytes) -> Optional[str]:
"""Modified timestamp of `file` if it corresponds to a file on the local filesystem."""
filename = None
if hasattr(file, "name"):
filename = file.name
# -- a file-like object will have a name attribute if created by `open()` or if a name is
# -- assigned to it for metadata purposes. Use "" as default because the empty string is never
# -- a path to an actual file.
filename = str(getattr(file, "name", ""))
if not filename:
# -- there's no guarantee the path corresponds to an actual file on the filesystem. In
# -- particular, a user can set the `.name` attribute of an e.g. `io.BytesIO` object to
# -- populate the `.metadata.filename` fields for a payload perhaps downloaded via HTTP.
if not os.path.isfile(filename):
return None
modify_date = get_last_modified_date(filename)
return modify_date
return get_last_modified_date(filename)
def normalize_layout_element(
@ -145,7 +151,7 @@ def normalize_layout_element(
)
elif element_type in TYPE_TO_TEXT_ELEMENT_MAP:
assert isinstance(element_type, str)
assert isinstance(element_type, str) # Added to resolve type-error
_element_class = TYPE_TO_TEXT_ELEMENT_MAP[element_type]
_element_class = _element_class(
text=text,