diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ca08db0b..e006965d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.12.6-dev7 +## 0.12.6-dev8 ### Enhancements @@ -10,6 +10,7 @@ ### Fixes +* **Partitioning raises on file-like object with `.name` not a local file path.** When partitioning a file using the `file=` argument, and `file` is a file-like object (e.g. io.BytesIO) having a `.name` attribute, and the value of `file.name` is not a valid path to a file present on the local filesystem, `FileNotFoundError` is raised. This prevents use of the `file.name` attribute for downstream purposes to, for example, describe the source of a document retrieved from a network location via HTTP. * **Fix SharePoint dates with inconsistent formatting** Adds logic to conditionally support dates returned by office365 that may vary in date formatting or may be a datetime rather than a string. * **Include warnings** about the potential risk of installing a version of `pandoc` which does not support RTF files + instructions that will help resolve that issue. * **Incorporate the `install-pandoc` Makefile recipe** into relevant stages of CI workflow, ensuring it is a version that supports RTF input files. diff --git a/test_unstructured/partition/test_common.py b/test_unstructured/partition/test_common.py index f57da4875..57f672584 100644 --- a/test_unstructured/partition/test_common.py +++ b/test_unstructured/partition/test_common.py @@ -1,3 +1,7 @@ +import datetime as dt +import io +import os +import pathlib from dataclasses import dataclass from unittest import mock @@ -540,3 +544,64 @@ def test_ocr_data_to_elements( points=layout_el.bbox.coordinates, system=coordinate_system, ) + + +class Describe_get_last_modified_date: + + def it_gets_the_modified_time_of_a_file_identified_by_a_path(self, tmp_path: pathlib.Path): + modified_timestamp = dt.datetime( + year=2024, month=3, day=5, hour=17, minute=43, second=40 + ).timestamp() + file_path = tmp_path / "some_file.txt" + file_path.write_text("abcdefg") + os.utime(file_path, (modified_timestamp, modified_timestamp)) + + last_modified_date = common.get_last_modified_date(str(file_path)) + + assert last_modified_date == "2024-03-05T17:43:40" + + def but_it_returns_None_when_there_is_no_file_at_that_path(self, tmp_path: pathlib.Path): + file_path = tmp_path / "some_file_that_does_not_exist.txt" + + last_modified_date = common.get_last_modified_date(str(file_path)) + + assert last_modified_date is None + + +class Describe_get_last_modified_date_from_file: + + def it_gets_the_modified_time_of_a_file_like_object_corresponding_to_a_filesystem_file( + self, tmp_path: pathlib.Path + ): + modified_timestamp = dt.datetime( + year=2024, month=3, day=5, hour=20, minute=48, second=26 + ).timestamp() + file_path = tmp_path / "some_file_2.txt" + file_path.write_text("abcdefg") + os.utime(file_path, (modified_timestamp, modified_timestamp)) + + with open(file_path, "rb") as f: + last_modified_date = common.get_last_modified_date_from_file(f) + + assert last_modified_date == "2024-03-05T20:48:26" + + def but_it_returns_None_when_the_argument_is_a_bytes_object(self): + assert common.get_last_modified_date_from_file(b"abcdefg") is None + + def and_it_returns_None_when_the_file_like_object_has_no_name_attribute(self): + file = io.BytesIO(b"abcdefg") + assert hasattr(file, "name") is False + + last_modified_date = common.get_last_modified_date_from_file(file) + + assert last_modified_date is None + + def and_it_returns_None_when_the_file_like_object_name_is_not_a_path_to_a_file( + self, tmp_path: pathlib.Path + ): + file = io.BytesIO(b"abcdefg") + file.name = str(tmp_path / "a_file_that_isn't_here.txt") + + last_modified_date = common.get_last_modified_date_from_file(file) + + assert last_modified_date is None diff --git a/test_unstructured/partition/xlsx/test_xlsx.py b/test_unstructured/partition/xlsx/test_xlsx.py index 1a4c8fff9..65d9f79d0 100644 --- a/test_unstructured/partition/xlsx/test_xlsx.py +++ b/test_unstructured/partition/xlsx/test_xlsx.py @@ -4,6 +4,7 @@ from __future__ import annotations +import io import sys import tempfile from typing import cast @@ -163,6 +164,23 @@ def test_partition_xlsx_from_file(): assert elements[1].metadata.filename is None +def test_partition_xlsx_from_file_like_object_with_name(): + with open("example-docs/stanley-cups.xlsx", "rb") as f: + file = io.BytesIO(f.read()) + file.name = "stanley-cups-downloaded-from-network.xlsx" + + elements = partition_xlsx(file=file, include_header=False) + + assert sum(isinstance(element, Table) for element in elements) == 2 + assert len(elements) == 4 + assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE + assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT_XLSX + assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX + assert elements[1].metadata.page_number == 1 + assert elements[1].metadata.filetype == EXPECTED_FILETYPE + assert elements[1].metadata.page_name == EXCEPTED_PAGE_NAME + + def test_partition_xlsx_from_file_with_metadata_filename(): with open("example-docs/stanley-cups.xlsx", "rb") as f: elements = partition_xlsx(file=f, metadata_filename="test", include_header=False) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index df403815d..cdb9342e7 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.12.6-dev7" # pragma: no cover +__version__ = "0.12.6-dev8" # pragma: no cover diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py index 225d779f7..35321b30f 100644 --- a/unstructured/partition/common.py +++ b/unstructured/partition/common.py @@ -75,21 +75,27 @@ def get_last_modified_date(filename: str) -> Optional[str]: Otherwise returns date and time in ISO 8601 string format (YYYY-MM-DDTHH:MM:SS) like "2024-03-05T17:02:53". """ + if not os.path.isfile(filename): + return None + modify_date = datetime.fromtimestamp(os.path.getmtime(filename)) return modify_date.strftime("%Y-%m-%dT%H:%M:%S%z") def get_last_modified_date_from_file(file: IO[bytes] | bytes) -> Optional[str]: """Modified timestamp of `file` if it corresponds to a file on the local filesystem.""" - filename = None - if hasattr(file, "name"): - filename = file.name + # -- a file-like object will have a name attribute if created by `open()` or if a name is + # -- assigned to it for metadata purposes. Use "" as default because the empty string is never + # -- a path to an actual file. + filename = str(getattr(file, "name", "")) - if not filename: + # -- there's no guarantee the path corresponds to an actual file on the filesystem. In + # -- particular, a user can set the `.name` attribute of an e.g. `io.BytesIO` object to + # -- populate the `.metadata.filename` fields for a payload perhaps downloaded via HTTP. + if not os.path.isfile(filename): return None - modify_date = get_last_modified_date(filename) - return modify_date + return get_last_modified_date(filename) def normalize_layout_element( @@ -145,7 +151,7 @@ def normalize_layout_element( ) elif element_type in TYPE_TO_TEXT_ELEMENT_MAP: - assert isinstance(element_type, str) + assert isinstance(element_type, str) # Added to resolve type-error _element_class = TYPE_TO_TEXT_ELEMENT_MAP[element_type] _element_class = _element_class( text=text,