mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-11-04 03:53:45 +00:00 
			
		
		
		
	fix: raises on file-like object with .name not a valid path (#2614)
**Summary** Fixes: #2308 **Additional context** Through a somewhat deep call-chain, partitioning a file-like object (e.g. io.BytesIO) having its `.name` attribute set to a path not pointing to an actual file on the local filesystem would raise `FileNotFoundError` when the last-modified date was being computed for the document. This scenario is a legitimate partitioning call, where `file.name` is used downstream to describe the source of, for example, a bytes payload downloaded from the network. **Fix** - explicitly check for the existence of a file at the given path before accessing it to get its modified date. Return `None` (already a legitimate return value) when no such file exists. - Generally clean up the implementations. - Add unit tests that exercise all cases. --------- Co-authored-by: John <43506685+Coniferish@users.noreply.github.com>
This commit is contained in:
		
							parent
							
								
									e35306cfc7
								
							
						
					
					
						commit
						b27ad9b6aa
					
				@ -1,4 +1,4 @@
 | 
				
			|||||||
## 0.12.6-dev7
 | 
					## 0.12.6-dev8
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### Enhancements
 | 
					### Enhancements
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -10,6 +10,7 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
### Fixes
 | 
					### Fixes
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					* **Partitioning raises on file-like object with `.name` not a local file path.** When partitioning a file using the `file=` argument, and `file` is a file-like object (e.g. io.BytesIO) having a `.name` attribute, and the value of `file.name` is not a valid path to a file present on the local filesystem, `FileNotFoundError` is raised. This prevents use of the `file.name` attribute for downstream purposes to, for example, describe the source of a document retrieved from a network location via HTTP.
 | 
				
			||||||
* **Fix SharePoint dates with inconsistent formatting** Adds logic to conditionally support dates returned by office365 that may vary in date formatting or may be a datetime rather than a string.
 | 
					* **Fix SharePoint dates with inconsistent formatting** Adds logic to conditionally support dates returned by office365 that may vary in date formatting or may be a datetime rather than a string.
 | 
				
			||||||
* **Include warnings** about the potential risk of installing a version of `pandoc` which does not support RTF files + instructions that will help resolve that issue.
 | 
					* **Include warnings** about the potential risk of installing a version of `pandoc` which does not support RTF files + instructions that will help resolve that issue.
 | 
				
			||||||
* **Incorporate the `install-pandoc` Makefile recipe** into relevant stages of CI workflow, ensuring it is a version that supports RTF input files.
 | 
					* **Incorporate the `install-pandoc` Makefile recipe** into relevant stages of CI workflow, ensuring it is a version that supports RTF input files.
 | 
				
			||||||
 | 
				
			|||||||
@ -1,3 +1,7 @@
 | 
				
			|||||||
 | 
					import datetime as dt
 | 
				
			||||||
 | 
					import io
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
 | 
					import pathlib
 | 
				
			||||||
from dataclasses import dataclass
 | 
					from dataclasses import dataclass
 | 
				
			||||||
from unittest import mock
 | 
					from unittest import mock
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -540,3 +544,64 @@ def test_ocr_data_to_elements(
 | 
				
			|||||||
            points=layout_el.bbox.coordinates,
 | 
					            points=layout_el.bbox.coordinates,
 | 
				
			||||||
            system=coordinate_system,
 | 
					            system=coordinate_system,
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Describe_get_last_modified_date:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def it_gets_the_modified_time_of_a_file_identified_by_a_path(self, tmp_path: pathlib.Path):
 | 
				
			||||||
 | 
					        modified_timestamp = dt.datetime(
 | 
				
			||||||
 | 
					            year=2024, month=3, day=5, hour=17, minute=43, second=40
 | 
				
			||||||
 | 
					        ).timestamp()
 | 
				
			||||||
 | 
					        file_path = tmp_path / "some_file.txt"
 | 
				
			||||||
 | 
					        file_path.write_text("abcdefg")
 | 
				
			||||||
 | 
					        os.utime(file_path, (modified_timestamp, modified_timestamp))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        last_modified_date = common.get_last_modified_date(str(file_path))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        assert last_modified_date == "2024-03-05T17:43:40"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def but_it_returns_None_when_there_is_no_file_at_that_path(self, tmp_path: pathlib.Path):
 | 
				
			||||||
 | 
					        file_path = tmp_path / "some_file_that_does_not_exist.txt"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        last_modified_date = common.get_last_modified_date(str(file_path))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        assert last_modified_date is None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Describe_get_last_modified_date_from_file:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def it_gets_the_modified_time_of_a_file_like_object_corresponding_to_a_filesystem_file(
 | 
				
			||||||
 | 
					        self, tmp_path: pathlib.Path
 | 
				
			||||||
 | 
					    ):
 | 
				
			||||||
 | 
					        modified_timestamp = dt.datetime(
 | 
				
			||||||
 | 
					            year=2024, month=3, day=5, hour=20, minute=48, second=26
 | 
				
			||||||
 | 
					        ).timestamp()
 | 
				
			||||||
 | 
					        file_path = tmp_path / "some_file_2.txt"
 | 
				
			||||||
 | 
					        file_path.write_text("abcdefg")
 | 
				
			||||||
 | 
					        os.utime(file_path, (modified_timestamp, modified_timestamp))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        with open(file_path, "rb") as f:
 | 
				
			||||||
 | 
					            last_modified_date = common.get_last_modified_date_from_file(f)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        assert last_modified_date == "2024-03-05T20:48:26"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def but_it_returns_None_when_the_argument_is_a_bytes_object(self):
 | 
				
			||||||
 | 
					        assert common.get_last_modified_date_from_file(b"abcdefg") is None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def and_it_returns_None_when_the_file_like_object_has_no_name_attribute(self):
 | 
				
			||||||
 | 
					        file = io.BytesIO(b"abcdefg")
 | 
				
			||||||
 | 
					        assert hasattr(file, "name") is False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        last_modified_date = common.get_last_modified_date_from_file(file)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        assert last_modified_date is None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def and_it_returns_None_when_the_file_like_object_name_is_not_a_path_to_a_file(
 | 
				
			||||||
 | 
					        self, tmp_path: pathlib.Path
 | 
				
			||||||
 | 
					    ):
 | 
				
			||||||
 | 
					        file = io.BytesIO(b"abcdefg")
 | 
				
			||||||
 | 
					        file.name = str(tmp_path / "a_file_that_isn't_here.txt")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        last_modified_date = common.get_last_modified_date_from_file(file)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        assert last_modified_date is None
 | 
				
			||||||
 | 
				
			|||||||
@ -4,6 +4,7 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
from __future__ import annotations
 | 
					from __future__ import annotations
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import io
 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
import tempfile
 | 
					import tempfile
 | 
				
			||||||
from typing import cast
 | 
					from typing import cast
 | 
				
			||||||
@ -163,6 +164,23 @@ def test_partition_xlsx_from_file():
 | 
				
			|||||||
    assert elements[1].metadata.filename is None
 | 
					    assert elements[1].metadata.filename is None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_partition_xlsx_from_file_like_object_with_name():
 | 
				
			||||||
 | 
					    with open("example-docs/stanley-cups.xlsx", "rb") as f:
 | 
				
			||||||
 | 
					        file = io.BytesIO(f.read())
 | 
				
			||||||
 | 
					    file.name = "stanley-cups-downloaded-from-network.xlsx"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    elements = partition_xlsx(file=file, include_header=False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert sum(isinstance(element, Table) for element in elements) == 2
 | 
				
			||||||
 | 
					    assert len(elements) == 4
 | 
				
			||||||
 | 
					    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
 | 
				
			||||||
 | 
					    assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT_XLSX
 | 
				
			||||||
 | 
					    assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
 | 
				
			||||||
 | 
					    assert elements[1].metadata.page_number == 1
 | 
				
			||||||
 | 
					    assert elements[1].metadata.filetype == EXPECTED_FILETYPE
 | 
				
			||||||
 | 
					    assert elements[1].metadata.page_name == EXCEPTED_PAGE_NAME
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_partition_xlsx_from_file_with_metadata_filename():
 | 
					def test_partition_xlsx_from_file_with_metadata_filename():
 | 
				
			||||||
    with open("example-docs/stanley-cups.xlsx", "rb") as f:
 | 
					    with open("example-docs/stanley-cups.xlsx", "rb") as f:
 | 
				
			||||||
        elements = partition_xlsx(file=f, metadata_filename="test", include_header=False)
 | 
					        elements = partition_xlsx(file=f, metadata_filename="test", include_header=False)
 | 
				
			||||||
 | 
				
			|||||||
@ -1 +1 @@
 | 
				
			|||||||
__version__ = "0.12.6-dev7"  # pragma: no cover
 | 
					__version__ = "0.12.6-dev8"  # pragma: no cover
 | 
				
			||||||
 | 
				
			|||||||
@ -75,21 +75,27 @@ def get_last_modified_date(filename: str) -> Optional[str]:
 | 
				
			|||||||
    Otherwise returns date and time in ISO 8601 string format (YYYY-MM-DDTHH:MM:SS) like
 | 
					    Otherwise returns date and time in ISO 8601 string format (YYYY-MM-DDTHH:MM:SS) like
 | 
				
			||||||
    "2024-03-05T17:02:53".
 | 
					    "2024-03-05T17:02:53".
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					    if not os.path.isfile(filename):
 | 
				
			||||||
 | 
					        return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    modify_date = datetime.fromtimestamp(os.path.getmtime(filename))
 | 
					    modify_date = datetime.fromtimestamp(os.path.getmtime(filename))
 | 
				
			||||||
    return modify_date.strftime("%Y-%m-%dT%H:%M:%S%z")
 | 
					    return modify_date.strftime("%Y-%m-%dT%H:%M:%S%z")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_last_modified_date_from_file(file: IO[bytes] | bytes) -> Optional[str]:
 | 
					def get_last_modified_date_from_file(file: IO[bytes] | bytes) -> Optional[str]:
 | 
				
			||||||
    """Modified timestamp of `file` if it corresponds to a file on the local filesystem."""
 | 
					    """Modified timestamp of `file` if it corresponds to a file on the local filesystem."""
 | 
				
			||||||
    filename = None
 | 
					    # -- a file-like object will have a name attribute if created by `open()` or if a name is
 | 
				
			||||||
    if hasattr(file, "name"):
 | 
					    # -- assigned to it for metadata purposes. Use "" as default because the empty string is never
 | 
				
			||||||
        filename = file.name
 | 
					    # -- a path to an actual file.
 | 
				
			||||||
 | 
					    filename = str(getattr(file, "name", ""))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if not filename:
 | 
					    # -- there's no guarantee the path corresponds to an actual file on the filesystem. In
 | 
				
			||||||
 | 
					    # -- particular, a user can set the `.name` attribute of an e.g. `io.BytesIO` object to
 | 
				
			||||||
 | 
					    # -- populate the `.metadata.filename` fields for a payload perhaps downloaded via HTTP.
 | 
				
			||||||
 | 
					    if not os.path.isfile(filename):
 | 
				
			||||||
        return None
 | 
					        return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    modify_date = get_last_modified_date(filename)
 | 
					    return get_last_modified_date(filename)
 | 
				
			||||||
    return modify_date
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def normalize_layout_element(
 | 
					def normalize_layout_element(
 | 
				
			||||||
@ -145,7 +151,7 @@ def normalize_layout_element(
 | 
				
			|||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    elif element_type in TYPE_TO_TEXT_ELEMENT_MAP:
 | 
					    elif element_type in TYPE_TO_TEXT_ELEMENT_MAP:
 | 
				
			||||||
        assert isinstance(element_type, str)
 | 
					        assert isinstance(element_type, str)  # Added to resolve type-error
 | 
				
			||||||
        _element_class = TYPE_TO_TEXT_ELEMENT_MAP[element_type]
 | 
					        _element_class = TYPE_TO_TEXT_ELEMENT_MAP[element_type]
 | 
				
			||||||
        _element_class = _element_class(
 | 
					        _element_class = _element_class(
 | 
				
			||||||
            text=text,
 | 
					            text=text,
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user