From 03c2bf8f1f124ca9da0675d3ba04b9520bae8ff5 Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Fri, 20 Sep 2024 13:35:28 -0700 Subject: [PATCH] rfctr(part): extract partition.common submodules (#3649) **Summary** In preparation for consolidating post-partitioning metadata decorators, extract `partition.common` module into a sub-package (directory) and extract `partition.common.metadata` module to house metadata-specific object shared by partitioners. **Additional Context** - This new module will be the home of the new consolidated metadata decorator. - The consolidated decorator is a step toward removing post-processing decorators from _delegating_ partitioners. A delegating partitioner is one that convert its file to a different format and "delegates" actual partitioning to the partitioner for that target format. 10 of the 20 partitioners are delegating partitioners. - Removing decorators from delegating partitioners will allow us to avoid "double-decorating", i.e. running those decorators twice, once on the principal partitioner and again on the proxy partitioner. - This will allow us to send `**kwargs` to either partitioner, removing the knowledge of which arguments to send for each file-type from auto-partition. - And this will allow pluggable auto-partitioners which all have a `partition_x(filename, *, file, **kwargs) -> list[Element]` interface. --- CHANGELOG.md | 12 ++ .../create-and-check-chroma.sh | 9 - .../partition/common/__init__.py | 0 .../partition/{ => common}/test_common.py | 173 +-------------- .../partition/common/test_metadata.py | 201 ++++++++++++++++++ unstructured/__version__.py | 2 +- unstructured/file_utils/encoding.py | 2 +- unstructured/file_utils/file_conversion.py | 2 +- unstructured/file_utils/filetype.py | 8 +- unstructured/partition/api.py | 2 +- unstructured/partition/auto.py | 2 +- unstructured/partition/common/__init__.py | 0 unstructured/partition/{ => common}/common.py | 122 +---------- unstructured/partition/common/metadata.py | 128 +++++++++++ unstructured/partition/csv.py | 5 +- unstructured/partition/doc.py | 7 +- unstructured/partition/docx.py | 2 +- unstructured/partition/email.py | 5 +- unstructured/partition/epub.py | 3 +- unstructured/partition/html/partition.py | 5 +- unstructured/partition/image.py | 2 +- unstructured/partition/json.py | 4 +- unstructured/partition/md.py | 4 +- unstructured/partition/msg.py | 2 +- unstructured/partition/odt.py | 3 +- unstructured/partition/org.py | 3 +- unstructured/partition/pdf.py | 2 +- .../partition/pdf_image/pdf_image_utils.py | 8 +- unstructured/partition/ppt.py | 5 +- unstructured/partition/pptx.py | 4 +- unstructured/partition/rst.py | 3 +- unstructured/partition/rtf.py | 3 +- unstructured/partition/text.py | 4 +- unstructured/partition/tsv.py | 6 +- unstructured/partition/xlsx.py | 5 +- unstructured/partition/xml.py | 6 +- unstructured/staging/base.py | 2 +- 37 files changed, 406 insertions(+), 350 deletions(-) create mode 100644 test_unstructured/partition/common/__init__.py rename test_unstructured/partition/{ => common}/test_common.py (69%) create mode 100644 test_unstructured/partition/common/test_metadata.py create mode 100644 unstructured/partition/common/__init__.py rename unstructured/partition/{ => common}/common.py (84%) create mode 100644 unstructured/partition/common/metadata.py diff --git a/CHANGELOG.md b/CHANGELOG.md index cfdd51e67..acb61499d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,17 @@ +## 0.15.14-dev0 + +### Enhancements + +### Features + +### Fixes + ## 0.15.13 +### BREAKING CHANGES + +* **Remove dead experimental code.** Unused code in `file_utils.experimantal` and `file_utils.metadata` was removed. These functions were never published in the documentation, but if a client dug these out and used them this removal could break client code. + ### Enhancements * **Improve `pdfminer` image cleanup process**. Optimized the removal of duplicated pdfminer images by performing the cleanup before merging elements, rather than after. This improvement reduces execution time and enhances overall processing speed of PDF documents. diff --git a/scripts/chroma-test-helpers/create-and-check-chroma.sh b/scripts/chroma-test-helpers/create-and-check-chroma.sh index 726ee9cae..f89129162 100755 --- a/scripts/chroma-test-helpers/create-and-check-chroma.sh +++ b/scripts/chroma-test-helpers/create-and-check-chroma.sh @@ -3,13 +3,4 @@ set -e # $1 is the path for chroma to write the contents to. The symbol "&" runs process in background -echo "Current venv is:" -echo "$VIRTUAL_ENV" -echo "Current path is:" -echo "$PATH" -ls -l "$VIRTUAL_ENV/bin/chroma" -echo "================" -cat "$VIRTUAL_ENV/bin/chroma" -echo "================" -# chroma run --path "$1" & python "$VIRTUAL_ENV/bin/chroma" run --path "$1" & diff --git a/test_unstructured/partition/common/__init__.py b/test_unstructured/partition/common/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test_unstructured/partition/test_common.py b/test_unstructured/partition/common/test_common.py similarity index 69% rename from test_unstructured/partition/test_common.py rename to test_unstructured/partition/common/test_common.py index ebe12c0be..8981ca19e 100644 --- a/test_unstructured/partition/test_common.py +++ b/test_unstructured/partition/common/test_common.py @@ -1,6 +1,3 @@ -import datetime as dt -import io -import os import pathlib from dataclasses import dataclass from multiprocessing import Pool @@ -20,7 +17,6 @@ from unstructured.documents.elements import ( TYPE_TO_TEXT_ELEMENT_MAP, CheckBox, CoordinatesMetadata, - ElementMetadata, ElementType, FigureCaption, Header, @@ -32,7 +28,7 @@ from unstructured.documents.elements import ( from unstructured.documents.elements import ( Image as ImageElement, ) -from unstructured.partition import common +from unstructured.partition.common import common from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_DONT, SORT_MODE_XY_CUT @@ -347,7 +343,7 @@ class MockRunOutput: def test_convert_office_doc_captures_errors(monkeypatch, caplog): - from unstructured.partition.common import subprocess + from unstructured.partition.common.common import subprocess def mock_run(*args, **kwargs): return MockRunOutput(1, "an error occurred".encode(), "error details".encode()) @@ -429,75 +425,6 @@ def test_get_page_image_metadata_and_coordinate_system(): assert isinstance(metadata, dict) -def test_set_element_hierarchy(): - elements_to_set = [ - Title(text="Title"), # 0 - NarrativeText(text="NarrativeText"), # 1 - FigureCaption(text="FigureCaption"), # 2 - ListItem(text="ListItem"), # 3 - ListItem(text="ListItem", metadata=ElementMetadata(category_depth=1)), # 4 - ListItem(text="ListItem", metadata=ElementMetadata(category_depth=1)), # 5 - ListItem(text="ListItem"), # 6 - CheckBox(element_id="some-id-1", checked=True), # 7 - Title(text="Title 2"), # 8 - ListItem(text="ListItem"), # 9 - ListItem(text="ListItem"), # 10 - Text(text="Text"), # 11 - ] - elements = common.set_element_hierarchy(elements_to_set) - - assert ( - elements[1].metadata.parent_id == elements[0].id - ), "NarrativeText should be child of Title" - assert ( - elements[2].metadata.parent_id == elements[0].id - ), "FigureCaption should be child of Title" - assert elements[3].metadata.parent_id == elements[0].id, "ListItem should be child of Title" - assert elements[4].metadata.parent_id == elements[3].id, "ListItem should be child of Title" - assert elements[5].metadata.parent_id == elements[3].id, "ListItem should be child of Title" - assert elements[6].metadata.parent_id == elements[0].id, "ListItem should be child of Title" - # NOTE(Hubert): moving the category field to Element, caused this to fail. - # Checkboxes will soon be deprecated, then we can remove the test. - # assert ( - # elements[7].metadata.parent_id is None - # ), "CheckBox should be None, as it's not a Text based element" - assert elements[8].metadata.parent_id is None, "Title 2 should be child of None" - assert elements[9].metadata.parent_id == elements[8].id, "ListItem should be child of Title 2" - assert elements[10].metadata.parent_id == elements[8].id, "ListItem should be child of Title 2" - assert elements[11].metadata.parent_id == elements[8].id, "Text should be child of Title 2" - - -def test_set_element_hierarchy_custom_rule_set(): - elements_to_set = [ - Header(text="Header"), # 0 - Title(text="Title"), # 1 - NarrativeText(text="NarrativeText"), # 2 - Text(text="Text"), # 3 - Title(text="Title 2"), # 4 - FigureCaption(text="FigureCaption"), # 5 - ] - - custom_rule_set = { - "Header": ["Title", "Text"], - "Title": ["NarrativeText", "UncategorizedText", "FigureCaption"], - } - - elements = common.set_element_hierarchy( - elements=elements_to_set, - ruleset=custom_rule_set, - ) - - assert elements[1].metadata.parent_id == elements[0].id, "Title should be child of Header" - assert ( - elements[2].metadata.parent_id == elements[1].id - ), "NarrativeText should be child of Title" - assert elements[3].metadata.parent_id == elements[1].id, "Text should be child of Title" - assert elements[4].metadata.parent_id == elements[0].id, "Title 2 should be child of Header" - assert ( - elements[5].metadata.parent_id == elements[4].id - ), "FigureCaption should be child of Title 2" - - @dataclass class MockImage: width = 640 @@ -607,99 +534,3 @@ def test_ocr_data_to_elements( points=layout_el.bbox.coordinates, system=coordinate_system, ) - - -class Describe_get_last_modified: - """Isolated unit-tests for `unstructured.partition.common.get_last_modified().""" - - def it_pulls_last_modified_from_the_filesystem_when_a_path_is_provided( - self, file_and_last_modified: tuple[str, str] - ): - file_path, last_modified = file_and_last_modified - last_modified_date = common.get_last_modified(str(file_path), None, False) - assert last_modified_date == last_modified - - def and_it_pulls_last_modified_from_the_file_like_object_when_one_is_provided( - self, file_and_last_modified: tuple[str, str] - ): - file_path, last_modified = file_and_last_modified - with open(file_path, "rb") as f: - last_modified_date = common.get_last_modified(None, f, True) - assert last_modified_date == last_modified - - def but_not_when_date_from_file_object_is_False(self, file_and_last_modified: tuple[str, str]): - file_path, _ = file_and_last_modified - with open(file_path, "rb") as f: - last_modified_date = common.get_last_modified(None, f, False) - assert last_modified_date is None - - # -- fixtures -------------------------------------------------------------------------------- - - @pytest.fixture() - def file_and_last_modified(self, tmp_path: pathlib.Path) -> tuple[str, str]: - modified_timestamp = dt.datetime( - year=2024, month=6, day=14, hour=15, minute=39, second=25 - ).timestamp() - file_path = tmp_path / "some_file.txt" - file_path.write_text("abcdefg") - os.utime(file_path, (modified_timestamp, modified_timestamp)) - return str(file_path), "2024-06-14T15:39:25" - - -class Describe_get_last_modified_date: - def it_gets_the_modified_time_of_a_file_identified_by_a_path(self, tmp_path: pathlib.Path): - modified_timestamp = dt.datetime( - year=2024, month=3, day=5, hour=17, minute=43, second=40 - ).timestamp() - file_path = tmp_path / "some_file.txt" - file_path.write_text("abcdefg") - os.utime(file_path, (modified_timestamp, modified_timestamp)) - - last_modified_date = common.get_last_modified_date(str(file_path)) - - assert last_modified_date == "2024-03-05T17:43:40" - - def but_it_returns_None_when_there_is_no_file_at_that_path(self, tmp_path: pathlib.Path): - file_path = tmp_path / "some_file_that_does_not_exist.txt" - - last_modified_date = common.get_last_modified_date(str(file_path)) - - assert last_modified_date is None - - -class Describe_get_last_modified_date_from_file: - def it_gets_the_modified_time_of_a_file_like_object_corresponding_to_a_filesystem_file( - self, tmp_path: pathlib.Path - ): - modified_timestamp = dt.datetime( - year=2024, month=3, day=5, hour=20, minute=48, second=26 - ).timestamp() - file_path = tmp_path / "some_file_2.txt" - file_path.write_text("abcdefg") - os.utime(file_path, (modified_timestamp, modified_timestamp)) - - with open(file_path, "rb") as f: - last_modified_date = common.get_last_modified_date_from_file(f) - - assert last_modified_date == "2024-03-05T20:48:26" - - def but_it_returns_None_when_the_argument_is_a_bytes_object(self): - assert common.get_last_modified_date_from_file(b"abcdefg") is None - - def and_it_returns_None_when_the_file_like_object_has_no_name_attribute(self): - file = io.BytesIO(b"abcdefg") - assert hasattr(file, "name") is False - - last_modified_date = common.get_last_modified_date_from_file(file) - - assert last_modified_date is None - - def and_it_returns_None_when_the_file_like_object_name_is_not_a_path_to_a_file( - self, tmp_path: pathlib.Path - ): - file = io.BytesIO(b"abcdefg") - file.name = str(tmp_path / "a_file_that_isn't_here.txt") - - last_modified_date = common.get_last_modified_date_from_file(file) - - assert last_modified_date is None diff --git a/test_unstructured/partition/common/test_metadata.py b/test_unstructured/partition/common/test_metadata.py new file mode 100644 index 000000000..024ac9148 --- /dev/null +++ b/test_unstructured/partition/common/test_metadata.py @@ -0,0 +1,201 @@ +"""Test-suite for `unstructured.partition.common.metadata` module.""" + +from __future__ import annotations + +import datetime as dt +import io +import os +import pathlib + +import pytest + +from unstructured.documents.elements import ( + CheckBox, + ElementMetadata, + FigureCaption, + Header, + ListItem, + NarrativeText, + Text, + Title, +) +from unstructured.partition.common.metadata import ( + get_last_modified, + get_last_modified_date, + get_last_modified_date_from_file, + set_element_hierarchy, +) + +# ================================================================================================ +# LAST-MODIFIED +# ================================================================================================ + + +class Describe_get_last_modified: + """Isolated unit-tests for `unstructured.partition.common.metadata.get_last_modified().""" + + def it_pulls_last_modified_from_the_filesystem_when_a_path_is_provided( + self, file_and_last_modified: tuple[str, str] + ): + file_path, last_modified = file_and_last_modified + last_modified_date = get_last_modified(str(file_path), None, False) + assert last_modified_date == last_modified + + def and_it_pulls_last_modified_from_the_file_like_object_when_one_is_provided( + self, file_and_last_modified: tuple[str, str] + ): + file_path, last_modified = file_and_last_modified + with open(file_path, "rb") as f: + last_modified_date = get_last_modified(None, f, True) + assert last_modified_date == last_modified + + def but_not_when_date_from_file_object_is_False(self, file_and_last_modified: tuple[str, str]): + file_path, _ = file_and_last_modified + with open(file_path, "rb") as f: + last_modified_date = get_last_modified(None, f, False) + assert last_modified_date is None + + # -- fixtures -------------------------------------------------------------------------------- + + @pytest.fixture() + def file_and_last_modified(self, tmp_path: pathlib.Path) -> tuple[str, str]: + modified_timestamp = dt.datetime( + year=2024, month=6, day=14, hour=15, minute=39, second=25 + ).timestamp() + file_path = tmp_path / "some_file.txt" + file_path.write_text("abcdefg") + os.utime(file_path, (modified_timestamp, modified_timestamp)) + return str(file_path), "2024-06-14T15:39:25" + + +class Describe_get_last_modified_date: + def it_gets_the_modified_time_of_a_file_identified_by_a_path(self, tmp_path: pathlib.Path): + modified_timestamp = dt.datetime( + year=2024, month=3, day=5, hour=17, minute=43, second=40 + ).timestamp() + file_path = tmp_path / "some_file.txt" + file_path.write_text("abcdefg") + os.utime(file_path, (modified_timestamp, modified_timestamp)) + + last_modified_date = get_last_modified_date(str(file_path)) + + assert last_modified_date == "2024-03-05T17:43:40" + + def but_it_returns_None_when_there_is_no_file_at_that_path(self, tmp_path: pathlib.Path): + file_path = tmp_path / "some_file_that_does_not_exist.txt" + + last_modified_date = get_last_modified_date(str(file_path)) + + assert last_modified_date is None + + +class Describe_get_last_modified_date_from_file: + def it_gets_the_modified_time_of_a_file_like_object_corresponding_to_a_filesystem_file( + self, tmp_path: pathlib.Path + ): + modified_timestamp = dt.datetime( + year=2024, month=3, day=5, hour=20, minute=48, second=26 + ).timestamp() + file_path = tmp_path / "some_file_2.txt" + file_path.write_text("abcdefg") + os.utime(file_path, (modified_timestamp, modified_timestamp)) + + with open(file_path, "rb") as f: + last_modified_date = get_last_modified_date_from_file(f) + + assert last_modified_date == "2024-03-05T20:48:26" + + def but_it_returns_None_when_the_argument_is_a_bytes_object(self): + assert get_last_modified_date_from_file(b"abcdefg") is None + + def and_it_returns_None_when_the_file_like_object_has_no_name_attribute(self): + file = io.BytesIO(b"abcdefg") + assert hasattr(file, "name") is False + + last_modified_date = get_last_modified_date_from_file(file) + + assert last_modified_date is None + + def and_it_returns_None_when_the_file_like_object_name_is_not_a_path_to_a_file( + self, tmp_path: pathlib.Path + ): + file = io.BytesIO(b"abcdefg") + file.name = str(tmp_path / "a_file_that_isn't_here.txt") + + last_modified_date = get_last_modified_date_from_file(file) + + assert last_modified_date is None + + +# ================================================================================================ +# ELEMENT HIERARCHY +# ================================================================================================ + + +def test_set_element_hierarchy(): + elements_to_set = [ + Title(text="Title"), # 0 + NarrativeText(text="NarrativeText"), # 1 + FigureCaption(text="FigureCaption"), # 2 + ListItem(text="ListItem"), # 3 + ListItem(text="ListItem", metadata=ElementMetadata(category_depth=1)), # 4 + ListItem(text="ListItem", metadata=ElementMetadata(category_depth=1)), # 5 + ListItem(text="ListItem"), # 6 + CheckBox(element_id="some-id-1", checked=True), # 7 + Title(text="Title 2"), # 8 + ListItem(text="ListItem"), # 9 + ListItem(text="ListItem"), # 10 + Text(text="Text"), # 11 + ] + elements = set_element_hierarchy(elements_to_set) + + assert ( + elements[1].metadata.parent_id == elements[0].id + ), "NarrativeText should be child of Title" + assert ( + elements[2].metadata.parent_id == elements[0].id + ), "FigureCaption should be child of Title" + assert elements[3].metadata.parent_id == elements[0].id, "ListItem should be child of Title" + assert elements[4].metadata.parent_id == elements[3].id, "ListItem should be child of Title" + assert elements[5].metadata.parent_id == elements[3].id, "ListItem should be child of Title" + assert elements[6].metadata.parent_id == elements[0].id, "ListItem should be child of Title" + # NOTE(Hubert): moving the category field to Element, caused this to fail. + # Checkboxes will soon be deprecated, then we can remove the test. + # assert ( + # elements[7].metadata.parent_id is None + # ), "CheckBox should be None, as it's not a Text based element" + assert elements[8].metadata.parent_id is None, "Title 2 should be child of None" + assert elements[9].metadata.parent_id == elements[8].id, "ListItem should be child of Title 2" + assert elements[10].metadata.parent_id == elements[8].id, "ListItem should be child of Title 2" + assert elements[11].metadata.parent_id == elements[8].id, "Text should be child of Title 2" + + +def test_set_element_hierarchy_custom_rule_set(): + elements_to_set = [ + Header(text="Header"), # 0 + Title(text="Title"), # 1 + NarrativeText(text="NarrativeText"), # 2 + Text(text="Text"), # 3 + Title(text="Title 2"), # 4 + FigureCaption(text="FigureCaption"), # 5 + ] + + custom_rule_set = { + "Header": ["Title", "Text"], + "Title": ["NarrativeText", "UncategorizedText", "FigureCaption"], + } + + elements = set_element_hierarchy( + elements=elements_to_set, + ruleset=custom_rule_set, + ) + + assert elements[1].metadata.parent_id == elements[0].id, "Title should be child of Header" + assert ( + elements[2].metadata.parent_id == elements[1].id + ), "NarrativeText should be child of Title" + assert elements[3].metadata.parent_id == elements[1].id, "Text should be child of Title" + assert elements[4].metadata.parent_id == elements[0].id, "Title 2 should be child of Header" + assert ( + elements[5].metadata.parent_id == elements[4].id + ), "FigureCaption should be child of Title 2" diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 341cad2d2..b62ccd10f 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.13" # pragma: no cover +__version__ = "0.15.14-dev0" # pragma: no cover diff --git a/unstructured/file_utils/encoding.py b/unstructured/file_utils/encoding.py index 01d57ff75..de5249f7b 100644 --- a/unstructured/file_utils/encoding.py +++ b/unstructured/file_utils/encoding.py @@ -2,7 +2,7 @@ from typing import IO, Optional, Tuple, Union import chardet -from unstructured.partition.common import convert_to_bytes +from unstructured.partition.common.common import convert_to_bytes ENCODE_REC_THRESHOLD = 0.8 diff --git a/unstructured/file_utils/file_conversion.py b/unstructured/file_utils/file_conversion.py index a6fc5f014..a4ef5fabd 100644 --- a/unstructured/file_utils/file_conversion.py +++ b/unstructured/file_utils/file_conversion.py @@ -4,7 +4,7 @@ import os import tempfile from typing import IO -from unstructured.partition.common import exactly_one +from unstructured.partition.common.common import exactly_one from unstructured.utils import requires_dependencies diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index e459e72f8..0203d8070 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -47,12 +47,12 @@ from unstructured.file_utils.encoding import detect_file_encoding, format_encodi from unstructured.file_utils.model import FileType from unstructured.logger import logger from unstructured.nlp.patterns import EMAIL_HEAD_RE, LIST_OF_DICTS_PATTERN -from unstructured.partition.common import ( +from unstructured.partition.common.common import ( add_element_metadata, exactly_one, remove_element_metadata, - set_element_hierarchy, ) +from unstructured.partition.common.metadata import set_element_hierarchy from unstructured.utils import get_call_args_applying_defaults, lazyproperty LIBMAGIC_AVAILABLE = bool(importlib.util.find_spec("magic")) @@ -500,8 +500,8 @@ class _OleFileDifferentiator: @staticmethod def _check_ole_file_type(ctx: _FileTypeDetectionContext) -> FileType | None: with ctx.open() as f: - ole = OleFileIO(f) - root_storage = Storage.from_ole(ole) + ole = OleFileIO(f) # pyright: ignore[reportUnknownVariableType] + root_storage = Storage.from_ole(ole) # pyright: ignore[reportUnknownMemberType] for stream in root_storage.streams: if stream.name == "WordDocument": diff --git a/unstructured/partition/api.py b/unstructured/partition/api.py index c5094d77f..b8a1cded2 100644 --- a/unstructured/partition/api.py +++ b/unstructured/partition/api.py @@ -9,7 +9,7 @@ from unstructured_client.models import shared from unstructured.documents.elements import Element from unstructured.logger import logger -from unstructured.partition.common import exactly_one +from unstructured.partition.common.common import exactly_one from unstructured.staging.base import elements_from_dicts, elements_from_json diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index 35cbc37ab..44b46a134 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -13,7 +13,7 @@ from unstructured.documents.elements import DataSourceMetadata, Element from unstructured.file_utils.filetype import detect_filetype, is_json_processable from unstructured.file_utils.model import FileType from unstructured.logger import logger -from unstructured.partition.common import exactly_one +from unstructured.partition.common.common import exactly_one from unstructured.partition.lang import check_language_args from unstructured.partition.utils.constants import PartitionStrategy from unstructured.utils import dependency_exists diff --git a/unstructured/partition/common/__init__.py b/unstructured/partition/common/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/unstructured/partition/common.py b/unstructured/partition/common/common.py similarity index 84% rename from unstructured/partition/common.py rename to unstructured/partition/common/common.py index dcf49c66d..ac83b1be7 100644 --- a/unstructured/partition/common.py +++ b/unstructured/partition/common/common.py @@ -1,9 +1,7 @@ from __future__ import annotations import numbers -import os import subprocess -from datetime import datetime from io import BufferedReader, BytesIO, TextIOWrapper from tempfile import SpooledTemporaryFile from time import sleep @@ -41,76 +39,6 @@ if TYPE_CHECKING: from unstructured_inference.inference.layout import DocumentLayout, PageLayout from unstructured_inference.inference.layoutelement import LayoutElement -HIERARCHY_RULE_SET = { - "Title": [ - "Text", - "UncategorizedText", - "NarrativeText", - "ListItem", - "BulletedText", - "Table", - "FigureCaption", - "CheckBox", - "Table", - ], - "Header": [ - "Title", - "Text", - "UncategorizedText", - "NarrativeText", - "ListItem", - "BulletedText", - "Table", - "FigureCaption", - "CheckBox", - "Table", - ], -} - - -def get_last_modified( - filename: str | None, file: IO[bytes] | None, date_from_file_object: bool -) -> str | None: - """Determine best available last-modified date from file or filename.""" - if filename is not None: - return get_last_modified_date(filename) - - if file is not None: - return get_last_modified_date_from_file(file) if date_from_file_object else None - - return None - - -def get_last_modified_date(filename: str) -> Optional[str]: - """Modification time of file at path `filename`, if it exists. - - Returns `None` when `filename` is not a path to a file on the local filesystem. - - Otherwise returns date and time in ISO 8601 string format (YYYY-MM-DDTHH:MM:SS) like - "2024-03-05T17:02:53". - """ - if not os.path.isfile(filename): - return None - - modify_date = datetime.fromtimestamp(os.path.getmtime(filename)) - return modify_date.strftime("%Y-%m-%dT%H:%M:%S%z") - - -def get_last_modified_date_from_file(file: IO[bytes] | bytes) -> Optional[str]: - """Modified timestamp of `file` if it corresponds to a file on the local filesystem.""" - # -- a file-like object will have a name attribute if created by `open()` or if a name is - # -- assigned to it for metadata purposes. Use "" as default because the empty string is never - # -- a path to an actual file. - filename = str(getattr(file, "name", "")) - - # -- there's no guarantee the path corresponds to an actual file on the filesystem. In - # -- particular, a user can set the `.name` attribute of an e.g. `io.BytesIO` object to - # -- populate the `.metadata.filename` fields for a payload perhaps downloaded via HTTP. - if not os.path.isfile(filename): - return None - - return get_last_modified_date(filename) - def normalize_layout_element( layout_element: LayoutElement | Element | dict[str, Any], @@ -230,54 +158,6 @@ def layout_list_to_list_items( return list_items -def set_element_hierarchy( - elements: list[Element], ruleset: dict[str, list[str]] = HIERARCHY_RULE_SET -) -> list[Element]: - """Sets the parent_id for each element in the list of elements - based on the element's category, depth and a ruleset - - """ - stack: list[Element] = [] - for element in elements: - if element.metadata.parent_id is not None: - continue - parent_id = None - element_category = getattr(element, "category", None) - element_category_depth = getattr(element.metadata, "category_depth", 0) or 0 - - if not element_category: - continue - - while stack: - top_element: Element = stack[-1] - top_element_category = getattr(top_element, "category") - top_element_category_depth = ( - getattr( - top_element.metadata, - "category_depth", - 0, - ) - or 0 - ) - - if ( - top_element_category == element_category - and top_element_category_depth < element_category_depth - ) or ( - top_element_category != element_category - and element_category in ruleset.get(top_element_category, []) - ): - parent_id = top_element.id - break - - stack.pop() - - element.metadata.parent_id = parent_id - stack.append(element) - - return elements - - def add_element_metadata( element: Element, filename: Optional[str] = None, @@ -580,7 +460,7 @@ def _get_page_image_metadata(page: PageLayout) -> dict[str, Any]: # FIXME: document here can be either DocumentLayout or HTMLDocument; HTMLDocument is defined in # unstructured.documents.html, which imports this module so we can't import the class for type # hints. Moreover, those two types of documents have different lists of attributes -# UPDATE(scanny): HTMLDocument no longer uses this function, so it can be optimized for use by +# UPDATE(scanny): HTMLDocument no longer exists, so this function can be optimized for use by # DocumentLayout only. def document_to_element_list( document: DocumentLayout, diff --git a/unstructured/partition/common/metadata.py b/unstructured/partition/common/metadata.py new file mode 100644 index 000000000..ef479cdf2 --- /dev/null +++ b/unstructured/partition/common/metadata.py @@ -0,0 +1,128 @@ +"""Helpers used across multiple partitioners to compute metadata.""" + +from __future__ import annotations + +import datetime as dt +import os +from typing import IO, Optional, Sequence + +from unstructured.documents.elements import Element + + +def get_last_modified( + filename: str | None, file: IO[bytes] | None, date_from_file_object: bool +) -> str | None: + """Determine best available last-modified date from file or filename.""" + if filename is not None: + return get_last_modified_date(filename) + + if file is not None: + return get_last_modified_date_from_file(file) if date_from_file_object else None + + return None + + +def get_last_modified_date(filename: str) -> Optional[str]: + """Modification time of file at path `filename`, if it exists. + + Returns `None` when `filename` is not a path to a file on the local filesystem. + + Otherwise returns date and time in ISO 8601 string format (YYYY-MM-DDTHH:MM:SS) like + "2024-03-05T17:02:53". + """ + if not os.path.isfile(filename): + return None + + modify_date = dt.datetime.fromtimestamp(os.path.getmtime(filename)) + return modify_date.strftime("%Y-%m-%dT%H:%M:%S%z") + + +def get_last_modified_date_from_file(file: IO[bytes] | bytes) -> Optional[str]: + """Modified timestamp of `file` if it corresponds to a file on the local filesystem.""" + # -- a file-like object will have a name attribute if created by `open()` or if a name is + # -- assigned to it for metadata purposes. Use "" as default because the empty string is never + # -- a path to an actual file. + filename = str(getattr(file, "name", "")) + + # -- there's no guarantee the path corresponds to an actual file on the filesystem. In + # -- particular, a user can set the `.name` attribute of an e.g. `io.BytesIO` object to + # -- populate the `.metadata.filename` fields for a payload perhaps downloaded via HTTP. + if not os.path.isfile(filename): + return None + + return get_last_modified_date(filename) + + +HIERARCHY_RULE_SET = { + "Title": [ + "Text", + "UncategorizedText", + "NarrativeText", + "ListItem", + "BulletedText", + "Table", + "FigureCaption", + "CheckBox", + "Table", + ], + "Header": [ + "Title", + "Text", + "UncategorizedText", + "NarrativeText", + "ListItem", + "BulletedText", + "Table", + "FigureCaption", + "CheckBox", + "Table", + ], +} + + +def set_element_hierarchy( + elements: Sequence[Element], ruleset: dict[str, list[str]] = HIERARCHY_RULE_SET +) -> list[Element]: + """Sets the parent_id for each element in the list of elements + based on the element's category, depth and a ruleset + + """ + stack: list[Element] = [] + for element in elements: + if element.metadata.parent_id is not None: + continue + parent_id = None + element_category = getattr(element, "category", None) + element_category_depth = getattr(element.metadata, "category_depth", 0) or 0 + + if not element_category: + continue + + while stack: + top_element: Element = stack[-1] + top_element_category = getattr(top_element, "category") + top_element_category_depth = ( + getattr( + top_element.metadata, + "category_depth", + 0, + ) + or 0 + ) + + if ( + top_element_category == element_category + and top_element_category_depth < element_category_depth + ) or ( + top_element_category != element_category + and element_category in ruleset.get(top_element_category, []) + ): + parent_id = top_element.id + break + + stack.pop() + + element.metadata.parent_id = parent_id + stack.append(element) + + return list(elements) diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py index 9d250505a..15c792f81 100644 --- a/unstructured/partition/csv.py +++ b/unstructured/partition/csv.py @@ -16,7 +16,10 @@ from unstructured.documents.elements import ( ) from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.model import FileType -from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file +from unstructured.partition.common.metadata import ( + get_last_modified_date, + get_last_modified_date_from_file, +) from unstructured.partition.lang import apply_lang_metadata from unstructured.utils import is_temp_file_path, lazyproperty diff --git a/unstructured/partition/doc.py b/unstructured/partition/doc.py index 23f5afb48..3cbece6d8 100644 --- a/unstructured/partition/doc.py +++ b/unstructured/partition/doc.py @@ -8,11 +8,8 @@ from unstructured.chunking import add_chunking_strategy from unstructured.documents.elements import Element, process_metadata from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.model import FileType -from unstructured.partition.common import ( - convert_office_doc, - exactly_one, - get_last_modified, -) +from unstructured.partition.common.common import convert_office_doc, exactly_one +from unstructured.partition.common.metadata import get_last_modified from unstructured.partition.docx import partition_docx diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py index 6e0fa1b24..67297de98 100644 --- a/unstructured/partition/docx.py +++ b/unstructured/partition/docx.py @@ -46,7 +46,7 @@ from unstructured.documents.elements import ( ) from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.model import FileType -from unstructured.partition.common import ( +from unstructured.partition.common.metadata import ( get_last_modified_date, get_last_modified_date_from_file, ) diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py index 76a8729fc..83e0a13cd 100644 --- a/unstructured/partition/email.py +++ b/unstructured/partition/email.py @@ -46,9 +46,8 @@ from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.model import FileType from unstructured.logger import logger from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE -from unstructured.partition.common import ( - convert_to_bytes, - exactly_one, +from unstructured.partition.common.common import convert_to_bytes, exactly_one +from unstructured.partition.common.metadata import ( get_last_modified_date, get_last_modified_date_from_file, ) diff --git a/unstructured/partition/epub.py b/unstructured/partition/epub.py index f8ecfb2c1..7e8b2695b 100644 --- a/unstructured/partition/epub.py +++ b/unstructured/partition/epub.py @@ -7,7 +7,8 @@ from unstructured.documents.elements import Element, process_metadata from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.model import FileType -from unstructured.partition.common import exactly_one, get_last_modified +from unstructured.partition.common.common import exactly_one +from unstructured.partition.common.metadata import get_last_modified from unstructured.partition.html import partition_html DETECTION_ORIGIN: str = "epub" diff --git a/unstructured/partition/html/partition.py b/unstructured/partition/html/partition.py index 92da768c9..843fefca2 100644 --- a/unstructured/partition/html/partition.py +++ b/unstructured/partition/html/partition.py @@ -14,7 +14,10 @@ from unstructured.documents.elements import Element, process_metadata from unstructured.file_utils.encoding import read_txt_file from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.model import FileType -from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file +from unstructured.partition.common.metadata import ( + get_last_modified_date, + get_last_modified_date_from_file, +) from unstructured.partition.html.parser import Flow, html_parser from unstructured.partition.lang import apply_lang_metadata from unstructured.utils import is_temp_file_path, lazyproperty diff --git a/unstructured/partition/image.py b/unstructured/partition/image.py index a9a9ea963..c970c51eb 100644 --- a/unstructured/partition/image.py +++ b/unstructured/partition/image.py @@ -5,7 +5,7 @@ from typing import IO, Any, Optional from unstructured.chunking import add_chunking_strategy from unstructured.documents.elements import Element, process_metadata from unstructured.file_utils.filetype import add_metadata -from unstructured.partition.common import exactly_one +from unstructured.partition.common.common import exactly_one from unstructured.partition.lang import check_language_args from unstructured.partition.pdf import partition_pdf_or_image from unstructured.partition.utils.constants import PartitionStrategy diff --git a/unstructured/partition/json.py b/unstructured/partition/json.py index c4f2592a8..caaad8580 100644 --- a/unstructured/partition/json.py +++ b/unstructured/partition/json.py @@ -19,8 +19,8 @@ from unstructured.file_utils.filetype import ( add_metadata_with_filetype, is_json_processable, ) -from unstructured.partition.common import ( - exactly_one, +from unstructured.partition.common.common import exactly_one +from unstructured.partition.common.metadata import ( get_last_modified_date, get_last_modified_date_from_file, ) diff --git a/unstructured/partition/md.py b/unstructured/partition/md.py index 92fe362f6..259d2674d 100644 --- a/unstructured/partition/md.py +++ b/unstructured/partition/md.py @@ -9,8 +9,8 @@ from unstructured.chunking import add_chunking_strategy from unstructured.documents.elements import Element, process_metadata from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.model import FileType -from unstructured.partition.common import ( - exactly_one, +from unstructured.partition.common.common import exactly_one +from unstructured.partition.common.metadata import ( get_last_modified_date, get_last_modified_date_from_file, ) diff --git a/unstructured/partition/msg.py b/unstructured/partition/msg.py index 4c9daa89c..f0ee95524 100644 --- a/unstructured/partition/msg.py +++ b/unstructured/partition/msg.py @@ -14,7 +14,7 @@ from unstructured.documents.elements import Element, ElementMetadata, process_me from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.model import FileType from unstructured.logger import logger -from unstructured.partition.common import ( +from unstructured.partition.common.metadata import ( get_last_modified_date, get_last_modified_date_from_file, ) diff --git a/unstructured/partition/odt.py b/unstructured/partition/odt.py index 99e6ec9ec..0f96a01ba 100644 --- a/unstructured/partition/odt.py +++ b/unstructured/partition/odt.py @@ -8,7 +8,8 @@ from unstructured.chunking import add_chunking_strategy from unstructured.documents.elements import Element, process_metadata from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.model import FileType -from unstructured.partition.common import exactly_one, get_last_modified +from unstructured.partition.common.common import exactly_one +from unstructured.partition.common.metadata import get_last_modified from unstructured.partition.docx import partition_docx from unstructured.utils import requires_dependencies diff --git a/unstructured/partition/org.py b/unstructured/partition/org.py index 797c73d24..765fe6bd5 100644 --- a/unstructured/partition/org.py +++ b/unstructured/partition/org.py @@ -7,7 +7,8 @@ from unstructured.documents.elements import Element from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.model import FileType -from unstructured.partition.common import exactly_one, get_last_modified +from unstructured.partition.common.common import exactly_one +from unstructured.partition.common.metadata import get_last_modified from unstructured.partition.html import partition_html DETECTION_ORIGIN: str = "org" diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 3dad1f996..fd84ed24e 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -42,7 +42,7 @@ from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.model import FileType from unstructured.logger import logger, trace_logger from unstructured.nlp.patterns import PARAGRAPH_PATTERN -from unstructured.partition.common import ( +from unstructured.partition.common.common import ( document_to_element_list, exactly_one, ocr_data_to_elements, diff --git a/unstructured/partition/pdf_image/pdf_image_utils.py b/unstructured/partition/pdf_image/pdf_image_utils.py index edc4122c8..76b8aa764 100644 --- a/unstructured/partition/pdf_image/pdf_image_utils.py +++ b/unstructured/partition/pdf_image/pdf_image_utils.py @@ -17,16 +17,16 @@ from PIL import Image from unstructured.documents.elements import ElementType from unstructured.logger import logger -from unstructured.partition.common import ( - convert_to_bytes, - exactly_one, +from unstructured.partition.common.common import convert_to_bytes, exactly_one +from unstructured.partition.common.metadata import ( get_last_modified_date, get_last_modified_date_from_file, ) from unstructured.partition.utils.config import env_config if TYPE_CHECKING: - from unstructured_inference.inference.layout import DocumentLayout, PageLayout, TextRegion + from unstructured_inference.inference.elements import TextRegion + from unstructured_inference.inference.layout import DocumentLayout, PageLayout from unstructured_inference.inference.layoutelement import LayoutElement from unstructured.documents.elements import Element diff --git a/unstructured/partition/ppt.py b/unstructured/partition/ppt.py index 972c1d5fc..d60d5fc3b 100644 --- a/unstructured/partition/ppt.py +++ b/unstructured/partition/ppt.py @@ -8,9 +8,8 @@ from unstructured.chunking import add_chunking_strategy from unstructured.documents.elements import Element, process_metadata from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.model import FileType -from unstructured.partition.common import ( - convert_office_doc, - exactly_one, +from unstructured.partition.common.common import convert_office_doc, exactly_one +from unstructured.partition.common.metadata import ( get_last_modified_date, get_last_modified_date_from_file, ) diff --git a/unstructured/partition/pptx.py b/unstructured/partition/pptx.py index 06e617a13..fc53347fd 100644 --- a/unstructured/partition/pptx.py +++ b/unstructured/partition/pptx.py @@ -36,8 +36,8 @@ from unstructured.documents.elements import ( ) from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.model import FileType -from unstructured.partition.common import ( - convert_ms_office_table_to_text, +from unstructured.partition.common.common import convert_ms_office_table_to_text +from unstructured.partition.common.metadata import ( get_last_modified_date, get_last_modified_date_from_file, ) diff --git a/unstructured/partition/rst.py b/unstructured/partition/rst.py index 30105ef4a..b5d00912d 100644 --- a/unstructured/partition/rst.py +++ b/unstructured/partition/rst.py @@ -7,7 +7,8 @@ from unstructured.documents.elements import Element, process_metadata from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.model import FileType -from unstructured.partition.common import exactly_one, get_last_modified +from unstructured.partition.common.common import exactly_one +from unstructured.partition.common.metadata import get_last_modified from unstructured.partition.html import partition_html DETECTION_ORIGIN: str = "rst" diff --git a/unstructured/partition/rtf.py b/unstructured/partition/rtf.py index ac5cf00e4..6833a2001 100644 --- a/unstructured/partition/rtf.py +++ b/unstructured/partition/rtf.py @@ -7,7 +7,8 @@ from unstructured.documents.elements import Element, process_metadata from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.model import FileType -from unstructured.partition.common import exactly_one, get_last_modified +from unstructured.partition.common.common import exactly_one +from unstructured.partition.common.metadata import get_last_modified from unstructured.partition.html import partition_html DETECTION_ORIGIN: str = "rtf" diff --git a/unstructured/partition/text.py b/unstructured/partition/text.py index cc648384a..8e3ac88a4 100644 --- a/unstructured/partition/text.py +++ b/unstructured/partition/text.py @@ -29,8 +29,8 @@ from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.model import FileType from unstructured.nlp.patterns import PARAGRAPH_PATTERN, UNICODE_BULLETS_RE from unstructured.nlp.tokenize import sent_tokenize -from unstructured.partition.common import ( - exactly_one, +from unstructured.partition.common.common import exactly_one +from unstructured.partition.common.metadata import ( get_last_modified_date, get_last_modified_date_from_file, ) diff --git a/unstructured/partition/tsv.py b/unstructured/partition/tsv.py index 04368b96a..364478072 100644 --- a/unstructured/partition/tsv.py +++ b/unstructured/partition/tsv.py @@ -14,11 +14,13 @@ from unstructured.documents.elements import ( ) from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.model import FileType -from unstructured.partition.common import ( +from unstructured.partition.common.common import ( exactly_one, + spooled_to_bytes_io_if_needed, +) +from unstructured.partition.common.metadata import ( get_last_modified_date, get_last_modified_date_from_file, - spooled_to_bytes_io_if_needed, ) from unstructured.partition.lang import apply_lang_metadata diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py index 27b977321..9c7f7854d 100644 --- a/unstructured/partition/xlsx.py +++ b/unstructured/partition/xlsx.py @@ -26,7 +26,10 @@ from unstructured.documents.elements import ( ) from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.model import FileType -from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file +from unstructured.partition.common.metadata import ( + get_last_modified_date, + get_last_modified_date_from_file, +) from unstructured.partition.lang import apply_lang_metadata from unstructured.partition.text_type import ( is_bulleted_text, diff --git a/unstructured/partition/xml.py b/unstructured/partition/xml.py index 91e79c575..50c9d4e30 100644 --- a/unstructured/partition/xml.py +++ b/unstructured/partition/xml.py @@ -16,11 +16,13 @@ from unstructured.documents.elements import ( from unstructured.file_utils.encoding import read_txt_file from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.model import FileType -from unstructured.partition.common import ( +from unstructured.partition.common.common import ( exactly_one, + spooled_to_bytes_io_if_needed, +) +from unstructured.partition.common.metadata import ( get_last_modified_date, get_last_modified_date_from_file, - spooled_to_bytes_io_if_needed, ) from unstructured.partition.lang import apply_lang_metadata from unstructured.partition.text import element_from_text diff --git a/unstructured/staging/base.py b/unstructured/staging/base.py index 429195f68..592f95948 100644 --- a/unstructured/staging/base.py +++ b/unstructured/staging/base.py @@ -16,7 +16,7 @@ from unstructured.documents.elements import ( Element, ElementMetadata, ) -from unstructured.partition.common import exactly_one +from unstructured.partition.common.common import exactly_one from unstructured.utils import Point, dependency_exists, requires_dependencies if dependency_exists("pandas"):