mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-03 06:19:13 +00:00
202 lines
7.8 KiB
Python
202 lines
7.8 KiB
Python
![]() |
"""Test-suite for `unstructured.partition.common.metadata` module."""
|
||
|
|
||
|
from __future__ import annotations
|
||
|
|
||
|
import datetime as dt
|
||
|
import io
|
||
|
import os
|
||
|
import pathlib
|
||
|
|
||
|
import pytest
|
||
|
|
||
|
from unstructured.documents.elements import (
|
||
|
CheckBox,
|
||
|
ElementMetadata,
|
||
|
FigureCaption,
|
||
|
Header,
|
||
|
ListItem,
|
||
|
NarrativeText,
|
||
|
Text,
|
||
|
Title,
|
||
|
)
|
||
|
from unstructured.partition.common.metadata import (
|
||
|
get_last_modified,
|
||
|
get_last_modified_date,
|
||
|
get_last_modified_date_from_file,
|
||
|
set_element_hierarchy,
|
||
|
)
|
||
|
|
||
|
# ================================================================================================
|
||
|
# LAST-MODIFIED
|
||
|
# ================================================================================================
|
||
|
|
||
|
|
||
|
class Describe_get_last_modified:
|
||
|
"""Isolated unit-tests for `unstructured.partition.common.metadata.get_last_modified()."""
|
||
|
|
||
|
def it_pulls_last_modified_from_the_filesystem_when_a_path_is_provided(
|
||
|
self, file_and_last_modified: tuple[str, str]
|
||
|
):
|
||
|
file_path, last_modified = file_and_last_modified
|
||
|
last_modified_date = get_last_modified(str(file_path), None, False)
|
||
|
assert last_modified_date == last_modified
|
||
|
|
||
|
def and_it_pulls_last_modified_from_the_file_like_object_when_one_is_provided(
|
||
|
self, file_and_last_modified: tuple[str, str]
|
||
|
):
|
||
|
file_path, last_modified = file_and_last_modified
|
||
|
with open(file_path, "rb") as f:
|
||
|
last_modified_date = get_last_modified(None, f, True)
|
||
|
assert last_modified_date == last_modified
|
||
|
|
||
|
def but_not_when_date_from_file_object_is_False(self, file_and_last_modified: tuple[str, str]):
|
||
|
file_path, _ = file_and_last_modified
|
||
|
with open(file_path, "rb") as f:
|
||
|
last_modified_date = get_last_modified(None, f, False)
|
||
|
assert last_modified_date is None
|
||
|
|
||
|
# -- fixtures --------------------------------------------------------------------------------
|
||
|
|
||
|
@pytest.fixture()
|
||
|
def file_and_last_modified(self, tmp_path: pathlib.Path) -> tuple[str, str]:
|
||
|
modified_timestamp = dt.datetime(
|
||
|
year=2024, month=6, day=14, hour=15, minute=39, second=25
|
||
|
).timestamp()
|
||
|
file_path = tmp_path / "some_file.txt"
|
||
|
file_path.write_text("abcdefg")
|
||
|
os.utime(file_path, (modified_timestamp, modified_timestamp))
|
||
|
return str(file_path), "2024-06-14T15:39:25"
|
||
|
|
||
|
|
||
|
class Describe_get_last_modified_date:
|
||
|
def it_gets_the_modified_time_of_a_file_identified_by_a_path(self, tmp_path: pathlib.Path):
|
||
|
modified_timestamp = dt.datetime(
|
||
|
year=2024, month=3, day=5, hour=17, minute=43, second=40
|
||
|
).timestamp()
|
||
|
file_path = tmp_path / "some_file.txt"
|
||
|
file_path.write_text("abcdefg")
|
||
|
os.utime(file_path, (modified_timestamp, modified_timestamp))
|
||
|
|
||
|
last_modified_date = get_last_modified_date(str(file_path))
|
||
|
|
||
|
assert last_modified_date == "2024-03-05T17:43:40"
|
||
|
|
||
|
def but_it_returns_None_when_there_is_no_file_at_that_path(self, tmp_path: pathlib.Path):
|
||
|
file_path = tmp_path / "some_file_that_does_not_exist.txt"
|
||
|
|
||
|
last_modified_date = get_last_modified_date(str(file_path))
|
||
|
|
||
|
assert last_modified_date is None
|
||
|
|
||
|
|
||
|
class Describe_get_last_modified_date_from_file:
|
||
|
def it_gets_the_modified_time_of_a_file_like_object_corresponding_to_a_filesystem_file(
|
||
|
self, tmp_path: pathlib.Path
|
||
|
):
|
||
|
modified_timestamp = dt.datetime(
|
||
|
year=2024, month=3, day=5, hour=20, minute=48, second=26
|
||
|
).timestamp()
|
||
|
file_path = tmp_path / "some_file_2.txt"
|
||
|
file_path.write_text("abcdefg")
|
||
|
os.utime(file_path, (modified_timestamp, modified_timestamp))
|
||
|
|
||
|
with open(file_path, "rb") as f:
|
||
|
last_modified_date = get_last_modified_date_from_file(f)
|
||
|
|
||
|
assert last_modified_date == "2024-03-05T20:48:26"
|
||
|
|
||
|
def but_it_returns_None_when_the_argument_is_a_bytes_object(self):
|
||
|
assert get_last_modified_date_from_file(b"abcdefg") is None
|
||
|
|
||
|
def and_it_returns_None_when_the_file_like_object_has_no_name_attribute(self):
|
||
|
file = io.BytesIO(b"abcdefg")
|
||
|
assert hasattr(file, "name") is False
|
||
|
|
||
|
last_modified_date = get_last_modified_date_from_file(file)
|
||
|
|
||
|
assert last_modified_date is None
|
||
|
|
||
|
def and_it_returns_None_when_the_file_like_object_name_is_not_a_path_to_a_file(
|
||
|
self, tmp_path: pathlib.Path
|
||
|
):
|
||
|
file = io.BytesIO(b"abcdefg")
|
||
|
file.name = str(tmp_path / "a_file_that_isn't_here.txt")
|
||
|
|
||
|
last_modified_date = get_last_modified_date_from_file(file)
|
||
|
|
||
|
assert last_modified_date is None
|
||
|
|
||
|
|
||
|
# ================================================================================================
|
||
|
# ELEMENT HIERARCHY
|
||
|
# ================================================================================================
|
||
|
|
||
|
|
||
|
def test_set_element_hierarchy():
|
||
|
elements_to_set = [
|
||
|
Title(text="Title"), # 0
|
||
|
NarrativeText(text="NarrativeText"), # 1
|
||
|
FigureCaption(text="FigureCaption"), # 2
|
||
|
ListItem(text="ListItem"), # 3
|
||
|
ListItem(text="ListItem", metadata=ElementMetadata(category_depth=1)), # 4
|
||
|
ListItem(text="ListItem", metadata=ElementMetadata(category_depth=1)), # 5
|
||
|
ListItem(text="ListItem"), # 6
|
||
|
CheckBox(element_id="some-id-1", checked=True), # 7
|
||
|
Title(text="Title 2"), # 8
|
||
|
ListItem(text="ListItem"), # 9
|
||
|
ListItem(text="ListItem"), # 10
|
||
|
Text(text="Text"), # 11
|
||
|
]
|
||
|
elements = set_element_hierarchy(elements_to_set)
|
||
|
|
||
|
assert (
|
||
|
elements[1].metadata.parent_id == elements[0].id
|
||
|
), "NarrativeText should be child of Title"
|
||
|
assert (
|
||
|
elements[2].metadata.parent_id == elements[0].id
|
||
|
), "FigureCaption should be child of Title"
|
||
|
assert elements[3].metadata.parent_id == elements[0].id, "ListItem should be child of Title"
|
||
|
assert elements[4].metadata.parent_id == elements[3].id, "ListItem should be child of Title"
|
||
|
assert elements[5].metadata.parent_id == elements[3].id, "ListItem should be child of Title"
|
||
|
assert elements[6].metadata.parent_id == elements[0].id, "ListItem should be child of Title"
|
||
|
# NOTE(Hubert): moving the category field to Element, caused this to fail.
|
||
|
# Checkboxes will soon be deprecated, then we can remove the test.
|
||
|
# assert (
|
||
|
# elements[7].metadata.parent_id is None
|
||
|
# ), "CheckBox should be None, as it's not a Text based element"
|
||
|
assert elements[8].metadata.parent_id is None, "Title 2 should be child of None"
|
||
|
assert elements[9].metadata.parent_id == elements[8].id, "ListItem should be child of Title 2"
|
||
|
assert elements[10].metadata.parent_id == elements[8].id, "ListItem should be child of Title 2"
|
||
|
assert elements[11].metadata.parent_id == elements[8].id, "Text should be child of Title 2"
|
||
|
|
||
|
|
||
|
def test_set_element_hierarchy_custom_rule_set():
|
||
|
elements_to_set = [
|
||
|
Header(text="Header"), # 0
|
||
|
Title(text="Title"), # 1
|
||
|
NarrativeText(text="NarrativeText"), # 2
|
||
|
Text(text="Text"), # 3
|
||
|
Title(text="Title 2"), # 4
|
||
|
FigureCaption(text="FigureCaption"), # 5
|
||
|
]
|
||
|
|
||
|
custom_rule_set = {
|
||
|
"Header": ["Title", "Text"],
|
||
|
"Title": ["NarrativeText", "UncategorizedText", "FigureCaption"],
|
||
|
}
|
||
|
|
||
|
elements = set_element_hierarchy(
|
||
|
elements=elements_to_set,
|
||
|
ruleset=custom_rule_set,
|
||
|
)
|
||
|
|
||
|
assert elements[1].metadata.parent_id == elements[0].id, "Title should be child of Header"
|
||
|
assert (
|
||
|
elements[2].metadata.parent_id == elements[1].id
|
||
|
), "NarrativeText should be child of Title"
|
||
|
assert elements[3].metadata.parent_id == elements[1].id, "Text should be child of Title"
|
||
|
assert elements[4].metadata.parent_id == elements[0].id, "Title 2 should be child of Header"
|
||
|
assert (
|
||
|
elements[5].metadata.parent_id == elements[4].id
|
||
|
), "FigureCaption should be child of Title 2"
|