Steve Canny 3bab9d93e6
rfctr(part): prepare for pluggable auto-partitioners 1 (#3655)
**Summary**
In preparation for pluggable auto-partitioners simplify metadata as
discussed.

**Additional Context**
- Pluggable auto-partitioners requires partitioners to have a consistent
call signature. An arbitrary partitioner provided at runtime needs to
have a call signature that is known and consistent. Basically
`partition_x(filename, *, file, **kwargs)`.
- The current `auto.partition()` is highly coupled to each distinct
file-type partitioner, deciding which arguments to forward to each.
- This is driven by the existence of "delegating" partitioners, those
that convert their file-type and then call a second partitioner to do
the actual partitioning. Both the delegating and proxy partitioners are
decorated with metadata-post-processing decorators and those decorators
are not idempotent. We call the situation where those decorators would
run twice "double-decorating". For example, EPUB converts to HTML and
calls `partition_html()` and both `partition_epub()` and
`partition_html()` are decorated.
- The way double-decorating has been avoided in the past is to avoid
sending the arguments the metadata decorators are sensitive to to the
proxy partitioner. This is very obscure, complex to reason about,
error-prone, and just overall not a viable strategy. The better solution
is to not decorate delegating partitioners and let the proxy partitioner
handle all the metadata.
- This first step in preparation for that is part of simplifying the
metadata processing by removing unused or unwanted legacy parameters.
- `date_from_file_object` is a misnomer because a file-object never
contains last-modified data.
- It can never produce useful results in the API where last-modified
information must be provided by `metadata_last_modified`.
- It is an undocumented parameter so not in use.
- Using it can produce incorrect metadata.
2024-09-23 22:23:10 +00:00

169 lines
6.4 KiB
Python

"""Test-suite for `unstructured.partition.ppt` module."""
from __future__ import annotations
import pytest
from pytest_mock import MockFixture
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import ListItem, NarrativeText, PageBreak, Title
from unstructured.partition.ppt import partition_ppt
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
EXPECTED_PPT_OUTPUT = [
Title(text="Adding a Bullet Slide"),
ListItem(text="Find the bullet slide layout"),
ListItem(text="Use _TextFrame.text for first bullet"),
ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"),
NarrativeText(text="Here is a lot of text!"),
NarrativeText(text="Here is some text in a text box!"),
]
def test_partition_ppt_from_filename():
elements = partition_ppt(example_doc_path("fake-power-point.ppt"))
assert elements == EXPECTED_PPT_OUTPUT
for element in elements:
assert element.metadata.filename == "fake-power-point.ppt"
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
assert {element.metadata.detection_origin for element in elements} == {"pptx"}
def test_partition_ppt_from_filename_with_metadata_filename():
elements = partition_ppt(example_doc_path("fake-power-point.ppt"), metadata_filename="test")
assert all(element.metadata.filename == "test" for element in elements)
def test_partition_ppt_raises_with_missing_file():
with pytest.raises(ValueError):
partition_ppt(example_doc_path("doesnt-exist.ppt"))
def test_partition_ppt_from_file():
with open(example_doc_path("fake-power-point.ppt"), "rb") as f:
elements = partition_ppt(file=f)
assert elements == EXPECTED_PPT_OUTPUT
for element in elements:
assert element.metadata.filename is None
def test_partition_ppt_from_file_with_metadata_filename():
with open(example_doc_path("fake-power-point.ppt"), "rb") as f:
elements = partition_ppt(file=f, metadata_filename="test")
assert elements == EXPECTED_PPT_OUTPUT
for element in elements:
assert element.metadata.filename == "test"
def test_partition_ppt_raises_with_both_specified():
filename = example_doc_path("fake-power-point.ppt")
with open(filename, "rb") as f, pytest.raises(ValueError):
partition_ppt(filename=filename, file=f)
def test_partition_ppt_raises_when_neither_file_path_or_file_is_provided():
with pytest.raises(ValueError):
partition_ppt()
def test_partition_ppt_from_filename_exclude_metadata():
filename = example_doc_path("fake-power-point.ppt")
elements = partition_ppt(filename=filename, include_metadata=False)
for i in range(len(elements)):
assert elements[i].metadata.to_dict() == {}
def test_partition_ppt_from_file_exclude_metadata():
filename = example_doc_path("fake-power-point.ppt")
with open(filename, "rb") as f:
elements = partition_ppt(file=f, include_metadata=False)
for i in range(len(elements)):
assert elements[i].metadata.to_dict() == {}
# -- .metadata.last_modified ---------------------------------------------------------------------
def test_partition_ppt_from_file_path_gets_last_modified_from_filesystem(mocker: MockFixture):
filesystem_last_modified = "2024-05-01T15:37:28"
mocker.patch(
"unstructured.partition.ppt.get_last_modified_date", return_value=filesystem_last_modified
)
elements = partition_ppt(example_doc_path("fake-power-point.ppt"))
assert all(e.metadata.last_modified == filesystem_last_modified for e in elements)
def test_partition_ppt_from_file_gets_last_modified_None():
with open(example_doc_path("fake-power-point.ppt"), "rb") as f:
elements = partition_ppt(file=f)
assert all(e.metadata.last_modified is None for e in elements)
def test_partition_ppt_from_file_path_prefers_metadata_last_modified(mocker: MockFixture):
filesystem_last_modified = "2024-05-01T15:37:28"
metadata_last_modified = "2020-07-05T09:24:28"
mocker.patch(
"unstructured.partition.ppt.get_last_modified_date", return_value=filesystem_last_modified
)
elements = partition_ppt(
example_doc_path("fake-power-point.ppt"), metadata_last_modified=metadata_last_modified
)
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
def test_partition_ppt_from_file_prefers_metadata_last_modified():
metadata_last_modified = "2020-07-05T09:24:28"
with open(example_doc_path("fake-power-point.ppt"), "rb") as f:
elements = partition_ppt(file=f, metadata_last_modified=metadata_last_modified)
assert elements[0].metadata.last_modified == metadata_last_modified
# ------------------------------------------------------------------------------------------------
def test_partition_ppt_with_json():
elements = partition_ppt(example_doc_path("fake-power-point.ppt"))
assert_round_trips_through_JSON(elements)
def test_add_chunking_strategy_by_title_on_partition_ppt():
file_path = example_doc_path("fake-power-point.ppt")
elements = partition_ppt(file_path)
chunk_elements = partition_ppt(file_path, chunking_strategy="by_title")
chunks = chunk_by_title(elements)
assert chunk_elements != elements
assert chunk_elements == chunks
def test_partition_ppt_params():
"""Integration test of params: languages, include_page_break, and include_slide_notes."""
elements = partition_ppt(
example_doc_path("language-docs/eng_spa_mult.ppt"),
include_page_breaks=True,
include_slide_notes=True,
)
assert elements[0].metadata.languages == ["eng"]
assert any(isinstance(element, PageBreak) for element in elements)
# The example doc contains a slide note with the text "This is a slide note."
assert any(element.text == "This is a slide note." for element in elements)
def test_partition_ppt_respects_detect_language_per_element():
elements = partition_ppt(
example_doc_path("language-docs/eng_spa_mult.ppt"), detect_language_per_element=True
)
langs = [element.metadata.languages for element in elements]
# languages other than English and Spanish are detected by this partitioner,
# so this test is slightly different from the other partition tests
langs = {element.metadata.languages[0] for element in elements if element.metadata.languages}
assert "eng" in langs
assert "spa" in langs