Steve Canny b4ee019170
rfctr: flatten test_unstructured/partition (#3073)
**Summary**
Some partitioner test modules are placed in directories by themselves or
with one other test module. This unnecessarily obscures where to find
the test module corresponding to a partitiner.

Move partitioner test modules to mirror the directory structure of
`unstructured/partition`.
2024-05-23 00:51:08 +00:00

839 lines
31 KiB
Python

# pyright: reportPrivateUsage=false
"""Test suite for `unstructured.partition.pptx` module."""
from __future__ import annotations
import hashlib
import io
import pathlib
import tempfile
from typing import Any, Iterator
import pptx
import pytest
from pptx.shapes.picture import Picture
from pptx.util import Inches
from pytest_mock import MockFixture
from test_unstructured.unit_utils import (
FixtureRequest,
Mock,
assert_round_trips_through_JSON,
example_doc_path,
function_mock,
)
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import (
Element,
ElementMetadata,
Image,
ListItem,
NarrativeText,
PageBreak,
Text,
Title,
)
from unstructured.partition.pptx import (
PptxPartitionerOptions,
partition_pptx,
register_picture_partitioner,
)
EXPECTED_PPTX_OUTPUT = [
Title(text="Adding a Bullet Slide"),
ListItem(text="Find the bullet slide layout"),
ListItem(text="Use _TextFrame.text for first bullet"),
ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"),
NarrativeText(text="Here is a lot of text!"),
NarrativeText(text="Here is some text in a text box!"),
]
# == document file behaviors =====================================================================
def test_partition_pptx_from_filename():
elements = partition_pptx(example_doc_path("fake-power-point.pptx"))
assert elements == EXPECTED_PPTX_OUTPUT
for element in elements:
assert element.metadata.filename == "fake-power-point.pptx"
def test_partition_pptx_from_filename_with_metadata_filename():
elements = partition_pptx(example_doc_path("fake-power-point.pptx"), metadata_filename="test")
assert elements == EXPECTED_PPTX_OUTPUT
for element in elements:
assert element.metadata.filename == "test"
def test_partition_pptx_with_spooled_file():
"""The `partition_pptx() function can handle a `SpooledTemporaryFile.
Including one that does not have its read-pointer set to the start.
"""
with open(example_doc_path("fake-power-point.pptx"), "rb") as test_file:
spooled_temp_file = tempfile.SpooledTemporaryFile()
spooled_temp_file.write(test_file.read())
elements = partition_pptx(file=spooled_temp_file)
assert elements == EXPECTED_PPTX_OUTPUT
for element in elements:
assert element.metadata.filename is None
def test_partition_pptx_from_file():
with open(example_doc_path("fake-power-point.pptx"), "rb") as f:
elements = partition_pptx(file=f)
assert elements == EXPECTED_PPTX_OUTPUT
for element in elements:
assert element.metadata.filename is None
def test_partition_pptx_from_file_with_metadata_filename():
with open(example_doc_path("fake-power-point.pptx"), "rb") as f:
elements = partition_pptx(file=f, metadata_filename="test")
assert elements == EXPECTED_PPTX_OUTPUT
for element in elements:
assert element.metadata.filename == "test"
def test_partition_pptx_raises_with_neither():
with pytest.raises(ValueError):
partition_pptx()
def test_partition_pptx_recurses_into_group_shapes():
elements = partition_pptx(example_doc_path("group-shapes-nested.pptx"))
assert [e.text for e in elements] == ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"]
# == page-break behaviors ========================================================================
def test_partition_pptx_adds_page_breaks(tmp_path: pathlib.Path):
filename = str(tmp_path / "test-page-breaks.pptx")
presentation = pptx.Presentation()
blank_slide_layout = presentation.slide_layouts[6]
slide = presentation.slides.add_slide(blank_slide_layout)
left = top = width = height = Inches(2)
txBox = slide.shapes.add_textbox(left, top, width, height)
tf = txBox.text_frame
tf.text = "This is the first slide."
slide = presentation.slides.add_slide(blank_slide_layout)
left = top = width = height = Inches(2)
txBox = slide.shapes.add_textbox(left, top, width, height)
tf = txBox.text_frame
tf.text = "This is the second slide."
presentation.save(filename)
elements = partition_pptx(filename=filename)
assert elements == [
NarrativeText(text="This is the first slide."),
PageBreak(text=""),
NarrativeText(text="This is the second slide."),
]
for element in elements:
assert element.metadata.filename == "test-page-breaks.pptx"
def test_partition_pptx_page_breaks_toggle_off(tmp_path: pathlib.Path):
filename = str(tmp_path / "test-page-breaks.pptx")
presentation = pptx.Presentation()
blank_slide_layout = presentation.slide_layouts[6]
slide = presentation.slides.add_slide(blank_slide_layout)
left = top = width = height = Inches(2)
txBox = slide.shapes.add_textbox(left, top, width, height)
tf = txBox.text_frame
tf.text = "This is the first slide."
slide = presentation.slides.add_slide(blank_slide_layout)
left = top = width = height = Inches(2)
txBox = slide.shapes.add_textbox(left, top, width, height)
tf = txBox.text_frame
tf.text = "This is the second slide."
presentation.save(filename)
elements = partition_pptx(filename=filename, include_page_breaks=False)
assert elements == [
NarrativeText(text="This is the first slide."),
NarrativeText(text="This is the second slide."),
]
for element in elements:
assert element.metadata.filename == "test-page-breaks.pptx"
def test_partition_pptx_many_pages():
elements = partition_pptx(example_doc_path("fake-power-point-many-pages.pptx"))
# The page_number of PageBreak is None
assert set(filter(None, (elt.metadata.page_number for elt in elements))) == {1, 2}
for element in elements:
assert element.metadata.filename == "fake-power-point-many-pages.pptx"
# == miscellaneous behaviors =====================================================================
def test_partition_pptx_orders_elements(tmp_path: pathlib.Path):
filename = str(tmp_path / "test-ordering.pptx")
presentation = pptx.Presentation()
blank_slide_layout = presentation.slide_layouts[6]
slide = presentation.slides.add_slide(blank_slide_layout)
left = top = width = height = Inches(2)
txBox = slide.shapes.add_textbox(left, top, width, height)
tf = txBox.text_frame
tf.text = "This is lower and should come second"
left = top = width = height = Inches(1)
left = top = Inches(-10)
txBox = slide.shapes.add_textbox(left, top, width, height)
tf = txBox.text_frame
tf.text = "This is off the page and shouldn't appear"
left = top = width = height = Inches(2)
txBox = slide.shapes.add_textbox(left, top, width, height)
tf = txBox.text_frame
tf.text = ""
left = top = width = height = Inches(1)
txBox = slide.shapes.add_textbox(left, top, width, height)
tf = txBox.text_frame
tf.text = "This is higher and should come first"
top = width = height = Inches(1)
left = Inches(0.5)
txBox = slide.shapes.add_textbox(left, top, width, height)
tf = txBox.text_frame
tf.text = "-------------TOP-------------"
presentation.save(filename)
elements = partition_pptx(filename=filename)
assert elements == [
Text("-------------TOP-------------"),
NarrativeText("This is higher and should come first"),
NarrativeText("This is lower and should come second"),
]
for element in elements:
assert element.metadata.filename == "test-ordering.pptx"
def test_partition_pptx_grabs_tables():
elements = partition_pptx(example_doc_path("fake-power-point-table.pptx"))
assert elements[1].text.startswith("Column 1")
assert elements[1].text.strip().endswith("Aqua")
assert elements[1].metadata.text_as_html == (
"<table>\n"
"<thead>\n"
"<tr><th>Column 1 </th><th>Column 2 </th><th>Column 3 </th></tr>\n"
"</thead>\n"
"<tbody>\n"
"<tr><td>Red </td><td>Green </td><td>Blue </td></tr>\n"
"<tr><td>Purple </td><td>Orange </td><td>Yellow </td></tr>\n"
"<tr><td>Tangerine </td><td>Pink </td><td>Aqua </td></tr>\n"
"</tbody>\n"
"</table>"
)
assert elements[1].metadata.filename == "fake-power-point-table.pptx"
@pytest.mark.parametrize("infer_table_structure", [True, False])
def test_partition_pptx_infer_table_structure(infer_table_structure: bool):
elements = partition_pptx(
example_doc_path("fake-power-point-table.pptx"), infer_table_structure=infer_table_structure
)
table_element_has_text_as_html_field = (
hasattr(elements[1].metadata, "text_as_html")
and elements[1].metadata.text_as_html is not None
)
assert table_element_has_text_as_html_field == infer_table_structure
def test_partition_pptx_malformed():
elements = partition_pptx(example_doc_path("fake-power-point-malformed.pptx"))
assert elements[0].text == "Problem Date Placeholder"
assert elements[1].text == "Test Slide"
for element in elements:
assert element.metadata.filename == "fake-power-point-malformed.pptx"
# == image sub-partitioning behaviors ============================================================
def test_partition_pptx_generates_no_Image_elements_by_default():
assert partition_pptx(example_doc_path("picture.pptx")) == []
def test_partition_pptx_uses_registered_picture_partitioner():
class FakePicturePartitioner:
@classmethod
def iter_elements(cls, picture: Picture, opts: PptxPartitionerOptions) -> Iterator[Element]:
image_hash = hashlib.sha1(picture.image.blob).hexdigest()
yield Image(f"Image with hash {image_hash}, strategy: {opts.strategy}")
register_picture_partitioner(FakePicturePartitioner)
elements = partition_pptx(example_doc_path("picture.pptx"))
assert len(elements) == 1
image = elements[0]
assert type(image) is Image
assert image.text == "Image with hash b0a1e6cf904691e6fa42bd9e72acc2b05280dc86, strategy: fast"
# == metadata behaviors ==========================================================================
def test_partition_pptx_metadata_date(mocker: MockFixture):
mocker.patch(
"unstructured.partition.pptx.get_last_modified_date", return_value="2029-07-05T09:24:28"
)
elements = partition_pptx(example_doc_path("fake-power-point-malformed.pptx"))
assert elements[0].metadata.last_modified == "2029-07-05T09:24:28"
def test_partition_pptx_with_custom_metadata_date(mocker: MockFixture):
mocker.patch(
"unstructured.partition.pptx.get_last_modified_date", return_value="2022-11-22T11:22:33"
)
elements = partition_pptx(
example_doc_path("fake-power-point-malformed.pptx"),
metadata_last_modified="2024-04-03T20:16:03",
)
assert elements[0].metadata.last_modified == "2024-04-03T20:16:03"
def test_partition_pptx_from_file_metadata_date(mocker: MockFixture):
mocker.patch(
"unstructured.partition.pptx.get_last_modified_date_from_file",
return_value="2029-07-05T09:24:28",
)
with open(example_doc_path("fake-power-point-malformed.pptx"), "rb") as f:
elements = partition_pptx(file=f)
assert elements[0].metadata.last_modified is None
def test_partition_pptx_from_file_explicit_get_metadata_date(mocker: MockFixture):
mocker.patch(
"unstructured.partition.pptx.get_last_modified_date_from_file",
return_value="2029-07-05T09:24:28",
)
with open(example_doc_path("fake-power-point-malformed.pptx"), "rb") as f:
elements = partition_pptx(file=f, date_from_file_object=True)
assert elements[0].metadata.last_modified == "2029-07-05T09:24:28"
def test_partition_pptx_from_file_with_custom_metadata_date(mocker: MockFixture):
mocker.patch(
"unstructured.partition.pptx.get_last_modified_date_from_file",
return_value="2022-11-22T11:22:33",
)
with open(example_doc_path("fake-power-point-malformed.pptx"), "rb") as f:
elements = partition_pptx(file=f, metadata_last_modified="2024-04-03T20:16:03")
assert elements[0].metadata.last_modified == "2024-04-03T20:16:03"
def test_partition_pptx_from_file_without_metadata_date():
"""Test partition_pptx() with file that are not possible to get last modified date"""
with open(example_doc_path("fake-power-point-malformed.pptx"), "rb") as f:
sf = tempfile.SpooledTemporaryFile()
sf.write(f.read())
sf.seek(0)
elements = partition_pptx(file=sf, date_from_file_object=True)
assert elements[0].metadata.last_modified is None
def test_partition_pptx_element_metadata_has_languages():
elements = partition_pptx(example_doc_path("fake-power-point.pptx"))
assert elements[0].metadata.languages == ["eng"]
def test_partition_pptx_respects_detect_language_per_element():
elements = partition_pptx(
example_doc_path("language-docs/eng_spa_mult.pptx"), detect_language_per_element=True
)
langs = [element.metadata.languages for element in elements]
# languages other than English and Spanish are detected by this partitioner,
# so this test is slightly different from the other partition tests
langs = {element.metadata.languages[0] for element in elements if element.metadata.languages}
assert "eng" in langs
assert "spa" in langs
def test_partition_pptx_raises_TypeError_for_invalid_languages():
with pytest.raises(TypeError):
partition_pptx(example_doc_path("fake-power-point.pptx"), languages="eng") # type: ignore
# == downstream behaviors ========================================================================
def test_partition_pptx_with_json():
elements = partition_pptx(example_doc_path("fake-power-point.pptx"))
assert_round_trips_through_JSON(elements)
def test_add_chunking_strategy_by_title_on_partition_pptx():
filename = example_doc_path("science-exploration-1p.pptx")
elements = partition_pptx(filename=filename)
chunk_elements = partition_pptx(filename, chunking_strategy="by_title")
chunks = chunk_by_title(elements)
assert chunk_elements != elements
assert chunk_elements == chunks
def test_partition_pptx_title_shape_detection(tmp_path: pathlib.Path):
"""This tests if the title attribute of a shape is correctly categorized as a title"""
filename = str(tmp_path / "test-title-shape.pptx")
# create a fake PowerPoint presentation with a slide containing a title shape
prs = pptx.Presentation()
slide = prs.slides.add_slide(prs.slide_layouts[0])
title_shape = slide.shapes.title
assert title_shape is not None
title_shape.text = (
"This is a title, it's a bit long so we can make sure it's not narrative text"
)
title_shape.text_frame.add_paragraph().text = "this is a subtitle"
prs.save(filename)
# partition the PowerPoint presentation and get the first element
elements = partition_pptx(filename)
title = elements[0]
subtitle = elements[1]
# assert that the first line is a title and has the correct text and depth
assert isinstance(title, Title)
assert (
title.text == "This is a title, it's a bit long so we can make sure it's not narrative text"
)
assert title.metadata.category_depth == 0
# assert that the first line is the subtitle and has the correct text and depth
assert isinstance(subtitle, Title)
assert subtitle.text == "this is a subtitle"
assert subtitle.metadata.category_depth == 1
def test_partition_pptx_level_detection(tmp_path: pathlib.Path):
"""This tests if the level attribute of a paragraph is correctly set as the category depth"""
filename = str(tmp_path / "test-category-depth.pptx")
prs = pptx.Presentation()
blank_slide_layout = prs.slide_layouts[1]
slide = prs.slides.add_slide(blank_slide_layout)
shapes = slide.shapes
title_shape = shapes.title
assert title_shape is not None
title_shape.text = (
"This is a title, it's a bit long so we can make sure it's not narrative text"
)
body_shape = shapes.placeholders[1]
tf = body_shape.text_frame
tf.text = "this is the root level bullet"
p = tf.add_paragraph()
p.text = "this is the level 1 bullet"
p.level = 1
p = tf.add_paragraph()
p.text = "this is the level 2 bullet"
p.level = 2
prs.slides[0].shapes
prs.save(filename)
# partition the PowerPoint presentation and get the first element
elements = partition_pptx(filename)
# NOTE(newelh) - python_pptx does not create full bullet xml, so unstructured will
# not detect the paragraphs as bullets. This is fine for now, as
# the level attribute is still set correctly, and what we're testing here
test_cases = [
(0, Title, "This is a title, it's a bit long so we can make sure it's not narrative text"),
(0, NarrativeText, "this is the root level bullet"),
(1, NarrativeText, "this is the level 1 bullet"),
(2, NarrativeText, "this is the level 2 bullet"),
]
for element, test_case in zip(elements, test_cases):
assert element.text == test_case[2], f"expected {test_case[2]}, got {element.text}"
assert isinstance(
element,
test_case[1],
), f"expected {test_case[1]}, got {type(element).__name__} for {element.text}"
assert (
element.metadata.category_depth == test_case[0]
), f"expected {test_case[0]}, got {element.metadata.category_depth} for {element.text}"
def test_partition_pptx_hierarchy_sample_document():
"""This tests if the hierarchy of the sample document is correctly detected"""
elements = partition_pptx(example_doc_path("sample-presentation.pptx"))
test_cases = [
(0, None, "b2859226ba1f9243fb3f1b2ace889f43"),
(1, "b2859226ba1f9243fb3f1b2ace889f43", "d13f8827e94541c8b818b0df8f942526"),
(None, None, "1ffd3151819e594553e6b540e19e6c36"),
(0, None, "e535f799d1f0e79d6777efa873a16ce1"),
(0, "e535f799d1f0e79d6777efa873a16ce1", "f02bbfb417ad60daa2ba35080e96262f"),
(0, "e535f799d1f0e79d6777efa873a16ce1", "414dfce72ea53cd4649176af0d62a4c1"),
(1, "414dfce72ea53cd4649176af0d62a4c1", "3d45a95c79473a07db4edca5534a7c49"),
(1, "414dfce72ea53cd4649176af0d62a4c1", "a33333f527851f700ca175acd04b8a2c"),
(2, "a33333f527851f700ca175acd04b8a2c", "6f1b87689e4da2b0fb865bc5f92d5702"),
(0, "e535f799d1f0e79d6777efa873a16ce1", "3f58e0be3b8e8b15cba7adc4eae68586"),
(None, None, "1ffd3151819e594553e6b540e19e6c36"),
(0, None, "8319096532fe2e55f66c491ea8313150"),
(0, "8319096532fe2e55f66c491ea8313150", "17a7e78277ab131a627cb4538bab7390"),
(0, "8319096532fe2e55f66c491ea8313150", "41a9e1d0390f4edd77181142ceae51bc"),
(1, "41a9e1d0390f4edd77181142ceae51bc", "cbbc78ef38a035fd66f7b030dcf12f66"),
(1, "41a9e1d0390f4edd77181142ceae51bc", "2a551e3cbe67561debe0da262a294f24"),
(2, "2a551e3cbe67561debe0da262a294f24", "7a121a056eedb11ac8804d6fd17afc0c"),
(0, "8319096532fe2e55f66c491ea8313150", "a24a3caf9853702cb73daae23020b7b4"),
(0, "8319096532fe2e55f66c491ea8313150", "18367f334b5c8c4602ea413ab68ac35b"),
(0, "8319096532fe2e55f66c491ea8313150", "7f647b1f0f20c3db40c36ab57d9a5550"),
(1, "7f647b1f0f20c3db40c36ab57d9a5550", "591c24b41b53aba873188a0881d10961"),
(1, "7f647b1f0f20c3db40c36ab57d9a5550", "6ec455f5f19782facf184886876c9a66"),
(2, "6ec455f5f19782facf184886876c9a66", "5614b00c3f6bff23ebba1360e10f6428"),
(0, "8319096532fe2e55f66c491ea8313150", "2f57a8d4182e6fd5bd5842b0a2d9841b"),
(None, None, "1ffd3151819e594553e6b540e19e6c36"),
(None, None, "2ed3bd10daace79ac129cbf8faf22bfc"),
(0, None, "fd08cacbaddafee5cbacc02528536ee5"),
]
# Zip the test cases with the elements
for element, test_case in zip(elements, test_cases):
expected_depth, expected_parent_id, expected_id = test_case
assert element.metadata.category_depth == expected_depth
assert element.metadata.parent_id == expected_parent_id
assert element.id == expected_id
# ================================================================================================
# ISOLATED UNIT TESTS
# ================================================================================================
# These test components used by `partition_pptx()` in isolation such that all edge cases can be
# exercised.
# ================================================================================================
class DescribePptxPartitionerOptions:
"""Unit-test suite for `unstructured.partition.xlsx.PptxPartitionerOptions` objects."""
@pytest.mark.parametrize("arg_value", [True, False])
def it_knows_whether_to_emit_PageBreak_elements_as_part_of_the_output_element_stream(
self, arg_value: bool, opts_args: dict[str, Any]
):
opts_args["include_page_breaks"] = arg_value
opts = PptxPartitionerOptions(**opts_args)
assert opts.include_page_breaks is arg_value
@pytest.mark.parametrize("arg_value", [True, False])
def it_knows_whether_to_partition_content_found_in_slide_notes(
self, arg_value: bool, opts_args: dict[str, Any]
):
opts_args["include_slide_notes"] = arg_value
opts = PptxPartitionerOptions(**opts_args)
assert opts.include_slide_notes is arg_value
@pytest.mark.parametrize("arg_value", [True, False])
def it_knows_whether_to_include_text_as_html_in_Table_metadata(
self, arg_value: bool, opts_args: dict[str, Any]
):
opts_args["infer_table_structure"] = arg_value
opts = PptxPartitionerOptions(**opts_args)
assert opts.infer_table_structure is arg_value
# -- .increment_page_number() ----------------
def it_generates_a_PageBreak_element_when_the_page_number_is_incremented(
self, opts_args: dict[str, Any]
):
opts = PptxPartitionerOptions(**opts_args)
# -- move to the first slide --
list(opts.increment_page_number())
page_break_iter = opts.increment_page_number()
assert isinstance(next(page_break_iter, None), PageBreak)
assert opts.page_number == 2
with pytest.raises(StopIteration):
next(page_break_iter)
def but_it_does_not_generate_a_PageBreak_element_for_the_first_slide(
self, opts_args: dict[str, Any]
):
opts = PptxPartitionerOptions(**opts_args)
page_break_iter = opts.increment_page_number()
with pytest.raises(StopIteration):
next(page_break_iter)
assert opts.page_number == 1
def and_it_does_not_generate_a_PageBreak_element_when_include_page_breaks_option_is_off(
self, opts_args: dict[str, Any]
):
opts_args["include_page_breaks"] = False
opts = PptxPartitionerOptions(**opts_args)
# -- move to the first slide --
list(opts.increment_page_number())
page_break_iter = opts.increment_page_number()
with pytest.raises(StopIteration):
next(page_break_iter)
assert opts.page_number == 2
# -- .last_modified --------------------------
def it_gets_the_last_modified_date_of_the_document_from_the_caller_when_provided(
self, opts_args: dict[str, Any]
):
opts_args["metadata_last_modified"] = "2024-03-05T17:02:53"
opts = PptxPartitionerOptions(**opts_args)
assert opts.last_modified == "2024-03-05T17:02:53"
def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_path_is_provided(
self, opts_args: dict[str, Any], get_last_modified_date_: Mock
):
opts_args["file_path"] = "a/b/spreadsheet.pptx"
get_last_modified_date_.return_value = "2024-04-02T20:32:35"
opts = PptxPartitionerOptions(**opts_args)
last_modified = opts.last_modified
get_last_modified_date_.assert_called_once_with("a/b/spreadsheet.pptx")
assert last_modified == "2024-04-02T20:32:35"
def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_file_like_object_is_provided(
self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock
):
file = io.BytesIO(b"abcdefg")
opts_args["file"] = file
opts_args["date_from_file_object"] = True
get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07"
opts = PptxPartitionerOptions(**opts_args)
last_modified = opts.last_modified
get_last_modified_date_from_file_.assert_called_once_with(file)
assert last_modified == "2024-04-02T20:42:07"
def but_it_falls_back_to_None_for_the_last_modified_date_when_date_from_file_object_is_False(
self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock
):
file = io.BytesIO(b"abcdefg")
opts_args["file"] = file
opts_args["date_from_file_object"] = False
get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07"
opts = PptxPartitionerOptions(**opts_args)
last_modified = opts.last_modified
get_last_modified_date_from_file_.assert_not_called()
assert last_modified is None
# -- .metadata_file_path ---------------------
def it_uses_the_user_provided_file_path_in_the_metadata_when_provided(
self, opts_args: dict[str, Any]
):
opts_args["file_path"] = "x/y/z.pptx"
opts_args["metadata_file_path"] = "a/b/c.pptx"
opts = PptxPartitionerOptions(**opts_args)
assert opts.metadata_file_path == "a/b/c.pptx"
@pytest.mark.parametrize("file_path", ["u/v/w.pptx", None])
def and_it_falls_back_to_the_document_file_path_otherwise(
self, file_path: str | None, opts_args: dict[str, Any]
):
opts_args["file_path"] = file_path
opts_args["metadata_file_path"] = None
opts = PptxPartitionerOptions(**opts_args)
assert opts.metadata_file_path == file_path
# -- .page_number ----------------------------
def it_keeps_track_of_the_page_number(self, opts_args: dict[str, Any]):
"""In PPTX, page-number is the slide number."""
opts = PptxPartitionerOptions(**opts_args)
assert opts.page_number == 0
list(opts.increment_page_number())
assert opts.page_number == 1
list(opts.increment_page_number())
assert opts.page_number == 2
def it_assigns_the_correct_page_number_when_starting_page_number_is_given(
self, opts_args: dict[str, Any]
):
opts = PptxPartitionerOptions(**opts_args, starting_page_number=3)
# -- move to the "first" slide --
list(opts.increment_page_number())
table_metadata = opts.table_metadata(text_as_html="<table><tr/></table>")
text_metadata = opts.text_metadata()
assert isinstance(table_metadata, ElementMetadata)
assert isinstance(text_metadata, ElementMetadata)
assert text_metadata.page_number == 3
assert table_metadata.page_number == 3
# -- .pptx_file ------------------------------
def it_uses_the_path_to_open_the_presentation_when_file_path_is_provided(
self, opts_args: dict[str, Any]
):
opts_args["file_path"] = "l/m/n.pptx"
opts = PptxPartitionerOptions(**opts_args)
assert opts.pptx_file == "l/m/n.pptx"
def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided(
self, opts_args: dict[str, Any]
):
spooled_temp_file = tempfile.SpooledTemporaryFile()
spooled_temp_file.write(b"abcdefg")
opts_args["file"] = spooled_temp_file
opts = PptxPartitionerOptions(**opts_args)
pptx_file = opts.pptx_file
assert pptx_file is not spooled_temp_file
assert isinstance(pptx_file, io.BytesIO)
assert pptx_file.getvalue() == b"abcdefg"
def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile(
self, opts_args: dict[str, Any]
):
file = io.BytesIO(b"abcdefg")
opts_args["file"] = file
opts = PptxPartitionerOptions(**opts_args)
pptx_file = opts.pptx_file
assert pptx_file is file
assert isinstance(pptx_file, io.BytesIO)
assert pptx_file.getvalue() == b"abcdefg"
def but_it_raises_ValueError_when_neither_a_file_path_or_file_is_provided(
self, opts_args: dict[str, Any]
):
opts = PptxPartitionerOptions(**opts_args)
with pytest.raises(ValueError, match="No PPTX document specified, either `filename` or "):
opts.pptx_file
# -- .strategy -------------------------------
@pytest.mark.parametrize("arg_value", ["fast", "hi_res"])
def it_knows_which_partitioning_strategy_to_use(
self, arg_value: str, opts_args: dict[str, Any]
):
opts_args["strategy"] = arg_value
opts = PptxPartitionerOptions(**opts_args)
assert opts.strategy == arg_value
# -- .table_metadata -------------------------
def it_can_create_table_metadata(self, opts_args: dict[str, Any]):
opts_args["metadata_file_path"] = "d/e/f.pptx"
opts_args["metadata_last_modified"] = "2024-04-02T19:51:55"
opts = PptxPartitionerOptions(**opts_args)
# -- move to the first slide --
list(opts.increment_page_number())
metadata = opts.table_metadata(text_as_html="<table><tr/></table>")
assert isinstance(metadata, ElementMetadata)
assert metadata.filename == "f.pptx"
assert metadata.last_modified == "2024-04-02T19:51:55"
assert metadata.page_number == 1
assert metadata.text_as_html == "<table><tr/></table>"
# -- .text_metadata -------------------------
def it_can_create_text_metadata(self, opts_args: dict[str, Any]):
opts_args["metadata_file_path"] = "d/e/f.pptx"
opts_args["metadata_last_modified"] = "2024-04-02T19:56:40"
opts = PptxPartitionerOptions(**opts_args)
# -- move to the first slide --
list(opts.increment_page_number())
metadata = opts.text_metadata(category_depth=2)
assert isinstance(metadata, ElementMetadata)
assert metadata.filename == "f.pptx"
assert metadata.last_modified == "2024-04-02T19:56:40"
assert metadata.page_number == 1
assert metadata.category_depth == 2
# -- fixtures --------------------------------------------------------------------------------
@pytest.fixture()
def get_last_modified_date_(self, request: FixtureRequest):
return function_mock(request, "unstructured.partition.pptx.get_last_modified_date")
@pytest.fixture()
def get_last_modified_date_from_file_(self, request: FixtureRequest):
return function_mock(
request, "unstructured.partition.pptx.get_last_modified_date_from_file"
)
@pytest.fixture()
def opts_args(self) -> dict[str, Any]:
"""All default arguments for `_XlsxPartitionerOptions`.
Individual argument values can be changed to suit each test. Makes construction of opts more
compact for testing purposes.
"""
return {
"date_from_file_object": False,
"file": None,
"file_path": None,
"include_page_breaks": True,
"include_slide_notes": False,
"infer_table_structure": True,
"metadata_file_path": None,
"metadata_last_modified": None,
"strategy": "fast",
}