unstructured/test_unstructured/partition/test_pptx.py

# pyright: reportPrivateUsage=false

"""Test suite for `unstructured.partition.pptx` module."""

from __future__ import annotations

import hashlib
import io
import pathlib
import tempfile
from typing import Any, Iterator

import pptx
import pytest
from pptx.shapes.picture import Picture
from pptx.util import Inches
from pytest_mock import MockFixture

from test_unstructured.unit_utils import (
    FixtureRequest,
    Mock,
    assert_round_trips_through_JSON,
    example_doc_path,
    function_mock,
)
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import (
    Element,
    ElementMetadata,
    Image,
    ListItem,
    NarrativeText,
    PageBreak,
    Text,
    Title,
)
from unstructured.partition.pptx import (
    PptxPartitionerOptions,
    partition_pptx,
    register_picture_partitioner,
)

EXPECTED_PPTX_OUTPUT = [
    Title(text="Adding a Bullet Slide"),
    ListItem(text="Find the bullet slide layout"),
    ListItem(text="Use _TextFrame.text for first bullet"),
    ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"),
    NarrativeText(text="Here is a lot of text!"),
    NarrativeText(text="Here is some text in a text box!"),
]


# == document file behaviors =====================================================================


def test_partition_pptx_from_filename():
    elements = partition_pptx(example_doc_path("fake-power-point.pptx"))
    assert elements == EXPECTED_PPTX_OUTPUT
    for element in elements:
        assert element.metadata.filename == "fake-power-point.pptx"


def test_partition_pptx_from_filename_with_metadata_filename():
    elements = partition_pptx(example_doc_path("fake-power-point.pptx"), metadata_filename="test")
    assert elements == EXPECTED_PPTX_OUTPUT
    for element in elements:
        assert element.metadata.filename == "test"


def test_partition_pptx_with_spooled_file():
    """The `partition_pptx() function can handle a `SpooledTemporaryFile.

    Including one that does not have its read-pointer set to the start.
    """
    with open(example_doc_path("fake-power-point.pptx"), "rb") as test_file:
        spooled_temp_file = tempfile.SpooledTemporaryFile()
        spooled_temp_file.write(test_file.read())
        elements = partition_pptx(file=spooled_temp_file)
        assert elements == EXPECTED_PPTX_OUTPUT
        for element in elements:
            assert element.metadata.filename is None


def test_partition_pptx_from_file():
    with open(example_doc_path("fake-power-point.pptx"), "rb") as f:
        elements = partition_pptx(file=f)
    assert elements == EXPECTED_PPTX_OUTPUT
    for element in elements:
        assert element.metadata.filename is None


def test_partition_pptx_from_file_with_metadata_filename():
    with open(example_doc_path("fake-power-point.pptx"), "rb") as f:
        elements = partition_pptx(file=f, metadata_filename="test")
    assert elements == EXPECTED_PPTX_OUTPUT
    for element in elements:
        assert element.metadata.filename == "test"


def test_partition_pptx_raises_with_neither():
    with pytest.raises(ValueError):
        partition_pptx()


def test_partition_pptx_recurses_into_group_shapes():
    elements = partition_pptx(example_doc_path("group-shapes-nested.pptx"))
    assert [e.text for e in elements] == ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"]


# == page-break behaviors ========================================================================


def test_partition_pptx_adds_page_breaks(tmp_path: pathlib.Path):
    filename = str(tmp_path / "test-page-breaks.pptx")

    presentation = pptx.Presentation()
    blank_slide_layout = presentation.slide_layouts[6]

    slide = presentation.slides.add_slide(blank_slide_layout)
    left = top = width = height = Inches(2)
    txBox = slide.shapes.add_textbox(left, top, width, height)
    tf = txBox.text_frame
    tf.text = "This is the first slide."

    slide = presentation.slides.add_slide(blank_slide_layout)
    left = top = width = height = Inches(2)
    txBox = slide.shapes.add_textbox(left, top, width, height)
    tf = txBox.text_frame
    tf.text = "This is the second slide."

    presentation.save(filename)

    elements = partition_pptx(filename=filename)

    assert elements == [
        NarrativeText(text="This is the first slide."),
        PageBreak(text=""),
        NarrativeText(text="This is the second slide."),
    ]
    for element in elements:
        assert element.metadata.filename == "test-page-breaks.pptx"


def test_partition_pptx_page_breaks_toggle_off(tmp_path: pathlib.Path):
    filename = str(tmp_path / "test-page-breaks.pptx")

    presentation = pptx.Presentation()
    blank_slide_layout = presentation.slide_layouts[6]

    slide = presentation.slides.add_slide(blank_slide_layout)
    left = top = width = height = Inches(2)
    txBox = slide.shapes.add_textbox(left, top, width, height)
    tf = txBox.text_frame
    tf.text = "This is the first slide."

    slide = presentation.slides.add_slide(blank_slide_layout)
    left = top = width = height = Inches(2)
    txBox = slide.shapes.add_textbox(left, top, width, height)
    tf = txBox.text_frame
    tf.text = "This is the second slide."

    presentation.save(filename)

    elements = partition_pptx(filename=filename, include_page_breaks=False)

    assert elements == [
        NarrativeText(text="This is the first slide."),
        NarrativeText(text="This is the second slide."),
    ]
    for element in elements:
        assert element.metadata.filename == "test-page-breaks.pptx"


def test_partition_pptx_many_pages():
    elements = partition_pptx(example_doc_path("fake-power-point-many-pages.pptx"))

    # The page_number of PageBreak is None
    assert set(filter(None, (elt.metadata.page_number for elt in elements))) == {1, 2}
    for element in elements:
        assert element.metadata.filename == "fake-power-point-many-pages.pptx"


# == miscellaneous behaviors =====================================================================


def test_partition_pptx_orders_elements(tmp_path: pathlib.Path):
    filename = str(tmp_path / "test-ordering.pptx")
    presentation = pptx.Presentation()
    blank_slide_layout = presentation.slide_layouts[6]
    slide = presentation.slides.add_slide(blank_slide_layout)

    left = top = width = height = Inches(2)
    txBox = slide.shapes.add_textbox(left, top, width, height)
    tf = txBox.text_frame
    tf.text = "This is lower and should come second"

    left = top = width = height = Inches(1)
    left = top = Inches(-10)
    txBox = slide.shapes.add_textbox(left, top, width, height)
    tf = txBox.text_frame
    tf.text = "This is off the page and shouldn't appear"

    left = top = width = height = Inches(2)
    txBox = slide.shapes.add_textbox(left, top, width, height)
    tf = txBox.text_frame
    tf.text = ""

    left = top = width = height = Inches(1)
    txBox = slide.shapes.add_textbox(left, top, width, height)
    tf = txBox.text_frame
    tf.text = "This is higher and should come first"

    top = width = height = Inches(1)
    left = Inches(0.5)
    txBox = slide.shapes.add_textbox(left, top, width, height)
    tf = txBox.text_frame
    tf.text = "-------------TOP-------------"

    presentation.save(filename)

    elements = partition_pptx(filename=filename)
    assert elements == [
        Text("-------------TOP-------------"),
        NarrativeText("This is higher and should come first"),
        NarrativeText("This is lower and should come second"),
    ]
    for element in elements:
        assert element.metadata.filename == "test-ordering.pptx"


def test_partition_pptx_grabs_tables():
    elements = partition_pptx(example_doc_path("fake-power-point-table.pptx"))

    assert elements[1].text.startswith("Column 1")
    assert elements[1].text.strip().endswith("Aqua")
    assert elements[1].metadata.text_as_html == (
        "<table>\n"
        "<thead>\n"
        "<tr><th>Column 1  </th><th>Column 2  </th><th>Column 3  </th></tr>\n"
        "</thead>\n"
        "<tbody>\n"
        "<tr><td>Red       </td><td>Green     </td><td>Blue      </td></tr>\n"
        "<tr><td>Purple    </td><td>Orange    </td><td>Yellow    </td></tr>\n"
        "<tr><td>Tangerine </td><td>Pink      </td><td>Aqua      </td></tr>\n"
        "</tbody>\n"
        "</table>"
    )
    assert elements[1].metadata.filename == "fake-power-point-table.pptx"


@pytest.mark.parametrize("infer_table_structure", [True, False])
def test_partition_pptx_infer_table_structure(infer_table_structure: bool):
    elements = partition_pptx(
        example_doc_path("fake-power-point-table.pptx"), infer_table_structure=infer_table_structure
    )
    table_element_has_text_as_html_field = (
        hasattr(elements[1].metadata, "text_as_html")
        and elements[1].metadata.text_as_html is not None
    )
    assert table_element_has_text_as_html_field == infer_table_structure


def test_partition_pptx_malformed():
    elements = partition_pptx(example_doc_path("fake-power-point-malformed.pptx"))

    assert elements[0].text == "Problem Date Placeholder"
    assert elements[1].text == "Test Slide"
    for element in elements:
        assert element.metadata.filename == "fake-power-point-malformed.pptx"


# == image sub-partitioning behaviors ============================================================


def test_partition_pptx_generates_no_Image_elements_by_default():
    assert partition_pptx(example_doc_path("picture.pptx")) == []


def test_partition_pptx_uses_registered_picture_partitioner():
    class FakePicturePartitioner:
        @classmethod
        def iter_elements(cls, picture: Picture, opts: PptxPartitionerOptions) -> Iterator[Element]:
            image_hash = hashlib.sha1(picture.image.blob).hexdigest()
            yield Image(f"Image with hash {image_hash}, strategy: {opts.strategy}")

    register_picture_partitioner(FakePicturePartitioner)

    elements = partition_pptx(example_doc_path("picture.pptx"))

    assert len(elements) == 1
    image = elements[0]
    assert type(image) is Image
    assert image.text == "Image with hash b0a1e6cf904691e6fa42bd9e72acc2b05280dc86, strategy: fast"


# == metadata behaviors ==========================================================================


def test_partition_pptx_metadata_date(mocker: MockFixture):
    mocker.patch(
        "unstructured.partition.pptx.get_last_modified_date", return_value="2029-07-05T09:24:28"
    )

    elements = partition_pptx(example_doc_path("fake-power-point-malformed.pptx"))

    assert elements[0].metadata.last_modified == "2029-07-05T09:24:28"


def test_partition_pptx_with_custom_metadata_date(mocker: MockFixture):
    mocker.patch(
        "unstructured.partition.pptx.get_last_modified_date", return_value="2022-11-22T11:22:33"
    )

    elements = partition_pptx(
        example_doc_path("fake-power-point-malformed.pptx"),
        metadata_last_modified="2024-04-03T20:16:03",
    )

    assert elements[0].metadata.last_modified == "2024-04-03T20:16:03"


def test_partition_pptx_from_file_metadata_date(mocker: MockFixture):
    mocker.patch(
        "unstructured.partition.pptx.get_last_modified_date_from_file",
        return_value="2029-07-05T09:24:28",
    )

    with open(example_doc_path("fake-power-point-malformed.pptx"), "rb") as f:
        elements = partition_pptx(file=f)

    assert elements[0].metadata.last_modified is None


def test_partition_pptx_from_file_explicit_get_metadata_date(mocker: MockFixture):
    mocker.patch(
        "unstructured.partition.pptx.get_last_modified_date_from_file",
        return_value="2029-07-05T09:24:28",
    )

    with open(example_doc_path("fake-power-point-malformed.pptx"), "rb") as f:
        elements = partition_pptx(file=f, date_from_file_object=True)

    assert elements[0].metadata.last_modified == "2029-07-05T09:24:28"


def test_partition_pptx_from_file_with_custom_metadata_date(mocker: MockFixture):
    mocker.patch(
        "unstructured.partition.pptx.get_last_modified_date_from_file",
        return_value="2022-11-22T11:22:33",
    )

    with open(example_doc_path("fake-power-point-malformed.pptx"), "rb") as f:
        elements = partition_pptx(file=f, metadata_last_modified="2024-04-03T20:16:03")

    assert elements[0].metadata.last_modified == "2024-04-03T20:16:03"


def test_partition_pptx_from_file_without_metadata_date():
    """Test partition_pptx() with file that are not possible to get last modified date"""
    with open(example_doc_path("fake-power-point-malformed.pptx"), "rb") as f:
        sf = tempfile.SpooledTemporaryFile()
        sf.write(f.read())
        sf.seek(0)
        elements = partition_pptx(file=sf, date_from_file_object=True)

    assert elements[0].metadata.last_modified is None


def test_partition_pptx_element_metadata_has_languages():
    elements = partition_pptx(example_doc_path("fake-power-point.pptx"))
    assert elements[0].metadata.languages == ["eng"]


def test_partition_pptx_respects_detect_language_per_element():
    elements = partition_pptx(
        example_doc_path("language-docs/eng_spa_mult.pptx"), detect_language_per_element=True
    )

    langs = [element.metadata.languages for element in elements]
    # languages other than English and Spanish are detected by this partitioner,
    # so this test is slightly different from the other partition tests
    langs = {element.metadata.languages[0] for element in elements if element.metadata.languages}
    assert "eng" in langs
    assert "spa" in langs


def test_partition_pptx_raises_TypeError_for_invalid_languages():
    with pytest.raises(TypeError):
        partition_pptx(example_doc_path("fake-power-point.pptx"), languages="eng")  # type: ignore


# == downstream behaviors ========================================================================


def test_partition_pptx_with_json():
    elements = partition_pptx(example_doc_path("fake-power-point.pptx"))
    assert_round_trips_through_JSON(elements)


def test_add_chunking_strategy_by_title_on_partition_pptx():
    filename = example_doc_path("science-exploration-1p.pptx")

    elements = partition_pptx(filename=filename)
    chunk_elements = partition_pptx(filename, chunking_strategy="by_title")
    chunks = chunk_by_title(elements)

    assert chunk_elements != elements
    assert chunk_elements == chunks


def test_partition_pptx_title_shape_detection(tmp_path: pathlib.Path):
    """This tests if the title attribute of a shape is correctly categorized as a title"""
    filename = str(tmp_path / "test-title-shape.pptx")

    # create a fake PowerPoint presentation with a slide containing a title shape
    prs = pptx.Presentation()
    slide = prs.slides.add_slide(prs.slide_layouts[0])
    title_shape = slide.shapes.title
    assert title_shape is not None
    title_shape.text = (
        "This is a title, it's a bit long so we can make sure it's not narrative text"
    )
    title_shape.text_frame.add_paragraph().text = "this is a subtitle"

    prs.save(filename)

    # partition the PowerPoint presentation and get the first element
    elements = partition_pptx(filename)
    title = elements[0]
    subtitle = elements[1]

    # assert that the first line is a title and has the correct text and depth
    assert isinstance(title, Title)
    assert (
        title.text == "This is a title, it's a bit long so we can make sure it's not narrative text"
    )
    assert title.metadata.category_depth == 0

    # assert that the first line is the subtitle and has the correct text and depth
    assert isinstance(subtitle, Title)
    assert subtitle.text == "this is a subtitle"
    assert subtitle.metadata.category_depth == 1


def test_partition_pptx_level_detection(tmp_path: pathlib.Path):
    """This tests if the level attribute of a paragraph is correctly set as the category depth"""
    filename = str(tmp_path / "test-category-depth.pptx")

    prs = pptx.Presentation()
    blank_slide_layout = prs.slide_layouts[1]

    slide = prs.slides.add_slide(blank_slide_layout)
    shapes = slide.shapes
    title_shape = shapes.title
    assert title_shape is not None
    title_shape.text = (
        "This is a title, it's a bit long so we can make sure it's not narrative text"
    )
    body_shape = shapes.placeholders[1]

    tf = body_shape.text_frame
    tf.text = "this is the root level bullet"

    p = tf.add_paragraph()
    p.text = "this is the level 1 bullet"
    p.level = 1

    p = tf.add_paragraph()
    p.text = "this is the level 2 bullet"
    p.level = 2

    prs.slides[0].shapes

    prs.save(filename)

    # partition the PowerPoint presentation and get the first element
    elements = partition_pptx(filename)

    # NOTE(newelh) - python_pptx does not create full bullet xml, so unstructured will
    #                not detect the paragraphs as bullets. This is fine for now, as
    #                the level attribute is still set correctly, and what we're testing here
    test_cases = [
        (0, Title, "This is a title, it's a bit long so we can make sure it's not narrative text"),
        (0, NarrativeText, "this is the root level bullet"),
        (1, NarrativeText, "this is the level 1 bullet"),
        (2, NarrativeText, "this is the level 2 bullet"),
    ]

    for element, test_case in zip(elements, test_cases):
        assert element.text == test_case[2], f"expected {test_case[2]}, got {element.text}"
        assert isinstance(
            element,
            test_case[1],
        ), f"expected {test_case[1]}, got {type(element).__name__} for {element.text}"
        assert (
            element.metadata.category_depth == test_case[0]
        ), f"expected {test_case[0]}, got {element.metadata.category_depth} for {element.text}"


def test_partition_pptx_hierarchy_sample_document():
    """This tests if the hierarchy of the sample document is correctly detected"""
    elements = partition_pptx(example_doc_path("sample-presentation.pptx"))

    test_cases = [
        (0, None, "b2859226ba1f9243fb3f1b2ace889f43"),
        (1, "b2859226ba1f9243fb3f1b2ace889f43", "d13f8827e94541c8b818b0df8f942526"),
        (None, None, "1ffd3151819e594553e6b540e19e6c36"),
        (0, None, "e535f799d1f0e79d6777efa873a16ce1"),
        (0, "e535f799d1f0e79d6777efa873a16ce1", "f02bbfb417ad60daa2ba35080e96262f"),
        (0, "e535f799d1f0e79d6777efa873a16ce1", "414dfce72ea53cd4649176af0d62a4c1"),
        (1, "414dfce72ea53cd4649176af0d62a4c1", "3d45a95c79473a07db4edca5534a7c49"),
        (1, "414dfce72ea53cd4649176af0d62a4c1", "a33333f527851f700ca175acd04b8a2c"),
        (2, "a33333f527851f700ca175acd04b8a2c", "6f1b87689e4da2b0fb865bc5f92d5702"),
        (0, "e535f799d1f0e79d6777efa873a16ce1", "3f58e0be3b8e8b15cba7adc4eae68586"),
        (None, None, "1ffd3151819e594553e6b540e19e6c36"),
        (0, None, "8319096532fe2e55f66c491ea8313150"),
        (0, "8319096532fe2e55f66c491ea8313150", "17a7e78277ab131a627cb4538bab7390"),
        (0, "8319096532fe2e55f66c491ea8313150", "41a9e1d0390f4edd77181142ceae51bc"),
        (1, "41a9e1d0390f4edd77181142ceae51bc", "cbbc78ef38a035fd66f7b030dcf12f66"),
        (1, "41a9e1d0390f4edd77181142ceae51bc", "2a551e3cbe67561debe0da262a294f24"),
        (2, "2a551e3cbe67561debe0da262a294f24", "7a121a056eedb11ac8804d6fd17afc0c"),
        (0, "8319096532fe2e55f66c491ea8313150", "a24a3caf9853702cb73daae23020b7b4"),
        (0, "8319096532fe2e55f66c491ea8313150", "18367f334b5c8c4602ea413ab68ac35b"),
        (0, "8319096532fe2e55f66c491ea8313150", "7f647b1f0f20c3db40c36ab57d9a5550"),
        (1, "7f647b1f0f20c3db40c36ab57d9a5550", "591c24b41b53aba873188a0881d10961"),
        (1, "7f647b1f0f20c3db40c36ab57d9a5550", "6ec455f5f19782facf184886876c9a66"),
        (2, "6ec455f5f19782facf184886876c9a66", "5614b00c3f6bff23ebba1360e10f6428"),
        (0, "8319096532fe2e55f66c491ea8313150", "2f57a8d4182e6fd5bd5842b0a2d9841b"),
        (None, None, "1ffd3151819e594553e6b540e19e6c36"),
        (None, None, "2ed3bd10daace79ac129cbf8faf22bfc"),
        (0, None, "fd08cacbaddafee5cbacc02528536ee5"),
    ]

    # Zip the test cases with the elements
    for element, test_case in zip(elements, test_cases):
        expected_depth, expected_parent_id, expected_id = test_case
        assert element.metadata.category_depth == expected_depth
        assert element.metadata.parent_id == expected_parent_id
        assert element.id == expected_id


# ================================================================================================
# ISOLATED UNIT TESTS
# ================================================================================================
# These test components used by `partition_pptx()` in isolation such that all edge cases can be
# exercised.
# ================================================================================================


class DescribePptxPartitionerOptions:
    """Unit-test suite for `unstructured.partition.xlsx.PptxPartitionerOptions` objects."""

    @pytest.mark.parametrize("arg_value", [True, False])
    def it_knows_whether_to_emit_PageBreak_elements_as_part_of_the_output_element_stream(
        self, arg_value: bool, opts_args: dict[str, Any]
    ):
        opts_args["include_page_breaks"] = arg_value
        opts = PptxPartitionerOptions(**opts_args)

        assert opts.include_page_breaks is arg_value

    @pytest.mark.parametrize("arg_value", [True, False])
    def it_knows_whether_to_partition_content_found_in_slide_notes(
        self, arg_value: bool, opts_args: dict[str, Any]
    ):
        opts_args["include_slide_notes"] = arg_value
        opts = PptxPartitionerOptions(**opts_args)

        assert opts.include_slide_notes is arg_value

    @pytest.mark.parametrize("arg_value", [True, False])
    def it_knows_whether_to_include_text_as_html_in_Table_metadata(
        self, arg_value: bool, opts_args: dict[str, Any]
    ):
        opts_args["infer_table_structure"] = arg_value
        opts = PptxPartitionerOptions(**opts_args)

        assert opts.infer_table_structure is arg_value

    # -- .increment_page_number() ----------------

    def it_generates_a_PageBreak_element_when_the_page_number_is_incremented(
        self, opts_args: dict[str, Any]
    ):
        opts = PptxPartitionerOptions(**opts_args)
        # -- move to the first slide --
        list(opts.increment_page_number())

        page_break_iter = opts.increment_page_number()

        assert isinstance(next(page_break_iter, None), PageBreak)
        assert opts.page_number == 2
        with pytest.raises(StopIteration):
            next(page_break_iter)

    def but_it_does_not_generate_a_PageBreak_element_for_the_first_slide(
        self, opts_args: dict[str, Any]
    ):
        opts = PptxPartitionerOptions(**opts_args)

        page_break_iter = opts.increment_page_number()

        with pytest.raises(StopIteration):
            next(page_break_iter)
        assert opts.page_number == 1

    def and_it_does_not_generate_a_PageBreak_element_when_include_page_breaks_option_is_off(
        self, opts_args: dict[str, Any]
    ):
        opts_args["include_page_breaks"] = False
        opts = PptxPartitionerOptions(**opts_args)
        # -- move to the first slide --
        list(opts.increment_page_number())

        page_break_iter = opts.increment_page_number()

        with pytest.raises(StopIteration):
            next(page_break_iter)
        assert opts.page_number == 2

    # -- .last_modified --------------------------

    def it_gets_the_last_modified_date_of_the_document_from_the_caller_when_provided(
        self, opts_args: dict[str, Any]
    ):
        opts_args["metadata_last_modified"] = "2024-03-05T17:02:53"
        opts = PptxPartitionerOptions(**opts_args)

        assert opts.last_modified == "2024-03-05T17:02:53"

    def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_path_is_provided(
        self, opts_args: dict[str, Any], get_last_modified_date_: Mock
    ):
        opts_args["file_path"] = "a/b/spreadsheet.pptx"
        get_last_modified_date_.return_value = "2024-04-02T20:32:35"
        opts = PptxPartitionerOptions(**opts_args)

        last_modified = opts.last_modified

        get_last_modified_date_.assert_called_once_with("a/b/spreadsheet.pptx")
        assert last_modified == "2024-04-02T20:32:35"

    def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_file_like_object_is_provided(
        self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock
    ):
        file = io.BytesIO(b"abcdefg")
        opts_args["file"] = file
        opts_args["date_from_file_object"] = True
        get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07"
        opts = PptxPartitionerOptions(**opts_args)

        last_modified = opts.last_modified

        get_last_modified_date_from_file_.assert_called_once_with(file)
        assert last_modified == "2024-04-02T20:42:07"

    def but_it_falls_back_to_None_for_the_last_modified_date_when_date_from_file_object_is_False(
        self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock
    ):
        file = io.BytesIO(b"abcdefg")
        opts_args["file"] = file
        opts_args["date_from_file_object"] = False
        get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07"
        opts = PptxPartitionerOptions(**opts_args)

        last_modified = opts.last_modified

        get_last_modified_date_from_file_.assert_not_called()
        assert last_modified is None

    # -- .metadata_file_path ---------------------

    def it_uses_the_user_provided_file_path_in_the_metadata_when_provided(
        self, opts_args: dict[str, Any]
    ):
        opts_args["file_path"] = "x/y/z.pptx"
        opts_args["metadata_file_path"] = "a/b/c.pptx"
        opts = PptxPartitionerOptions(**opts_args)

        assert opts.metadata_file_path == "a/b/c.pptx"

    @pytest.mark.parametrize("file_path", ["u/v/w.pptx", None])
    def and_it_falls_back_to_the_document_file_path_otherwise(
        self, file_path: str | None, opts_args: dict[str, Any]
    ):
        opts_args["file_path"] = file_path
        opts_args["metadata_file_path"] = None
        opts = PptxPartitionerOptions(**opts_args)

        assert opts.metadata_file_path == file_path

    # -- .page_number ----------------------------

    def it_keeps_track_of_the_page_number(self, opts_args: dict[str, Any]):
        """In PPTX, page-number is the slide number."""
        opts = PptxPartitionerOptions(**opts_args)

        assert opts.page_number == 0
        list(opts.increment_page_number())
        assert opts.page_number == 1
        list(opts.increment_page_number())
        assert opts.page_number == 2

    def it_assigns_the_correct_page_number_when_starting_page_number_is_given(
        self, opts_args: dict[str, Any]
    ):
        opts = PptxPartitionerOptions(**opts_args, starting_page_number=3)
        # -- move to the "first" slide --
        list(opts.increment_page_number())

        table_metadata = opts.table_metadata(text_as_html="<table><tr/></table>")
        text_metadata = opts.text_metadata()

        assert isinstance(table_metadata, ElementMetadata)
        assert isinstance(text_metadata, ElementMetadata)
        assert text_metadata.page_number == 3
        assert table_metadata.page_number == 3

    # -- .pptx_file ------------------------------

    def it_uses_the_path_to_open_the_presentation_when_file_path_is_provided(
        self, opts_args: dict[str, Any]
    ):
        opts_args["file_path"] = "l/m/n.pptx"
        opts = PptxPartitionerOptions(**opts_args)

        assert opts.pptx_file == "l/m/n.pptx"

    def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided(
        self, opts_args: dict[str, Any]
    ):
        spooled_temp_file = tempfile.SpooledTemporaryFile()
        spooled_temp_file.write(b"abcdefg")
        opts_args["file"] = spooled_temp_file
        opts = PptxPartitionerOptions(**opts_args)

        pptx_file = opts.pptx_file

        assert pptx_file is not spooled_temp_file
        assert isinstance(pptx_file, io.BytesIO)
        assert pptx_file.getvalue() == b"abcdefg"

    def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile(
        self, opts_args: dict[str, Any]
    ):
        file = io.BytesIO(b"abcdefg")
        opts_args["file"] = file
        opts = PptxPartitionerOptions(**opts_args)

        pptx_file = opts.pptx_file

        assert pptx_file is file
        assert isinstance(pptx_file, io.BytesIO)
        assert pptx_file.getvalue() == b"abcdefg"

    def but_it_raises_ValueError_when_neither_a_file_path_or_file_is_provided(
        self, opts_args: dict[str, Any]
    ):
        opts = PptxPartitionerOptions(**opts_args)

        with pytest.raises(ValueError, match="No PPTX document specified, either `filename` or "):
            opts.pptx_file

    # -- .strategy -------------------------------

    @pytest.mark.parametrize("arg_value", ["fast", "hi_res"])
    def it_knows_which_partitioning_strategy_to_use(
        self, arg_value: str, opts_args: dict[str, Any]
    ):
        opts_args["strategy"] = arg_value
        opts = PptxPartitionerOptions(**opts_args)

        assert opts.strategy == arg_value

    # -- .table_metadata -------------------------

    def it_can_create_table_metadata(self, opts_args: dict[str, Any]):
        opts_args["metadata_file_path"] = "d/e/f.pptx"
        opts_args["metadata_last_modified"] = "2024-04-02T19:51:55"
        opts = PptxPartitionerOptions(**opts_args)
        # -- move to the first slide --
        list(opts.increment_page_number())

        metadata = opts.table_metadata(text_as_html="<table><tr/></table>")

        assert isinstance(metadata, ElementMetadata)
        assert metadata.filename == "f.pptx"
        assert metadata.last_modified == "2024-04-02T19:51:55"
        assert metadata.page_number == 1
        assert metadata.text_as_html == "<table><tr/></table>"

    # -- .text_metadata -------------------------

    def it_can_create_text_metadata(self, opts_args: dict[str, Any]):
        opts_args["metadata_file_path"] = "d/e/f.pptx"
        opts_args["metadata_last_modified"] = "2024-04-02T19:56:40"
        opts = PptxPartitionerOptions(**opts_args)
        # -- move to the first slide --
        list(opts.increment_page_number())

        metadata = opts.text_metadata(category_depth=2)

        assert isinstance(metadata, ElementMetadata)
        assert metadata.filename == "f.pptx"
        assert metadata.last_modified == "2024-04-02T19:56:40"
        assert metadata.page_number == 1
        assert metadata.category_depth == 2

    # -- fixtures --------------------------------------------------------------------------------

    @pytest.fixture()
    def get_last_modified_date_(self, request: FixtureRequest):
        return function_mock(request, "unstructured.partition.pptx.get_last_modified_date")

    @pytest.fixture()
    def get_last_modified_date_from_file_(self, request: FixtureRequest):
        return function_mock(
            request, "unstructured.partition.pptx.get_last_modified_date_from_file"
        )

    @pytest.fixture()
    def opts_args(self) -> dict[str, Any]:
        """All default arguments for `_XlsxPartitionerOptions`.

        Individual argument values can be changed to suit each test. Makes construction of opts more
        compact for testing purposes.
        """
        return {
            "date_from_file_object": False,
            "file": None,
            "file_path": None,
            "include_page_breaks": True,
            "include_slide_notes": False,
            "infer_table_structure": True,
            "metadata_file_path": None,
            "metadata_last_modified": None,
            "strategy": "fast",
        }