# pyright: reportPrivateUsage=false """Test suite for `unstructured.partition.pptx` module.""" from __future__ import annotations import hashlib import io import pathlib import tempfile from typing import Any, Iterator import pptx import pytest from pptx.shapes.picture import Picture from pptx.util import Inches from pytest_mock import MockFixture from test_unstructured.unit_utils import ( FixtureRequest, Mock, assert_round_trips_through_JSON, example_doc_path, function_mock, ) from unstructured.chunking.title import chunk_by_title from unstructured.documents.elements import ( Element, ElementMetadata, Image, ListItem, NarrativeText, PageBreak, Text, Title, ) from unstructured.partition.pptx import ( PptxPartitionerOptions, partition_pptx, register_picture_partitioner, ) EXPECTED_PPTX_OUTPUT = [ Title(text="Adding a Bullet Slide"), ListItem(text="Find the bullet slide layout"), ListItem(text="Use _TextFrame.text for first bullet"), ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"), NarrativeText(text="Here is a lot of text!"), NarrativeText(text="Here is some text in a text box!"), ] # == document file behaviors ===================================================================== def test_partition_pptx_from_filename(): elements = partition_pptx(example_doc_path("fake-power-point.pptx")) assert elements == EXPECTED_PPTX_OUTPUT for element in elements: assert element.metadata.filename == "fake-power-point.pptx" def test_partition_pptx_from_filename_with_metadata_filename(): elements = partition_pptx(example_doc_path("fake-power-point.pptx"), metadata_filename="test") assert elements == EXPECTED_PPTX_OUTPUT for element in elements: assert element.metadata.filename == "test" def test_partition_pptx_with_spooled_file(): """The `partition_pptx() function can handle a `SpooledTemporaryFile. Including one that does not have its read-pointer set to the start. """ with open(example_doc_path("fake-power-point.pptx"), "rb") as test_file: spooled_temp_file = tempfile.SpooledTemporaryFile() spooled_temp_file.write(test_file.read()) elements = partition_pptx(file=spooled_temp_file) assert elements == EXPECTED_PPTX_OUTPUT for element in elements: assert element.metadata.filename is None def test_partition_pptx_from_file(): with open(example_doc_path("fake-power-point.pptx"), "rb") as f: elements = partition_pptx(file=f) assert elements == EXPECTED_PPTX_OUTPUT for element in elements: assert element.metadata.filename is None def test_partition_pptx_from_file_with_metadata_filename(): with open(example_doc_path("fake-power-point.pptx"), "rb") as f: elements = partition_pptx(file=f, metadata_filename="test") assert elements == EXPECTED_PPTX_OUTPUT for element in elements: assert element.metadata.filename == "test" def test_partition_pptx_raises_with_neither(): with pytest.raises(ValueError): partition_pptx() def test_partition_pptx_recurses_into_group_shapes(): elements = partition_pptx(example_doc_path("group-shapes-nested.pptx")) assert [e.text for e in elements] == ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"] # == page-break behaviors ======================================================================== def test_partition_pptx_adds_page_breaks(tmp_path: pathlib.Path): filename = str(tmp_path / "test-page-breaks.pptx") presentation = pptx.Presentation() blank_slide_layout = presentation.slide_layouts[6] slide = presentation.slides.add_slide(blank_slide_layout) left = top = width = height = Inches(2) txBox = slide.shapes.add_textbox(left, top, width, height) tf = txBox.text_frame tf.text = "This is the first slide." slide = presentation.slides.add_slide(blank_slide_layout) left = top = width = height = Inches(2) txBox = slide.shapes.add_textbox(left, top, width, height) tf = txBox.text_frame tf.text = "This is the second slide." presentation.save(filename) elements = partition_pptx(filename=filename) assert elements == [ NarrativeText(text="This is the first slide."), PageBreak(text=""), NarrativeText(text="This is the second slide."), ] for element in elements: assert element.metadata.filename == "test-page-breaks.pptx" def test_partition_pptx_page_breaks_toggle_off(tmp_path: pathlib.Path): filename = str(tmp_path / "test-page-breaks.pptx") presentation = pptx.Presentation() blank_slide_layout = presentation.slide_layouts[6] slide = presentation.slides.add_slide(blank_slide_layout) left = top = width = height = Inches(2) txBox = slide.shapes.add_textbox(left, top, width, height) tf = txBox.text_frame tf.text = "This is the first slide." slide = presentation.slides.add_slide(blank_slide_layout) left = top = width = height = Inches(2) txBox = slide.shapes.add_textbox(left, top, width, height) tf = txBox.text_frame tf.text = "This is the second slide." presentation.save(filename) elements = partition_pptx(filename=filename, include_page_breaks=False) assert elements == [ NarrativeText(text="This is the first slide."), NarrativeText(text="This is the second slide."), ] for element in elements: assert element.metadata.filename == "test-page-breaks.pptx" def test_partition_pptx_many_pages(): elements = partition_pptx(example_doc_path("fake-power-point-many-pages.pptx")) # The page_number of PageBreak is None assert set(filter(None, (elt.metadata.page_number for elt in elements))) == {1, 2} for element in elements: assert element.metadata.filename == "fake-power-point-many-pages.pptx" # == miscellaneous behaviors ===================================================================== def test_partition_pptx_orders_elements(tmp_path: pathlib.Path): filename = str(tmp_path / "test-ordering.pptx") presentation = pptx.Presentation() blank_slide_layout = presentation.slide_layouts[6] slide = presentation.slides.add_slide(blank_slide_layout) left = top = width = height = Inches(2) txBox = slide.shapes.add_textbox(left, top, width, height) tf = txBox.text_frame tf.text = "This is lower and should come second" left = top = width = height = Inches(1) left = top = Inches(-10) txBox = slide.shapes.add_textbox(left, top, width, height) tf = txBox.text_frame tf.text = "This is off the page and shouldn't appear" left = top = width = height = Inches(2) txBox = slide.shapes.add_textbox(left, top, width, height) tf = txBox.text_frame tf.text = "" left = top = width = height = Inches(1) txBox = slide.shapes.add_textbox(left, top, width, height) tf = txBox.text_frame tf.text = "This is higher and should come first" top = width = height = Inches(1) left = Inches(0.5) txBox = slide.shapes.add_textbox(left, top, width, height) tf = txBox.text_frame tf.text = "-------------TOP-------------" presentation.save(filename) elements = partition_pptx(filename=filename) assert elements == [ Text("-------------TOP-------------"), NarrativeText("This is higher and should come first"), NarrativeText("This is lower and should come second"), ] for element in elements: assert element.metadata.filename == "test-ordering.pptx" def test_partition_pptx_grabs_tables(): elements = partition_pptx(example_doc_path("fake-power-point-table.pptx")) assert elements[1].text.startswith("Column 1") assert elements[1].text.strip().endswith("Aqua") assert elements[1].metadata.text_as_html == ( "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "
Column 1 Column 2 Column 3
Red Green Blue
Purple Orange Yellow
Tangerine Pink Aqua
" ) assert elements[1].metadata.filename == "fake-power-point-table.pptx" @pytest.mark.parametrize("infer_table_structure", [True, False]) def test_partition_pptx_infer_table_structure(infer_table_structure: bool): elements = partition_pptx( example_doc_path("fake-power-point-table.pptx"), infer_table_structure=infer_table_structure ) table_element_has_text_as_html_field = ( hasattr(elements[1].metadata, "text_as_html") and elements[1].metadata.text_as_html is not None ) assert table_element_has_text_as_html_field == infer_table_structure def test_partition_pptx_malformed(): elements = partition_pptx(example_doc_path("fake-power-point-malformed.pptx")) assert elements[0].text == "Problem Date Placeholder" assert elements[1].text == "Test Slide" for element in elements: assert element.metadata.filename == "fake-power-point-malformed.pptx" # == image sub-partitioning behaviors ============================================================ def test_partition_pptx_generates_no_Image_elements_by_default(): assert partition_pptx(example_doc_path("picture.pptx")) == [] def test_partition_pptx_uses_registered_picture_partitioner(): class FakePicturePartitioner: @classmethod def iter_elements(cls, picture: Picture, opts: PptxPartitionerOptions) -> Iterator[Element]: image_hash = hashlib.sha1(picture.image.blob).hexdigest() yield Image(f"Image with hash {image_hash}, strategy: {opts.strategy}") register_picture_partitioner(FakePicturePartitioner) elements = partition_pptx(example_doc_path("picture.pptx")) assert len(elements) == 1 image = elements[0] assert type(image) is Image assert image.text == "Image with hash b0a1e6cf904691e6fa42bd9e72acc2b05280dc86, strategy: fast" # == metadata behaviors ========================================================================== def test_partition_pptx_metadata_date(mocker: MockFixture): mocker.patch( "unstructured.partition.pptx.get_last_modified_date", return_value="2029-07-05T09:24:28" ) elements = partition_pptx(example_doc_path("fake-power-point-malformed.pptx")) assert elements[0].metadata.last_modified == "2029-07-05T09:24:28" def test_partition_pptx_with_custom_metadata_date(mocker: MockFixture): mocker.patch( "unstructured.partition.pptx.get_last_modified_date", return_value="2022-11-22T11:22:33" ) elements = partition_pptx( example_doc_path("fake-power-point-malformed.pptx"), metadata_last_modified="2024-04-03T20:16:03", ) assert elements[0].metadata.last_modified == "2024-04-03T20:16:03" def test_partition_pptx_from_file_metadata_date(mocker: MockFixture): mocker.patch( "unstructured.partition.pptx.get_last_modified_date_from_file", return_value="2029-07-05T09:24:28", ) with open(example_doc_path("fake-power-point-malformed.pptx"), "rb") as f: elements = partition_pptx(file=f) assert elements[0].metadata.last_modified is None def test_partition_pptx_from_file_explicit_get_metadata_date(mocker: MockFixture): mocker.patch( "unstructured.partition.pptx.get_last_modified_date_from_file", return_value="2029-07-05T09:24:28", ) with open(example_doc_path("fake-power-point-malformed.pptx"), "rb") as f: elements = partition_pptx(file=f, date_from_file_object=True) assert elements[0].metadata.last_modified == "2029-07-05T09:24:28" def test_partition_pptx_from_file_with_custom_metadata_date(mocker: MockFixture): mocker.patch( "unstructured.partition.pptx.get_last_modified_date_from_file", return_value="2022-11-22T11:22:33", ) with open(example_doc_path("fake-power-point-malformed.pptx"), "rb") as f: elements = partition_pptx(file=f, metadata_last_modified="2024-04-03T20:16:03") assert elements[0].metadata.last_modified == "2024-04-03T20:16:03" def test_partition_pptx_from_file_without_metadata_date(): """Test partition_pptx() with file that are not possible to get last modified date""" with open(example_doc_path("fake-power-point-malformed.pptx"), "rb") as f: sf = tempfile.SpooledTemporaryFile() sf.write(f.read()) sf.seek(0) elements = partition_pptx(file=sf, date_from_file_object=True) assert elements[0].metadata.last_modified is None def test_partition_pptx_element_metadata_has_languages(): elements = partition_pptx(example_doc_path("fake-power-point.pptx")) assert elements[0].metadata.languages == ["eng"] def test_partition_pptx_respects_detect_language_per_element(): elements = partition_pptx( example_doc_path("language-docs/eng_spa_mult.pptx"), detect_language_per_element=True ) langs = [element.metadata.languages for element in elements] # languages other than English and Spanish are detected by this partitioner, # so this test is slightly different from the other partition tests langs = {element.metadata.languages[0] for element in elements if element.metadata.languages} assert "eng" in langs assert "spa" in langs def test_partition_pptx_raises_TypeError_for_invalid_languages(): with pytest.raises(TypeError): partition_pptx(example_doc_path("fake-power-point.pptx"), languages="eng") # type: ignore # == downstream behaviors ======================================================================== def test_partition_pptx_with_json(): elements = partition_pptx(example_doc_path("fake-power-point.pptx")) assert_round_trips_through_JSON(elements) def test_add_chunking_strategy_by_title_on_partition_pptx(): filename = example_doc_path("science-exploration-1p.pptx") elements = partition_pptx(filename=filename) chunk_elements = partition_pptx(filename, chunking_strategy="by_title") chunks = chunk_by_title(elements) assert chunk_elements != elements assert chunk_elements == chunks def test_partition_pptx_title_shape_detection(tmp_path: pathlib.Path): """This tests if the title attribute of a shape is correctly categorized as a title""" filename = str(tmp_path / "test-title-shape.pptx") # create a fake PowerPoint presentation with a slide containing a title shape prs = pptx.Presentation() slide = prs.slides.add_slide(prs.slide_layouts[0]) title_shape = slide.shapes.title assert title_shape is not None title_shape.text = ( "This is a title, it's a bit long so we can make sure it's not narrative text" ) title_shape.text_frame.add_paragraph().text = "this is a subtitle" prs.save(filename) # partition the PowerPoint presentation and get the first element elements = partition_pptx(filename) title = elements[0] subtitle = elements[1] # assert that the first line is a title and has the correct text and depth assert isinstance(title, Title) assert ( title.text == "This is a title, it's a bit long so we can make sure it's not narrative text" ) assert title.metadata.category_depth == 0 # assert that the first line is the subtitle and has the correct text and depth assert isinstance(subtitle, Title) assert subtitle.text == "this is a subtitle" assert subtitle.metadata.category_depth == 1 def test_partition_pptx_level_detection(tmp_path: pathlib.Path): """This tests if the level attribute of a paragraph is correctly set as the category depth""" filename = str(tmp_path / "test-category-depth.pptx") prs = pptx.Presentation() blank_slide_layout = prs.slide_layouts[1] slide = prs.slides.add_slide(blank_slide_layout) shapes = slide.shapes title_shape = shapes.title assert title_shape is not None title_shape.text = ( "This is a title, it's a bit long so we can make sure it's not narrative text" ) body_shape = shapes.placeholders[1] tf = body_shape.text_frame tf.text = "this is the root level bullet" p = tf.add_paragraph() p.text = "this is the level 1 bullet" p.level = 1 p = tf.add_paragraph() p.text = "this is the level 2 bullet" p.level = 2 prs.slides[0].shapes prs.save(filename) # partition the PowerPoint presentation and get the first element elements = partition_pptx(filename) # NOTE(newelh) - python_pptx does not create full bullet xml, so unstructured will # not detect the paragraphs as bullets. This is fine for now, as # the level attribute is still set correctly, and what we're testing here test_cases = [ (0, Title, "This is a title, it's a bit long so we can make sure it's not narrative text"), (0, NarrativeText, "this is the root level bullet"), (1, NarrativeText, "this is the level 1 bullet"), (2, NarrativeText, "this is the level 2 bullet"), ] for element, test_case in zip(elements, test_cases): assert element.text == test_case[2], f"expected {test_case[2]}, got {element.text}" assert isinstance( element, test_case[1], ), f"expected {test_case[1]}, got {type(element).__name__} for {element.text}" assert ( element.metadata.category_depth == test_case[0] ), f"expected {test_case[0]}, got {element.metadata.category_depth} for {element.text}" def test_partition_pptx_hierarchy_sample_document(): """This tests if the hierarchy of the sample document is correctly detected""" elements = partition_pptx(example_doc_path("sample-presentation.pptx")) test_cases = [ (0, None, "b2859226ba1f9243fb3f1b2ace889f43"), (1, "b2859226ba1f9243fb3f1b2ace889f43", "d13f8827e94541c8b818b0df8f942526"), (None, None, "1ffd3151819e594553e6b540e19e6c36"), (0, None, "e535f799d1f0e79d6777efa873a16ce1"), (0, "e535f799d1f0e79d6777efa873a16ce1", "f02bbfb417ad60daa2ba35080e96262f"), (0, "e535f799d1f0e79d6777efa873a16ce1", "414dfce72ea53cd4649176af0d62a4c1"), (1, "414dfce72ea53cd4649176af0d62a4c1", "3d45a95c79473a07db4edca5534a7c49"), (1, "414dfce72ea53cd4649176af0d62a4c1", "a33333f527851f700ca175acd04b8a2c"), (2, "a33333f527851f700ca175acd04b8a2c", "6f1b87689e4da2b0fb865bc5f92d5702"), (0, "e535f799d1f0e79d6777efa873a16ce1", "3f58e0be3b8e8b15cba7adc4eae68586"), (None, None, "1ffd3151819e594553e6b540e19e6c36"), (0, None, "8319096532fe2e55f66c491ea8313150"), (0, "8319096532fe2e55f66c491ea8313150", "17a7e78277ab131a627cb4538bab7390"), (0, "8319096532fe2e55f66c491ea8313150", "41a9e1d0390f4edd77181142ceae51bc"), (1, "41a9e1d0390f4edd77181142ceae51bc", "cbbc78ef38a035fd66f7b030dcf12f66"), (1, "41a9e1d0390f4edd77181142ceae51bc", "2a551e3cbe67561debe0da262a294f24"), (2, "2a551e3cbe67561debe0da262a294f24", "7a121a056eedb11ac8804d6fd17afc0c"), (0, "8319096532fe2e55f66c491ea8313150", "a24a3caf9853702cb73daae23020b7b4"), (0, "8319096532fe2e55f66c491ea8313150", "18367f334b5c8c4602ea413ab68ac35b"), (0, "8319096532fe2e55f66c491ea8313150", "7f647b1f0f20c3db40c36ab57d9a5550"), (1, "7f647b1f0f20c3db40c36ab57d9a5550", "591c24b41b53aba873188a0881d10961"), (1, "7f647b1f0f20c3db40c36ab57d9a5550", "6ec455f5f19782facf184886876c9a66"), (2, "6ec455f5f19782facf184886876c9a66", "5614b00c3f6bff23ebba1360e10f6428"), (0, "8319096532fe2e55f66c491ea8313150", "2f57a8d4182e6fd5bd5842b0a2d9841b"), (None, None, "1ffd3151819e594553e6b540e19e6c36"), (None, None, "2ed3bd10daace79ac129cbf8faf22bfc"), (0, None, "fd08cacbaddafee5cbacc02528536ee5"), ] # Zip the test cases with the elements for element, test_case in zip(elements, test_cases): expected_depth, expected_parent_id, expected_id = test_case assert element.metadata.category_depth == expected_depth assert element.metadata.parent_id == expected_parent_id assert element.id == expected_id # ================================================================================================ # ISOLATED UNIT TESTS # ================================================================================================ # These test components used by `partition_pptx()` in isolation such that all edge cases can be # exercised. # ================================================================================================ class DescribePptxPartitionerOptions: """Unit-test suite for `unstructured.partition.xlsx.PptxPartitionerOptions` objects.""" @pytest.mark.parametrize("arg_value", [True, False]) def it_knows_whether_to_emit_PageBreak_elements_as_part_of_the_output_element_stream( self, arg_value: bool, opts_args: dict[str, Any] ): opts_args["include_page_breaks"] = arg_value opts = PptxPartitionerOptions(**opts_args) assert opts.include_page_breaks is arg_value @pytest.mark.parametrize("arg_value", [True, False]) def it_knows_whether_to_partition_content_found_in_slide_notes( self, arg_value: bool, opts_args: dict[str, Any] ): opts_args["include_slide_notes"] = arg_value opts = PptxPartitionerOptions(**opts_args) assert opts.include_slide_notes is arg_value @pytest.mark.parametrize("arg_value", [True, False]) def it_knows_whether_to_include_text_as_html_in_Table_metadata( self, arg_value: bool, opts_args: dict[str, Any] ): opts_args["infer_table_structure"] = arg_value opts = PptxPartitionerOptions(**opts_args) assert opts.infer_table_structure is arg_value # -- .increment_page_number() ---------------- def it_generates_a_PageBreak_element_when_the_page_number_is_incremented( self, opts_args: dict[str, Any] ): opts = PptxPartitionerOptions(**opts_args) # -- move to the first slide -- list(opts.increment_page_number()) page_break_iter = opts.increment_page_number() assert isinstance(next(page_break_iter, None), PageBreak) assert opts.page_number == 2 with pytest.raises(StopIteration): next(page_break_iter) def but_it_does_not_generate_a_PageBreak_element_for_the_first_slide( self, opts_args: dict[str, Any] ): opts = PptxPartitionerOptions(**opts_args) page_break_iter = opts.increment_page_number() with pytest.raises(StopIteration): next(page_break_iter) assert opts.page_number == 1 def and_it_does_not_generate_a_PageBreak_element_when_include_page_breaks_option_is_off( self, opts_args: dict[str, Any] ): opts_args["include_page_breaks"] = False opts = PptxPartitionerOptions(**opts_args) # -- move to the first slide -- list(opts.increment_page_number()) page_break_iter = opts.increment_page_number() with pytest.raises(StopIteration): next(page_break_iter) assert opts.page_number == 2 # -- .last_modified -------------------------- def it_gets_the_last_modified_date_of_the_document_from_the_caller_when_provided( self, opts_args: dict[str, Any] ): opts_args["metadata_last_modified"] = "2024-03-05T17:02:53" opts = PptxPartitionerOptions(**opts_args) assert opts.last_modified == "2024-03-05T17:02:53" def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_path_is_provided( self, opts_args: dict[str, Any], get_last_modified_date_: Mock ): opts_args["file_path"] = "a/b/spreadsheet.pptx" get_last_modified_date_.return_value = "2024-04-02T20:32:35" opts = PptxPartitionerOptions(**opts_args) last_modified = opts.last_modified get_last_modified_date_.assert_called_once_with("a/b/spreadsheet.pptx") assert last_modified == "2024-04-02T20:32:35" def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_file_like_object_is_provided( self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock ): file = io.BytesIO(b"abcdefg") opts_args["file"] = file opts_args["date_from_file_object"] = True get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07" opts = PptxPartitionerOptions(**opts_args) last_modified = opts.last_modified get_last_modified_date_from_file_.assert_called_once_with(file) assert last_modified == "2024-04-02T20:42:07" def but_it_falls_back_to_None_for_the_last_modified_date_when_date_from_file_object_is_False( self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock ): file = io.BytesIO(b"abcdefg") opts_args["file"] = file opts_args["date_from_file_object"] = False get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07" opts = PptxPartitionerOptions(**opts_args) last_modified = opts.last_modified get_last_modified_date_from_file_.assert_not_called() assert last_modified is None # -- .metadata_file_path --------------------- def it_uses_the_user_provided_file_path_in_the_metadata_when_provided( self, opts_args: dict[str, Any] ): opts_args["file_path"] = "x/y/z.pptx" opts_args["metadata_file_path"] = "a/b/c.pptx" opts = PptxPartitionerOptions(**opts_args) assert opts.metadata_file_path == "a/b/c.pptx" @pytest.mark.parametrize("file_path", ["u/v/w.pptx", None]) def and_it_falls_back_to_the_document_file_path_otherwise( self, file_path: str | None, opts_args: dict[str, Any] ): opts_args["file_path"] = file_path opts_args["metadata_file_path"] = None opts = PptxPartitionerOptions(**opts_args) assert opts.metadata_file_path == file_path # -- .page_number ---------------------------- def it_keeps_track_of_the_page_number(self, opts_args: dict[str, Any]): """In PPTX, page-number is the slide number.""" opts = PptxPartitionerOptions(**opts_args) assert opts.page_number == 0 list(opts.increment_page_number()) assert opts.page_number == 1 list(opts.increment_page_number()) assert opts.page_number == 2 def it_assigns_the_correct_page_number_when_starting_page_number_is_given( self, opts_args: dict[str, Any] ): opts = PptxPartitionerOptions(**opts_args, starting_page_number=3) # -- move to the "first" slide -- list(opts.increment_page_number()) table_metadata = opts.table_metadata(text_as_html="
") text_metadata = opts.text_metadata() assert isinstance(table_metadata, ElementMetadata) assert isinstance(text_metadata, ElementMetadata) assert text_metadata.page_number == 3 assert table_metadata.page_number == 3 # -- .pptx_file ------------------------------ def it_uses_the_path_to_open_the_presentation_when_file_path_is_provided( self, opts_args: dict[str, Any] ): opts_args["file_path"] = "l/m/n.pptx" opts = PptxPartitionerOptions(**opts_args) assert opts.pptx_file == "l/m/n.pptx" def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided( self, opts_args: dict[str, Any] ): spooled_temp_file = tempfile.SpooledTemporaryFile() spooled_temp_file.write(b"abcdefg") opts_args["file"] = spooled_temp_file opts = PptxPartitionerOptions(**opts_args) pptx_file = opts.pptx_file assert pptx_file is not spooled_temp_file assert isinstance(pptx_file, io.BytesIO) assert pptx_file.getvalue() == b"abcdefg" def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile( self, opts_args: dict[str, Any] ): file = io.BytesIO(b"abcdefg") opts_args["file"] = file opts = PptxPartitionerOptions(**opts_args) pptx_file = opts.pptx_file assert pptx_file is file assert isinstance(pptx_file, io.BytesIO) assert pptx_file.getvalue() == b"abcdefg" def but_it_raises_ValueError_when_neither_a_file_path_or_file_is_provided( self, opts_args: dict[str, Any] ): opts = PptxPartitionerOptions(**opts_args) with pytest.raises(ValueError, match="No PPTX document specified, either `filename` or "): opts.pptx_file # -- .strategy ------------------------------- @pytest.mark.parametrize("arg_value", ["fast", "hi_res"]) def it_knows_which_partitioning_strategy_to_use( self, arg_value: str, opts_args: dict[str, Any] ): opts_args["strategy"] = arg_value opts = PptxPartitionerOptions(**opts_args) assert opts.strategy == arg_value # -- .table_metadata ------------------------- def it_can_create_table_metadata(self, opts_args: dict[str, Any]): opts_args["metadata_file_path"] = "d/e/f.pptx" opts_args["metadata_last_modified"] = "2024-04-02T19:51:55" opts = PptxPartitionerOptions(**opts_args) # -- move to the first slide -- list(opts.increment_page_number()) metadata = opts.table_metadata(text_as_html="
") assert isinstance(metadata, ElementMetadata) assert metadata.filename == "f.pptx" assert metadata.last_modified == "2024-04-02T19:51:55" assert metadata.page_number == 1 assert metadata.text_as_html == "
" # -- .text_metadata ------------------------- def it_can_create_text_metadata(self, opts_args: dict[str, Any]): opts_args["metadata_file_path"] = "d/e/f.pptx" opts_args["metadata_last_modified"] = "2024-04-02T19:56:40" opts = PptxPartitionerOptions(**opts_args) # -- move to the first slide -- list(opts.increment_page_number()) metadata = opts.text_metadata(category_depth=2) assert isinstance(metadata, ElementMetadata) assert metadata.filename == "f.pptx" assert metadata.last_modified == "2024-04-02T19:56:40" assert metadata.page_number == 1 assert metadata.category_depth == 2 # -- fixtures -------------------------------------------------------------------------------- @pytest.fixture() def get_last_modified_date_(self, request: FixtureRequest): return function_mock(request, "unstructured.partition.pptx.get_last_modified_date") @pytest.fixture() def get_last_modified_date_from_file_(self, request: FixtureRequest): return function_mock( request, "unstructured.partition.pptx.get_last_modified_date_from_file" ) @pytest.fixture() def opts_args(self) -> dict[str, Any]: """All default arguments for `_XlsxPartitionerOptions`. Individual argument values can be changed to suit each test. Makes construction of opts more compact for testing purposes. """ return { "date_from_file_object": False, "file": None, "file_path": None, "include_page_breaks": True, "include_slide_notes": False, "infer_table_structure": True, "metadata_file_path": None, "metadata_last_modified": None, "strategy": "fast", }