diff --git a/CHANGELOG.md b/CHANGELOG.md index d46ea896b..ea2f32fb1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ * **Add missing starting_page_num param to partition_image** * **Make the filename and file params for partition_image and partition_pdf match the other partitioners** +* **Fix include_slide_notes and include_page_breaks params in partition_ppt** * **Re-apply: skip accuracy calculation feature** Overwritten by mistake * **Remove links param from partition_pdf** `links` is extracted during partitioning and is not needed as a paramter in partition_pdf. diff --git a/example-docs/language-docs/eng_spa_mult.ppt b/example-docs/language-docs/eng_spa_mult.ppt index 43ebc3657..d19bfc3bf 100644 Binary files a/example-docs/language-docs/eng_spa_mult.ppt and b/example-docs/language-docs/eng_spa_mult.ppt differ diff --git a/test_unstructured/partition/pptx/test_ppt.py b/test_unstructured/partition/pptx/test_ppt.py index a29909d48..61c172ab4 100644 --- a/test_unstructured/partition/pptx/test_ppt.py +++ b/test_unstructured/partition/pptx/test_ppt.py @@ -7,7 +7,7 @@ from pytest_mock import MockFixture from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path from unstructured.chunking.title import chunk_by_title -from unstructured.documents.elements import ListItem, NarrativeText, Title +from unstructured.documents.elements import ListItem, NarrativeText, PageBreak, Title from unstructured.partition.ppt import partition_ppt from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA @@ -181,9 +181,17 @@ def test_add_chunking_strategy_by_title_on_partition_ppt(): assert chunk_elements == chunks -def test_partition_ppt_element_metadata_has_languages(): - elements = partition_ppt(example_doc_path("fake-power-point.ppt")) +def test_partition_ppt_params(): + """Integration test of params: languages, include_page_break, and include_slide_notes.""" + elements = partition_ppt( + example_doc_path("language-docs/eng_spa_mult.ppt"), + include_page_breaks=True, + include_slide_notes=True, + ) assert elements[0].metadata.languages == ["eng"] + assert any(isinstance(element, PageBreak) for element in elements) + # The example doc contains a slide note with the text "This is a slide note." + assert any(element.text == "This is a slide note." for element in elements) def test_partition_ppt_respects_detect_language_per_element(): diff --git a/unstructured/partition/ppt.py b/unstructured/partition/ppt.py index e9ada569c..c2428a7f9 100644 --- a/unstructured/partition/ppt.py +++ b/unstructured/partition/ppt.py @@ -24,6 +24,7 @@ def partition_ppt( file: Optional[IO[bytes]] = None, include_page_breaks: bool = False, include_metadata: bool = True, + include_slide_notes: Optional[bool] = None, infer_table_structure: bool = True, metadata_filename: Optional[str] = None, metadata_last_modified: Optional[str] = None, @@ -44,6 +45,8 @@ def partition_ppt( A file-like object using "rb" mode --> open(filename, "rb"). include_page_breaks If True, includes a PageBreak element between slides + include_slide_notes + If True, includes the slide notes as element infer_table_structure If True, any Table elements that are extracted will also have a metadata field named "text_as_html" where the table's text content is rendered into an html string. @@ -102,11 +105,13 @@ def partition_ppt( pptx_filename = os.path.join(tmpdir, f"{base_filename}.pptx") elements = partition_pptx( filename=pptx_filename, + detect_language_per_element=detect_language_per_element, + include_page_breaks=include_page_breaks, + include_slide_notes=include_slide_notes, infer_table_structure=infer_table_structure, + languages=languages, metadata_filename=metadata_filename, metadata_last_modified=metadata_last_modified or last_modification_date, - languages=languages, - detect_language_per_element=detect_language_per_element, starting_page_number=starting_page_number, ) diff --git a/unstructured/partition/pptx.py b/unstructured/partition/pptx.py index 58020ea07..7cc1924fd 100644 --- a/unstructured/partition/pptx.py +++ b/unstructured/partition/pptx.py @@ -92,7 +92,7 @@ def partition_pptx( date_from_file_object: bool = False, detect_language_per_element: bool = False, include_page_breaks: bool = True, - include_slide_notes: bool = False, + include_slide_notes: Optional[bool] = None, infer_table_structure: bool = True, languages: Optional[list[str]] = ["auto"], metadata_filename: Optional[str] = None, @@ -376,7 +376,7 @@ class PptxPartitionerOptions: file: Optional[IO[bytes]], file_path: Optional[str], include_page_breaks: bool, - include_slide_notes: bool, + include_slide_notes: Optional[bool], infer_table_structure: bool, metadata_file_path: Optional[str], metadata_last_modified: Optional[str], @@ -413,7 +413,7 @@ class PptxPartitionerOptions: @lazyproperty def include_slide_notes(self) -> bool: """When True, also partition any text found in slide notes as part of each slide.""" - return self._include_slide_notes + return False if self._include_slide_notes is None else self._include_slide_notes def increment_page_number(self) -> Iterator[PageBreak]: """Increment page-number by 1 and generate a PageBreak element if enabled."""