fix: ppt parameters include_page_breaks and include_slide_notes (#2996)

Pass the parameters `include_slide_notes` and `include_page_breaks` to
`partition_pptx` from `partition_ppt`.

Also update the .ppt example doc we use for testing so it has slide
notes and a PageBreak (and second page)
This commit is contained in:
John 2024-05-10 12:57:36 -05:00 committed by GitHub
parent 293a4a1152
commit 593aa47802
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 22 additions and 8 deletions

View File

@ -10,6 +10,7 @@
* **Add missing starting_page_num param to partition_image**
* **Make the filename and file params for partition_image and partition_pdf match the other partitioners**
* **Fix include_slide_notes and include_page_breaks params in partition_ppt**
* **Re-apply: skip accuracy calculation feature** Overwritten by mistake
* **Remove links param from partition_pdf** `links` is extracted during partitioning and is not needed as a paramter in partition_pdf.

View File

@ -7,7 +7,7 @@ from pytest_mock import MockFixture
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import ListItem, NarrativeText, Title
from unstructured.documents.elements import ListItem, NarrativeText, PageBreak, Title
from unstructured.partition.ppt import partition_ppt
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
@ -181,9 +181,17 @@ def test_add_chunking_strategy_by_title_on_partition_ppt():
assert chunk_elements == chunks
def test_partition_ppt_element_metadata_has_languages():
elements = partition_ppt(example_doc_path("fake-power-point.ppt"))
def test_partition_ppt_params():
"""Integration test of params: languages, include_page_break, and include_slide_notes."""
elements = partition_ppt(
example_doc_path("language-docs/eng_spa_mult.ppt"),
include_page_breaks=True,
include_slide_notes=True,
)
assert elements[0].metadata.languages == ["eng"]
assert any(isinstance(element, PageBreak) for element in elements)
# The example doc contains a slide note with the text "This is a slide note."
assert any(element.text == "This is a slide note." for element in elements)
def test_partition_ppt_respects_detect_language_per_element():

View File

@ -24,6 +24,7 @@ def partition_ppt(
file: Optional[IO[bytes]] = None,
include_page_breaks: bool = False,
include_metadata: bool = True,
include_slide_notes: Optional[bool] = None,
infer_table_structure: bool = True,
metadata_filename: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
@ -44,6 +45,8 @@ def partition_ppt(
A file-like object using "rb" mode --> open(filename, "rb").
include_page_breaks
If True, includes a PageBreak element between slides
include_slide_notes
If True, includes the slide notes as element
infer_table_structure
If True, any Table elements that are extracted will also have a metadata field
named "text_as_html" where the table's text content is rendered into an html string.
@ -102,11 +105,13 @@ def partition_ppt(
pptx_filename = os.path.join(tmpdir, f"{base_filename}.pptx")
elements = partition_pptx(
filename=pptx_filename,
detect_language_per_element=detect_language_per_element,
include_page_breaks=include_page_breaks,
include_slide_notes=include_slide_notes,
infer_table_structure=infer_table_structure,
languages=languages,
metadata_filename=metadata_filename,
metadata_last_modified=metadata_last_modified or last_modification_date,
languages=languages,
detect_language_per_element=detect_language_per_element,
starting_page_number=starting_page_number,
)

View File

@ -92,7 +92,7 @@ def partition_pptx(
date_from_file_object: bool = False,
detect_language_per_element: bool = False,
include_page_breaks: bool = True,
include_slide_notes: bool = False,
include_slide_notes: Optional[bool] = None,
infer_table_structure: bool = True,
languages: Optional[list[str]] = ["auto"],
metadata_filename: Optional[str] = None,
@ -376,7 +376,7 @@ class PptxPartitionerOptions:
file: Optional[IO[bytes]],
file_path: Optional[str],
include_page_breaks: bool,
include_slide_notes: bool,
include_slide_notes: Optional[bool],
infer_table_structure: bool,
metadata_file_path: Optional[str],
metadata_last_modified: Optional[str],
@ -413,7 +413,7 @@ class PptxPartitionerOptions:
@lazyproperty
def include_slide_notes(self) -> bool:
"""When True, also partition any text found in slide notes as part of each slide."""
return self._include_slide_notes
return False if self._include_slide_notes is None else self._include_slide_notes
def increment_page_number(self) -> Iterator[PageBreak]:
"""Increment page-number by 1 and generate a PageBreak element if enabled."""