fix: ppt parameters include_page_breaks and include_slide_notes (#2996)

Pass the parameters `include_slide_notes` and `include_page_breaks` to
`partition_pptx` from `partition_ppt`.

Also update the .ppt example doc we use for testing so it has slide
notes and a PageBreak (and second page)
This commit is contained in:
John 2024-05-10 12:57:36 -05:00 committed by GitHub
parent 293a4a1152
commit 593aa47802
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 22 additions and 8 deletions

View File

@ -10,6 +10,7 @@
* **Add missing starting_page_num param to partition_image** * **Add missing starting_page_num param to partition_image**
* **Make the filename and file params for partition_image and partition_pdf match the other partitioners** * **Make the filename and file params for partition_image and partition_pdf match the other partitioners**
* **Fix include_slide_notes and include_page_breaks params in partition_ppt**
* **Re-apply: skip accuracy calculation feature** Overwritten by mistake * **Re-apply: skip accuracy calculation feature** Overwritten by mistake
* **Remove links param from partition_pdf** `links` is extracted during partitioning and is not needed as a paramter in partition_pdf. * **Remove links param from partition_pdf** `links` is extracted during partitioning and is not needed as a paramter in partition_pdf.

View File

@ -7,7 +7,7 @@ from pytest_mock import MockFixture
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
from unstructured.chunking.title import chunk_by_title from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import ListItem, NarrativeText, Title from unstructured.documents.elements import ListItem, NarrativeText, PageBreak, Title
from unstructured.partition.ppt import partition_ppt from unstructured.partition.ppt import partition_ppt
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
@ -181,9 +181,17 @@ def test_add_chunking_strategy_by_title_on_partition_ppt():
assert chunk_elements == chunks assert chunk_elements == chunks
def test_partition_ppt_element_metadata_has_languages(): def test_partition_ppt_params():
elements = partition_ppt(example_doc_path("fake-power-point.ppt")) """Integration test of params: languages, include_page_break, and include_slide_notes."""
elements = partition_ppt(
example_doc_path("language-docs/eng_spa_mult.ppt"),
include_page_breaks=True,
include_slide_notes=True,
)
assert elements[0].metadata.languages == ["eng"] assert elements[0].metadata.languages == ["eng"]
assert any(isinstance(element, PageBreak) for element in elements)
# The example doc contains a slide note with the text "This is a slide note."
assert any(element.text == "This is a slide note." for element in elements)
def test_partition_ppt_respects_detect_language_per_element(): def test_partition_ppt_respects_detect_language_per_element():

View File

@ -24,6 +24,7 @@ def partition_ppt(
file: Optional[IO[bytes]] = None, file: Optional[IO[bytes]] = None,
include_page_breaks: bool = False, include_page_breaks: bool = False,
include_metadata: bool = True, include_metadata: bool = True,
include_slide_notes: Optional[bool] = None,
infer_table_structure: bool = True, infer_table_structure: bool = True,
metadata_filename: Optional[str] = None, metadata_filename: Optional[str] = None,
metadata_last_modified: Optional[str] = None, metadata_last_modified: Optional[str] = None,
@ -44,6 +45,8 @@ def partition_ppt(
A file-like object using "rb" mode --> open(filename, "rb"). A file-like object using "rb" mode --> open(filename, "rb").
include_page_breaks include_page_breaks
If True, includes a PageBreak element between slides If True, includes a PageBreak element between slides
include_slide_notes
If True, includes the slide notes as element
infer_table_structure infer_table_structure
If True, any Table elements that are extracted will also have a metadata field If True, any Table elements that are extracted will also have a metadata field
named "text_as_html" where the table's text content is rendered into an html string. named "text_as_html" where the table's text content is rendered into an html string.
@ -102,11 +105,13 @@ def partition_ppt(
pptx_filename = os.path.join(tmpdir, f"{base_filename}.pptx") pptx_filename = os.path.join(tmpdir, f"{base_filename}.pptx")
elements = partition_pptx( elements = partition_pptx(
filename=pptx_filename, filename=pptx_filename,
detect_language_per_element=detect_language_per_element,
include_page_breaks=include_page_breaks,
include_slide_notes=include_slide_notes,
infer_table_structure=infer_table_structure, infer_table_structure=infer_table_structure,
languages=languages,
metadata_filename=metadata_filename, metadata_filename=metadata_filename,
metadata_last_modified=metadata_last_modified or last_modification_date, metadata_last_modified=metadata_last_modified or last_modification_date,
languages=languages,
detect_language_per_element=detect_language_per_element,
starting_page_number=starting_page_number, starting_page_number=starting_page_number,
) )

View File

@ -92,7 +92,7 @@ def partition_pptx(
date_from_file_object: bool = False, date_from_file_object: bool = False,
detect_language_per_element: bool = False, detect_language_per_element: bool = False,
include_page_breaks: bool = True, include_page_breaks: bool = True,
include_slide_notes: bool = False, include_slide_notes: Optional[bool] = None,
infer_table_structure: bool = True, infer_table_structure: bool = True,
languages: Optional[list[str]] = ["auto"], languages: Optional[list[str]] = ["auto"],
metadata_filename: Optional[str] = None, metadata_filename: Optional[str] = None,
@ -376,7 +376,7 @@ class PptxPartitionerOptions:
file: Optional[IO[bytes]], file: Optional[IO[bytes]],
file_path: Optional[str], file_path: Optional[str],
include_page_breaks: bool, include_page_breaks: bool,
include_slide_notes: bool, include_slide_notes: Optional[bool],
infer_table_structure: bool, infer_table_structure: bool,
metadata_file_path: Optional[str], metadata_file_path: Optional[str],
metadata_last_modified: Optional[str], metadata_last_modified: Optional[str],
@ -413,7 +413,7 @@ class PptxPartitionerOptions:
@lazyproperty @lazyproperty
def include_slide_notes(self) -> bool: def include_slide_notes(self) -> bool:
"""When True, also partition any text found in slide notes as part of each slide.""" """When True, also partition any text found in slide notes as part of each slide."""
return self._include_slide_notes return False if self._include_slide_notes is None else self._include_slide_notes
def increment_page_number(self) -> Iterator[PageBreak]: def increment_page_number(self) -> Iterator[PageBreak]:
"""Increment page-number by 1 and generate a PageBreak element if enabled.""" """Increment page-number by 1 and generate a PageBreak element if enabled."""