mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-04 19:16:03 +00:00
fix: ppt parameters include_page_breaks and include_slide_notes (#2996)
Pass the parameters `include_slide_notes` and `include_page_breaks` to `partition_pptx` from `partition_ppt`. Also update the .ppt example doc we use for testing so it has slide notes and a PageBreak (and second page)
This commit is contained in:
parent
293a4a1152
commit
593aa47802
@ -10,6 +10,7 @@
|
||||
|
||||
* **Add missing starting_page_num param to partition_image**
|
||||
* **Make the filename and file params for partition_image and partition_pdf match the other partitioners**
|
||||
* **Fix include_slide_notes and include_page_breaks params in partition_ppt**
|
||||
* **Re-apply: skip accuracy calculation feature** Overwritten by mistake
|
||||
* **Remove links param from partition_pdf** `links` is extracted during partitioning and is not needed as a paramter in partition_pdf.
|
||||
|
||||
|
||||
Binary file not shown.
@ -7,7 +7,7 @@ from pytest_mock import MockFixture
|
||||
|
||||
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
from unstructured.documents.elements import ListItem, NarrativeText, Title
|
||||
from unstructured.documents.elements import ListItem, NarrativeText, PageBreak, Title
|
||||
from unstructured.partition.ppt import partition_ppt
|
||||
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
|
||||
|
||||
@ -181,9 +181,17 @@ def test_add_chunking_strategy_by_title_on_partition_ppt():
|
||||
assert chunk_elements == chunks
|
||||
|
||||
|
||||
def test_partition_ppt_element_metadata_has_languages():
|
||||
elements = partition_ppt(example_doc_path("fake-power-point.ppt"))
|
||||
def test_partition_ppt_params():
|
||||
"""Integration test of params: languages, include_page_break, and include_slide_notes."""
|
||||
elements = partition_ppt(
|
||||
example_doc_path("language-docs/eng_spa_mult.ppt"),
|
||||
include_page_breaks=True,
|
||||
include_slide_notes=True,
|
||||
)
|
||||
assert elements[0].metadata.languages == ["eng"]
|
||||
assert any(isinstance(element, PageBreak) for element in elements)
|
||||
# The example doc contains a slide note with the text "This is a slide note."
|
||||
assert any(element.text == "This is a slide note." for element in elements)
|
||||
|
||||
|
||||
def test_partition_ppt_respects_detect_language_per_element():
|
||||
|
||||
@ -24,6 +24,7 @@ def partition_ppt(
|
||||
file: Optional[IO[bytes]] = None,
|
||||
include_page_breaks: bool = False,
|
||||
include_metadata: bool = True,
|
||||
include_slide_notes: Optional[bool] = None,
|
||||
infer_table_structure: bool = True,
|
||||
metadata_filename: Optional[str] = None,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
@ -44,6 +45,8 @@ def partition_ppt(
|
||||
A file-like object using "rb" mode --> open(filename, "rb").
|
||||
include_page_breaks
|
||||
If True, includes a PageBreak element between slides
|
||||
include_slide_notes
|
||||
If True, includes the slide notes as element
|
||||
infer_table_structure
|
||||
If True, any Table elements that are extracted will also have a metadata field
|
||||
named "text_as_html" where the table's text content is rendered into an html string.
|
||||
@ -102,11 +105,13 @@ def partition_ppt(
|
||||
pptx_filename = os.path.join(tmpdir, f"{base_filename}.pptx")
|
||||
elements = partition_pptx(
|
||||
filename=pptx_filename,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
include_page_breaks=include_page_breaks,
|
||||
include_slide_notes=include_slide_notes,
|
||||
infer_table_structure=infer_table_structure,
|
||||
languages=languages,
|
||||
metadata_filename=metadata_filename,
|
||||
metadata_last_modified=metadata_last_modified or last_modification_date,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
starting_page_number=starting_page_number,
|
||||
)
|
||||
|
||||
|
||||
@ -92,7 +92,7 @@ def partition_pptx(
|
||||
date_from_file_object: bool = False,
|
||||
detect_language_per_element: bool = False,
|
||||
include_page_breaks: bool = True,
|
||||
include_slide_notes: bool = False,
|
||||
include_slide_notes: Optional[bool] = None,
|
||||
infer_table_structure: bool = True,
|
||||
languages: Optional[list[str]] = ["auto"],
|
||||
metadata_filename: Optional[str] = None,
|
||||
@ -376,7 +376,7 @@ class PptxPartitionerOptions:
|
||||
file: Optional[IO[bytes]],
|
||||
file_path: Optional[str],
|
||||
include_page_breaks: bool,
|
||||
include_slide_notes: bool,
|
||||
include_slide_notes: Optional[bool],
|
||||
infer_table_structure: bool,
|
||||
metadata_file_path: Optional[str],
|
||||
metadata_last_modified: Optional[str],
|
||||
@ -413,7 +413,7 @@ class PptxPartitionerOptions:
|
||||
@lazyproperty
|
||||
def include_slide_notes(self) -> bool:
|
||||
"""When True, also partition any text found in slide notes as part of each slide."""
|
||||
return self._include_slide_notes
|
||||
return False if self._include_slide_notes is None else self._include_slide_notes
|
||||
|
||||
def increment_page_number(self) -> Iterator[PageBreak]:
|
||||
"""Increment page-number by 1 and generate a PageBreak element if enabled."""
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user