mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-06 12:02:12 +00:00
fix: ppt parameters include_page_breaks and include_slide_notes (#2996)
Pass the parameters `include_slide_notes` and `include_page_breaks` to `partition_pptx` from `partition_ppt`. Also update the .ppt example doc we use for testing so it has slide notes and a PageBreak (and second page)
This commit is contained in:
parent
293a4a1152
commit
593aa47802
@ -10,6 +10,7 @@
|
|||||||
|
|
||||||
* **Add missing starting_page_num param to partition_image**
|
* **Add missing starting_page_num param to partition_image**
|
||||||
* **Make the filename and file params for partition_image and partition_pdf match the other partitioners**
|
* **Make the filename and file params for partition_image and partition_pdf match the other partitioners**
|
||||||
|
* **Fix include_slide_notes and include_page_breaks params in partition_ppt**
|
||||||
* **Re-apply: skip accuracy calculation feature** Overwritten by mistake
|
* **Re-apply: skip accuracy calculation feature** Overwritten by mistake
|
||||||
* **Remove links param from partition_pdf** `links` is extracted during partitioning and is not needed as a paramter in partition_pdf.
|
* **Remove links param from partition_pdf** `links` is extracted during partitioning and is not needed as a paramter in partition_pdf.
|
||||||
|
|
||||||
|
|||||||
Binary file not shown.
@ -7,7 +7,7 @@ from pytest_mock import MockFixture
|
|||||||
|
|
||||||
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
|
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
|
||||||
from unstructured.chunking.title import chunk_by_title
|
from unstructured.chunking.title import chunk_by_title
|
||||||
from unstructured.documents.elements import ListItem, NarrativeText, Title
|
from unstructured.documents.elements import ListItem, NarrativeText, PageBreak, Title
|
||||||
from unstructured.partition.ppt import partition_ppt
|
from unstructured.partition.ppt import partition_ppt
|
||||||
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
|
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
|
||||||
|
|
||||||
@ -181,9 +181,17 @@ def test_add_chunking_strategy_by_title_on_partition_ppt():
|
|||||||
assert chunk_elements == chunks
|
assert chunk_elements == chunks
|
||||||
|
|
||||||
|
|
||||||
def test_partition_ppt_element_metadata_has_languages():
|
def test_partition_ppt_params():
|
||||||
elements = partition_ppt(example_doc_path("fake-power-point.ppt"))
|
"""Integration test of params: languages, include_page_break, and include_slide_notes."""
|
||||||
|
elements = partition_ppt(
|
||||||
|
example_doc_path("language-docs/eng_spa_mult.ppt"),
|
||||||
|
include_page_breaks=True,
|
||||||
|
include_slide_notes=True,
|
||||||
|
)
|
||||||
assert elements[0].metadata.languages == ["eng"]
|
assert elements[0].metadata.languages == ["eng"]
|
||||||
|
assert any(isinstance(element, PageBreak) for element in elements)
|
||||||
|
# The example doc contains a slide note with the text "This is a slide note."
|
||||||
|
assert any(element.text == "This is a slide note." for element in elements)
|
||||||
|
|
||||||
|
|
||||||
def test_partition_ppt_respects_detect_language_per_element():
|
def test_partition_ppt_respects_detect_language_per_element():
|
||||||
|
|||||||
@ -24,6 +24,7 @@ def partition_ppt(
|
|||||||
file: Optional[IO[bytes]] = None,
|
file: Optional[IO[bytes]] = None,
|
||||||
include_page_breaks: bool = False,
|
include_page_breaks: bool = False,
|
||||||
include_metadata: bool = True,
|
include_metadata: bool = True,
|
||||||
|
include_slide_notes: Optional[bool] = None,
|
||||||
infer_table_structure: bool = True,
|
infer_table_structure: bool = True,
|
||||||
metadata_filename: Optional[str] = None,
|
metadata_filename: Optional[str] = None,
|
||||||
metadata_last_modified: Optional[str] = None,
|
metadata_last_modified: Optional[str] = None,
|
||||||
@ -44,6 +45,8 @@ def partition_ppt(
|
|||||||
A file-like object using "rb" mode --> open(filename, "rb").
|
A file-like object using "rb" mode --> open(filename, "rb").
|
||||||
include_page_breaks
|
include_page_breaks
|
||||||
If True, includes a PageBreak element between slides
|
If True, includes a PageBreak element between slides
|
||||||
|
include_slide_notes
|
||||||
|
If True, includes the slide notes as element
|
||||||
infer_table_structure
|
infer_table_structure
|
||||||
If True, any Table elements that are extracted will also have a metadata field
|
If True, any Table elements that are extracted will also have a metadata field
|
||||||
named "text_as_html" where the table's text content is rendered into an html string.
|
named "text_as_html" where the table's text content is rendered into an html string.
|
||||||
@ -102,11 +105,13 @@ def partition_ppt(
|
|||||||
pptx_filename = os.path.join(tmpdir, f"{base_filename}.pptx")
|
pptx_filename = os.path.join(tmpdir, f"{base_filename}.pptx")
|
||||||
elements = partition_pptx(
|
elements = partition_pptx(
|
||||||
filename=pptx_filename,
|
filename=pptx_filename,
|
||||||
|
detect_language_per_element=detect_language_per_element,
|
||||||
|
include_page_breaks=include_page_breaks,
|
||||||
|
include_slide_notes=include_slide_notes,
|
||||||
infer_table_structure=infer_table_structure,
|
infer_table_structure=infer_table_structure,
|
||||||
|
languages=languages,
|
||||||
metadata_filename=metadata_filename,
|
metadata_filename=metadata_filename,
|
||||||
metadata_last_modified=metadata_last_modified or last_modification_date,
|
metadata_last_modified=metadata_last_modified or last_modification_date,
|
||||||
languages=languages,
|
|
||||||
detect_language_per_element=detect_language_per_element,
|
|
||||||
starting_page_number=starting_page_number,
|
starting_page_number=starting_page_number,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -92,7 +92,7 @@ def partition_pptx(
|
|||||||
date_from_file_object: bool = False,
|
date_from_file_object: bool = False,
|
||||||
detect_language_per_element: bool = False,
|
detect_language_per_element: bool = False,
|
||||||
include_page_breaks: bool = True,
|
include_page_breaks: bool = True,
|
||||||
include_slide_notes: bool = False,
|
include_slide_notes: Optional[bool] = None,
|
||||||
infer_table_structure: bool = True,
|
infer_table_structure: bool = True,
|
||||||
languages: Optional[list[str]] = ["auto"],
|
languages: Optional[list[str]] = ["auto"],
|
||||||
metadata_filename: Optional[str] = None,
|
metadata_filename: Optional[str] = None,
|
||||||
@ -376,7 +376,7 @@ class PptxPartitionerOptions:
|
|||||||
file: Optional[IO[bytes]],
|
file: Optional[IO[bytes]],
|
||||||
file_path: Optional[str],
|
file_path: Optional[str],
|
||||||
include_page_breaks: bool,
|
include_page_breaks: bool,
|
||||||
include_slide_notes: bool,
|
include_slide_notes: Optional[bool],
|
||||||
infer_table_structure: bool,
|
infer_table_structure: bool,
|
||||||
metadata_file_path: Optional[str],
|
metadata_file_path: Optional[str],
|
||||||
metadata_last_modified: Optional[str],
|
metadata_last_modified: Optional[str],
|
||||||
@ -413,7 +413,7 @@ class PptxPartitionerOptions:
|
|||||||
@lazyproperty
|
@lazyproperty
|
||||||
def include_slide_notes(self) -> bool:
|
def include_slide_notes(self) -> bool:
|
||||||
"""When True, also partition any text found in slide notes as part of each slide."""
|
"""When True, also partition any text found in slide notes as part of each slide."""
|
||||||
return self._include_slide_notes
|
return False if self._include_slide_notes is None else self._include_slide_notes
|
||||||
|
|
||||||
def increment_page_number(self) -> Iterator[PageBreak]:
|
def increment_page_number(self) -> Iterator[PageBreak]:
|
||||||
"""Increment page-number by 1 and generate a PageBreak element if enabled."""
|
"""Increment page-number by 1 and generate a PageBreak element if enabled."""
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user