From 06c85235ee8f014eae417b44ca17872f13960280 Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Fri, 4 Oct 2024 21:41:10 -0700 Subject: [PATCH] rfctr(ppt): remove double-decoration (#3701) Somehow this slipped through the earlier PR removing double-decoration from PPTX. Remove the decorators from PPT (because it is a delegating partitioner) and let the decorators on the proxy partitioner (`partition_pptx()`) do the needful. --- CHANGELOG.md | 5 ++-- unstructured/__version__.py | 2 +- unstructured/partition/ppt.py | 51 ++++++----------------------------- 3 files changed, 12 insertions(+), 46 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8e1fbb6a5..6f6595a3d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.15.14-dev12 +## 0.15.14-dev13 ### Enhancements @@ -13,10 +13,11 @@ * **Fix occasional `KeyError` when mapping parent ids to hash ids.** Occasionally the input elements into `assign_and_map_hash_ids` can contain duplicated element instances, which lead to error when mapping parent id. * **Allow empty text files.** Fixes an issue where text files with only white space would fail to be partitioned. * **Remove double-decoration for CSV, DOC, ODT partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner (CSV and DOCX in this case); remove decoration from delegating partitioners. -* **Remove double-decoration for PPT, PPTX, TSV, XLSX, and XML partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner; remove decoration from delegating partitioners. +* **Remove double-decoration for PPTX, TSV, XLSX, and XML partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner; remove decoration from delegating partitioners. * **Remove double-decoration for HTML, EPUB, MD, ORG, RST, and RTF partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner (HTML in this case); remove decoration from delegating partitioners. * **Remove obsolete min_partition/max_partition args from TXT and EML.** The legacy `min_partition` and `max_partition` parameters were an initial rough implementation of chunking but now interfere with chunking and are unused. Remove those parameters from `partition_text()` and `partition_email()`. * **Remove double-decoration on EML and MSG.** Refactor these partitioners to rely on the new `@apply_metadata()` decorator operating on partitioners they delegate to (TXT, HTML, and all others for attachments) and remove direct decoration from EML and MSG. +* **Remove double-decoration for PPT.** Remove decorators from the delegating PPT partitioner. ## 0.15.13 diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 97d7bc2e3..a855a3dd4 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.14-dev12" # pragma: no cover +__version__ = "0.15.14-dev13" # pragma: no cover diff --git a/unstructured/partition/ppt.py b/unstructured/partition/ppt.py index f8f831624..420560f18 100644 --- a/unstructured/partition/ppt.py +++ b/unstructured/partition/ppt.py @@ -4,31 +4,18 @@ import os import tempfile from typing import IO, Any, Optional -from unstructured.chunking import add_chunking_strategy -from unstructured.documents.elements import Element, process_metadata -from unstructured.file_utils.filetype import add_metadata_with_filetype +from unstructured.documents.elements import Element from unstructured.file_utils.model import FileType from unstructured.partition.common.common import convert_office_doc, exactly_one from unstructured.partition.common.metadata import get_last_modified_date from unstructured.partition.pptx import partition_pptx -from unstructured.partition.utils.constants import PartitionStrategy -@process_metadata() -@add_metadata_with_filetype(FileType.PPT) -@add_chunking_strategy def partition_ppt( filename: Optional[str] = None, file: Optional[IO[bytes]] = None, - include_page_breaks: bool = False, - include_slide_notes: Optional[bool] = None, - infer_table_structure: bool = True, metadata_filename: Optional[str] = None, metadata_last_modified: Optional[str] = None, - languages: Optional[list[str]] = ["auto"], - detect_language_per_element: bool = False, - starting_page_number: int = 1, - strategy: str = PartitionStrategy.FAST, **kwargs: Any, ) -> list[Element]: """Partitions Microsoft PowerPoint Documents in .ppt format into their document elements. @@ -39,29 +26,11 @@ def partition_ppt( A string defining the target filename path. file A file-like object using "rb" mode --> open(filename, "rb"). - include_page_breaks - If True, includes a PageBreak element between slides - include_slide_notes - If True, includes the slide notes as element - infer_table_structure - If True, any Table elements that are extracted will also have a metadata field - named "text_as_html" where the table's text content is rendered into an html string. - I.e., rows and cells are preserved. - Whether True or False, the "text" field is always present in any Table element - and is the text content of the table (no structure). metadata_last_modified The last modified date for the document. - languages - User defined value for `metadata.languages` if provided. Otherwise language is detected - using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be - in either language. - Additional Parameters: - detect_language_per_element - Detect language per element instead of at the document level. - starting_page_number - Indicates what page number should be assigned to the first slide in the presentation. - This information will be reflected in elements' metadata and can be be especially - useful when partitioning a document that is part of a larger document. + + Note that all arguments valid on `partition_pptx()` are also valid here and will be passed + along to the `partition_pptx()` function. """ # -- Verify that only one of the arguments was provided exactly_one(filename=filename, file=file) @@ -92,17 +61,13 @@ def partition_ppt( target_filter="Impress MS PowerPoint 2007 XML", ) pptx_filename = os.path.join(tmpdir, f"{base_filename}.pptx") + elements = partition_pptx( filename=pptx_filename, - detect_language_per_element=detect_language_per_element, - include_page_breaks=include_page_breaks, - include_slide_notes=include_slide_notes, - infer_table_structure=infer_table_structure, - languages=languages, - metadata_filename=metadata_filename, + metadata_filename=metadata_filename or filename, + metadata_file_type=FileType.PPT, metadata_last_modified=metadata_last_modified or last_modified, - starting_page_number=starting_page_number, - strategy=strategy, + **kwargs, ) # -- Remove tmp.name from filename if parsing file