mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-21 22:40:43 +00:00
rfctr(ppt): remove double-decoration (#3701)
Somehow this slipped through the earlier PR removing double-decoration from PPTX. Remove the decorators from PPT (because it is a delegating partitioner) and let the decorators on the proxy partitioner (`partition_pptx()`) do the needful.
This commit is contained in:
parent
27fa2a39d8
commit
06c85235ee
@ -1,4 +1,4 @@
|
||||
## 0.15.14-dev12
|
||||
## 0.15.14-dev13
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -13,10 +13,11 @@
|
||||
* **Fix occasional `KeyError` when mapping parent ids to hash ids.** Occasionally the input elements into `assign_and_map_hash_ids` can contain duplicated element instances, which lead to error when mapping parent id.
|
||||
* **Allow empty text files.** Fixes an issue where text files with only white space would fail to be partitioned.
|
||||
* **Remove double-decoration for CSV, DOC, ODT partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner (CSV and DOCX in this case); remove decoration from delegating partitioners.
|
||||
* **Remove double-decoration for PPT, PPTX, TSV, XLSX, and XML partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner; remove decoration from delegating partitioners.
|
||||
* **Remove double-decoration for PPTX, TSV, XLSX, and XML partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner; remove decoration from delegating partitioners.
|
||||
* **Remove double-decoration for HTML, EPUB, MD, ORG, RST, and RTF partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner (HTML in this case); remove decoration from delegating partitioners.
|
||||
* **Remove obsolete min_partition/max_partition args from TXT and EML.** The legacy `min_partition` and `max_partition` parameters were an initial rough implementation of chunking but now interfere with chunking and are unused. Remove those parameters from `partition_text()` and `partition_email()`.
|
||||
* **Remove double-decoration on EML and MSG.** Refactor these partitioners to rely on the new `@apply_metadata()` decorator operating on partitioners they delegate to (TXT, HTML, and all others for attachments) and remove direct decoration from EML and MSG.
|
||||
* **Remove double-decoration for PPT.** Remove decorators from the delegating PPT partitioner.
|
||||
|
||||
## 0.15.13
|
||||
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.15.14-dev12" # pragma: no cover
|
||||
__version__ = "0.15.14-dev13" # pragma: no cover
|
||||
|
@ -4,31 +4,18 @@ import os
|
||||
import tempfile
|
||||
from typing import IO, Any, Optional
|
||||
|
||||
from unstructured.chunking import add_chunking_strategy
|
||||
from unstructured.documents.elements import Element, process_metadata
|
||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.file_utils.model import FileType
|
||||
from unstructured.partition.common.common import convert_office_doc, exactly_one
|
||||
from unstructured.partition.common.metadata import get_last_modified_date
|
||||
from unstructured.partition.pptx import partition_pptx
|
||||
from unstructured.partition.utils.constants import PartitionStrategy
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.PPT)
|
||||
@add_chunking_strategy
|
||||
def partition_ppt(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
include_page_breaks: bool = False,
|
||||
include_slide_notes: Optional[bool] = None,
|
||||
infer_table_structure: bool = True,
|
||||
metadata_filename: Optional[str] = None,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
languages: Optional[list[str]] = ["auto"],
|
||||
detect_language_per_element: bool = False,
|
||||
starting_page_number: int = 1,
|
||||
strategy: str = PartitionStrategy.FAST,
|
||||
**kwargs: Any,
|
||||
) -> list[Element]:
|
||||
"""Partitions Microsoft PowerPoint Documents in .ppt format into their document elements.
|
||||
@ -39,29 +26,11 @@ def partition_ppt(
|
||||
A string defining the target filename path.
|
||||
file
|
||||
A file-like object using "rb" mode --> open(filename, "rb").
|
||||
include_page_breaks
|
||||
If True, includes a PageBreak element between slides
|
||||
include_slide_notes
|
||||
If True, includes the slide notes as element
|
||||
infer_table_structure
|
||||
If True, any Table elements that are extracted will also have a metadata field
|
||||
named "text_as_html" where the table's text content is rendered into an html string.
|
||||
I.e., rows and cells are preserved.
|
||||
Whether True or False, the "text" field is always present in any Table element
|
||||
and is the text content of the table (no structure).
|
||||
metadata_last_modified
|
||||
The last modified date for the document.
|
||||
languages
|
||||
User defined value for `metadata.languages` if provided. Otherwise language is detected
|
||||
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
||||
in either language.
|
||||
Additional Parameters:
|
||||
detect_language_per_element
|
||||
Detect language per element instead of at the document level.
|
||||
starting_page_number
|
||||
Indicates what page number should be assigned to the first slide in the presentation.
|
||||
This information will be reflected in elements' metadata and can be be especially
|
||||
useful when partitioning a document that is part of a larger document.
|
||||
|
||||
Note that all arguments valid on `partition_pptx()` are also valid here and will be passed
|
||||
along to the `partition_pptx()` function.
|
||||
"""
|
||||
# -- Verify that only one of the arguments was provided
|
||||
exactly_one(filename=filename, file=file)
|
||||
@ -92,17 +61,13 @@ def partition_ppt(
|
||||
target_filter="Impress MS PowerPoint 2007 XML",
|
||||
)
|
||||
pptx_filename = os.path.join(tmpdir, f"{base_filename}.pptx")
|
||||
|
||||
elements = partition_pptx(
|
||||
filename=pptx_filename,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
include_page_breaks=include_page_breaks,
|
||||
include_slide_notes=include_slide_notes,
|
||||
infer_table_structure=infer_table_structure,
|
||||
languages=languages,
|
||||
metadata_filename=metadata_filename,
|
||||
metadata_filename=metadata_filename or filename,
|
||||
metadata_file_type=FileType.PPT,
|
||||
metadata_last_modified=metadata_last_modified or last_modified,
|
||||
starting_page_number=starting_page_number,
|
||||
strategy=strategy,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# -- Remove tmp.name from filename if parsing file
|
||||
|
Loading…
x
Reference in New Issue
Block a user