mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-22 14:59:46 +00:00
rfctr(ppt): remove double-decoration (#3701)
Somehow this slipped through the earlier PR removing double-decoration from PPTX. Remove the decorators from PPT (because it is a delegating partitioner) and let the decorators on the proxy partitioner (`partition_pptx()`) do the needful.
This commit is contained in:
parent
27fa2a39d8
commit
06c85235ee
@ -1,4 +1,4 @@
|
|||||||
## 0.15.14-dev12
|
## 0.15.14-dev13
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
@ -13,10 +13,11 @@
|
|||||||
* **Fix occasional `KeyError` when mapping parent ids to hash ids.** Occasionally the input elements into `assign_and_map_hash_ids` can contain duplicated element instances, which lead to error when mapping parent id.
|
* **Fix occasional `KeyError` when mapping parent ids to hash ids.** Occasionally the input elements into `assign_and_map_hash_ids` can contain duplicated element instances, which lead to error when mapping parent id.
|
||||||
* **Allow empty text files.** Fixes an issue where text files with only white space would fail to be partitioned.
|
* **Allow empty text files.** Fixes an issue where text files with only white space would fail to be partitioned.
|
||||||
* **Remove double-decoration for CSV, DOC, ODT partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner (CSV and DOCX in this case); remove decoration from delegating partitioners.
|
* **Remove double-decoration for CSV, DOC, ODT partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner (CSV and DOCX in this case); remove decoration from delegating partitioners.
|
||||||
* **Remove double-decoration for PPT, PPTX, TSV, XLSX, and XML partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner; remove decoration from delegating partitioners.
|
* **Remove double-decoration for PPTX, TSV, XLSX, and XML partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner; remove decoration from delegating partitioners.
|
||||||
* **Remove double-decoration for HTML, EPUB, MD, ORG, RST, and RTF partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner (HTML in this case); remove decoration from delegating partitioners.
|
* **Remove double-decoration for HTML, EPUB, MD, ORG, RST, and RTF partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner (HTML in this case); remove decoration from delegating partitioners.
|
||||||
* **Remove obsolete min_partition/max_partition args from TXT and EML.** The legacy `min_partition` and `max_partition` parameters were an initial rough implementation of chunking but now interfere with chunking and are unused. Remove those parameters from `partition_text()` and `partition_email()`.
|
* **Remove obsolete min_partition/max_partition args from TXT and EML.** The legacy `min_partition` and `max_partition` parameters were an initial rough implementation of chunking but now interfere with chunking and are unused. Remove those parameters from `partition_text()` and `partition_email()`.
|
||||||
* **Remove double-decoration on EML and MSG.** Refactor these partitioners to rely on the new `@apply_metadata()` decorator operating on partitioners they delegate to (TXT, HTML, and all others for attachments) and remove direct decoration from EML and MSG.
|
* **Remove double-decoration on EML and MSG.** Refactor these partitioners to rely on the new `@apply_metadata()` decorator operating on partitioners they delegate to (TXT, HTML, and all others for attachments) and remove direct decoration from EML and MSG.
|
||||||
|
* **Remove double-decoration for PPT.** Remove decorators from the delegating PPT partitioner.
|
||||||
|
|
||||||
## 0.15.13
|
## 0.15.13
|
||||||
|
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.15.14-dev12" # pragma: no cover
|
__version__ = "0.15.14-dev13" # pragma: no cover
|
||||||
|
@ -4,31 +4,18 @@ import os
|
|||||||
import tempfile
|
import tempfile
|
||||||
from typing import IO, Any, Optional
|
from typing import IO, Any, Optional
|
||||||
|
|
||||||
from unstructured.chunking import add_chunking_strategy
|
from unstructured.documents.elements import Element
|
||||||
from unstructured.documents.elements import Element, process_metadata
|
|
||||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
|
||||||
from unstructured.file_utils.model import FileType
|
from unstructured.file_utils.model import FileType
|
||||||
from unstructured.partition.common.common import convert_office_doc, exactly_one
|
from unstructured.partition.common.common import convert_office_doc, exactly_one
|
||||||
from unstructured.partition.common.metadata import get_last_modified_date
|
from unstructured.partition.common.metadata import get_last_modified_date
|
||||||
from unstructured.partition.pptx import partition_pptx
|
from unstructured.partition.pptx import partition_pptx
|
||||||
from unstructured.partition.utils.constants import PartitionStrategy
|
|
||||||
|
|
||||||
|
|
||||||
@process_metadata()
|
|
||||||
@add_metadata_with_filetype(FileType.PPT)
|
|
||||||
@add_chunking_strategy
|
|
||||||
def partition_ppt(
|
def partition_ppt(
|
||||||
filename: Optional[str] = None,
|
filename: Optional[str] = None,
|
||||||
file: Optional[IO[bytes]] = None,
|
file: Optional[IO[bytes]] = None,
|
||||||
include_page_breaks: bool = False,
|
|
||||||
include_slide_notes: Optional[bool] = None,
|
|
||||||
infer_table_structure: bool = True,
|
|
||||||
metadata_filename: Optional[str] = None,
|
metadata_filename: Optional[str] = None,
|
||||||
metadata_last_modified: Optional[str] = None,
|
metadata_last_modified: Optional[str] = None,
|
||||||
languages: Optional[list[str]] = ["auto"],
|
|
||||||
detect_language_per_element: bool = False,
|
|
||||||
starting_page_number: int = 1,
|
|
||||||
strategy: str = PartitionStrategy.FAST,
|
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> list[Element]:
|
) -> list[Element]:
|
||||||
"""Partitions Microsoft PowerPoint Documents in .ppt format into their document elements.
|
"""Partitions Microsoft PowerPoint Documents in .ppt format into their document elements.
|
||||||
@ -39,29 +26,11 @@ def partition_ppt(
|
|||||||
A string defining the target filename path.
|
A string defining the target filename path.
|
||||||
file
|
file
|
||||||
A file-like object using "rb" mode --> open(filename, "rb").
|
A file-like object using "rb" mode --> open(filename, "rb").
|
||||||
include_page_breaks
|
|
||||||
If True, includes a PageBreak element between slides
|
|
||||||
include_slide_notes
|
|
||||||
If True, includes the slide notes as element
|
|
||||||
infer_table_structure
|
|
||||||
If True, any Table elements that are extracted will also have a metadata field
|
|
||||||
named "text_as_html" where the table's text content is rendered into an html string.
|
|
||||||
I.e., rows and cells are preserved.
|
|
||||||
Whether True or False, the "text" field is always present in any Table element
|
|
||||||
and is the text content of the table (no structure).
|
|
||||||
metadata_last_modified
|
metadata_last_modified
|
||||||
The last modified date for the document.
|
The last modified date for the document.
|
||||||
languages
|
|
||||||
User defined value for `metadata.languages` if provided. Otherwise language is detected
|
Note that all arguments valid on `partition_pptx()` are also valid here and will be passed
|
||||||
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
along to the `partition_pptx()` function.
|
||||||
in either language.
|
|
||||||
Additional Parameters:
|
|
||||||
detect_language_per_element
|
|
||||||
Detect language per element instead of at the document level.
|
|
||||||
starting_page_number
|
|
||||||
Indicates what page number should be assigned to the first slide in the presentation.
|
|
||||||
This information will be reflected in elements' metadata and can be be especially
|
|
||||||
useful when partitioning a document that is part of a larger document.
|
|
||||||
"""
|
"""
|
||||||
# -- Verify that only one of the arguments was provided
|
# -- Verify that only one of the arguments was provided
|
||||||
exactly_one(filename=filename, file=file)
|
exactly_one(filename=filename, file=file)
|
||||||
@ -92,17 +61,13 @@ def partition_ppt(
|
|||||||
target_filter="Impress MS PowerPoint 2007 XML",
|
target_filter="Impress MS PowerPoint 2007 XML",
|
||||||
)
|
)
|
||||||
pptx_filename = os.path.join(tmpdir, f"{base_filename}.pptx")
|
pptx_filename = os.path.join(tmpdir, f"{base_filename}.pptx")
|
||||||
|
|
||||||
elements = partition_pptx(
|
elements = partition_pptx(
|
||||||
filename=pptx_filename,
|
filename=pptx_filename,
|
||||||
detect_language_per_element=detect_language_per_element,
|
metadata_filename=metadata_filename or filename,
|
||||||
include_page_breaks=include_page_breaks,
|
metadata_file_type=FileType.PPT,
|
||||||
include_slide_notes=include_slide_notes,
|
|
||||||
infer_table_structure=infer_table_structure,
|
|
||||||
languages=languages,
|
|
||||||
metadata_filename=metadata_filename,
|
|
||||||
metadata_last_modified=metadata_last_modified or last_modified,
|
metadata_last_modified=metadata_last_modified or last_modified,
|
||||||
starting_page_number=starting_page_number,
|
**kwargs,
|
||||||
strategy=strategy,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# -- Remove tmp.name from filename if parsing file
|
# -- Remove tmp.name from filename if parsing file
|
||||||
|
Loading…
x
Reference in New Issue
Block a user