mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
rfctr(part): remove double-decoration 2 (#3686)
**Summary** Install new `@apply_metadata()` on PPTX, TSV, XLSX, and XML and remove decoration from PPT. **Additional Context** - Alphabetical order turns out to be hard, so this is the remaining "easy" delegating partitioner and the remaining principal partitioners. - Replace use of `@process_metadata()` and `@add_metadata_with_filetype()` decorators with `@apply_metadata()` on principal partitioners (those that do not delegate to other partitioners. - Remove all decorators from delegating partitioners (PPT in this case); this removes the "double-decorating".
This commit is contained in:
parent
bba60260b2
commit
17092198d0
@ -1,4 +1,4 @@
|
||||
## 0.15.14-dev7
|
||||
## 0.15.14-dev8
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -13,6 +13,7 @@
|
||||
* **Fix occasional `KeyError` when mapping parent ids to hash ids.** Occasionally the input elements into `assign_and_map_hash_ids` can contain duplicated element instances, which lead to error when mapping parent id.
|
||||
* **Allow empty text files.** Fixes an issue where text files with only white space would fail to be partitioned.
|
||||
* **Remove double-decoration for CSV, DOC, ODT partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner (CSV and DOCX in this case); remove decoration from delegating partitioners.
|
||||
* **Remove double-decoration for PPT, PPTX, TSV, XLSX, and XML partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner; remove decoration from delegating partitioners.
|
||||
|
||||
## 0.15.13
|
||||
|
||||
|
@ -30,11 +30,6 @@ def test_partition_ppt_from_filename():
|
||||
assert {element.metadata.detection_origin for element in elements} == {"pptx"}
|
||||
|
||||
|
||||
def test_partition_ppt_from_filename_with_metadata_filename():
|
||||
elements = partition_ppt(example_doc_path("fake-power-point.ppt"), metadata_filename="test")
|
||||
assert all(element.metadata.filename == "test" for element in elements)
|
||||
|
||||
|
||||
def test_partition_ppt_raises_with_missing_file():
|
||||
with pytest.raises(ValueError):
|
||||
partition_ppt(example_doc_path("doesnt-exist.ppt"))
|
||||
@ -67,6 +62,38 @@ def test_partition_ppt_raises_when_neither_file_path_or_file_is_provided():
|
||||
partition_ppt()
|
||||
|
||||
|
||||
# -- .metadata.filename --------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_partition_ppt_from_filename_gets_filename_from_filename_arg():
|
||||
elements = partition_ppt(example_doc_path("fake-power-point.ppt"))
|
||||
|
||||
assert len(elements) > 0
|
||||
assert all(e.metadata.filename == "fake-power-point.ppt" for e in elements)
|
||||
|
||||
|
||||
def test_partition_ppt_from_file_gets_filename_None():
|
||||
with open(example_doc_path("fake-power-point.ppt"), "rb") as f:
|
||||
elements = partition_ppt(file=f)
|
||||
|
||||
assert len(elements) > 0
|
||||
assert all(e.metadata.filename is None for e in elements)
|
||||
|
||||
|
||||
def test_partition_ppt_from_filename_prefers_metadata_filename():
|
||||
elements = partition_ppt(example_doc_path("fake-power-point.ppt"), metadata_filename="test")
|
||||
|
||||
assert len(elements) > 0
|
||||
assert all(element.metadata.filename == "test" for element in elements)
|
||||
|
||||
|
||||
def test_partition_ppt_from_file_prefers_metadata_filename():
|
||||
with open(example_doc_path("fake-power-point.ppt"), "rb") as f:
|
||||
elements = partition_ppt(file=f, metadata_filename="test")
|
||||
|
||||
assert all(e.metadata.filename == "test" for e in elements)
|
||||
|
||||
|
||||
# -- .metadata.last_modified ---------------------------------------------------------------------
|
||||
|
||||
|
||||
|
@ -22,6 +22,7 @@ from test_unstructured.unit_utils import (
|
||||
assert_round_trips_through_JSON,
|
||||
example_doc_path,
|
||||
function_mock,
|
||||
property_mock,
|
||||
)
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
from unstructured.documents.elements import (
|
||||
@ -351,7 +352,7 @@ def test_partition_pptx_from_file_prefers_metadata_last_modified():
|
||||
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------------------
|
||||
# -- .metadata.languages -------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_partition_pptx_element_metadata_has_languages():
|
||||
@ -374,7 +375,7 @@ def test_partition_pptx_respects_detect_language_per_element():
|
||||
|
||||
def test_partition_pptx_raises_TypeError_for_invalid_languages():
|
||||
with pytest.raises(TypeError):
|
||||
partition_pptx(example_doc_path("fake-power-point.pptx"), languages="eng") # type: ignore
|
||||
partition_pptx(example_doc_path("fake-power-point.pptx"), languages="eng")
|
||||
|
||||
|
||||
# == downstream behaviors ========================================================================
|
||||
@ -492,7 +493,7 @@ def test_partition_pptx_hierarchy_sample_document():
|
||||
test_cases = [
|
||||
(0, None, "b2859226ba1f9243fb3f1b2ace889f43"),
|
||||
(1, "b2859226ba1f9243fb3f1b2ace889f43", "d13f8827e94541c8b818b0df8f942526"),
|
||||
(None, None, "1ffd3151819e594553e6b540e19e6c36"),
|
||||
(None, None, "cbb95b030de22979af6bfa42969c8202"),
|
||||
(0, None, "e535f799d1f0e79d6777efa873a16ce1"),
|
||||
(0, "e535f799d1f0e79d6777efa873a16ce1", "f02bbfb417ad60daa2ba35080e96262f"),
|
||||
(0, "e535f799d1f0e79d6777efa873a16ce1", "414dfce72ea53cd4649176af0d62a4c1"),
|
||||
@ -500,7 +501,7 @@ def test_partition_pptx_hierarchy_sample_document():
|
||||
(1, "414dfce72ea53cd4649176af0d62a4c1", "a33333f527851f700ca175acd04b8a2c"),
|
||||
(2, "a33333f527851f700ca175acd04b8a2c", "6f1b87689e4da2b0fb865bc5f92d5702"),
|
||||
(0, "e535f799d1f0e79d6777efa873a16ce1", "3f58e0be3b8e8b15cba7adc4eae68586"),
|
||||
(None, None, "1ffd3151819e594553e6b540e19e6c36"),
|
||||
(None, None, "e5de1b503e64da424fb7d8113371e16d"),
|
||||
(0, None, "8319096532fe2e55f66c491ea8313150"),
|
||||
(0, "8319096532fe2e55f66c491ea8313150", "17a7e78277ab131a627cb4538bab7390"),
|
||||
(0, "8319096532fe2e55f66c491ea8313150", "41a9e1d0390f4edd77181142ceae51bc"),
|
||||
@ -514,7 +515,7 @@ def test_partition_pptx_hierarchy_sample_document():
|
||||
(1, "7f647b1f0f20c3db40c36ab57d9a5550", "6ec455f5f19782facf184886876c9a66"),
|
||||
(2, "6ec455f5f19782facf184886876c9a66", "5614b00c3f6bff23ebba1360e10f6428"),
|
||||
(0, "8319096532fe2e55f66c491ea8313150", "2f57a8d4182e6fd5bd5842b0a2d9841b"),
|
||||
(None, None, "1ffd3151819e594553e6b540e19e6c36"),
|
||||
(None, None, "4120066d251ba675ade42e8a167ca61f"),
|
||||
(None, None, "2ed3bd10daace79ac129cbf8faf22bfc"),
|
||||
(0, None, "fd08cacbaddafee5cbacc02528536ee5"),
|
||||
]
|
||||
@ -545,8 +546,6 @@ def opts_args() -> dict[str, Any]:
|
||||
"include_page_breaks": True,
|
||||
"include_slide_notes": False,
|
||||
"infer_table_structure": True,
|
||||
"metadata_file_path": None,
|
||||
"metadata_last_modified": None,
|
||||
"strategy": "fast",
|
||||
}
|
||||
|
||||
@ -632,15 +631,7 @@ class DescribePptxPartitionerOptions:
|
||||
|
||||
# -- .last_modified --------------------------
|
||||
|
||||
def it_gets_the_last_modified_date_of_the_document_from_the_caller_when_provided(
|
||||
self, opts_args: dict[str, Any]
|
||||
):
|
||||
opts_args["metadata_last_modified"] = "2024-03-05T17:02:53"
|
||||
opts = PptxPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.last_modified == "2024-03-05T17:02:53"
|
||||
|
||||
def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_path_is_provided(
|
||||
def it_gets_last_modified_from_the_filesystem_when_a_path_is_provided(
|
||||
self, opts_args: dict[str, Any], get_last_modified_date_: Mock
|
||||
):
|
||||
opts_args["file_path"] = "a/b/spreadsheet.pptx"
|
||||
@ -665,21 +656,11 @@ class DescribePptxPartitionerOptions:
|
||||
|
||||
# -- .metadata_file_path ---------------------
|
||||
|
||||
def it_uses_the_user_provided_file_path_in_the_metadata_when_provided(
|
||||
self, opts_args: dict[str, Any]
|
||||
):
|
||||
opts_args["file_path"] = "x/y/z.pptx"
|
||||
opts_args["metadata_file_path"] = "a/b/c.pptx"
|
||||
opts = PptxPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.metadata_file_path == "a/b/c.pptx"
|
||||
|
||||
@pytest.mark.parametrize("file_path", ["u/v/w.pptx", None])
|
||||
def and_it_falls_back_to_the_document_file_path_otherwise(
|
||||
def it_uses_the_filename_argument_when_provided(
|
||||
self, file_path: str | None, opts_args: dict[str, Any]
|
||||
):
|
||||
opts_args["file_path"] = file_path
|
||||
opts_args["metadata_file_path"] = None
|
||||
opts = PptxPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.metadata_file_path == file_path
|
||||
@ -769,9 +750,11 @@ class DescribePptxPartitionerOptions:
|
||||
|
||||
# -- .table_metadata -------------------------
|
||||
|
||||
def it_can_create_table_metadata(self, opts_args: dict[str, Any]):
|
||||
opts_args["metadata_file_path"] = "d/e/f.pptx"
|
||||
opts_args["metadata_last_modified"] = "2024-04-02T19:51:55"
|
||||
def it_can_create_table_metadata(
|
||||
self, last_modified_prop_: Mock, metadata_file_path_prop_: Mock, opts_args: dict[str, Any]
|
||||
):
|
||||
metadata_file_path_prop_.return_value = "d/e/f.pptx"
|
||||
last_modified_prop_.return_value = "2024-04-02T19:51:55"
|
||||
opts = PptxPartitionerOptions(**opts_args)
|
||||
# -- move to the first slide --
|
||||
list(opts.increment_page_number())
|
||||
@ -786,9 +769,11 @@ class DescribePptxPartitionerOptions:
|
||||
|
||||
# -- .text_metadata -------------------------
|
||||
|
||||
def it_can_create_text_metadata(self, opts_args: dict[str, Any]):
|
||||
opts_args["metadata_file_path"] = "d/e/f.pptx"
|
||||
opts_args["metadata_last_modified"] = "2024-04-02T19:56:40"
|
||||
def it_can_create_text_metadata(
|
||||
self, last_modified_prop_: Mock, metadata_file_path_prop_: Mock, opts_args: dict[str, Any]
|
||||
):
|
||||
metadata_file_path_prop_.return_value = "d/e/f.pptx"
|
||||
last_modified_prop_.return_value = "2024-04-02T19:56:40"
|
||||
opts = PptxPartitionerOptions(**opts_args)
|
||||
# -- move to the first slide --
|
||||
list(opts.increment_page_number())
|
||||
@ -806,3 +791,11 @@ class DescribePptxPartitionerOptions:
|
||||
@pytest.fixture()
|
||||
def get_last_modified_date_(self, request: FixtureRequest):
|
||||
return function_mock(request, "unstructured.partition.pptx.get_last_modified_date")
|
||||
|
||||
@pytest.fixture()
|
||||
def last_modified_prop_(self, request: FixtureRequest):
|
||||
return property_mock(request, PptxPartitionerOptions, "last_modified")
|
||||
|
||||
@pytest.fixture()
|
||||
def metadata_file_path_prop_(self, request: FixtureRequest):
|
||||
return property_mock(request, PptxPartitionerOptions, "metadata_file_path")
|
||||
|
@ -321,15 +321,6 @@ def test_partition_xlsx_with_more_than_1k_cells():
|
||||
class Describe_XlsxPartitionerOptions:
|
||||
"""Unit-test suite for `unstructured.partition.xlsx._XlsxPartitionerOptions` objects."""
|
||||
|
||||
@pytest.mark.parametrize("arg_value", [True, False])
|
||||
def it_knows_whether_to_detect_language_for_each_element_individually(
|
||||
self, arg_value: bool, opts_args: dict[str, Any]
|
||||
):
|
||||
opts_args["detect_language_per_element"] = arg_value
|
||||
opts = _XlsxPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.detect_language_per_element is arg_value
|
||||
|
||||
@pytest.mark.parametrize("arg_value", [True, False])
|
||||
def it_knows_whether_to_find_subtables_within_each_worksheet_or_return_table_per_worksheet(
|
||||
self, arg_value: bool, opts_args: dict[str, Any]
|
||||
@ -366,37 +357,20 @@ class Describe_XlsxPartitionerOptions:
|
||||
|
||||
assert opts.infer_table_structure is arg_value
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("arg_value", "expected_value"),
|
||||
[(None, None), (["eng"], ["eng"]), (["eng", "spa"], ["eng", "spa"])],
|
||||
)
|
||||
def it_knows_what_languages_the_caller_expects_to_appear_in_the_text(
|
||||
self, arg_value: bool, expected_value: int | None, opts_args: dict[str, Any]
|
||||
):
|
||||
opts_args["languages"] = arg_value
|
||||
opts = _XlsxPartitionerOptions(**opts_args)
|
||||
# -- .last_modified --------------------------------------------------------------------------
|
||||
|
||||
assert opts.languages == expected_value
|
||||
|
||||
def it_gets_the_last_modified_date_of_the_document_from_the_caller_when_provided(
|
||||
self, opts_args: dict[str, Any]
|
||||
):
|
||||
opts_args["metadata_last_modified"] = "2024-03-05T17:02:53"
|
||||
opts = _XlsxPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.last_modified == "2024-03-05T17:02:53"
|
||||
|
||||
def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_path_is_provided(
|
||||
def it_gets_last_modified_from_the_filesystem_when_a_path_is_provided(
|
||||
self, opts_args: dict[str, Any], get_last_modified_date_: Mock
|
||||
):
|
||||
filesystem_last_modified = "2024-04-02T20:32:35"
|
||||
opts_args["file_path"] = "a/b/spreadsheet.xlsx"
|
||||
get_last_modified_date_.return_value = "2024-04-02T20:32:35"
|
||||
get_last_modified_date_.return_value = filesystem_last_modified
|
||||
opts = _XlsxPartitionerOptions(**opts_args)
|
||||
|
||||
last_modified = opts.last_modified
|
||||
|
||||
get_last_modified_date_.assert_called_once_with("a/b/spreadsheet.xlsx")
|
||||
assert last_modified == "2024-04-02T20:32:35"
|
||||
assert last_modified == filesystem_last_modified
|
||||
|
||||
def but_it_falls_back_to_None_for_the_last_modified_date_when_no_file_path_is_provided(
|
||||
self, opts_args: dict[str, Any]
|
||||
@ -409,24 +383,13 @@ class Describe_XlsxPartitionerOptions:
|
||||
|
||||
assert last_modified is None
|
||||
|
||||
def it_uses_the_user_provided_file_path_in_the_metadata_when_provided(
|
||||
self, opts_args: dict[str, Any]
|
||||
):
|
||||
# -- .metadata_file_path ---------------------------------------------------------------------
|
||||
|
||||
def it_uses_the_file_path_argument_when_provided(self, opts_args: dict[str, Any]):
|
||||
opts_args["file_path"] = "x/y/z.xlsx"
|
||||
opts_args["metadata_file_path"] = "a/b/c.xlsx"
|
||||
opts = _XlsxPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.metadata_file_path == "a/b/c.xlsx"
|
||||
|
||||
@pytest.mark.parametrize("file_path", ["u/v/w.xlsx", None])
|
||||
def and_it_falls_back_to_the_document_file_path_otherwise(
|
||||
self, file_path: str | None, opts_args: dict[str, Any]
|
||||
):
|
||||
opts_args["file_path"] = file_path
|
||||
opts_args["metadata_file_path"] = None
|
||||
opts = _XlsxPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.metadata_file_path == file_path
|
||||
assert opts.metadata_file_path == "x/y/z.xlsx"
|
||||
|
||||
# -- fixtures --------------------------------------------------------------------------------
|
||||
|
||||
@ -442,15 +405,11 @@ class Describe_XlsxPartitionerOptions:
|
||||
compact for testing purposes.
|
||||
"""
|
||||
return {
|
||||
"detect_language_per_element": False,
|
||||
"file": None,
|
||||
"file_path": None,
|
||||
"file": None,
|
||||
"find_subtable": True,
|
||||
"include_header": False,
|
||||
"infer_table_structure": True,
|
||||
"languages": ["auto"],
|
||||
"metadata_file_path": None,
|
||||
"metadata_last_modified": None,
|
||||
}
|
||||
|
||||
|
||||
|
@ -114,6 +114,18 @@ def test_partition_xml_from_file_rb_with_tags_raises_encoding_error():
|
||||
)
|
||||
|
||||
|
||||
# -- .metadata.filetype --------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_partition_xml_gets_the_XML_mime_type_in_metadata_filetype():
|
||||
XML_MIME_TYPE = "application/xml"
|
||||
elements = partition_xml(example_doc_path("factbook.xml"))
|
||||
assert all(e.metadata.filetype == XML_MIME_TYPE for e in elements), (
|
||||
f"Expected all elements to have '{XML_MIME_TYPE}' as their filetype, but got:"
|
||||
f" {repr(elements[0].metadata.filetype)}"
|
||||
)
|
||||
|
||||
|
||||
# -- .metadata.last_modified ---------------------------------------------------------------------
|
||||
|
||||
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.15.14-dev7" # pragma: no cover
|
||||
__version__ = "0.15.14-dev8" # pragma: no cover
|
||||
|
@ -20,6 +20,7 @@ DETECTION_ORIGIN: str = "csv"
|
||||
@add_chunking_strategy
|
||||
def partition_csv(
|
||||
filename: str | None = None,
|
||||
*,
|
||||
file: IO[bytes] | None = None,
|
||||
encoding: str | None = None,
|
||||
include_header: bool = False,
|
||||
|
@ -8,7 +8,7 @@ from __future__ import annotations
|
||||
|
||||
import io
|
||||
from tempfile import SpooledTemporaryFile
|
||||
from typing import IO, Any, Iterator, Optional, Protocol, Sequence
|
||||
from typing import IO, Any, Iterator, Protocol, Sequence
|
||||
|
||||
import pptx
|
||||
from pptx.presentation import Presentation
|
||||
@ -32,13 +32,10 @@ from unstructured.documents.elements import (
|
||||
Table,
|
||||
Text,
|
||||
Title,
|
||||
process_metadata,
|
||||
)
|
||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||
from unstructured.file_utils.model import FileType
|
||||
from unstructured.partition.common.common import convert_ms_office_table_to_text
|
||||
from unstructured.partition.common.lang import apply_lang_metadata
|
||||
from unstructured.partition.common.metadata import get_last_modified_date
|
||||
from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date
|
||||
from unstructured.partition.text_type import (
|
||||
is_email_address,
|
||||
is_possible_narrative_text,
|
||||
@ -80,20 +77,15 @@ class AbstractPicturePartitioner(Protocol):
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.PPTX)
|
||||
@apply_metadata(FileType.PPTX)
|
||||
@add_chunking_strategy
|
||||
def partition_pptx(
|
||||
filename: Optional[str] = None,
|
||||
filename: str | None = None,
|
||||
*,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
detect_language_per_element: bool = False,
|
||||
file: IO[bytes] | None = None,
|
||||
include_page_breaks: bool = True,
|
||||
include_slide_notes: Optional[bool] = None,
|
||||
include_slide_notes: bool | None = None,
|
||||
infer_table_structure: bool = True,
|
||||
languages: Optional[list[str]] = ["auto"],
|
||||
metadata_filename: Optional[str] = None,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
starting_page_number: int = 1,
|
||||
strategy: str = PartitionStrategy.FAST,
|
||||
**kwargs: Any,
|
||||
@ -108,12 +100,6 @@ def partition_pptx(
|
||||
A file-like object using "rb" mode --> open(filename, "rb").
|
||||
include_page_breaks
|
||||
If True, includes a PageBreak element between slides
|
||||
metadata_filename
|
||||
The filename to use for the metadata. Relevant because partition_ppt() converts its
|
||||
(legacy) .ppt document to .pptx before partition. We want the filename of the original
|
||||
.ppt source file in the metadata.
|
||||
metadata_last_modified
|
||||
The last modified date for the document.
|
||||
include_slide_notes
|
||||
If True, includes the slide notes as element
|
||||
infer_table_structure
|
||||
@ -122,13 +108,6 @@ def partition_pptx(
|
||||
I.e., rows and cells are preserved.
|
||||
Whether True or False, the "text" field is always present in any Table element
|
||||
and is the text content of the table (no structure).
|
||||
languages
|
||||
User defined value for `metadata.languages` if provided. Otherwise language is detected
|
||||
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
||||
in either language.
|
||||
Additional Parameters:
|
||||
detect_language_per_element
|
||||
Detect language per element instead of at the document level.
|
||||
starting_page_number
|
||||
Indicates what page number should be assigned to the first slide in the presentation.
|
||||
This information will be reflected in elements' metadata and can be be especially
|
||||
@ -140,19 +119,11 @@ def partition_pptx(
|
||||
include_page_breaks=include_page_breaks,
|
||||
include_slide_notes=include_slide_notes,
|
||||
infer_table_structure=infer_table_structure,
|
||||
metadata_file_path=metadata_filename,
|
||||
metadata_last_modified=metadata_last_modified,
|
||||
strategy=strategy,
|
||||
starting_page_number=starting_page_number,
|
||||
)
|
||||
|
||||
elements = _PptxPartitioner.iter_presentation_elements(opts)
|
||||
elements = apply_lang_metadata(
|
||||
elements=elements,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
)
|
||||
return list(elements)
|
||||
return list(_PptxPartitioner.iter_presentation_elements(opts))
|
||||
|
||||
|
||||
class _PptxPartitioner:
|
||||
@ -321,7 +292,7 @@ class _PptxPartitioner:
|
||||
detection_origin=DETECTION_ORIGIN,
|
||||
)
|
||||
|
||||
def _order_shapes(self, slide: Slide) -> tuple[Optional[Shape], Sequence[BaseShape]]:
|
||||
def _order_shapes(self, slide: Slide) -> tuple[Shape | None, Sequence[BaseShape]]:
|
||||
"""Orders the shapes on `slide` from top to bottom and left to right.
|
||||
|
||||
Returns the title shape if it exists and the ordered shapes."""
|
||||
@ -365,13 +336,11 @@ class PptxPartitionerOptions:
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
file: Optional[IO[bytes]],
|
||||
file_path: Optional[str],
|
||||
file: IO[bytes] | None,
|
||||
file_path: str | None,
|
||||
include_page_breaks: bool,
|
||||
include_slide_notes: Optional[bool],
|
||||
include_slide_notes: bool | None,
|
||||
infer_table_structure: bool,
|
||||
metadata_file_path: Optional[str],
|
||||
metadata_last_modified: Optional[str],
|
||||
strategy: str,
|
||||
starting_page_number: int = 1,
|
||||
):
|
||||
@ -380,8 +349,6 @@ class PptxPartitionerOptions:
|
||||
self._include_page_breaks = include_page_breaks
|
||||
self._include_slide_notes = include_slide_notes
|
||||
self._infer_table_structure = infer_table_structure
|
||||
self._metadata_file_path = metadata_file_path
|
||||
self._metadata_last_modified = metadata_last_modified
|
||||
self._strategy = strategy
|
||||
# -- options object maintains page-number state --
|
||||
self._page_counter = starting_page_number - 1
|
||||
@ -417,7 +384,9 @@ class PptxPartitionerOptions:
|
||||
yield PageBreak(
|
||||
"",
|
||||
detection_origin=DETECTION_ORIGIN,
|
||||
metadata=ElementMetadata(last_modified=self.last_modified),
|
||||
metadata=ElementMetadata(
|
||||
last_modified=self.last_modified, page_number=self.page_number - 1
|
||||
),
|
||||
)
|
||||
|
||||
@lazyproperty
|
||||
@ -426,27 +395,19 @@ class PptxPartitionerOptions:
|
||||
return self._infer_table_structure
|
||||
|
||||
@lazyproperty
|
||||
def last_modified(self) -> Optional[str]:
|
||||
def last_modified(self) -> str | None:
|
||||
"""The best last-modified date available, None if no sources are available."""
|
||||
# -- Value explicitly specified by caller takes precedence. This is used for example when
|
||||
# -- this file was converted from another format, and any last-modified date for the file
|
||||
# -- would be just now.
|
||||
if self._metadata_last_modified:
|
||||
return self._metadata_last_modified
|
||||
if not self._file_path:
|
||||
return None
|
||||
|
||||
if self._file_path:
|
||||
return (
|
||||
None
|
||||
if is_temp_file_path(self._file_path)
|
||||
else get_last_modified_date(self._file_path)
|
||||
)
|
||||
|
||||
return None
|
||||
return (
|
||||
None if is_temp_file_path(self._file_path) else get_last_modified_date(self._file_path)
|
||||
)
|
||||
|
||||
@lazyproperty
|
||||
def metadata_file_path(self) -> str | None:
|
||||
"""The best available file-path for this document or `None` if unavailable."""
|
||||
return self._metadata_file_path or self._file_path
|
||||
return self._file_path
|
||||
|
||||
@property
|
||||
def page_number(self) -> int:
|
||||
|
@ -6,36 +6,24 @@ import pandas as pd
|
||||
from lxml.html.soupparser import fromstring as soupparser_fromstring
|
||||
|
||||
from unstructured.chunking import add_chunking_strategy
|
||||
from unstructured.documents.elements import (
|
||||
Element,
|
||||
ElementMetadata,
|
||||
Table,
|
||||
process_metadata,
|
||||
)
|
||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||
from unstructured.documents.elements import Element, ElementMetadata, Table
|
||||
from unstructured.file_utils.model import FileType
|
||||
from unstructured.partition.common.common import (
|
||||
exactly_one,
|
||||
spooled_to_bytes_io_if_needed,
|
||||
)
|
||||
from unstructured.partition.common.lang import apply_lang_metadata
|
||||
from unstructured.partition.common.metadata import get_last_modified_date
|
||||
from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date
|
||||
|
||||
DETECTION_ORIGIN: str = "tsv"
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.TSV)
|
||||
@apply_metadata(FileType.TSV)
|
||||
@add_chunking_strategy
|
||||
def partition_tsv(
|
||||
filename: Optional[str] = None,
|
||||
*,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
metadata_filename: Optional[str] = None,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
include_header: bool = False,
|
||||
languages: Optional[list[str]] = ["auto"],
|
||||
# NOTE (jennings) partition_tsv generates a single TableElement
|
||||
# so detect_language_per_element is not included as a param
|
||||
**kwargs: Any,
|
||||
) -> list[Element]:
|
||||
"""Partitions TSV files into document elements.
|
||||
@ -48,17 +36,9 @@ def partition_tsv(
|
||||
A file-like object using "rb" mode --> open(filename, "rb").
|
||||
include_header
|
||||
Determines whether or not header info info is included in text and medatada.text_as_html.
|
||||
metadata_last_modified
|
||||
The day of the last modification.
|
||||
languages
|
||||
User defined value for `metadata.languages` if provided. Otherwise language is detected
|
||||
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
||||
in either language.
|
||||
"""
|
||||
exactly_one(filename=filename, file=file)
|
||||
|
||||
last_modified = get_last_modified_date(filename) if filename else None
|
||||
|
||||
header = 0 if include_header else None
|
||||
|
||||
if filename:
|
||||
@ -75,14 +55,9 @@ def partition_tsv(
|
||||
|
||||
metadata = ElementMetadata(
|
||||
text_as_html=html_text,
|
||||
filename=metadata_filename or filename,
|
||||
last_modified=metadata_last_modified or last_modified,
|
||||
languages=languages,
|
||||
filename=filename,
|
||||
last_modified=get_last_modified_date(filename) if filename else None,
|
||||
)
|
||||
metadata.detection_origin = DETECTION_ORIGIN
|
||||
|
||||
elements = apply_lang_metadata(
|
||||
[Table(text=text, metadata=metadata)],
|
||||
languages=languages,
|
||||
)
|
||||
return list(elements)
|
||||
return [Table(text=text, metadata=metadata)]
|
||||
|
@ -22,12 +22,9 @@ from unstructured.documents.elements import (
|
||||
Table,
|
||||
Text,
|
||||
Title,
|
||||
process_metadata,
|
||||
)
|
||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||
from unstructured.file_utils.model import FileType
|
||||
from unstructured.partition.common.lang import apply_lang_metadata
|
||||
from unstructured.partition.common.metadata import get_last_modified_date
|
||||
from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date
|
||||
from unstructured.partition.text_type import (
|
||||
is_bulleted_text,
|
||||
is_possible_narrative_text,
|
||||
@ -41,19 +38,15 @@ _CellCoordinate: TypeAlias = "tuple[int, int]"
|
||||
DETECTION_ORIGIN: str = "xlsx"
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.XLSX)
|
||||
@apply_metadata(FileType.XLSX)
|
||||
@add_chunking_strategy
|
||||
def partition_xlsx(
|
||||
filename: Optional[str] = None,
|
||||
*,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
metadata_filename: Optional[str] = None,
|
||||
infer_table_structure: bool = True,
|
||||
languages: Optional[list[str]] = ["auto"],
|
||||
detect_language_per_element: bool = False,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
include_header: bool = False,
|
||||
find_subtable: bool = True,
|
||||
include_header: bool = False,
|
||||
infer_table_structure: bool = True,
|
||||
starting_page_number: int = 1,
|
||||
**kwargs: Any,
|
||||
) -> list[Element]:
|
||||
@ -71,28 +64,15 @@ def partition_xlsx(
|
||||
I.e., rows and cells are preserved.
|
||||
Whether True or False, the "text" field is always present in any Table element
|
||||
and is the text content of the table (no structure).
|
||||
languages
|
||||
User defined value for metadata.languages if provided. Otherwise language is detected
|
||||
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
||||
in either language.
|
||||
Additional Parameters:
|
||||
detect_language_per_element
|
||||
Detect language per element instead of at the document level.
|
||||
metadata_last_modified
|
||||
The day of the last modification
|
||||
include_header
|
||||
Determines whether or not header info is included in text and medatada.text_as_html
|
||||
"""
|
||||
opts = _XlsxPartitionerOptions(
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
file=file,
|
||||
file_path=filename,
|
||||
file=file,
|
||||
find_subtable=find_subtable,
|
||||
include_header=include_header,
|
||||
infer_table_structure=infer_table_structure,
|
||||
languages=languages,
|
||||
metadata_file_path=metadata_filename,
|
||||
metadata_last_modified=metadata_last_modified,
|
||||
)
|
||||
|
||||
elements: list[Element] = []
|
||||
@ -151,13 +131,6 @@ def partition_xlsx(
|
||||
element.metadata = _get_metadata(sheet_name, page_number, opts)
|
||||
elements.append(element)
|
||||
|
||||
elements = list(
|
||||
apply_lang_metadata(
|
||||
elements=elements,
|
||||
languages=opts.languages,
|
||||
detect_language_per_element=opts.detect_language_per_element,
|
||||
),
|
||||
)
|
||||
return elements
|
||||
|
||||
|
||||
@ -167,30 +140,17 @@ class _XlsxPartitionerOptions:
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
detect_language_per_element: bool,
|
||||
file: Optional[IO[bytes]],
|
||||
file_path: Optional[str],
|
||||
file: Optional[IO[bytes]],
|
||||
find_subtable: bool,
|
||||
include_header: bool,
|
||||
infer_table_structure: bool,
|
||||
languages: Optional[list[str]],
|
||||
metadata_file_path: Optional[str],
|
||||
metadata_last_modified: Optional[str],
|
||||
):
|
||||
self._detect_language_per_element = detect_language_per_element
|
||||
self._file = file
|
||||
self._file_path = file_path
|
||||
self._file = file
|
||||
self._find_subtable = find_subtable
|
||||
self._include_header = include_header
|
||||
self._infer_table_structure = infer_table_structure
|
||||
self._languages = languages
|
||||
self._metadata_file_path = metadata_file_path
|
||||
self._metadata_last_modified = metadata_last_modified
|
||||
|
||||
@lazyproperty
|
||||
def detect_language_per_element(self) -> bool:
|
||||
"""When True, detect language on element-by-element basis instead of document level."""
|
||||
return self._detect_language_per_element
|
||||
|
||||
@lazyproperty
|
||||
def find_subtable(self) -> bool:
|
||||
@ -215,31 +175,15 @@ class _XlsxPartitionerOptions:
|
||||
"""True when partitioner should compute and apply `text_as_html` metadata."""
|
||||
return self._infer_table_structure
|
||||
|
||||
@lazyproperty
|
||||
def languages(self) -> Optional[list[str]]:
|
||||
"""User-specified language(s) of this document.
|
||||
|
||||
When `None`, language is detected using naive Bayesian filter via `langdetect`. Multiple
|
||||
language codes indicate text could be in any of those languages.
|
||||
"""
|
||||
return self._languages
|
||||
|
||||
@lazyproperty
|
||||
def last_modified(self) -> Optional[str]:
|
||||
"""The best last-modified date available, None if no sources are available."""
|
||||
# -- value explicitly specified by caller takes precedence --
|
||||
if self._metadata_last_modified:
|
||||
return self._metadata_last_modified
|
||||
|
||||
if self._file_path:
|
||||
return get_last_modified_date(self._file_path)
|
||||
|
||||
return None
|
||||
return get_last_modified_date(self._file_path) if self._file_path else None
|
||||
|
||||
@lazyproperty
|
||||
def metadata_file_path(self) -> str | None:
|
||||
"""The best available file-path for this document or `None` if unavailable."""
|
||||
return self._metadata_file_path or self._file_path
|
||||
return self._file_path
|
||||
|
||||
@lazyproperty
|
||||
def sheets(self) -> dict[str, pd.DataFrame]:
|
||||
|
@ -2,92 +2,34 @@ from __future__ import annotations
|
||||
|
||||
import copy
|
||||
from io import BytesIO
|
||||
from typing import IO, Any, Iterator, Optional, cast
|
||||
from typing import IO, Any, Iterator, cast
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from unstructured.chunking import add_chunking_strategy
|
||||
from unstructured.documents.elements import (
|
||||
Element,
|
||||
ElementMetadata,
|
||||
Text,
|
||||
process_metadata,
|
||||
)
|
||||
from unstructured.documents.elements import Element, ElementMetadata, Text
|
||||
from unstructured.file_utils.encoding import read_txt_file
|
||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||
from unstructured.file_utils.model import FileType
|
||||
from unstructured.partition.common.common import (
|
||||
exactly_one,
|
||||
spooled_to_bytes_io_if_needed,
|
||||
)
|
||||
from unstructured.partition.common.lang import apply_lang_metadata
|
||||
from unstructured.partition.common.metadata import get_last_modified_date
|
||||
from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date
|
||||
from unstructured.partition.text import element_from_text
|
||||
|
||||
DETECTION_ORIGIN: str = "xml"
|
||||
|
||||
|
||||
def get_leaf_elements(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
text: Optional[str] = None,
|
||||
xml_path: Optional[str] = None,
|
||||
) -> Iterator[Optional[str]]:
|
||||
"""Get leaf elements from the XML tree defined in filename, file, or text."""
|
||||
exactly_one(filename=filename, file=file, text=text)
|
||||
if filename:
|
||||
return _get_leaf_elements(filename, xml_path=xml_path)
|
||||
elif file:
|
||||
return _get_leaf_elements(file=spooled_to_bytes_io_if_needed(file), xml_path=xml_path)
|
||||
else:
|
||||
b = BytesIO(bytes(cast(str, text), encoding="utf-8"))
|
||||
return _get_leaf_elements(b, xml_path=xml_path)
|
||||
|
||||
|
||||
def _get_leaf_elements(
|
||||
file: str | IO[bytes],
|
||||
xml_path: Optional[str] = None,
|
||||
) -> Iterator[Optional[str]]:
|
||||
"""Parse the XML tree in a memory efficient manner if possible."""
|
||||
element_stack: list[etree._Element] = [] # pyright: ignore[reportPrivateUsage]
|
||||
|
||||
element_iterator = etree.iterparse(file, events=("start", "end"), resolve_entities=False)
|
||||
# NOTE(alan) If xml_path is used for filtering, I've yet to find a good way to stream
|
||||
# elements through in a memory efficient way, so we bite the bullet and load it all into
|
||||
# memory.
|
||||
if xml_path is not None:
|
||||
_, element = next(element_iterator)
|
||||
compiled_path = etree.XPath(xml_path)
|
||||
element_iterator = (("end", el) for el in compiled_path(element))
|
||||
|
||||
for event, element in element_iterator:
|
||||
if event == "start":
|
||||
element_stack.append(element)
|
||||
|
||||
if event == "end":
|
||||
if element.text is not None and element.text.strip():
|
||||
yield element.text
|
||||
|
||||
element.clear()
|
||||
|
||||
while element_stack and element_stack[-1].getparent() is None:
|
||||
element_stack.pop()
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.XML)
|
||||
@apply_metadata(FileType.XML)
|
||||
@add_chunking_strategy
|
||||
def partition_xml(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
text: Optional[str] = None,
|
||||
filename: str | None = None,
|
||||
*,
|
||||
file: IO[bytes] | None = None,
|
||||
text: str | None = None,
|
||||
encoding: str | None = None,
|
||||
xml_keep_tags: bool = False,
|
||||
xml_path: Optional[str] = None,
|
||||
metadata_filename: Optional[str] = None,
|
||||
encoding: Optional[str] = None,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
languages: Optional[list[str]] = ["auto"],
|
||||
detect_language_per_element: bool = False,
|
||||
xml_path: str | None = None,
|
||||
**kwargs: Any,
|
||||
) -> list[Element]:
|
||||
"""Partitions an XML document into its document elements.
|
||||
@ -100,32 +42,20 @@ def partition_xml(
|
||||
A file-like object using "rb" mode --> open(filename, "rb").
|
||||
text
|
||||
The text of the XML file.
|
||||
encoding
|
||||
The encoding method used to decode the text input. If None, utf-8 will be used.
|
||||
xml_keep_tags
|
||||
If True, will retain the XML tags in the output. Otherwise it will simply extract
|
||||
the text from within the tags.
|
||||
xml_path
|
||||
The xml_path to use for extracting the text. Only used if xml_keep_tags=False.
|
||||
encoding
|
||||
The encoding method used to decode the text input. If None, utf-8 will be used.
|
||||
metadata_last_modified
|
||||
The day of the last modification.
|
||||
languages
|
||||
User defined value for `metadata.languages` if provided. Otherwise language is detected
|
||||
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
||||
in either language.
|
||||
Additional Parameters:
|
||||
detect_language_per_element
|
||||
Detect language per element instead of at the document level.
|
||||
"""
|
||||
exactly_one(filename=filename, file=file, text=text)
|
||||
|
||||
elements: list[Element] = []
|
||||
|
||||
last_modification_date = get_last_modified_date(filename) if filename else None
|
||||
|
||||
metadata = ElementMetadata(
|
||||
filename=metadata_filename or filename,
|
||||
last_modified=metadata_last_modified or last_modification_date,
|
||||
filename=filename, last_modified=get_last_modified_date(filename) if filename else None
|
||||
)
|
||||
metadata.detection_origin = DETECTION_ORIGIN
|
||||
|
||||
@ -153,11 +83,48 @@ def partition_xml(
|
||||
element.metadata = copy.deepcopy(metadata)
|
||||
elements.append(element)
|
||||
|
||||
elements = list(
|
||||
apply_lang_metadata(
|
||||
elements=elements,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
),
|
||||
)
|
||||
return elements
|
||||
|
||||
|
||||
def get_leaf_elements(
|
||||
filename: str | None, file: IO[bytes] | None, text: str | None, xml_path: str | None
|
||||
) -> Iterator[str | None]:
|
||||
"""Get leaf elements from the XML tree defined in filename, file, or text."""
|
||||
exactly_one(filename=filename, file=file, text=text)
|
||||
if filename:
|
||||
return _get_leaf_elements(filename, xml_path=xml_path)
|
||||
elif file:
|
||||
return _get_leaf_elements(file=spooled_to_bytes_io_if_needed(file), xml_path=xml_path)
|
||||
else:
|
||||
b = BytesIO(bytes(cast(str, text), encoding="utf-8"))
|
||||
return _get_leaf_elements(b, xml_path=xml_path)
|
||||
|
||||
|
||||
def _get_leaf_elements(
|
||||
file: str | IO[bytes],
|
||||
xml_path: str | None,
|
||||
) -> Iterator[str | None]:
|
||||
"""Parse the XML tree in a memory efficient manner if possible."""
|
||||
element_stack: list[etree._Element] = [] # pyright: ignore[reportPrivateUsage]
|
||||
|
||||
element_iterator = etree.iterparse(file, events=("start", "end"), resolve_entities=False)
|
||||
# NOTE(alan) If xml_path is used for filtering, I've yet to find a good way to stream
|
||||
# elements through in a memory efficient way, so we bite the bullet and load it all into
|
||||
# memory.
|
||||
if xml_path is not None:
|
||||
_, element = next(element_iterator)
|
||||
compiled_path = etree.XPath(xml_path)
|
||||
element_iterator = (("end", el) for el in compiled_path(element))
|
||||
|
||||
for event, element in element_iterator:
|
||||
if event == "start":
|
||||
element_stack.append(element)
|
||||
|
||||
if event == "end":
|
||||
if element.text is not None and element.text.strip():
|
||||
yield element.text
|
||||
|
||||
element.clear()
|
||||
|
||||
while element_stack and element_stack[-1].getparent() is None:
|
||||
element_stack.pop()
|
||||
|
Loading…
x
Reference in New Issue
Block a user