rfctr(part): remove double-decoration 2 (#3686)

**Summary**
Install new `@apply_metadata()` on PPTX, TSV, XLSX, and XML and remove
decoration from PPT.

**Additional Context**
- Alphabetical order turns out to be hard, so this is the remaining
"easy" delegating partitioner and the remaining principal partitioners.
- Replace use of `@process_metadata()` and
`@add_metadata_with_filetype()` decorators with `@apply_metadata()` on
principal partitioners (those that do not delegate to other
partitioners.
- Remove all decorators from delegating partitioners (PPT in this case);
this removes the "double-decorating".
This commit is contained in:
Steve Canny 2024-10-02 11:52:59 -07:00 committed by GitHub
parent bba60260b2
commit 17092198d0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 179 additions and 339 deletions

View File

@ -1,4 +1,4 @@
## 0.15.14-dev7
## 0.15.14-dev8
### Enhancements
@ -13,6 +13,7 @@
* **Fix occasional `KeyError` when mapping parent ids to hash ids.** Occasionally the input elements into `assign_and_map_hash_ids` can contain duplicated element instances, which lead to error when mapping parent id.
* **Allow empty text files.** Fixes an issue where text files with only white space would fail to be partitioned.
* **Remove double-decoration for CSV, DOC, ODT partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner (CSV and DOCX in this case); remove decoration from delegating partitioners.
* **Remove double-decoration for PPT, PPTX, TSV, XLSX, and XML partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner; remove decoration from delegating partitioners.
## 0.15.13

View File

@ -30,11 +30,6 @@ def test_partition_ppt_from_filename():
assert {element.metadata.detection_origin for element in elements} == {"pptx"}
def test_partition_ppt_from_filename_with_metadata_filename():
elements = partition_ppt(example_doc_path("fake-power-point.ppt"), metadata_filename="test")
assert all(element.metadata.filename == "test" for element in elements)
def test_partition_ppt_raises_with_missing_file():
with pytest.raises(ValueError):
partition_ppt(example_doc_path("doesnt-exist.ppt"))
@ -67,6 +62,38 @@ def test_partition_ppt_raises_when_neither_file_path_or_file_is_provided():
partition_ppt()
# -- .metadata.filename --------------------------------------------------------------------------
def test_partition_ppt_from_filename_gets_filename_from_filename_arg():
elements = partition_ppt(example_doc_path("fake-power-point.ppt"))
assert len(elements) > 0
assert all(e.metadata.filename == "fake-power-point.ppt" for e in elements)
def test_partition_ppt_from_file_gets_filename_None():
with open(example_doc_path("fake-power-point.ppt"), "rb") as f:
elements = partition_ppt(file=f)
assert len(elements) > 0
assert all(e.metadata.filename is None for e in elements)
def test_partition_ppt_from_filename_prefers_metadata_filename():
elements = partition_ppt(example_doc_path("fake-power-point.ppt"), metadata_filename="test")
assert len(elements) > 0
assert all(element.metadata.filename == "test" for element in elements)
def test_partition_ppt_from_file_prefers_metadata_filename():
with open(example_doc_path("fake-power-point.ppt"), "rb") as f:
elements = partition_ppt(file=f, metadata_filename="test")
assert all(e.metadata.filename == "test" for e in elements)
# -- .metadata.last_modified ---------------------------------------------------------------------

View File

@ -22,6 +22,7 @@ from test_unstructured.unit_utils import (
assert_round_trips_through_JSON,
example_doc_path,
function_mock,
property_mock,
)
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import (
@ -351,7 +352,7 @@ def test_partition_pptx_from_file_prefers_metadata_last_modified():
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
# ------------------------------------------------------------------------------------------------
# -- .metadata.languages -------------------------------------------------------------------------
def test_partition_pptx_element_metadata_has_languages():
@ -374,7 +375,7 @@ def test_partition_pptx_respects_detect_language_per_element():
def test_partition_pptx_raises_TypeError_for_invalid_languages():
with pytest.raises(TypeError):
partition_pptx(example_doc_path("fake-power-point.pptx"), languages="eng") # type: ignore
partition_pptx(example_doc_path("fake-power-point.pptx"), languages="eng")
# == downstream behaviors ========================================================================
@ -492,7 +493,7 @@ def test_partition_pptx_hierarchy_sample_document():
test_cases = [
(0, None, "b2859226ba1f9243fb3f1b2ace889f43"),
(1, "b2859226ba1f9243fb3f1b2ace889f43", "d13f8827e94541c8b818b0df8f942526"),
(None, None, "1ffd3151819e594553e6b540e19e6c36"),
(None, None, "cbb95b030de22979af6bfa42969c8202"),
(0, None, "e535f799d1f0e79d6777efa873a16ce1"),
(0, "e535f799d1f0e79d6777efa873a16ce1", "f02bbfb417ad60daa2ba35080e96262f"),
(0, "e535f799d1f0e79d6777efa873a16ce1", "414dfce72ea53cd4649176af0d62a4c1"),
@ -500,7 +501,7 @@ def test_partition_pptx_hierarchy_sample_document():
(1, "414dfce72ea53cd4649176af0d62a4c1", "a33333f527851f700ca175acd04b8a2c"),
(2, "a33333f527851f700ca175acd04b8a2c", "6f1b87689e4da2b0fb865bc5f92d5702"),
(0, "e535f799d1f0e79d6777efa873a16ce1", "3f58e0be3b8e8b15cba7adc4eae68586"),
(None, None, "1ffd3151819e594553e6b540e19e6c36"),
(None, None, "e5de1b503e64da424fb7d8113371e16d"),
(0, None, "8319096532fe2e55f66c491ea8313150"),
(0, "8319096532fe2e55f66c491ea8313150", "17a7e78277ab131a627cb4538bab7390"),
(0, "8319096532fe2e55f66c491ea8313150", "41a9e1d0390f4edd77181142ceae51bc"),
@ -514,7 +515,7 @@ def test_partition_pptx_hierarchy_sample_document():
(1, "7f647b1f0f20c3db40c36ab57d9a5550", "6ec455f5f19782facf184886876c9a66"),
(2, "6ec455f5f19782facf184886876c9a66", "5614b00c3f6bff23ebba1360e10f6428"),
(0, "8319096532fe2e55f66c491ea8313150", "2f57a8d4182e6fd5bd5842b0a2d9841b"),
(None, None, "1ffd3151819e594553e6b540e19e6c36"),
(None, None, "4120066d251ba675ade42e8a167ca61f"),
(None, None, "2ed3bd10daace79ac129cbf8faf22bfc"),
(0, None, "fd08cacbaddafee5cbacc02528536ee5"),
]
@ -545,8 +546,6 @@ def opts_args() -> dict[str, Any]:
"include_page_breaks": True,
"include_slide_notes": False,
"infer_table_structure": True,
"metadata_file_path": None,
"metadata_last_modified": None,
"strategy": "fast",
}
@ -632,15 +631,7 @@ class DescribePptxPartitionerOptions:
# -- .last_modified --------------------------
def it_gets_the_last_modified_date_of_the_document_from_the_caller_when_provided(
self, opts_args: dict[str, Any]
):
opts_args["metadata_last_modified"] = "2024-03-05T17:02:53"
opts = PptxPartitionerOptions(**opts_args)
assert opts.last_modified == "2024-03-05T17:02:53"
def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_path_is_provided(
def it_gets_last_modified_from_the_filesystem_when_a_path_is_provided(
self, opts_args: dict[str, Any], get_last_modified_date_: Mock
):
opts_args["file_path"] = "a/b/spreadsheet.pptx"
@ -665,21 +656,11 @@ class DescribePptxPartitionerOptions:
# -- .metadata_file_path ---------------------
def it_uses_the_user_provided_file_path_in_the_metadata_when_provided(
self, opts_args: dict[str, Any]
):
opts_args["file_path"] = "x/y/z.pptx"
opts_args["metadata_file_path"] = "a/b/c.pptx"
opts = PptxPartitionerOptions(**opts_args)
assert opts.metadata_file_path == "a/b/c.pptx"
@pytest.mark.parametrize("file_path", ["u/v/w.pptx", None])
def and_it_falls_back_to_the_document_file_path_otherwise(
def it_uses_the_filename_argument_when_provided(
self, file_path: str | None, opts_args: dict[str, Any]
):
opts_args["file_path"] = file_path
opts_args["metadata_file_path"] = None
opts = PptxPartitionerOptions(**opts_args)
assert opts.metadata_file_path == file_path
@ -769,9 +750,11 @@ class DescribePptxPartitionerOptions:
# -- .table_metadata -------------------------
def it_can_create_table_metadata(self, opts_args: dict[str, Any]):
opts_args["metadata_file_path"] = "d/e/f.pptx"
opts_args["metadata_last_modified"] = "2024-04-02T19:51:55"
def it_can_create_table_metadata(
self, last_modified_prop_: Mock, metadata_file_path_prop_: Mock, opts_args: dict[str, Any]
):
metadata_file_path_prop_.return_value = "d/e/f.pptx"
last_modified_prop_.return_value = "2024-04-02T19:51:55"
opts = PptxPartitionerOptions(**opts_args)
# -- move to the first slide --
list(opts.increment_page_number())
@ -786,9 +769,11 @@ class DescribePptxPartitionerOptions:
# -- .text_metadata -------------------------
def it_can_create_text_metadata(self, opts_args: dict[str, Any]):
opts_args["metadata_file_path"] = "d/e/f.pptx"
opts_args["metadata_last_modified"] = "2024-04-02T19:56:40"
def it_can_create_text_metadata(
self, last_modified_prop_: Mock, metadata_file_path_prop_: Mock, opts_args: dict[str, Any]
):
metadata_file_path_prop_.return_value = "d/e/f.pptx"
last_modified_prop_.return_value = "2024-04-02T19:56:40"
opts = PptxPartitionerOptions(**opts_args)
# -- move to the first slide --
list(opts.increment_page_number())
@ -806,3 +791,11 @@ class DescribePptxPartitionerOptions:
@pytest.fixture()
def get_last_modified_date_(self, request: FixtureRequest):
return function_mock(request, "unstructured.partition.pptx.get_last_modified_date")
@pytest.fixture()
def last_modified_prop_(self, request: FixtureRequest):
return property_mock(request, PptxPartitionerOptions, "last_modified")
@pytest.fixture()
def metadata_file_path_prop_(self, request: FixtureRequest):
return property_mock(request, PptxPartitionerOptions, "metadata_file_path")

View File

@ -321,15 +321,6 @@ def test_partition_xlsx_with_more_than_1k_cells():
class Describe_XlsxPartitionerOptions:
"""Unit-test suite for `unstructured.partition.xlsx._XlsxPartitionerOptions` objects."""
@pytest.mark.parametrize("arg_value", [True, False])
def it_knows_whether_to_detect_language_for_each_element_individually(
self, arg_value: bool, opts_args: dict[str, Any]
):
opts_args["detect_language_per_element"] = arg_value
opts = _XlsxPartitionerOptions(**opts_args)
assert opts.detect_language_per_element is arg_value
@pytest.mark.parametrize("arg_value", [True, False])
def it_knows_whether_to_find_subtables_within_each_worksheet_or_return_table_per_worksheet(
self, arg_value: bool, opts_args: dict[str, Any]
@ -366,37 +357,20 @@ class Describe_XlsxPartitionerOptions:
assert opts.infer_table_structure is arg_value
@pytest.mark.parametrize(
("arg_value", "expected_value"),
[(None, None), (["eng"], ["eng"]), (["eng", "spa"], ["eng", "spa"])],
)
def it_knows_what_languages_the_caller_expects_to_appear_in_the_text(
self, arg_value: bool, expected_value: int | None, opts_args: dict[str, Any]
):
opts_args["languages"] = arg_value
opts = _XlsxPartitionerOptions(**opts_args)
# -- .last_modified --------------------------------------------------------------------------
assert opts.languages == expected_value
def it_gets_the_last_modified_date_of_the_document_from_the_caller_when_provided(
self, opts_args: dict[str, Any]
):
opts_args["metadata_last_modified"] = "2024-03-05T17:02:53"
opts = _XlsxPartitionerOptions(**opts_args)
assert opts.last_modified == "2024-03-05T17:02:53"
def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_path_is_provided(
def it_gets_last_modified_from_the_filesystem_when_a_path_is_provided(
self, opts_args: dict[str, Any], get_last_modified_date_: Mock
):
filesystem_last_modified = "2024-04-02T20:32:35"
opts_args["file_path"] = "a/b/spreadsheet.xlsx"
get_last_modified_date_.return_value = "2024-04-02T20:32:35"
get_last_modified_date_.return_value = filesystem_last_modified
opts = _XlsxPartitionerOptions(**opts_args)
last_modified = opts.last_modified
get_last_modified_date_.assert_called_once_with("a/b/spreadsheet.xlsx")
assert last_modified == "2024-04-02T20:32:35"
assert last_modified == filesystem_last_modified
def but_it_falls_back_to_None_for_the_last_modified_date_when_no_file_path_is_provided(
self, opts_args: dict[str, Any]
@ -409,24 +383,13 @@ class Describe_XlsxPartitionerOptions:
assert last_modified is None
def it_uses_the_user_provided_file_path_in_the_metadata_when_provided(
self, opts_args: dict[str, Any]
):
# -- .metadata_file_path ---------------------------------------------------------------------
def it_uses_the_file_path_argument_when_provided(self, opts_args: dict[str, Any]):
opts_args["file_path"] = "x/y/z.xlsx"
opts_args["metadata_file_path"] = "a/b/c.xlsx"
opts = _XlsxPartitionerOptions(**opts_args)
assert opts.metadata_file_path == "a/b/c.xlsx"
@pytest.mark.parametrize("file_path", ["u/v/w.xlsx", None])
def and_it_falls_back_to_the_document_file_path_otherwise(
self, file_path: str | None, opts_args: dict[str, Any]
):
opts_args["file_path"] = file_path
opts_args["metadata_file_path"] = None
opts = _XlsxPartitionerOptions(**opts_args)
assert opts.metadata_file_path == file_path
assert opts.metadata_file_path == "x/y/z.xlsx"
# -- fixtures --------------------------------------------------------------------------------
@ -442,15 +405,11 @@ class Describe_XlsxPartitionerOptions:
compact for testing purposes.
"""
return {
"detect_language_per_element": False,
"file": None,
"file_path": None,
"file": None,
"find_subtable": True,
"include_header": False,
"infer_table_structure": True,
"languages": ["auto"],
"metadata_file_path": None,
"metadata_last_modified": None,
}

View File

@ -114,6 +114,18 @@ def test_partition_xml_from_file_rb_with_tags_raises_encoding_error():
)
# -- .metadata.filetype --------------------------------------------------------------------------
def test_partition_xml_gets_the_XML_mime_type_in_metadata_filetype():
XML_MIME_TYPE = "application/xml"
elements = partition_xml(example_doc_path("factbook.xml"))
assert all(e.metadata.filetype == XML_MIME_TYPE for e in elements), (
f"Expected all elements to have '{XML_MIME_TYPE}' as their filetype, but got:"
f" {repr(elements[0].metadata.filetype)}"
)
# -- .metadata.last_modified ---------------------------------------------------------------------

View File

@ -1 +1 @@
__version__ = "0.15.14-dev7" # pragma: no cover
__version__ = "0.15.14-dev8" # pragma: no cover

View File

@ -20,6 +20,7 @@ DETECTION_ORIGIN: str = "csv"
@add_chunking_strategy
def partition_csv(
filename: str | None = None,
*,
file: IO[bytes] | None = None,
encoding: str | None = None,
include_header: bool = False,

View File

@ -8,7 +8,7 @@ from __future__ import annotations
import io
from tempfile import SpooledTemporaryFile
from typing import IO, Any, Iterator, Optional, Protocol, Sequence
from typing import IO, Any, Iterator, Protocol, Sequence
import pptx
from pptx.presentation import Presentation
@ -32,13 +32,10 @@ from unstructured.documents.elements import (
Table,
Text,
Title,
process_metadata,
)
from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common.common import convert_ms_office_table_to_text
from unstructured.partition.common.lang import apply_lang_metadata
from unstructured.partition.common.metadata import get_last_modified_date
from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date
from unstructured.partition.text_type import (
is_email_address,
is_possible_narrative_text,
@ -80,20 +77,15 @@ class AbstractPicturePartitioner(Protocol):
# ================================================================================================
@process_metadata()
@add_metadata_with_filetype(FileType.PPTX)
@apply_metadata(FileType.PPTX)
@add_chunking_strategy
def partition_pptx(
filename: Optional[str] = None,
filename: str | None = None,
*,
file: Optional[IO[bytes]] = None,
detect_language_per_element: bool = False,
file: IO[bytes] | None = None,
include_page_breaks: bool = True,
include_slide_notes: Optional[bool] = None,
include_slide_notes: bool | None = None,
infer_table_structure: bool = True,
languages: Optional[list[str]] = ["auto"],
metadata_filename: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
starting_page_number: int = 1,
strategy: str = PartitionStrategy.FAST,
**kwargs: Any,
@ -108,12 +100,6 @@ def partition_pptx(
A file-like object using "rb" mode --> open(filename, "rb").
include_page_breaks
If True, includes a PageBreak element between slides
metadata_filename
The filename to use for the metadata. Relevant because partition_ppt() converts its
(legacy) .ppt document to .pptx before partition. We want the filename of the original
.ppt source file in the metadata.
metadata_last_modified
The last modified date for the document.
include_slide_notes
If True, includes the slide notes as element
infer_table_structure
@ -122,13 +108,6 @@ def partition_pptx(
I.e., rows and cells are preserved.
Whether True or False, the "text" field is always present in any Table element
and is the text content of the table (no structure).
languages
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
starting_page_number
Indicates what page number should be assigned to the first slide in the presentation.
This information will be reflected in elements' metadata and can be be especially
@ -140,19 +119,11 @@ def partition_pptx(
include_page_breaks=include_page_breaks,
include_slide_notes=include_slide_notes,
infer_table_structure=infer_table_structure,
metadata_file_path=metadata_filename,
metadata_last_modified=metadata_last_modified,
strategy=strategy,
starting_page_number=starting_page_number,
)
elements = _PptxPartitioner.iter_presentation_elements(opts)
elements = apply_lang_metadata(
elements=elements,
languages=languages,
detect_language_per_element=detect_language_per_element,
)
return list(elements)
return list(_PptxPartitioner.iter_presentation_elements(opts))
class _PptxPartitioner:
@ -321,7 +292,7 @@ class _PptxPartitioner:
detection_origin=DETECTION_ORIGIN,
)
def _order_shapes(self, slide: Slide) -> tuple[Optional[Shape], Sequence[BaseShape]]:
def _order_shapes(self, slide: Slide) -> tuple[Shape | None, Sequence[BaseShape]]:
"""Orders the shapes on `slide` from top to bottom and left to right.
Returns the title shape if it exists and the ordered shapes."""
@ -365,13 +336,11 @@ class PptxPartitionerOptions:
def __init__(
self,
*,
file: Optional[IO[bytes]],
file_path: Optional[str],
file: IO[bytes] | None,
file_path: str | None,
include_page_breaks: bool,
include_slide_notes: Optional[bool],
include_slide_notes: bool | None,
infer_table_structure: bool,
metadata_file_path: Optional[str],
metadata_last_modified: Optional[str],
strategy: str,
starting_page_number: int = 1,
):
@ -380,8 +349,6 @@ class PptxPartitionerOptions:
self._include_page_breaks = include_page_breaks
self._include_slide_notes = include_slide_notes
self._infer_table_structure = infer_table_structure
self._metadata_file_path = metadata_file_path
self._metadata_last_modified = metadata_last_modified
self._strategy = strategy
# -- options object maintains page-number state --
self._page_counter = starting_page_number - 1
@ -417,7 +384,9 @@ class PptxPartitionerOptions:
yield PageBreak(
"",
detection_origin=DETECTION_ORIGIN,
metadata=ElementMetadata(last_modified=self.last_modified),
metadata=ElementMetadata(
last_modified=self.last_modified, page_number=self.page_number - 1
),
)
@lazyproperty
@ -426,27 +395,19 @@ class PptxPartitionerOptions:
return self._infer_table_structure
@lazyproperty
def last_modified(self) -> Optional[str]:
def last_modified(self) -> str | None:
"""The best last-modified date available, None if no sources are available."""
# -- Value explicitly specified by caller takes precedence. This is used for example when
# -- this file was converted from another format, and any last-modified date for the file
# -- would be just now.
if self._metadata_last_modified:
return self._metadata_last_modified
if not self._file_path:
return None
if self._file_path:
return (
None
if is_temp_file_path(self._file_path)
else get_last_modified_date(self._file_path)
)
return None
return (
None if is_temp_file_path(self._file_path) else get_last_modified_date(self._file_path)
)
@lazyproperty
def metadata_file_path(self) -> str | None:
"""The best available file-path for this document or `None` if unavailable."""
return self._metadata_file_path or self._file_path
return self._file_path
@property
def page_number(self) -> int:

View File

@ -6,36 +6,24 @@ import pandas as pd
from lxml.html.soupparser import fromstring as soupparser_fromstring
from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import (
Element,
ElementMetadata,
Table,
process_metadata,
)
from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.documents.elements import Element, ElementMetadata, Table
from unstructured.file_utils.model import FileType
from unstructured.partition.common.common import (
exactly_one,
spooled_to_bytes_io_if_needed,
)
from unstructured.partition.common.lang import apply_lang_metadata
from unstructured.partition.common.metadata import get_last_modified_date
from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date
DETECTION_ORIGIN: str = "tsv"
@process_metadata()
@add_metadata_with_filetype(FileType.TSV)
@apply_metadata(FileType.TSV)
@add_chunking_strategy
def partition_tsv(
filename: Optional[str] = None,
*,
file: Optional[IO[bytes]] = None,
metadata_filename: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
include_header: bool = False,
languages: Optional[list[str]] = ["auto"],
# NOTE (jennings) partition_tsv generates a single TableElement
# so detect_language_per_element is not included as a param
**kwargs: Any,
) -> list[Element]:
"""Partitions TSV files into document elements.
@ -48,17 +36,9 @@ def partition_tsv(
A file-like object using "rb" mode --> open(filename, "rb").
include_header
Determines whether or not header info info is included in text and medatada.text_as_html.
metadata_last_modified
The day of the last modification.
languages
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
"""
exactly_one(filename=filename, file=file)
last_modified = get_last_modified_date(filename) if filename else None
header = 0 if include_header else None
if filename:
@ -75,14 +55,9 @@ def partition_tsv(
metadata = ElementMetadata(
text_as_html=html_text,
filename=metadata_filename or filename,
last_modified=metadata_last_modified or last_modified,
languages=languages,
filename=filename,
last_modified=get_last_modified_date(filename) if filename else None,
)
metadata.detection_origin = DETECTION_ORIGIN
elements = apply_lang_metadata(
[Table(text=text, metadata=metadata)],
languages=languages,
)
return list(elements)
return [Table(text=text, metadata=metadata)]

View File

@ -22,12 +22,9 @@ from unstructured.documents.elements import (
Table,
Text,
Title,
process_metadata,
)
from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common.lang import apply_lang_metadata
from unstructured.partition.common.metadata import get_last_modified_date
from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date
from unstructured.partition.text_type import (
is_bulleted_text,
is_possible_narrative_text,
@ -41,19 +38,15 @@ _CellCoordinate: TypeAlias = "tuple[int, int]"
DETECTION_ORIGIN: str = "xlsx"
@process_metadata()
@add_metadata_with_filetype(FileType.XLSX)
@apply_metadata(FileType.XLSX)
@add_chunking_strategy
def partition_xlsx(
filename: Optional[str] = None,
*,
file: Optional[IO[bytes]] = None,
metadata_filename: Optional[str] = None,
infer_table_structure: bool = True,
languages: Optional[list[str]] = ["auto"],
detect_language_per_element: bool = False,
metadata_last_modified: Optional[str] = None,
include_header: bool = False,
find_subtable: bool = True,
include_header: bool = False,
infer_table_structure: bool = True,
starting_page_number: int = 1,
**kwargs: Any,
) -> list[Element]:
@ -71,28 +64,15 @@ def partition_xlsx(
I.e., rows and cells are preserved.
Whether True or False, the "text" field is always present in any Table element
and is the text content of the table (no structure).
languages
User defined value for metadata.languages if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
metadata_last_modified
The day of the last modification
include_header
Determines whether or not header info is included in text and medatada.text_as_html
"""
opts = _XlsxPartitionerOptions(
detect_language_per_element=detect_language_per_element,
file=file,
file_path=filename,
file=file,
find_subtable=find_subtable,
include_header=include_header,
infer_table_structure=infer_table_structure,
languages=languages,
metadata_file_path=metadata_filename,
metadata_last_modified=metadata_last_modified,
)
elements: list[Element] = []
@ -151,13 +131,6 @@ def partition_xlsx(
element.metadata = _get_metadata(sheet_name, page_number, opts)
elements.append(element)
elements = list(
apply_lang_metadata(
elements=elements,
languages=opts.languages,
detect_language_per_element=opts.detect_language_per_element,
),
)
return elements
@ -167,30 +140,17 @@ class _XlsxPartitionerOptions:
def __init__(
self,
*,
detect_language_per_element: bool,
file: Optional[IO[bytes]],
file_path: Optional[str],
file: Optional[IO[bytes]],
find_subtable: bool,
include_header: bool,
infer_table_structure: bool,
languages: Optional[list[str]],
metadata_file_path: Optional[str],
metadata_last_modified: Optional[str],
):
self._detect_language_per_element = detect_language_per_element
self._file = file
self._file_path = file_path
self._file = file
self._find_subtable = find_subtable
self._include_header = include_header
self._infer_table_structure = infer_table_structure
self._languages = languages
self._metadata_file_path = metadata_file_path
self._metadata_last_modified = metadata_last_modified
@lazyproperty
def detect_language_per_element(self) -> bool:
"""When True, detect language on element-by-element basis instead of document level."""
return self._detect_language_per_element
@lazyproperty
def find_subtable(self) -> bool:
@ -215,31 +175,15 @@ class _XlsxPartitionerOptions:
"""True when partitioner should compute and apply `text_as_html` metadata."""
return self._infer_table_structure
@lazyproperty
def languages(self) -> Optional[list[str]]:
"""User-specified language(s) of this document.
When `None`, language is detected using naive Bayesian filter via `langdetect`. Multiple
language codes indicate text could be in any of those languages.
"""
return self._languages
@lazyproperty
def last_modified(self) -> Optional[str]:
"""The best last-modified date available, None if no sources are available."""
# -- value explicitly specified by caller takes precedence --
if self._metadata_last_modified:
return self._metadata_last_modified
if self._file_path:
return get_last_modified_date(self._file_path)
return None
return get_last_modified_date(self._file_path) if self._file_path else None
@lazyproperty
def metadata_file_path(self) -> str | None:
"""The best available file-path for this document or `None` if unavailable."""
return self._metadata_file_path or self._file_path
return self._file_path
@lazyproperty
def sheets(self) -> dict[str, pd.DataFrame]:

View File

@ -2,92 +2,34 @@ from __future__ import annotations
import copy
from io import BytesIO
from typing import IO, Any, Iterator, Optional, cast
from typing import IO, Any, Iterator, cast
from lxml import etree
from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import (
Element,
ElementMetadata,
Text,
process_metadata,
)
from unstructured.documents.elements import Element, ElementMetadata, Text
from unstructured.file_utils.encoding import read_txt_file
from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common.common import (
exactly_one,
spooled_to_bytes_io_if_needed,
)
from unstructured.partition.common.lang import apply_lang_metadata
from unstructured.partition.common.metadata import get_last_modified_date
from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date
from unstructured.partition.text import element_from_text
DETECTION_ORIGIN: str = "xml"
def get_leaf_elements(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,
text: Optional[str] = None,
xml_path: Optional[str] = None,
) -> Iterator[Optional[str]]:
"""Get leaf elements from the XML tree defined in filename, file, or text."""
exactly_one(filename=filename, file=file, text=text)
if filename:
return _get_leaf_elements(filename, xml_path=xml_path)
elif file:
return _get_leaf_elements(file=spooled_to_bytes_io_if_needed(file), xml_path=xml_path)
else:
b = BytesIO(bytes(cast(str, text), encoding="utf-8"))
return _get_leaf_elements(b, xml_path=xml_path)
def _get_leaf_elements(
file: str | IO[bytes],
xml_path: Optional[str] = None,
) -> Iterator[Optional[str]]:
"""Parse the XML tree in a memory efficient manner if possible."""
element_stack: list[etree._Element] = [] # pyright: ignore[reportPrivateUsage]
element_iterator = etree.iterparse(file, events=("start", "end"), resolve_entities=False)
# NOTE(alan) If xml_path is used for filtering, I've yet to find a good way to stream
# elements through in a memory efficient way, so we bite the bullet and load it all into
# memory.
if xml_path is not None:
_, element = next(element_iterator)
compiled_path = etree.XPath(xml_path)
element_iterator = (("end", el) for el in compiled_path(element))
for event, element in element_iterator:
if event == "start":
element_stack.append(element)
if event == "end":
if element.text is not None and element.text.strip():
yield element.text
element.clear()
while element_stack and element_stack[-1].getparent() is None:
element_stack.pop()
@process_metadata()
@add_metadata_with_filetype(FileType.XML)
@apply_metadata(FileType.XML)
@add_chunking_strategy
def partition_xml(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,
text: Optional[str] = None,
filename: str | None = None,
*,
file: IO[bytes] | None = None,
text: str | None = None,
encoding: str | None = None,
xml_keep_tags: bool = False,
xml_path: Optional[str] = None,
metadata_filename: Optional[str] = None,
encoding: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
languages: Optional[list[str]] = ["auto"],
detect_language_per_element: bool = False,
xml_path: str | None = None,
**kwargs: Any,
) -> list[Element]:
"""Partitions an XML document into its document elements.
@ -100,32 +42,20 @@ def partition_xml(
A file-like object using "rb" mode --> open(filename, "rb").
text
The text of the XML file.
encoding
The encoding method used to decode the text input. If None, utf-8 will be used.
xml_keep_tags
If True, will retain the XML tags in the output. Otherwise it will simply extract
the text from within the tags.
xml_path
The xml_path to use for extracting the text. Only used if xml_keep_tags=False.
encoding
The encoding method used to decode the text input. If None, utf-8 will be used.
metadata_last_modified
The day of the last modification.
languages
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
"""
exactly_one(filename=filename, file=file, text=text)
elements: list[Element] = []
last_modification_date = get_last_modified_date(filename) if filename else None
metadata = ElementMetadata(
filename=metadata_filename or filename,
last_modified=metadata_last_modified or last_modification_date,
filename=filename, last_modified=get_last_modified_date(filename) if filename else None
)
metadata.detection_origin = DETECTION_ORIGIN
@ -153,11 +83,48 @@ def partition_xml(
element.metadata = copy.deepcopy(metadata)
elements.append(element)
elements = list(
apply_lang_metadata(
elements=elements,
languages=languages,
detect_language_per_element=detect_language_per_element,
),
)
return elements
def get_leaf_elements(
filename: str | None, file: IO[bytes] | None, text: str | None, xml_path: str | None
) -> Iterator[str | None]:
"""Get leaf elements from the XML tree defined in filename, file, or text."""
exactly_one(filename=filename, file=file, text=text)
if filename:
return _get_leaf_elements(filename, xml_path=xml_path)
elif file:
return _get_leaf_elements(file=spooled_to_bytes_io_if_needed(file), xml_path=xml_path)
else:
b = BytesIO(bytes(cast(str, text), encoding="utf-8"))
return _get_leaf_elements(b, xml_path=xml_path)
def _get_leaf_elements(
file: str | IO[bytes],
xml_path: str | None,
) -> Iterator[str | None]:
"""Parse the XML tree in a memory efficient manner if possible."""
element_stack: list[etree._Element] = [] # pyright: ignore[reportPrivateUsage]
element_iterator = etree.iterparse(file, events=("start", "end"), resolve_entities=False)
# NOTE(alan) If xml_path is used for filtering, I've yet to find a good way to stream
# elements through in a memory efficient way, so we bite the bullet and load it all into
# memory.
if xml_path is not None:
_, element = next(element_iterator)
compiled_path = etree.XPath(xml_path)
element_iterator = (("end", el) for el in compiled_path(element))
for event, element in element_iterator:
if event == "start":
element_stack.append(element)
if event == "end":
if element.text is not None and element.text.strip():
yield element.text
element.clear()
while element_stack and element_stack[-1].getparent() is None:
element_stack.pop()