mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-01 12:53:58 +00:00
rfctr(xlsx): extract _XlsxPartitionerOptions (#2838)
**Summary** As an initial step in reducing the complexity of the monolithic `partition_xlsx()` function, extract all argument-handling to a separate `_XlsxPartitionerOptions` object which can be fully covered by isolated unit tests. **Additional Context** This code was from a prior XLSX bug-fix branch that did not get committed because of time constraints. I wanted to revisit it here because I need the benefits of this as part of some new work on PPTX that will require a separate options object that can be passed to delegate objects. This approach was incubated in the chunking context and has produced a lot of opportunities there to decompose the logic into smaller components that are more understandable and isolated-test-able, without having to pass an extended list of option values in ever sub-call. As well as decluttering the code, this removes coupling where the caller needs to know which options a subroutine might need to reference.
This commit is contained in:
parent
e49c35933d
commit
1ce60f2bba
@ -1,4 +1,4 @@
|
|||||||
## 0.13.1-dev1
|
## 0.13.1-dev2
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@ from __future__ import annotations
|
|||||||
import io
|
import io
|
||||||
import sys
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import cast
|
from typing import Any, cast
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import pandas.testing as pdt
|
import pandas.testing as pdt
|
||||||
@ -19,13 +19,20 @@ from test_unstructured.partition.test_constants import (
|
|||||||
EXPECTED_TEXT_XLSX,
|
EXPECTED_TEXT_XLSX,
|
||||||
EXPECTED_TITLE,
|
EXPECTED_TITLE,
|
||||||
)
|
)
|
||||||
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
|
from test_unstructured.unit_utils import (
|
||||||
|
FixtureRequest,
|
||||||
|
Mock,
|
||||||
|
assert_round_trips_through_JSON,
|
||||||
|
example_doc_path,
|
||||||
|
function_mock,
|
||||||
|
)
|
||||||
from unstructured.cleaners.core import clean_extra_whitespace
|
from unstructured.cleaners.core import clean_extra_whitespace
|
||||||
from unstructured.documents.elements import ListItem, Table, Text, Title
|
from unstructured.documents.elements import ListItem, Table, Text, Title
|
||||||
from unstructured.partition.xlsx import (
|
from unstructured.partition.xlsx import (
|
||||||
_CellCoordinate,
|
_CellCoordinate,
|
||||||
_ConnectedComponent,
|
_ConnectedComponent,
|
||||||
_SubtableParser,
|
_SubtableParser,
|
||||||
|
_XlsxPartitionerOptions,
|
||||||
partition_xlsx,
|
partition_xlsx,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -116,13 +123,7 @@ def test_partition_xlsx_from_filename_with_metadata_filename():
|
|||||||
assert elements[0].metadata.filename == "test"
|
assert elements[0].metadata.filename == "test"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize("infer_table_structure", [True, False])
|
||||||
"infer_table_structure",
|
|
||||||
[
|
|
||||||
True,
|
|
||||||
False,
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_partition_xlsx_infer_table_structure(infer_table_structure: bool):
|
def test_partition_xlsx_infer_table_structure(infer_table_structure: bool):
|
||||||
elements = partition_xlsx(
|
elements = partition_xlsx(
|
||||||
"example-docs/stanley-cups.xlsx", infer_table_structure=infer_table_structure
|
"example-docs/stanley-cups.xlsx", infer_table_structure=infer_table_structure
|
||||||
@ -395,6 +396,176 @@ def test_partition_xlsx_with_more_than_1k_cells():
|
|||||||
# ------------------------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class Describe_XlsxPartitionerOptions:
|
||||||
|
"""Unit-test suite for `unstructured.partition.xlsx._XlsxPartitionerOptions` objects."""
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("arg_value", [True, False])
|
||||||
|
def it_knows_whether_to_detect_language_for_each_element_individually(
|
||||||
|
self, arg_value: bool, opts_args: dict[str, Any]
|
||||||
|
):
|
||||||
|
opts_args["detect_language_per_element"] = arg_value
|
||||||
|
opts = _XlsxPartitionerOptions(**opts_args)
|
||||||
|
|
||||||
|
assert opts.detect_language_per_element is arg_value
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("arg_value", [True, False])
|
||||||
|
def it_knows_whether_to_find_subtables_within_each_worksheet_or_return_table_per_worksheet(
|
||||||
|
self, arg_value: bool, opts_args: dict[str, Any]
|
||||||
|
):
|
||||||
|
opts_args["find_subtable"] = arg_value
|
||||||
|
opts = _XlsxPartitionerOptions(**opts_args)
|
||||||
|
|
||||||
|
assert opts.find_subtable is arg_value
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(("arg_value", "expected_value"), [(True, 0), (False, None)])
|
||||||
|
def it_knows_the_header_row_index_for_Pandas(
|
||||||
|
self, arg_value: bool, expected_value: int | None, opts_args: dict[str, Any]
|
||||||
|
):
|
||||||
|
opts_args["include_header"] = arg_value
|
||||||
|
opts = _XlsxPartitionerOptions(**opts_args)
|
||||||
|
|
||||||
|
assert opts.header_row_idx == expected_value
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("arg_value", [True, False])
|
||||||
|
def it_knows_whether_to_include_column_headings_in_Table_text_as_html(
|
||||||
|
self, arg_value: bool, opts_args: dict[str, Any]
|
||||||
|
):
|
||||||
|
opts_args["include_header"] = arg_value
|
||||||
|
opts = _XlsxPartitionerOptions(**opts_args)
|
||||||
|
|
||||||
|
assert opts.include_header is arg_value
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("arg_value", [True, False])
|
||||||
|
def it_knows_whether_to_include_metadata_on_elements(
|
||||||
|
self, arg_value: bool, opts_args: dict[str, Any]
|
||||||
|
):
|
||||||
|
opts_args["include_metadata"] = arg_value
|
||||||
|
opts = _XlsxPartitionerOptions(**opts_args)
|
||||||
|
|
||||||
|
assert opts.include_metadata is arg_value
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("arg_value", [True, False])
|
||||||
|
def it_knows_whether_to_include_text_as_html_in_Table_metadata(
|
||||||
|
self, arg_value: bool, opts_args: dict[str, Any]
|
||||||
|
):
|
||||||
|
opts_args["infer_table_structure"] = arg_value
|
||||||
|
opts = _XlsxPartitionerOptions(**opts_args)
|
||||||
|
|
||||||
|
assert opts.infer_table_structure is arg_value
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("arg_value", "expected_value"),
|
||||||
|
[(None, None), (["eng"], ["eng"]), (["eng", "spa"], ["eng", "spa"])],
|
||||||
|
)
|
||||||
|
def it_knows_what_languages_the_caller_expects_to_appear_in_the_text(
|
||||||
|
self, arg_value: bool, expected_value: int | None, opts_args: dict[str, Any]
|
||||||
|
):
|
||||||
|
opts_args["languages"] = arg_value
|
||||||
|
opts = _XlsxPartitionerOptions(**opts_args)
|
||||||
|
|
||||||
|
assert opts.languages == expected_value
|
||||||
|
|
||||||
|
def it_gets_the_last_modified_date_of_the_document_from_the_caller_when_provided(
|
||||||
|
self, opts_args: dict[str, Any]
|
||||||
|
):
|
||||||
|
opts_args["metadata_last_modified"] = "2024-03-05T17:02:53"
|
||||||
|
opts = _XlsxPartitionerOptions(**opts_args)
|
||||||
|
|
||||||
|
assert opts.last_modified == "2024-03-05T17:02:53"
|
||||||
|
|
||||||
|
def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_path_is_provided(
|
||||||
|
self, opts_args: dict[str, Any], get_last_modified_date_: Mock
|
||||||
|
):
|
||||||
|
opts_args["file_path"] = "a/b/spreadsheet.xlsx"
|
||||||
|
get_last_modified_date_.return_value = "2024-04-02T20:32:35"
|
||||||
|
opts = _XlsxPartitionerOptions(**opts_args)
|
||||||
|
|
||||||
|
last_modified = opts.last_modified
|
||||||
|
|
||||||
|
get_last_modified_date_.assert_called_once_with("a/b/spreadsheet.xlsx")
|
||||||
|
assert last_modified == "2024-04-02T20:32:35"
|
||||||
|
|
||||||
|
def and_it_falls_back_to_the_last_modified_date_of_the_open_file_when_a_file_is_provided(
|
||||||
|
self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock
|
||||||
|
):
|
||||||
|
file = io.BytesIO(b"abcdefg")
|
||||||
|
opts_args["file"] = file
|
||||||
|
opts_args["date_from_file_object"] = True
|
||||||
|
get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07"
|
||||||
|
opts = _XlsxPartitionerOptions(**opts_args)
|
||||||
|
|
||||||
|
last_modified = opts.last_modified
|
||||||
|
|
||||||
|
get_last_modified_date_from_file_.assert_called_once_with(file)
|
||||||
|
assert last_modified == "2024-04-02T20:42:07"
|
||||||
|
|
||||||
|
def but_it_falls_back_to_None_for_the_last_modified_date_when_date_from_file_object_is_False(
|
||||||
|
self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock
|
||||||
|
):
|
||||||
|
file = io.BytesIO(b"abcdefg")
|
||||||
|
opts_args["file"] = file
|
||||||
|
opts_args["date_from_file_object"] = False
|
||||||
|
get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07"
|
||||||
|
opts = _XlsxPartitionerOptions(**opts_args)
|
||||||
|
|
||||||
|
last_modified = opts.last_modified
|
||||||
|
|
||||||
|
get_last_modified_date_from_file_.assert_not_called()
|
||||||
|
assert last_modified is None
|
||||||
|
|
||||||
|
def it_uses_the_user_provided_file_path_in_the_metadata_when_provided(
|
||||||
|
self, opts_args: dict[str, Any]
|
||||||
|
):
|
||||||
|
opts_args["file_path"] = "x/y/z.xlsx"
|
||||||
|
opts_args["metadata_file_path"] = "a/b/c.xlsx"
|
||||||
|
opts = _XlsxPartitionerOptions(**opts_args)
|
||||||
|
|
||||||
|
assert opts.metadata_file_path == "a/b/c.xlsx"
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("file_path", ["u/v/w.xlsx", None])
|
||||||
|
def and_it_falls_back_to_the_document_file_path_otherwise(
|
||||||
|
self, file_path: str | None, opts_args: dict[str, Any]
|
||||||
|
):
|
||||||
|
opts_args["file_path"] = file_path
|
||||||
|
opts_args["metadata_file_path"] = None
|
||||||
|
opts = _XlsxPartitionerOptions(**opts_args)
|
||||||
|
|
||||||
|
assert opts.metadata_file_path == file_path
|
||||||
|
|
||||||
|
# -- fixtures --------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def get_last_modified_date_(self, request: FixtureRequest):
|
||||||
|
return function_mock(request, "unstructured.partition.xlsx.get_last_modified_date")
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def get_last_modified_date_from_file_(self, request: FixtureRequest):
|
||||||
|
return function_mock(
|
||||||
|
request, "unstructured.partition.xlsx.get_last_modified_date_from_file"
|
||||||
|
)
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def opts_args(self) -> dict[str, Any]:
|
||||||
|
"""All default arguments for `_XlsxPartitionerOptions`.
|
||||||
|
|
||||||
|
Individual argument values can be changed to suit each test. Makes construction of opts more
|
||||||
|
compact for testing purposes.
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
"date_from_file_object": False,
|
||||||
|
"detect_language_per_element": False,
|
||||||
|
"file": None,
|
||||||
|
"file_path": None,
|
||||||
|
"find_subtable": True,
|
||||||
|
"include_header": False,
|
||||||
|
"include_metadata": True,
|
||||||
|
"infer_table_structure": True,
|
||||||
|
"languages": ["auto"],
|
||||||
|
"metadata_file_path": None,
|
||||||
|
"metadata_last_modified": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class Describe_ConnectedComponent:
|
class Describe_ConnectedComponent:
|
||||||
"""Unit-test suite for `unstructured.partition.xlsx._ConnectedComponent` objects."""
|
"""Unit-test suite for `unstructured.partition.xlsx._ConnectedComponent` objects."""
|
||||||
|
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.13.1-dev1" # pragma: no cover
|
__version__ = "0.13.1-dev2" # pragma: no cover
|
||||||
|
@ -88,39 +88,28 @@ def partition_xlsx(
|
|||||||
Applies only when providing file via `file` parameter. If this option is True, attempt
|
Applies only when providing file via `file` parameter. If this option is True, attempt
|
||||||
infer last_modified metadata from bytes, otherwise set it to None.
|
infer last_modified metadata from bytes, otherwise set it to None.
|
||||||
"""
|
"""
|
||||||
last_modification_date = None
|
opts = _XlsxPartitionerOptions(
|
||||||
header = 0 if include_header else None
|
date_from_file_object=date_from_file_object,
|
||||||
|
detect_language_per_element=detect_language_per_element,
|
||||||
sheets: dict[str, pd.DataFrame] = {}
|
file=file,
|
||||||
if filename:
|
file_path=filename,
|
||||||
sheets = pd.read_excel( # pyright: ignore[reportUnknownMemberType]
|
find_subtable=find_subtable,
|
||||||
filename, sheet_name=None, header=header
|
include_header=include_header,
|
||||||
|
include_metadata=include_metadata,
|
||||||
|
infer_table_structure=infer_table_structure,
|
||||||
|
languages=languages,
|
||||||
|
metadata_file_path=metadata_filename,
|
||||||
|
metadata_last_modified=metadata_last_modified,
|
||||||
)
|
)
|
||||||
last_modification_date = get_last_modified_date(filename)
|
|
||||||
|
|
||||||
elif file:
|
|
||||||
if isinstance(file, SpooledTemporaryFile):
|
|
||||||
file.seek(0)
|
|
||||||
f = io.BytesIO(file.read())
|
|
||||||
else:
|
|
||||||
f = file
|
|
||||||
sheets = pd.read_excel( # pyright: ignore[reportUnknownMemberType]
|
|
||||||
f, sheet_name=None, header=header
|
|
||||||
)
|
|
||||||
last_modification_date = (
|
|
||||||
get_last_modified_date_from_file(file) if date_from_file_object else None
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
raise ValueError("Either 'filename' or 'file' argument must be specified")
|
|
||||||
|
|
||||||
elements: list[Element] = []
|
elements: list[Element] = []
|
||||||
for page_number, (sheet_name, sheet) in enumerate(sheets.items(), start=1):
|
for page_number, (sheet_name, sheet) in enumerate(opts.sheets.items(), start=1):
|
||||||
if not find_subtable:
|
if not opts.find_subtable:
|
||||||
html_text = (
|
html_text = (
|
||||||
sheet.to_html( # pyright: ignore[reportUnknownMemberType]
|
sheet.to_html( # pyright: ignore[reportUnknownMemberType]
|
||||||
index=False, header=include_header, na_rep=""
|
index=False, header=opts.include_header, na_rep=""
|
||||||
)
|
)
|
||||||
if infer_table_structure
|
if opts.infer_table_structure
|
||||||
else None
|
else None
|
||||||
)
|
)
|
||||||
# XXX: `html_text` can be `None`. What happens on this call in that case?
|
# XXX: `html_text` can be `None`. What happens on this call in that case?
|
||||||
@ -131,13 +120,13 @@ def partition_xlsx(
|
|||||||
).text_content(),
|
).text_content(),
|
||||||
)
|
)
|
||||||
|
|
||||||
if include_metadata:
|
if opts.include_metadata:
|
||||||
metadata = ElementMetadata(
|
metadata = ElementMetadata(
|
||||||
text_as_html=html_text,
|
text_as_html=html_text,
|
||||||
page_name=sheet_name,
|
page_name=sheet_name,
|
||||||
page_number=page_number,
|
page_number=page_number,
|
||||||
filename=metadata_filename or filename,
|
filename=opts.metadata_file_path,
|
||||||
last_modified=metadata_last_modified or last_modification_date,
|
last_modified=opts.last_modified,
|
||||||
)
|
)
|
||||||
metadata.detection_origin = DETECTION_ORIGIN
|
metadata.detection_origin = DETECTION_ORIGIN
|
||||||
else:
|
else:
|
||||||
@ -149,17 +138,11 @@ def partition_xlsx(
|
|||||||
for component in _ConnectedComponents.from_worksheet_df(sheet):
|
for component in _ConnectedComponents.from_worksheet_df(sheet):
|
||||||
subtable_parser = _SubtableParser(component.subtable)
|
subtable_parser = _SubtableParser(component.subtable)
|
||||||
|
|
||||||
metadata = _get_metadata(
|
metadata = _get_metadata(sheet_name, page_number, opts)
|
||||||
include_metadata,
|
|
||||||
sheet_name,
|
|
||||||
page_number,
|
|
||||||
metadata_filename or filename,
|
|
||||||
metadata_last_modified or last_modification_date,
|
|
||||||
)
|
|
||||||
|
|
||||||
# -- emit each leading single-cell row as its own `Text`-subtype element --
|
# -- emit each leading single-cell row as its own `Text`-subtype element --
|
||||||
for content in subtable_parser.iter_leading_single_cell_rows_texts():
|
for content in subtable_parser.iter_leading_single_cell_rows_texts():
|
||||||
element = _check_content_element_type(str(content))
|
element = _create_element(str(content))
|
||||||
element.metadata = metadata
|
element.metadata = metadata
|
||||||
elements.append(element)
|
elements.append(element)
|
||||||
|
|
||||||
@ -167,7 +150,7 @@ def partition_xlsx(
|
|||||||
core_table = subtable_parser.core_table
|
core_table = subtable_parser.core_table
|
||||||
if core_table is not None:
|
if core_table is not None:
|
||||||
html_text = core_table.to_html( # pyright: ignore[reportUnknownMemberType]
|
html_text = core_table.to_html( # pyright: ignore[reportUnknownMemberType]
|
||||||
index=False, header=include_header, na_rep=""
|
index=False, header=opts.include_header, na_rep=""
|
||||||
)
|
)
|
||||||
text = cast(
|
text = cast(
|
||||||
str,
|
str,
|
||||||
@ -177,27 +160,144 @@ def partition_xlsx(
|
|||||||
)
|
)
|
||||||
element = Table(text=text)
|
element = Table(text=text)
|
||||||
element.metadata = metadata
|
element.metadata = metadata
|
||||||
element.metadata.text_as_html = html_text if infer_table_structure else None
|
element.metadata.text_as_html = (
|
||||||
|
html_text if opts.infer_table_structure else None
|
||||||
|
)
|
||||||
elements.append(element)
|
elements.append(element)
|
||||||
|
|
||||||
# -- no core-table is emitted if it's empty (all rows are single-cell rows) --
|
# -- no core-table is emitted if it's empty (all rows are single-cell rows) --
|
||||||
|
|
||||||
# -- emit each trailing single-cell row as its own `Text`-subtype element --
|
# -- emit each trailing single-cell row as its own `Text`-subtype element --
|
||||||
for content in subtable_parser.iter_trailing_single_cell_rows_texts():
|
for content in subtable_parser.iter_trailing_single_cell_rows_texts():
|
||||||
element = _check_content_element_type(str(content))
|
element = _create_element(str(content))
|
||||||
element.metadata = metadata
|
element.metadata = metadata
|
||||||
elements.append(element)
|
elements.append(element)
|
||||||
|
|
||||||
elements = list(
|
elements = list(
|
||||||
apply_lang_metadata(
|
apply_lang_metadata(
|
||||||
elements=elements,
|
elements=elements,
|
||||||
languages=languages,
|
languages=opts.languages,
|
||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=opts.detect_language_per_element,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
return elements
|
return elements
|
||||||
|
|
||||||
|
|
||||||
|
class _XlsxPartitionerOptions:
|
||||||
|
"""Encapsulates partitioning option validation, computation, and application of defaults."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
date_from_file_object: bool,
|
||||||
|
detect_language_per_element: bool,
|
||||||
|
file: Optional[IO[bytes]],
|
||||||
|
file_path: Optional[str],
|
||||||
|
find_subtable: bool,
|
||||||
|
include_header: bool,
|
||||||
|
include_metadata: bool,
|
||||||
|
infer_table_structure: bool,
|
||||||
|
languages: Optional[list[str]],
|
||||||
|
metadata_file_path: Optional[str],
|
||||||
|
metadata_last_modified: Optional[str],
|
||||||
|
):
|
||||||
|
self._date_from_file_object = date_from_file_object
|
||||||
|
self._detect_language_per_element = detect_language_per_element
|
||||||
|
self._file = file
|
||||||
|
self._file_path = file_path
|
||||||
|
self._find_subtable = find_subtable
|
||||||
|
self._include_header = include_header
|
||||||
|
self._include_metadata = include_metadata
|
||||||
|
self._infer_table_structure = infer_table_structure
|
||||||
|
self._languages = languages
|
||||||
|
self._metadata_file_path = metadata_file_path
|
||||||
|
self._metadata_last_modified = metadata_last_modified
|
||||||
|
|
||||||
|
@lazyproperty
|
||||||
|
def detect_language_per_element(self) -> bool:
|
||||||
|
"""When True, detect language on element-by-element basis instead of document level."""
|
||||||
|
return self._detect_language_per_element
|
||||||
|
|
||||||
|
@lazyproperty
|
||||||
|
def find_subtable(self) -> bool:
|
||||||
|
"""True when partitioner should detect and emit separate `Table` elements for subtables.
|
||||||
|
|
||||||
|
A subtable is (roughly) a contiguous rectangle of populated cells bounded by empty rows.
|
||||||
|
"""
|
||||||
|
return self._find_subtable
|
||||||
|
|
||||||
|
@lazyproperty
|
||||||
|
def header_row_idx(self) -> int | None:
|
||||||
|
"""The index of the row Pandas should treat as column-headings. Either 0 or None."""
|
||||||
|
return 0 if self._include_header else None
|
||||||
|
|
||||||
|
@lazyproperty
|
||||||
|
def include_header(self) -> bool:
|
||||||
|
"""True when column headers should be included in tables."""
|
||||||
|
return self._include_header
|
||||||
|
|
||||||
|
@lazyproperty
|
||||||
|
def include_metadata(self) -> bool:
|
||||||
|
"""True when partitioner should apply metadata to emitted elements."""
|
||||||
|
return self._include_metadata
|
||||||
|
|
||||||
|
@lazyproperty
|
||||||
|
def infer_table_structure(self) -> bool:
|
||||||
|
"""True when partitioner should compute and apply `text_as_html` metadata."""
|
||||||
|
return self._infer_table_structure
|
||||||
|
|
||||||
|
@lazyproperty
|
||||||
|
def languages(self) -> Optional[list[str]]:
|
||||||
|
"""User-specified language(s) of this document.
|
||||||
|
|
||||||
|
When `None`, language is detected using naive Bayesian filter via `langdetect`. Multiple
|
||||||
|
language codes indicate text could be in any of those languages.
|
||||||
|
"""
|
||||||
|
return self._languages
|
||||||
|
|
||||||
|
@lazyproperty
|
||||||
|
def last_modified(self) -> Optional[str]:
|
||||||
|
"""The best last-modified date available, None if no sources are available."""
|
||||||
|
# -- value explicitly specified by caller takes precedence --
|
||||||
|
if self._metadata_last_modified:
|
||||||
|
return self._metadata_last_modified
|
||||||
|
|
||||||
|
if self._file_path:
|
||||||
|
return get_last_modified_date(self._file_path)
|
||||||
|
|
||||||
|
if self._file:
|
||||||
|
return (
|
||||||
|
get_last_modified_date_from_file(self._file)
|
||||||
|
if self._date_from_file_object
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
@lazyproperty
|
||||||
|
def metadata_file_path(self) -> str | None:
|
||||||
|
"""The best available file-path for this document or `None` if unavailable."""
|
||||||
|
return self._metadata_file_path or self._file_path
|
||||||
|
|
||||||
|
@lazyproperty
|
||||||
|
def sheets(self) -> dict[str, pd.DataFrame]:
|
||||||
|
"""The spreadsheet worksheets, each as a data-frame mapped by sheet-name."""
|
||||||
|
if file_path := self._file_path:
|
||||||
|
return pd.read_excel( # pyright: ignore[reportUnknownMemberType]
|
||||||
|
file_path, sheet_name=None, header=self.header_row_idx
|
||||||
|
)
|
||||||
|
|
||||||
|
if f := self._file:
|
||||||
|
if isinstance(f, SpooledTemporaryFile):
|
||||||
|
f.seek(0)
|
||||||
|
f = io.BytesIO(f.read())
|
||||||
|
return pd.read_excel( # pyright: ignore[reportUnknownMemberType]
|
||||||
|
f, sheet_name=None, header=self.header_row_idx
|
||||||
|
)
|
||||||
|
|
||||||
|
raise ValueError("Either 'filename' or 'file' argument must be specified.")
|
||||||
|
|
||||||
|
|
||||||
class _ConnectedComponent:
|
class _ConnectedComponent:
|
||||||
"""A collection of cells that are "2d-connected" in a worksheet.
|
"""A collection of cells that are "2d-connected" in a worksheet.
|
||||||
|
|
||||||
@ -423,7 +523,7 @@ class _SubtableParser:
|
|||||||
return tuple(reversed(list(iter_trailing_single_cell_row_indices())))
|
return tuple(reversed(list(iter_trailing_single_cell_row_indices())))
|
||||||
|
|
||||||
|
|
||||||
def _check_content_element_type(text: str) -> Element:
|
def _create_element(text: str) -> Element:
|
||||||
"""Create `Text`-subtype document element appropriate to `text`."""
|
"""Create `Text`-subtype document element appropriate to `text`."""
|
||||||
if is_bulleted_text(text):
|
if is_bulleted_text(text):
|
||||||
return ListItem(text=clean_bullets(text))
|
return ListItem(text=clean_bullets(text))
|
||||||
@ -438,20 +538,16 @@ def _check_content_element_type(text: str) -> Element:
|
|||||||
|
|
||||||
|
|
||||||
def _get_metadata(
|
def _get_metadata(
|
||||||
include_metadata: bool = True,
|
sheet_name: str, page_number: int, opts: _XlsxPartitionerOptions
|
||||||
sheet_name: Optional[str] = None,
|
|
||||||
page_number: Optional[int] = -1,
|
|
||||||
filename: Optional[str] = None,
|
|
||||||
last_modification_date: Optional[str] = None,
|
|
||||||
) -> ElementMetadata:
|
) -> ElementMetadata:
|
||||||
"""Returns metadata depending on `include_metadata` flag"""
|
"""Returns metadata depending on `include_metadata` flag"""
|
||||||
if include_metadata:
|
return (
|
||||||
metadata = ElementMetadata(
|
ElementMetadata(
|
||||||
page_name=sheet_name,
|
page_name=sheet_name,
|
||||||
page_number=page_number,
|
page_number=page_number,
|
||||||
filename=filename,
|
filename=opts.metadata_file_path,
|
||||||
last_modified=last_modification_date,
|
last_modified=opts.last_modified,
|
||||||
|
)
|
||||||
|
if opts.include_metadata
|
||||||
|
else ElementMetadata()
|
||||||
)
|
)
|
||||||
else:
|
|
||||||
metadata = ElementMetadata()
|
|
||||||
return metadata
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user