mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-03 23:20:35 +00:00

**Summary** Relax table-segregation rule applied during chunking such that a `Table` and `Text`-subtype elements can be combined into a single chunk when the chunking window allows. **Additional Context** Until now, `Table` elements have always been segregated during chunking, i.e. a chunk that contained a table would never contain any other element. In certain scenarios, especially when a large chunking window of say 2000 characters is used, this behavior can reduce retrieval effectiveness by isolating the table from surrounding context. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: scanny <scanny@users.noreply.github.com>
169 lines
8.1 KiB
Python
169 lines
8.1 KiB
Python
"""Test suite for the `unstructured.chunking.basic` module.
|
||
|
||
That module implements the baseline chunking strategy. The baseline strategy has all behaviors
|
||
shared by all chunking strategies and no extra rules like perserve section or page boundaries.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
from typing import Any
|
||
|
||
import pytest
|
||
|
||
from test_unstructured.unit_utils import FixtureRequest, Mock, function_mock
|
||
from unstructured.chunking.basic import chunk_elements
|
||
from unstructured.documents.elements import CompositeElement, Text, Title
|
||
from unstructured.partition.docx import partition_docx
|
||
|
||
|
||
def test_it_chunks_a_document_when_basic_chunking_strategy_is_specified_on_partition_function():
|
||
"""Basic chunking can be combined with partitioning, exercising the decorator."""
|
||
filename = "example-docs/handbook-1p.docx"
|
||
|
||
chunks = partition_docx(filename, chunking_strategy="basic")
|
||
|
||
assert chunks == [
|
||
CompositeElement(
|
||
"US Trustee Handbook\n\nCHAPTER 1\n\nINTRODUCTION\n\nCHAPTER 1 – INTRODUCTION"
|
||
"\n\nA. PURPOSE"
|
||
),
|
||
CompositeElement(
|
||
"The United States Trustee appoints and supervises standing trustees and monitors and"
|
||
" supervises cases under chapter 13 of title 11 of the United States Code. 28 U.S.C."
|
||
" § 586(b). The Handbook, issued as part of our duties under 28 U.S.C. § 586,"
|
||
" establishes or clarifies the position of the United States Trustee Program (Program)"
|
||
" on the duties owed by a standing trustee to the debtors, creditors, other parties in"
|
||
" interest, and the United States Trustee. The Handbook does not present a full and"
|
||
),
|
||
CompositeElement(
|
||
"complete statement of the law; it should not be used as a substitute for legal"
|
||
" research and analysis. The standing trustee must be familiar with relevant"
|
||
" provisions of the Bankruptcy Code, Federal Rules of Bankruptcy Procedure (Rules),"
|
||
" any local bankruptcy rules, and case law. 11 U.S.C. § 321, 28 U.S.C. § 586,"
|
||
" 28 C.F.R. § 58.6(a)(3). Standing trustees are encouraged to follow Practice Tips"
|
||
" identified in this Handbook but these are not considered mandatory."
|
||
),
|
||
CompositeElement(
|
||
"Nothing in this Handbook should be construed to excuse the standing trustee from"
|
||
" complying with all duties imposed by the Bankruptcy Code and Rules, local rules, and"
|
||
" orders of the court. The standing trustee should notify the United States Trustee"
|
||
" whenever the provision of the Handbook conflicts with the local rules or orders of"
|
||
" the court. The standing trustee is accountable for all duties set forth in this"
|
||
" Handbook, but need not personally perform any duty unless otherwise indicated. All"
|
||
),
|
||
CompositeElement(
|
||
"statutory references in this Handbook refer to the Bankruptcy Code, 11 U.S.C. § 101"
|
||
" et seq., unless otherwise indicated."
|
||
),
|
||
CompositeElement(
|
||
"This Handbook does not create additional rights against the standing trustee or"
|
||
" United States Trustee in favor of other parties.\n\nB. ROLE OF THE UNITED STATES"
|
||
" TRUSTEE"
|
||
),
|
||
CompositeElement(
|
||
"The Bankruptcy Reform Act of 1978 removed the bankruptcy judge from the"
|
||
" responsibilities for daytoday administration of cases. Debtors, creditors, and"
|
||
" third parties with adverse interests to the trustee were concerned that the court,"
|
||
" which previously appointed and supervised the trustee, would not impartially"
|
||
" adjudicate their rights as adversaries of that trustee. To address these concerns,"
|
||
" judicial and administrative functions within the bankruptcy system were bifurcated."
|
||
),
|
||
CompositeElement(
|
||
"Many administrative functions formerly performed by the court were placed within the"
|
||
" Department of Justice through the creation of the Program. Among the administrative"
|
||
" functions assigned to the United States Trustee were the appointment and supervision"
|
||
" of chapter 13 trustees./ This Handbook is issued under the authority of the"
|
||
" Program’s enabling statutes.\n\nC. STATUTORY DUTIES OF A STANDING TRUSTEE"
|
||
),
|
||
CompositeElement(
|
||
"The standing trustee has a fiduciary responsibility to the bankruptcy estate. The"
|
||
" standing trustee is more than a mere disbursing agent. The standing trustee must"
|
||
" be personally involved in the trustee operation. If the standing trustee is or"
|
||
" becomes unable to perform the duties and responsibilities of a standing trustee,"
|
||
" the standing trustee must immediately advise the United States Trustee."
|
||
" 28 U.S.C. § 586(b), 28 C.F.R. § 58.4(b) referencing 28 C.F.R. § 58.3(b)."
|
||
),
|
||
CompositeElement(
|
||
"Although this Handbook is not intended to be a complete statutory reference, the"
|
||
" standing trustee’s primary statutory duties are set forth in 11 U.S.C. § 1302, which"
|
||
" incorporates by reference some of the duties of chapter 7 trustees found in"
|
||
" 11 U.S.C. § 704. These duties include, but are not limited to, the"
|
||
" following:\n\nCopyright"
|
||
),
|
||
]
|
||
|
||
|
||
def test_it_chunks_elements_when_the_user_already_has_them():
|
||
elements = [
|
||
Title("Introduction"),
|
||
Text(
|
||
# --------------------------------------------------------- 64 -v
|
||
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed lectus"
|
||
" porta volutpat.",
|
||
),
|
||
]
|
||
|
||
chunks = chunk_elements(elements, max_characters=64)
|
||
|
||
assert chunks == [
|
||
CompositeElement("Introduction"),
|
||
# -- splits on even word boundary, not mid-"rhoncus" --
|
||
CompositeElement("Lorem ipsum dolor sit amet consectetur adipiscing elit. In"),
|
||
CompositeElement("rhoncus ipsum sed lectus porta volutpat."),
|
||
]
|
||
|
||
|
||
def test_it_includes_original_elements_as_metadata_when_requested():
|
||
element = Title("Introduction")
|
||
element_2 = Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.")
|
||
element_3 = Text("In rhoncus ipsum sed lectus porta volutpat.")
|
||
|
||
chunks = chunk_elements(
|
||
[element, element_2, element_3], max_characters=70, include_orig_elements=True
|
||
)
|
||
|
||
assert len(chunks) == 2
|
||
chunk = chunks[0]
|
||
assert chunk == CompositeElement(
|
||
"Introduction\n\nLorem ipsum dolor sit amet consectetur adipiscing elit."
|
||
)
|
||
assert chunk.metadata.orig_elements == [element, element_2]
|
||
# --
|
||
chunk = chunks[1]
|
||
assert chunk == CompositeElement("In rhoncus ipsum sed lectus porta volutpat.")
|
||
assert chunk.metadata.orig_elements == [element_3]
|
||
|
||
|
||
# ------------------------------------------------------------------------------------------------
|
||
# UNIT TESTS
|
||
# ------------------------------------------------------------------------------------------------
|
||
|
||
|
||
class Describe_chunk_elements:
|
||
"""Unit-test suite for `unstructured.chunking.basic.chunk_elements()` function."""
|
||
|
||
@pytest.mark.parametrize(
|
||
("kwargs", "expected_value"),
|
||
[
|
||
({"include_orig_elements": True}, True),
|
||
({"include_orig_elements": False}, False),
|
||
({"include_orig_elements": None}, True),
|
||
({}, True),
|
||
],
|
||
)
|
||
def it_supports_the_include_orig_elements_option(
|
||
self, kwargs: dict[str, Any], expected_value: bool, _chunk_elements_: Mock
|
||
):
|
||
# -- this line would raise if "include_orig_elements" was not an available parameter on
|
||
# -- `chunk_elements()`.
|
||
chunk_elements([], **kwargs)
|
||
|
||
_, opts = _chunk_elements_.call_args.args
|
||
assert opts.include_orig_elements is expected_value
|
||
|
||
# -- fixtures --------------------------------------------------------------------------------
|
||
|
||
@pytest.fixture()
|
||
def _chunk_elements_(self, request: FixtureRequest):
|
||
return function_mock(request, "unstructured.chunking.basic._chunk_elements")
|