diff --git a/CHANGELOG.md b/CHANGELOG.md
index 539ac63a8..02ef78a2b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.15.2-dev6
+## 0.15.2-dev7
### Enhancements
diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py
index ab90d38d1..d5e7d6c5b 100644
--- a/test_unstructured/chunking/test_base.py
+++ b/test_unstructured/chunking/test_base.py
@@ -4,7 +4,7 @@
from __future__ import annotations
-from typing import Any, Optional, Sequence
+from typing import Any, Sequence
import pytest
@@ -65,7 +65,7 @@ class DescribeChunkingOptions:
("combine_text_under_n_chars", "expected_value"), [(None, 0), (42, 42)]
)
def it_accepts_combine_text_under_n_chars_in_constructor_but_defaults_to_no_combining(
- self, combine_text_under_n_chars: Optional[int], expected_value: int
+ self, combine_text_under_n_chars: int | None, expected_value: int
):
"""Subclasses can store `combine_text_under_n_chars` but must validate and enable it.
@@ -153,107 +153,6 @@ class DescribeChunkingOptions:
assert ChunkingOptions().text_separator == "\n\n"
-class Describe_TextSplitter:
- """Unit-test suite for `unstructured.chunking.base._TextSplitter` objects."""
-
- def it_splits_on_a_preferred_separator_when_it_can(self):
- opts = ChunkingOptions(max_characters=50, text_splitting_separators=("\n", " "), overlap=10)
- split = _TextSplitter(opts)
- text = (
- "Lorem ipsum dolor amet consectetur adipiscing. \n "
- "In rhoncus ipsum sed lectus porta."
- )
-
- s, remainder = split(text)
-
- # -- trailing whitespace is stripped from split --
- assert s == "Lorem ipsum dolor amet consectetur adipiscing."
- # -- leading whitespace is stripped from remainder
- # -- overlap is separated by single space
- # -- overlap-prefix is computed on arbitrary character boundary
- # -- overlap-prefix len includes space separator (text portion is one less than specified)
- assert remainder == "ipiscing. In rhoncus ipsum sed lectus porta."
- # --
- s, remainder = split(remainder)
- assert s == "ipiscing. In rhoncus ipsum sed lectus porta."
- assert remainder == ""
-
- def and_it_splits_on_the_next_available_separator_when_the_first_is_not_available(self):
- opts = ChunkingOptions(max_characters=40, text_splitting_separators=("\n", " "), overlap=10)
- split = _TextSplitter(opts)
- text = (
- "Lorem ipsum dolor amet consectetur adipiscing. In rhoncus ipsum sed lectus porta"
- " volutpat."
- )
-
- s, remainder = split(text)
- assert s == "Lorem ipsum dolor amet consectetur"
- assert remainder == "nsectetur adipiscing. In rhoncus ipsum sed lectus porta volutpat."
- # --
- s, remainder = split(remainder)
- assert s == "nsectetur adipiscing. In rhoncus ipsum"
- assert remainder == "cus ipsum sed lectus porta volutpat."
- # --
- s, remainder = split(remainder)
- assert s == "cus ipsum sed lectus porta volutpat."
- assert remainder == ""
-
- def and_it_splits_on_an_arbitrary_character_as_a_last_resort(self):
- opts = ChunkingOptions(max_characters=30, text_splitting_separators=("\n", " "), overlap=10)
- split = _TextSplitter(opts)
- text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."
-
- s, remainder = split(text)
- assert s == "Loremipsumdolorametconsectetur"
- assert remainder == "onsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."
- # --
- s, remainder = split(remainder)
- assert s == "onsecteturadipiscingelit. In"
- assert remainder == "gelit. In rhoncus ipsum sed lectus porta."
- # --
- s, remainder = split(remainder)
- assert s == "gelit. In rhoncus ipsum sed"
- assert remainder == "ipsum sed lectus porta."
-
- @pytest.mark.parametrize(
- "text",
- [
- "Lorem ipsum dolor amet consectetur adipiscing.", # 46-chars
- "Lorem ipsum dolor.", # 18-chars
- ],
- )
- def it_does_not_split_a_string_that_is_not_longer_than_maxlen(self, text: str):
- opts = ChunkingOptions(max_characters=46, overlap=10)
- split = _TextSplitter(opts)
-
- s, remainder = split(text)
-
- assert s == text
- assert remainder == ""
-
- def it_fills_the_window_when_falling_back_to_an_arbitrary_character_split(self):
- opts = ChunkingOptions(max_characters=38, overlap=10)
- split = _TextSplitter(opts)
- text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."
-
- s, _ = split(text)
-
- assert s == "Loremipsumdolorametconsecteturadipisci"
- assert len(s) == 38
-
- @pytest.mark.parametrize("separators", [("\n", " "), (" ",)])
- def it_strips_whitespace_around_the_split(self, separators: Sequence[str]):
- opts = ChunkingOptions(max_characters=50, text_splitting_separators=separators, overlap=10)
- split = _TextSplitter(opts)
- text = "Lorem ipsum dolor amet consectetur adipiscing. \n\n In rhoncus ipsum sed lectus."
- # |-------------------------------------------------^ 50-chars
-
- s, remainder = split(text)
-
- assert s == "Lorem ipsum dolor amet consectetur adipiscing."
- assert remainder == "ipiscing. In rhoncus ipsum sed lectus."
-
-
# ================================================================================================
# PRE-CHUNKER
# ================================================================================================
@@ -305,6 +204,169 @@ class DescribePreChunker:
next(pre_chunk_iter)
+class DescribePreChunkBuilder:
+ """Unit-test suite for `unstructured.chunking.base.PreChunkBuilder`."""
+
+ def it_is_empty_on_construction(self):
+ builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=50))
+
+ assert builder._text_length == 0
+ assert builder._remaining_space == 50
+
+ def it_accumulates_elements_added_to_it(self):
+ builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
+
+ builder.add_element(Title("Introduction"))
+ assert builder._text_length == 12
+ assert builder._remaining_space == 136
+
+ builder.add_element(
+ Text(
+ "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
+ "lectus porta volutpat.",
+ ),
+ )
+ assert builder._text_length == 112
+ assert builder._remaining_space == 36
+
+ @pytest.mark.parametrize("element", [Table("Heading\nCell text"), Text("abcd " * 200)])
+ def it_will_fit_a_Table_or_oversized_element_when_empty(self, element: Element):
+ builder = PreChunkBuilder(opts=ChunkingOptions())
+ assert builder.will_fit(element)
+
+ @pytest.mark.parametrize(
+ ("existing_element", "next_element"),
+ [
+ (Text("abcd"), Table("Fruits\nMango")),
+ (Text("abcd"), Text("abcd " * 200)),
+ (Table("Heading\nCell text"), Table("Fruits\nMango")),
+ (Table("Heading\nCell text"), Text("abcd " * 200)),
+ ],
+ )
+ def but_not_when_it_already_contains_an_element_of_any_kind(
+ self, existing_element: Element, next_element: Element
+ ):
+ builder = PreChunkBuilder(opts=ChunkingOptions())
+ builder.add_element(existing_element)
+
+ assert not builder.will_fit(next_element)
+
+ @pytest.mark.parametrize("element", [Text("abcd"), Table("Fruits\nMango")])
+ def it_will_not_fit_any_element_when_it_already_contains_a_table(self, element: Element):
+ builder = PreChunkBuilder(opts=ChunkingOptions())
+ builder.add_element(Table("Heading\nCell text"))
+
+ assert not builder.will_fit(element)
+
+ def it_will_not_fit_an_element_when_it_already_exceeds_the_soft_maxlen(self):
+ builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=100, new_after_n_chars=50))
+ builder.add_element(
+ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
+ )
+
+ assert not builder.will_fit(Text("In rhoncus ipsum."))
+
+ def and_it_will_not_fit_an_element_when_that_would_cause_it_to_exceed_the_hard_maxlen(self):
+ builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=100))
+ builder.add_element(
+ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
+ )
+
+ # -- 55 + 2 (separator) + 44 == 101 --
+ assert not builder.will_fit(
+ Text("In rhoncus ipsum sed lectus portos volutpat.") # 44-chars
+ )
+
+ def but_it_will_fit_an_element_that_fits(self):
+ builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=100))
+ builder.add_element(
+ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
+ )
+
+ # -- 55 + 2 (separator) + 43 == 100 --
+ assert builder.will_fit(Text("In rhoncus ipsum sed lectus porto volutpat.")) # 43-chars
+
+ def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
+ builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
+ builder.add_element(Title("Introduction"))
+ builder.add_element(
+ Text(
+ "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
+ "lectus porta volutpat.",
+ ),
+ )
+
+ pre_chunk = next(builder.flush())
+
+ assert isinstance(pre_chunk, TextPreChunk)
+ assert pre_chunk._elements == [
+ Title("Introduction"),
+ Text(
+ "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
+ "lectus porta volutpat.",
+ ),
+ ]
+ assert builder._text_length == 0
+ assert builder._remaining_space == 150
+
+ def and_it_generates_a_TablePreChunk_when_it_contains_a_Table_element(self):
+ builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
+ builder.add_element(Table("Heading\nCell text"))
+
+ pre_chunk = next(builder.flush())
+
+ # -- pre-chunk builder was reset before the yield, such that the iterator does not need to
+ # -- be exhausted before clearing out the old elements and a new pre-chunk can be
+ # -- accumulated immediately (first `next()` call is required however, to advance to the
+ # -- yield statement).
+ assert builder._text_length == 0
+ assert builder._remaining_space == 150
+ # -- pre-chunk is a `TablePreChunk` --
+ assert isinstance(pre_chunk, TablePreChunk)
+ assert pre_chunk._table == Table("Heading\nCell text")
+
+ def but_it_does_not_generate_a_pre_chunk_on_flush_when_empty(self):
+ builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
+
+ pre_chunks = list(builder.flush())
+
+ assert pre_chunks == []
+ assert builder._text_length == 0
+ assert builder._remaining_space == 150
+
+ def it_computes_overlap_from_each_pre_chunk_and_applies_it_to_the_next(self):
+ opts = ChunkingOptions(overlap=15, overlap_all=True)
+ builder = PreChunkBuilder(opts=opts)
+
+ builder.add_element(Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."))
+ pre_chunk = list(builder.flush())[0]
+
+ assert pre_chunk._text == "Lorem ipsum dolor sit amet consectetur adipiscing elit."
+
+ builder.add_element(Table("In rhoncus ipsum sed lectus porta volutpat."))
+ pre_chunk = list(builder.flush())[0]
+
+ assert pre_chunk._text == "dipiscing elit.\nIn rhoncus ipsum sed lectus porta volutpat."
+
+ builder.add_element(Text("Donec semper facilisis metus finibus."))
+ pre_chunk = list(builder.flush())[0]
+
+ assert pre_chunk._text == "porta volutpat.\n\nDonec semper facilisis metus finibus."
+
+ def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
+ builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=50))
+ builder.add_element(Text("abcde"))
+ builder.add_element(Text("fghij"))
+
+ # -- ._text_length includes a separator ("\n\n", len==2) between each text-segment,
+ # -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
+ assert builder._text_length == 12
+ # -- ._remaining_space is reduced by the length (2) of the trailing separator which would
+ # -- go between the current text and that of the next element if one was added.
+ # -- So 50 - 12 - 2 = 36 here, not 50 - 12 = 38
+ assert builder._remaining_space == 36
+
+
# ================================================================================================
# PRE-CHUNK SUBTYPES
# ================================================================================================
@@ -1032,171 +1094,114 @@ class DescribeTextPreChunk:
# ================================================================================================
-# PRE-CHUNKING ACCUMULATORS
+# PRE-CHUNK SPLITTERS
# ================================================================================================
-class DescribePreChunkBuilder:
- """Unit-test suite for `unstructured.chunking.base.PreChunkBuilder`."""
+class Describe_TextSplitter:
+ """Unit-test suite for `unstructured.chunking.base._TextSplitter` objects."""
- def it_is_empty_on_construction(self):
- builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=50))
-
- assert builder._text_length == 0
- assert builder._remaining_space == 50
-
- def it_accumulates_elements_added_to_it(self):
- builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
-
- builder.add_element(Title("Introduction"))
- assert builder._text_length == 12
- assert builder._remaining_space == 136
-
- builder.add_element(
- Text(
- "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
- "lectus porta volutpat.",
- ),
+ def it_splits_on_a_preferred_separator_when_it_can(self):
+ opts = ChunkingOptions(max_characters=50, text_splitting_separators=("\n", " "), overlap=10)
+ split = _TextSplitter(opts)
+ text = (
+ "Lorem ipsum dolor amet consectetur adipiscing. \n "
+ "In rhoncus ipsum sed lectus porta."
)
- assert builder._text_length == 112
- assert builder._remaining_space == 36
- @pytest.mark.parametrize("element", [Table("Heading\nCell text"), Text("abcd " * 200)])
- def it_will_fit_a_Table_or_oversized_element_when_empty(self, element: Element):
- builder = PreChunkBuilder(opts=ChunkingOptions())
- assert builder.will_fit(element)
+ s, remainder = split(text)
+
+ # -- trailing whitespace is stripped from split --
+ assert s == "Lorem ipsum dolor amet consectetur adipiscing."
+ # -- leading whitespace is stripped from remainder
+ # -- overlap is separated by single space
+ # -- overlap-prefix is computed on arbitrary character boundary
+ # -- overlap-prefix len includes space separator (text portion is one less than specified)
+ assert remainder == "ipiscing. In rhoncus ipsum sed lectus porta."
+ # --
+ s, remainder = split(remainder)
+ assert s == "ipiscing. In rhoncus ipsum sed lectus porta."
+ assert remainder == ""
+
+ def and_it_splits_on_the_next_available_separator_when_the_first_is_not_available(self):
+ opts = ChunkingOptions(max_characters=40, text_splitting_separators=("\n", " "), overlap=10)
+ split = _TextSplitter(opts)
+ text = (
+ "Lorem ipsum dolor amet consectetur adipiscing. In rhoncus ipsum sed lectus porta"
+ " volutpat."
+ )
+
+ s, remainder = split(text)
+ assert s == "Lorem ipsum dolor amet consectetur"
+ assert remainder == "nsectetur adipiscing. In rhoncus ipsum sed lectus porta volutpat."
+ # --
+ s, remainder = split(remainder)
+ assert s == "nsectetur adipiscing. In rhoncus ipsum"
+ assert remainder == "cus ipsum sed lectus porta volutpat."
+ # --
+ s, remainder = split(remainder)
+ assert s == "cus ipsum sed lectus porta volutpat."
+ assert remainder == ""
+
+ def and_it_splits_on_an_arbitrary_character_as_a_last_resort(self):
+ opts = ChunkingOptions(max_characters=30, text_splitting_separators=("\n", " "), overlap=10)
+ split = _TextSplitter(opts)
+ text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."
+
+ s, remainder = split(text)
+ assert s == "Loremipsumdolorametconsectetur"
+ assert remainder == "onsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."
+ # --
+ s, remainder = split(remainder)
+ assert s == "onsecteturadipiscingelit. In"
+ assert remainder == "gelit. In rhoncus ipsum sed lectus porta."
+ # --
+ s, remainder = split(remainder)
+ assert s == "gelit. In rhoncus ipsum sed"
+ assert remainder == "ipsum sed lectus porta."
@pytest.mark.parametrize(
- ("existing_element", "next_element"),
+ "text",
[
- (Text("abcd"), Table("Fruits\nMango")),
- (Text("abcd"), Text("abcd " * 200)),
- (Table("Heading\nCell text"), Table("Fruits\nMango")),
- (Table("Heading\nCell text"), Text("abcd " * 200)),
+ "Lorem ipsum dolor amet consectetur adipiscing.", # 46-chars
+ "Lorem ipsum dolor.", # 18-chars
],
)
- def but_not_when_it_already_contains_an_element_of_any_kind(
- self, existing_element: Element, next_element: Element
- ):
- builder = PreChunkBuilder(opts=ChunkingOptions())
- builder.add_element(existing_element)
+ def it_does_not_split_a_string_that_is_not_longer_than_maxlen(self, text: str):
+ opts = ChunkingOptions(max_characters=46, overlap=10)
+ split = _TextSplitter(opts)
- assert not builder.will_fit(next_element)
+ s, remainder = split(text)
- @pytest.mark.parametrize("element", [Text("abcd"), Table("Fruits\nMango")])
- def it_will_not_fit_any_element_when_it_already_contains_a_table(self, element: Element):
- builder = PreChunkBuilder(opts=ChunkingOptions())
- builder.add_element(Table("Heading\nCell text"))
+ assert s == text
+ assert remainder == ""
- assert not builder.will_fit(element)
+ def it_fills_the_window_when_falling_back_to_an_arbitrary_character_split(self):
+ opts = ChunkingOptions(max_characters=38, overlap=10)
+ split = _TextSplitter(opts)
+ text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."
- def it_will_not_fit_an_element_when_it_already_exceeds_the_soft_maxlen(self):
- builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=100, new_after_n_chars=50))
- builder.add_element(
- Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
- )
+ s, _ = split(text)
- assert not builder.will_fit(Text("In rhoncus ipsum."))
+ assert s == "Loremipsumdolorametconsecteturadipisci"
+ assert len(s) == 38
- def and_it_will_not_fit_an_element_when_that_would_cause_it_to_exceed_the_hard_maxlen(self):
- builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=100))
- builder.add_element(
- Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
- )
+ @pytest.mark.parametrize("separators", [("\n", " "), (" ",)])
+ def it_strips_whitespace_around_the_split(self, separators: Sequence[str]):
+ opts = ChunkingOptions(max_characters=50, text_splitting_separators=separators, overlap=10)
+ split = _TextSplitter(opts)
+ text = "Lorem ipsum dolor amet consectetur adipiscing. \n\n In rhoncus ipsum sed lectus."
+ # |-------------------------------------------------^ 50-chars
- # -- 55 + 2 (separator) + 44 == 101 --
- assert not builder.will_fit(
- Text("In rhoncus ipsum sed lectus portos volutpat.") # 44-chars
- )
+ s, remainder = split(text)
- def but_it_will_fit_an_element_that_fits(self):
- builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=100))
- builder.add_element(
- Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
- )
+ assert s == "Lorem ipsum dolor amet consectetur adipiscing."
+ assert remainder == "ipiscing. In rhoncus ipsum sed lectus."
- # -- 55 + 2 (separator) + 43 == 100 --
- assert builder.will_fit(Text("In rhoncus ipsum sed lectus porto volutpat.")) # 43-chars
- def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
- builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
- builder.add_element(Title("Introduction"))
- builder.add_element(
- Text(
- "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
- "lectus porta volutpat.",
- ),
- )
-
- pre_chunk = next(builder.flush())
-
- assert isinstance(pre_chunk, TextPreChunk)
- assert pre_chunk._elements == [
- Title("Introduction"),
- Text(
- "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
- "lectus porta volutpat.",
- ),
- ]
- assert builder._text_length == 0
- assert builder._remaining_space == 150
-
- def and_it_generates_a_TablePreChunk_when_it_contains_a_Table_element(self):
- builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
- builder.add_element(Table("Heading\nCell text"))
-
- pre_chunk = next(builder.flush())
-
- # -- pre-chunk builder was reset before the yield, such that the iterator does not need to
- # -- be exhausted before clearing out the old elements and a new pre-chunk can be
- # -- accumulated immediately (first `next()` call is required however, to advance to the
- # -- yield statement).
- assert builder._text_length == 0
- assert builder._remaining_space == 150
- # -- pre-chunk is a `TablePreChunk` --
- assert isinstance(pre_chunk, TablePreChunk)
- assert pre_chunk._table == Table("Heading\nCell text")
-
- def but_it_does_not_generate_a_pre_chunk_on_flush_when_empty(self):
- builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
-
- pre_chunks = list(builder.flush())
-
- assert pre_chunks == []
- assert builder._text_length == 0
- assert builder._remaining_space == 150
-
- def it_computes_overlap_from_each_pre_chunk_and_applies_it_to_the_next(self):
- opts = ChunkingOptions(overlap=15, overlap_all=True)
- builder = PreChunkBuilder(opts=opts)
-
- builder.add_element(Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."))
- pre_chunk = list(builder.flush())[0]
-
- assert pre_chunk._text == "Lorem ipsum dolor sit amet consectetur adipiscing elit."
-
- builder.add_element(Table("In rhoncus ipsum sed lectus porta volutpat."))
- pre_chunk = list(builder.flush())[0]
-
- assert pre_chunk._text == "dipiscing elit.\nIn rhoncus ipsum sed lectus porta volutpat."
-
- builder.add_element(Text("Donec semper facilisis metus finibus."))
- pre_chunk = list(builder.flush())[0]
-
- assert pre_chunk._text == "porta volutpat.\n\nDonec semper facilisis metus finibus."
-
- def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
- builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=50))
- builder.add_element(Text("abcde"))
- builder.add_element(Text("fghij"))
-
- # -- ._text_length includes a separator ("\n\n", len==2) between each text-segment,
- # -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
- assert builder._text_length == 12
- # -- ._remaining_space is reduced by the length (2) of the trailing separator which would
- # -- go between the current text and that of the next element if one was added.
- # -- So 50 - 12 - 2 = 36 here, not 50 - 12 = 38
- assert builder._remaining_space == 36
+# ================================================================================================
+# PRE-CHUNK COMBINER
+# ================================================================================================
class DescribePreChunkCombiner:
diff --git a/test_unstructured/common/__init__.py b/test_unstructured/common/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/test_unstructured/common/test_html_table.py b/test_unstructured/common/test_html_table.py
new file mode 100644
index 000000000..618277449
--- /dev/null
+++ b/test_unstructured/common/test_html_table.py
@@ -0,0 +1,33 @@
+"""Unit-test suite for the `unstructured.common.html_table` module."""
+
+from __future__ import annotations
+
+from unstructured.common.html_table import htmlify_matrix_of_cell_texts
+
+
+class Describe_htmlify_matrix_of_cell_texts:
+ """Unit-test suite for `unstructured.common.html_table.htmlify_matrix_of_cell_texts()`."""
+
+ def test_htmlify_matrix_handles_empty_cells(self):
+ assert htmlify_matrix_of_cell_texts([["cell1", "", "cell3"], ["", "cell5", ""]]) == (
+ "
"
+ "| cell1 | | cell3 |
"
+ " | cell5 | |
"
+ "
"
+ )
+
+ def test_htmlify_matrix_handles_special_characters(self):
+ assert htmlify_matrix_of_cell_texts([['<>&"', "newline\n"]]) == (
+ ""
+ )
+
+ def test_htmlify_matrix_handles_multiple_rows_and_cells(self):
+ assert htmlify_matrix_of_cell_texts([["cell1", "cell2"], ["cell3", "cell4"]]) == (
+ ""
+ "| cell1 | cell2 |
"
+ "| cell3 | cell4 |
"
+ "
"
+ )
+
+ def test_htmlify_matrix_handles_empty_matrix(self):
+ assert htmlify_matrix_of_cell_texts([]) == ""
diff --git a/test_unstructured/test_utils.py b/test_unstructured/test_utils.py
index b87582161..8d8f5a7eb 100644
--- a/test_unstructured/test_utils.py
+++ b/test_unstructured/test_utils.py
@@ -339,30 +339,6 @@ def test_validate_date_args_raises_for_invalid_formats(date):
assert utils.validate_date_args(date)
-def test_htmlify_matrix_handles_empty_cells():
- assert utils.htmlify_matrix_of_cell_texts([["cell1", "", "cell3"], ["", "cell5", ""]]) == (
- ""
- )
-
-
-def test_htmlify_matrix_handles_special_characters():
- assert utils.htmlify_matrix_of_cell_texts([['<>&"', "newline\n"]]) == (
- ""
- )
-
-
-def test_htmlify_matrix_handles_multiple_rows_and_cells():
- assert utils.htmlify_matrix_of_cell_texts([["cell1", "cell2"], ["cell3", "cell4"]]) == (
- "| cell1 | cell2 |
"
- "| cell3 | cell4 |
"
- )
-
-
-def test_htmlify_matrix_handles_empty_matrix():
- assert utils.htmlify_matrix_of_cell_texts([]) == ""
-
-
def test_only_returns_singleton_iterable():
singleton_iterable = [42]
result = utils.only(singleton_iterable)
diff --git a/typings/lxml/_types.pyi b/typings/lxml/_types.pyi
index 615c09e5c..377a44f64 100644
--- a/typings/lxml/_types.pyi
+++ b/typings/lxml/_types.pyi
@@ -6,7 +6,7 @@ from typing import Any, Callable, Collection, Protocol, TypeVar
from typing_extensions import TypeAlias
-from .etree import QName, _Element, _ElementTree
+from .etree import HTMLParser, QName, XMLParser, _Element, _ElementTree
_ET = TypeVar("_ET", bound=_Element, default=_Element)
_ET_co = TypeVar("_ET_co", bound=_Element, default=_Element, covariant=True)
@@ -30,5 +30,8 @@ _TextArg: TypeAlias = str | bytes | QName
_XPathObject = Any
+# The basic parsers bundled in lxml.etree
+_DefEtreeParsers = XMLParser[_ET_co] | HTMLParser[_ET_co]
+
class SupportsLaxedItems(Protocol[_KT_co, _VT_co]):
def items(self) -> Collection[tuple[_KT_co, _VT_co]]: ...
diff --git a/typings/lxml/etree/_element.pyi b/typings/lxml/etree/_element.pyi
index 7afd99601..b5a91abaa 100644
--- a/typings/lxml/etree/_element.pyi
+++ b/typings/lxml/etree/_element.pyi
@@ -2,11 +2,12 @@
from __future__ import annotations
-from typing import Collection, Generic, Iterator, TypeVar, overload
+from typing import Collection, Generic, Iterable, Iterator, TypeVar, overload
from typing_extensions import Self
from .. import _types as _t
+from ._module_misc import CDATA, QName
_T = TypeVar("_T")
@@ -23,6 +24,12 @@ class _Element:
def get(self, key: _t._AttrName) -> str | None: ...
@overload
def get(self, key: _t._AttrName, default: _T) -> str | _T: ...
+ @overload
+ def iter(self, *tags: _t._TagSelector) -> Iterator[Self]: ...
+ @overload
+ def iter(
+ self, *, tag: _t._TagSelector | Iterable[_t._TagSelector] | None = None
+ ) -> Iterator[Self]: ...
def iterancestors(
self, *, tag: _t._TagSelector | Collection[_t._TagSelector] | None = None
) -> Iterator[Self]: ...
@@ -39,8 +46,12 @@ class _Element:
def tag(self) -> str: ...
@property
def tail(self) -> str | None: ...
+ @tail.setter
+ def tail(self, value: str | CDATA | None) -> None: ...
@property
def text(self) -> str | None: ...
+ @text.setter
+ def text(self, value: str | QName | CDATA | None) -> None: ...
def xpath(
self,
_path: str,
diff --git a/typings/lxml/etree/_module_misc.pyi b/typings/lxml/etree/_module_misc.pyi
index 9da021f0c..3b758d055 100644
--- a/typings/lxml/etree/_module_misc.pyi
+++ b/typings/lxml/etree/_module_misc.pyi
@@ -2,4 +2,7 @@
from __future__ import annotations
+class CDATA:
+ def __init__(self, data: str) -> None: ...
+
class QName: ...
diff --git a/typings/lxml/etree/_parser.pyi b/typings/lxml/etree/_parser.pyi
index df0c67470..41e090283 100644
--- a/typings/lxml/etree/_parser.pyi
+++ b/typings/lxml/etree/_parser.pyi
@@ -1,8 +1,16 @@
+# pyright: reportPrivateUsage=false
+
from __future__ import annotations
+from typing import Generic
+
+from .._types import _ET_co
from ._classlookup import ElementClassLookup
-class HTMLParser:
+# Includes most stuff in _BaseParser
+class _FeedParser(Generic[_ET_co]): ...
+
+class HTMLParser(_FeedParser[_ET_co]):
def __init__(
self,
*,
@@ -20,7 +28,7 @@ class HTMLParser:
) -> None: ...
def set_element_class_lookup(self, lookup: ElementClassLookup | None = None) -> None: ...
-class XMLParser:
+class XMLParser(_FeedParser[_ET_co]):
def __init__(
self,
*,
diff --git a/typings/lxml/html/__init__.pyi b/typings/lxml/html/__init__.pyi
new file mode 100644
index 000000000..e3e41e95f
--- /dev/null
+++ b/typings/lxml/html/__init__.pyi
@@ -0,0 +1,8 @@
+from __future__ import annotations
+
+from ._element import (
+ HtmlElement as HtmlElement,
+)
+from ._parse import (
+ fragment_fromstring as fragment_fromstring,
+)
diff --git a/typings/lxml/html/_parse.pyi b/typings/lxml/html/_parse.pyi
new file mode 100644
index 000000000..162453da1
--- /dev/null
+++ b/typings/lxml/html/_parse.pyi
@@ -0,0 +1,20 @@
+# pyright: reportPrivateUsage=false
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from .._types import _DefEtreeParsers
+from ._element import HtmlElement
+
+if TYPE_CHECKING:
+ from typing_extensions import TypeAlias
+
+_HtmlElemParser: TypeAlias = _DefEtreeParsers[HtmlElement]
+
+def fragment_fromstring(
+ html: str,
+ create_parent: bool = False,
+ base_url: str | None = None,
+ parser: _HtmlElemParser | None = None,
+) -> HtmlElement: ...
diff --git a/typings/pandas/__init__.pyi b/typings/pandas/__init__.pyi
index cc25d44ab..4f822bbeb 100644
--- a/typings/pandas/__init__.pyi
+++ b/typings/pandas/__init__.pyi
@@ -1,8 +1,5 @@
from __future__ import annotations
-from pandas.core.api import (
- DataFrame as DataFrame,
-)
-from pandas.io.api import (
- read_csv as read_csv,
-)
+from pandas.core.api import DataFrame as DataFrame
+from pandas.io.api import read_csv as read_csv
+from pandas.io.api import read_excel as read_excel
diff --git a/typings/pandas/_typing.pyi b/typings/pandas/_typing.pyi
new file mode 100644
index 000000000..329f02852
--- /dev/null
+++ b/typings/pandas/_typing.pyi
@@ -0,0 +1,20 @@
+from __future__ import annotations
+
+from os import PathLike
+from typing import Protocol, TypeVar
+
+from typing_extensions import TypeAlias
+
+AnyStr_cov = TypeVar("AnyStr_cov", str, bytes, covariant=True)
+FilePath: TypeAlias = str | PathLike[str]
+S1 = TypeVar("S1")
+
+class BaseBuffer(Protocol):
+ @property
+ def mode(self) -> str: ...
+ def seek(self, __offset: int, __whence: int = ...) -> int: ...
+ def seekable(self) -> bool: ...
+ def tell(self) -> int: ...
+
+class ReadBuffer(BaseBuffer, Protocol[AnyStr_cov]):
+ def read(self, __n: int = ...) -> AnyStr_cov: ...
diff --git a/typings/pandas/core/frame.pyi b/typings/pandas/core/frame.pyi
index ea4c9f5f2..21bb0f3f5 100644
--- a/typings/pandas/core/frame.pyi
+++ b/typings/pandas/core/frame.pyi
@@ -1,9 +1,29 @@
+# pyright: reportPrivateUsage=false
+
from __future__ import annotations
+from typing import Any, Hashable, Iterable
+
+from pandas.core.indexing import _iLocIndexer
+from pandas.core.series import Series
+
class DataFrame:
+ def __getitem__(self, key: Iterable[Hashable] | slice) -> DataFrame: ...
+ def __len__(self) -> int: ...
+ @property
+ def T(self) -> DataFrame: ...
+ @property
+ def iloc(self) -> _iLocIndexerFrame: ...
+ def isna(self) -> DataFrame: ...
+ def iterrows(self) -> Iterable[tuple[Hashable, Series[Any]]]: ...
+ @property
+ def shape(self) -> tuple[int, int]: ...
def to_html(
self,
index: bool = ...,
header: bool = ...,
na_rep: str = ...,
) -> str: ...
+
+class _iLocIndexerFrame(_iLocIndexer):
+ def __getitem__(self, idx: Any) -> DataFrame: ...
diff --git a/typings/pandas/io/api.pyi b/typings/pandas/io/api.pyi
index 0c267998d..7fd6e4ec7 100644
--- a/typings/pandas/io/api.pyi
+++ b/typings/pandas/io/api.pyi
@@ -1,5 +1,4 @@
from __future__ import annotations
-from pandas.io.parsers import (
- read_csv as read_csv,
-)
+from pandas.io.excel import read_excel as read_excel
+from pandas.io.parsers import read_csv as read_csv
diff --git a/typings/pandas/io/excel/__init__.pyi b/typings/pandas/io/excel/__init__.pyi
new file mode 100644
index 000000000..4e157a8fa
--- /dev/null
+++ b/typings/pandas/io/excel/__init__.pyi
@@ -0,0 +1 @@
+from pandas.io.excel._base import read_excel as read_excel
diff --git a/typings/pandas/io/excel/_base.pyi b/typings/pandas/io/excel/_base.pyi
new file mode 100644
index 000000000..959d02cef
--- /dev/null
+++ b/typings/pandas/io/excel/_base.pyi
@@ -0,0 +1,13 @@
+from __future__ import annotations
+
+from typing import Sequence
+
+from pandas._typing import FilePath, ReadBuffer
+from pandas.core.frame import DataFrame
+
+def read_excel(
+ io: FilePath | ReadBuffer[bytes],
+ sheet_name: None,
+ *,
+ header: int | Sequence[int] | None = ...,
+) -> dict[str, DataFrame]: ...
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index b17a9bb28..56ad20d66 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.15.2-dev6" # pragma: no cover
+__version__ = "0.15.2-dev7" # pragma: no cover
diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py
index 7f28dd3b3..5d735d626 100644
--- a/unstructured/chunking/base.py
+++ b/unstructured/chunking/base.py
@@ -4,7 +4,7 @@ from __future__ import annotations
import collections
import copy
-from typing import Any, Callable, DefaultDict, Iterable, Iterator, Optional, cast
+from typing import Any, Callable, DefaultDict, Iterable, Iterator, cast
import regex
from typing_extensions import Self, TypeAlias
@@ -21,14 +21,16 @@ from unstructured.documents.elements import (
)
from unstructured.utils import lazyproperty
-# -- CONSTANTS -----------------------------------
+# ================================================================================================
+# MODEL
+# ================================================================================================
CHUNK_MAX_CHARS_DEFAULT: int = 500
"""Hard-max chunk-length when no explicit value specified in `max_characters` argument.
Provided for reference only, for example so the ingest CLI can advertise the default value in its
UI. External chunking-related functions (e.g. in ingest or decorators) should use
-`max_characters: Optional[int] = None` and not apply this default themselves. Only
+`max_characters: int | None = None` and not apply this default themselves. Only
`ChunkingOptions.max_characters` should apply a default value.
"""
@@ -38,9 +40,6 @@ CHUNK_MULTI_PAGE_DEFAULT: bool = True
Only operative for "by_title" chunking strategy.
"""
-
-# -- TYPES ---------------------------------------
-
BoundaryPredicate: TypeAlias = Callable[[Element], bool]
"""Detects when element represents crossing a semantic boundary like section or page."""
@@ -237,122 +236,6 @@ class ChunkingOptions:
)
-class _TextSplitter:
- """Provides a text-splitting function configured on construction.
-
- Text is split on the best-available separator, falling-back from the preferred separator
- through a sequence of alternate separators.
-
- - The separator is removed by splitting so only whitespace strings are suitable separators.
- - A "blank-line" ("\n\n") is unlikely to occur in an element as it would have been used as an
- element boundary during partitioning.
-
- This is a *callable* object. Constructing it essentially produces a function:
-
- split = _TextSplitter(opts)
- fragment, remainder = split(s)
-
- This allows it to be configured with length-options etc. on construction and used throughout a
- chunking operation on a given element-stream.
- """
-
- def __init__(self, opts: ChunkingOptions):
- self._opts = opts
-
- def __call__(self, s: str) -> tuple[str, str]:
- """Return pair of strings split from `s` on the best match of configured patterns.
-
- The first string is the split, the second is the remainder of the string. The split string
- will never be longer than `maxlen`. The separators are tried in order until a match is
- found. The last separator is "" which matches between any two characters so there will
- always be a split.
-
- The separator is removed and does not appear in the split or remainder.
-
- An `s` that is already less than the maximum length is returned unchanged with no remainder.
- This allows this function to be called repeatedly with the remainder until it is consumed
- and returns a remainder of "".
- """
- maxlen = self._opts.hard_max
-
- if len(s) <= maxlen:
- return s, ""
-
- for p, sep_len in self._patterns:
- # -- length of separator must be added to include that separator when it happens to be
- # -- located exactly at maxlen. Otherwise the search-from-end regex won't find it.
- fragment, remainder = self._split_from_maxlen(p, sep_len, s)
- if (
- # -- no available split with this separator --
- not fragment
- # -- split did not progress, consuming part of the string --
- or len(remainder) >= len(s)
- ):
- continue
- return fragment.rstrip(), remainder.lstrip()
-
- # -- the terminal "" pattern is not actually executed via regex since its implementation is
- # -- trivial and provides a hard back-stop here in this method. No separator is used between
- # -- tail and remainder on arb-char split.
- return s[:maxlen].rstrip(), s[maxlen - self._opts.overlap :].lstrip()
-
- @lazyproperty
- def _patterns(self) -> tuple[tuple[regex.Pattern[str], int], ...]:
- """Sequence of (pattern, len) pairs to match against.
-
- Patterns appear in order of preference, those following are "fall-back" patterns to be used
- if no match of a prior pattern is found.
-
- NOTE these regexes search *from the end of the string*, which is what the "(?r)" bit
- specifies. This is much more efficient than starting at the beginning of the string which
- could result in hundreds of matches before the desired one.
- """
- separators = self._opts.text_splitting_separators
- return tuple((regex.compile(f"(?r){sep}"), len(sep)) for sep in separators)
-
- def _split_from_maxlen(
- self, pattern: regex.Pattern[str], sep_len: int, s: str
- ) -> tuple[str, str]:
- """Return (split, remainder) pair split from `s` on the right-most match before `maxlen`.
-
- Returns `"", s` if no suitable match was found. Also returns `"", s` if splitting on this
- separator produces a split shorter than the required overlap (which would produce an
- infinite loop).
-
- `split` will never be longer than `maxlen` and there is no longer split available using
- `pattern`.
-
- The separator is removed and does not appear in either the split or remainder.
- """
- maxlen, overlap = self._opts.hard_max, self._opts.overlap
-
- # -- A split not longer than overlap will not progress (infinite loop). On the right side,
- # -- need to extend search range to include a separator located exactly at maxlen.
- match = pattern.search(s, pos=overlap + 1, endpos=maxlen + sep_len)
- if match is None:
- return "", s
-
- # -- characterize match location
- match_start, match_end = match.span()
- # -- matched separator is replaced by single-space in overlap string --
- separator = " "
-
- # -- in multi-space situation, fragment may have trailing whitespace because match is from
- # -- right to left
- fragment = s[:match_start].rstrip()
- # -- remainder can have leading space when match is on "\n" followed by spaces --
- raw_remainder = s[match_end:].lstrip()
-
- if overlap <= len(separator):
- return fragment, raw_remainder
-
- # -- compute overlap --
- tail_len = overlap - len(separator)
- tail = fragment[-tail_len:].lstrip()
- overlapped_remainder = tail + separator + raw_remainder
- return fragment, overlapped_remainder
-
-
# ================================================================================================
# PRE-CHUNKER
# ================================================================================================
@@ -428,6 +311,121 @@ class PreChunker:
return any(semantic_boundaries)
+class PreChunkBuilder:
+ """An element accumulator suitable for incrementally forming a pre-chunk.
+
+ Provides the trial method `.will_fit()` a pre-chunker can use to determine whether it should add
+ the next element in the element stream.
+
+ `.flush()` is used to build a PreChunk object from the accumulated elements. This method
+ returns an iterator that generates zero-or-one `TextPreChunk` or `TablePreChunk` object and is
+ used like so:
+
+ yield from builder.flush()
+
+ If no elements have been accumulated, no `PreChunk` instance is generated. Flushing the builder
+ clears the elements it contains so it is ready to build the next pre-chunk.
+ """
+
+ def __init__(self, opts: ChunkingOptions) -> None:
+ self._opts = opts
+ self._separator_len = len(opts.text_separator)
+ self._elements: list[Element] = []
+
+ # -- overlap is only between pre-chunks so starts empty --
+ self._overlap_prefix: str = ""
+ # -- only includes non-empty element text, e.g. PageBreak.text=="" is not included --
+ self._text_segments: list[str] = []
+ # -- combined length of text-segments, not including separators --
+ self._text_len: int = 0
+
+ def add_element(self, element: Element) -> None:
+ """Add `element` to this section."""
+ self._elements.append(element)
+ if element.text:
+ self._text_segments.append(element.text)
+ self._text_len += len(element.text)
+
+ def flush(self) -> Iterator[PreChunk]:
+ """Generate zero-or-one `PreChunk` object and clear the accumulator.
+
+ Suitable for use to emit a PreChunk when the maximum size has been reached or a semantic
+ boundary has been reached. Also to clear out a terminal pre-chunk at the end of an element
+ stream.
+ """
+ if not self._elements:
+ return
+
+ pre_chunk = (
+ TablePreChunk(self._elements[0], self._overlap_prefix, self._opts)
+ if isinstance(self._elements[0], Table)
+ # -- copy list, don't use original or it may change contents as builder proceeds --
+ else TextPreChunk(list(self._elements), self._overlap_prefix, self._opts)
+ )
+ # -- clear builder before yield so we're not sensitive to the timing of how/when this
+ # -- iterator is exhausted and can add elements for the next pre-chunk immediately.
+ self._reset_state(pre_chunk.overlap_tail)
+ yield pre_chunk
+
+ def will_fit(self, element: Element) -> bool:
+ """True when `element` can be added to this prechunk without violating its limits.
+
+ There are several limits:
+ - A `Table` element will never fit with any other element. It will only fit in an empty
+ pre-chunk.
+ - No element will fit in a pre-chunk that already contains a `Table` element.
+ - A text-element will not fit in a pre-chunk that already exceeds the soft-max
+ (aka. new_after_n_chars).
+ - A text-element will not fit when together with the elements already present it would
+ exceed the hard-max (aka. max_characters).
+ """
+ # -- an empty pre-chunk will accept any element (including an oversized-element) --
+ if len(self._elements) == 0:
+ return True
+ # -- a `Table` will not fit in a non-empty pre-chunk --
+ if isinstance(element, Table):
+ return False
+ # -- no element will fit in a pre-chunk that already contains a `Table` element --
+ if isinstance(self._elements[0], Table):
+ return False
+ # -- a pre-chunk that already exceeds the soft-max is considered "full" --
+ if self._text_length > self._opts.soft_max:
+ return False
+ # -- don't add an element if it would increase total size beyond the hard-max --
+ return not self._remaining_space < len(element.text)
+
+ @property
+ def _remaining_space(self) -> int:
+ """Maximum text-length of an element that can be added without exceeding maxlen."""
+ # -- include length of trailing separator that will go before next element text --
+ separators_len = self._separator_len * len(self._text_segments)
+ return self._opts.hard_max - self._text_len - separators_len
+
+ def _reset_state(self, overlap_prefix: str) -> None:
+ """Set working-state values back to "empty", ready to accumulate next pre-chunk."""
+ self._overlap_prefix = overlap_prefix
+ self._elements.clear()
+ self._text_segments = [overlap_prefix] if overlap_prefix else []
+ self._text_len = len(overlap_prefix)
+
+ @property
+ def _text_length(self) -> int:
+ """Length of the text in this pre-chunk.
+
+ This value represents the chunk-size that would result if this pre-chunk was flushed in its
+ current state. In particular, it does not include the length of a trailing separator (since
+ that would only appear if an additional element was added).
+
+ Not suitable for judging remaining space, use `.remaining_space` for that value.
+ """
+ # -- number of text separators present in joined text of elements. This includes only
+ # -- separators *between* text segments, not one at the end. Note there are zero separators
+ # -- for both 0 and 1 text-segments.
+ n = len(self._text_segments)
+ separator_count = n - 1 if n else 0
+ return self._text_len + (separator_count * self._separator_len)
+
+
# ================================================================================================
# PRE-CHUNK SUB-TYPES
# ================================================================================================
@@ -793,126 +791,129 @@ class TextPreChunk:
# ================================================================================================
-# PRE-CHUNKING ACCUMULATORS
-# ------------------------------------------------------------------------------------------------
-# Accumulators encapsulate the work of grouping elements and later pre-chunks to form the larger
-# pre-chunk and combined-pre-chunk items central to unstructured chunking.
+# PRE-CHUNK SPLITTERS
# ================================================================================================
-class PreChunkBuilder:
- """An element accumulator suitable for incrementally forming a pre-chunk.
+class _TextSplitter:
+ """Provides a text-splitting function configured on construction.
- Provides the trial method `.will_fit()` a pre-chunker can use to determine whether it should add
- the next element in the element stream.
+ Text is split on the best-available separator, falling-back from the preferred separator
+ through a sequence of alternate separators.
- `.flush()` is used to build a PreChunk object from the accumulated elements. This method
- returns an iterator that generates zero-or-one `TextPreChunk` or `TablePreChunk` object and is
- used like so:
+ - The separator is removed by splitting so only whitespace strings are suitable separators.
+ - A "blank-line" ("\n\n") is unlikely to occur in an element as it would have been used as an
+ element boundary during partitioning.
- yield from builder.flush()
+ This is a *callable* object. Constructing it essentially produces a function:
- If no elements have been accumulated, no `PreChunk` instance is generated. Flushing the builder
- clears the elements it contains so it is ready to build the next pre-chunk.
+ split = _TextSplitter(opts)
+ fragment, remainder = split(s)
+
+ This allows it to be configured with length-options etc. on construction and used throughout a
+ chunking operation on a given element-stream.
"""
- def __init__(self, opts: ChunkingOptions) -> None:
+ def __init__(self, opts: ChunkingOptions):
self._opts = opts
- self._separator_len = len(opts.text_separator)
- self._elements: list[Element] = []
- # -- overlap is only between pre-chunks so starts empty --
- self._overlap_prefix: str = ""
- # -- only includes non-empty element text, e.g. PageBreak.text=="" is not included --
- self._text_segments: list[str] = []
- # -- combined length of text-segments, not including separators --
- self._text_len: int = 0
+ def __call__(self, s: str) -> tuple[str, str]:
+ """Return pair of strings split from `s` on the best match of configured patterns.
- def add_element(self, element: Element) -> None:
- """Add `element` to this section."""
- self._elements.append(element)
- if element.text:
- self._text_segments.append(element.text)
- self._text_len += len(element.text)
+ The first string is the split, the second is the remainder of the string. The split string
+ will never be longer than `maxlen`. The separators are tried in order until a match is
+ found. The last separator is "" which matches between any two characters so there will
+ always be a split.
- def flush(self) -> Iterator[PreChunk]:
- """Generate zero-or-one `PreChunk` object and clear the accumulator.
+ The separator is removed and does not appear in the split or remainder.
- Suitable for use to emit a PreChunk when the maximum size has been reached or a semantic
- boundary has been reached. Also to clear out a terminal pre-chunk at the end of an element
- stream.
+ An `s` that is already less than the maximum length is returned unchanged with no remainder.
+ This allows this function to be called repeatedly with the remainder until it is consumed
+ and returns a remainder of "".
"""
- if not self._elements:
- return
+ maxlen = self._opts.hard_max
- pre_chunk = (
- TablePreChunk(self._elements[0], self._overlap_prefix, self._opts)
- if isinstance(self._elements[0], Table)
- # -- copy list, don't use original or it may change contents as builder proceeds --
- else TextPreChunk(list(self._elements), self._overlap_prefix, self._opts)
- )
- # -- clear builder before yield so we're not sensitive to the timing of how/when this
- # -- iterator is exhausted and can add elements for the next pre-chunk immediately.
- self._reset_state(pre_chunk.overlap_tail)
- yield pre_chunk
+ if len(s) <= maxlen:
+ return s, ""
- def will_fit(self, element: Element) -> bool:
- """True when `element` can be added to this prechunk without violating its limits.
+ for p, sep_len in self._patterns:
+ # -- length of separator must be added to include that separator when it happens to be
+ # -- located exactly at maxlen. Otherwise the search-from-end regex won't find it.
+ fragment, remainder = self._split_from_maxlen(p, sep_len, s)
+ if (
+ # -- no available split with this separator --
+ not fragment
+ # -- split did not progress, consuming part of the string --
+ or len(remainder) >= len(s)
+ ):
+ continue
+ return fragment.rstrip(), remainder.lstrip()
- There are several limits:
- - A `Table` element will never fit with any other element. It will only fit in an empty
- pre-chunk.
- - No element will fit in a pre-chunk that already contains a `Table` element.
- - A text-element will not fit in a pre-chunk that already exceeds the soft-max
- (aka. new_after_n_chars).
- - A text-element will not fit when together with the elements already present it would
- exceed the hard-max (aka. max_characters).
+ # -- the terminal "" pattern is not actually executed via regex since its implementation is
+ # -- trivial and provides a hard back-stop here in this method. No separator is used between
+ # -- tail and remainder on arb-char split.
+ return s[:maxlen].rstrip(), s[maxlen - self._opts.overlap :].lstrip()
+
+ @lazyproperty
+ def _patterns(self) -> tuple[tuple[regex.Pattern[str], int], ...]:
+ """Sequence of (pattern, len) pairs to match against.
+
+ Patterns appear in order of preference, those following are "fall-back" patterns to be used
+ if no match of a prior pattern is found.
+
+ NOTE these regexes search *from the end of the string*, which is what the "(?r)" bit
+ specifies. This is much more efficient than starting at the beginning of the string which
+ could result in hundreds of matches before the desired one.
"""
- # -- an empty pre-chunk will accept any element (including an oversized-element) --
- if len(self._elements) == 0:
- return True
- # -- a `Table` will not fit in a non-empty pre-chunk --
- if isinstance(element, Table):
- return False
- # -- no element will fit in a pre-chunk that already contains a `Table` element --
- if isinstance(self._elements[0], Table):
- return False
- # -- a pre-chunk that already exceeds the soft-max is considered "full" --
- if self._text_length > self._opts.soft_max:
- return False
- # -- don't add an element if it would increase total size beyond the hard-max --
- return not self._remaining_space < len(element.text)
+ separators = self._opts.text_splitting_separators
+ return tuple((regex.compile(f"(?r){sep}"), len(sep)) for sep in separators)
- @property
- def _remaining_space(self) -> int:
- """Maximum text-length of an element that can be added without exceeding maxlen."""
- # -- include length of trailing separator that will go before next element text --
- separators_len = self._separator_len * len(self._text_segments)
- return self._opts.hard_max - self._text_len - separators_len
+ def _split_from_maxlen(
+ self, pattern: regex.Pattern[str], sep_len: int, s: str
+ ) -> tuple[str, str]:
+ """Return (split, remainder) pair split from `s` on the right-most match before `maxlen`.
- def _reset_state(self, overlap_prefix: str) -> None:
- """Set working-state values back to "empty", ready to accumulate next pre-chunk."""
- self._overlap_prefix = overlap_prefix
- self._elements.clear()
- self._text_segments = [overlap_prefix] if overlap_prefix else []
- self._text_len = len(overlap_prefix)
+ Returns `"", s` if no suitable match was found. Also returns `"", s` if splitting on this
+ separator produces a split shorter than the required overlap (which would produce an
+ infinite loop).
- @property
- def _text_length(self) -> int:
- """Length of the text in this pre-chunk.
+ `split` will never be longer than `maxlen` and there is no longer split available using
+ `pattern`.
- This value represents the chunk-size that would result if this pre-chunk was flushed in its
- current state. In particular, it does not include the length of a trailing separator (since
- that would only appear if an additional element was added).
-
- Not suitable for judging remaining space, use `.remaining_space` for that value.
+ The separator is removed and does not appear in either the split or remainder.
"""
- # -- number of text separators present in joined text of elements. This includes only
- # -- separators *between* text segments, not one at the end. Note there are zero separators
- # -- for both 0 and 1 text-segments.
- n = len(self._text_segments)
- separator_count = n - 1 if n else 0
- return self._text_len + (separator_count * self._separator_len)
+ maxlen, overlap = self._opts.hard_max, self._opts.overlap
+
+ # -- A split not longer than overlap will not progress (infinite loop). On the right side,
+ # -- need to extend search range to include a separator located exactly at maxlen.
+ match = pattern.search(s, pos=overlap + 1, endpos=maxlen + sep_len)
+ if match is None:
+ return "", s
+
+ # -- characterize match location
+ match_start, match_end = match.span()
+ # -- matched separator is replaced by single-space in overlap string --
+ separator = " "
+
+ # -- in multi-space situation, fragment may have trailing whitespace because match is from
+ # -- right to left
+ fragment = s[:match_start].rstrip()
+ # -- remainder can have leading space when match is on "\n" followed by spaces --
+ raw_remainder = s[match_end:].lstrip()
+
+ if overlap <= len(separator):
+ return fragment, raw_remainder
+
+ # -- compute overlap --
+ tail_len = overlap - len(separator)
+ tail = fragment[-tail_len:].lstrip()
+ overlapped_remainder = tail + separator + raw_remainder
+ return fragment, overlapped_remainder
+
+
+# ================================================================================================
+# PRE-CHUNK COMBINER
+# ================================================================================================
class PreChunkCombiner:
@@ -966,7 +967,7 @@ class TextPreChunkAccumulator:
def __init__(self, opts: ChunkingOptions) -> None:
self._opts = opts
- self._pre_chunk: Optional[TextPreChunk] = None
+ self._pre_chunk: TextPreChunk | None = None
def add_pre_chunk(self, pre_chunk: TextPreChunk) -> None:
"""Add a pre-chunk to the accumulator for possible combination with next pre-chunk."""
diff --git a/unstructured/common/__init__.py b/unstructured/common/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/unstructured/common/html_table.py b/unstructured/common/html_table.py
new file mode 100644
index 000000000..e6f4a30ba
--- /dev/null
+++ b/unstructured/common/html_table.py
@@ -0,0 +1,39 @@
+"""Provides operations related to the HTML table stored in `.metadata.text_as_html`.
+
+Used during partitioning as well as chunking.
+"""
+
+from __future__ import annotations
+
+import html
+from typing import Iterator, Sequence
+
+
+def htmlify_matrix_of_cell_texts(matrix: Sequence[Sequence[str]]) -> str:
+ """Form an HTML table from "rows" and "columns" of `matrix`.
+
+ Character overhead is minimized:
+ - No whitespace padding is added for human readability
+ - No newlines ("\n") are added
+ - No ``, ``, or `` elements are used; we can't tell where those might be
+ semantically appropriate anyway so at best they would consume unnecessary space and at worst
+ would be misleading.
+ """
+
+ def iter_trs(rows_of_cell_strs: Sequence[Sequence[str]]) -> Iterator[str]:
+ for row_cell_strs in rows_of_cell_strs:
+ # -- suppress emission of rows with no cells --
+ if not row_cell_strs:
+ continue
+ yield f"{''.join(iter_tds(row_cell_strs))}
"
+
+ def iter_tds(row_cell_strs: Sequence[str]) -> Iterator[str]:
+ for s in row_cell_strs:
+ # -- take care of things like '<' and '>' in the text --
+ s = html.escape(s)
+ # -- substitute
elements for line-feeds in the text --
+ s = "
".join(s.split("\n"))
+ # -- strip leading and trailing whitespace, wrap it up and go --
+ yield f"{s.strip()} | "
+
+ return f"{''.join(iter_trs(matrix))}
" if matrix else ""
diff --git a/unstructured/partition/html/parser.py b/unstructured/partition/html/parser.py
index a79803969..c54fbb895 100644
--- a/unstructured/partition/html/parser.py
+++ b/unstructured/partition/html/parser.py
@@ -83,6 +83,7 @@ from lxml import etree
from typing_extensions import TypeAlias
from unstructured.cleaners.core import clean_bullets
+from unstructured.common.html_table import htmlify_matrix_of_cell_texts
from unstructured.documents.elements import (
Address,
Element,
@@ -101,7 +102,7 @@ from unstructured.partition.text_type import (
is_possible_title,
is_us_city_state_zip,
)
-from unstructured.utils import htmlify_matrix_of_cell_texts, lazyproperty
+from unstructured.utils import lazyproperty
# ------------------------------------------------------------------------------------------------
# DOMAIN MODEL
diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py
index af52d12df..27b977321 100644
--- a/unstructured/partition/xlsx.py
+++ b/unstructured/partition/xlsx.py
@@ -4,12 +4,12 @@ from __future__ import annotations
import io
from tempfile import SpooledTemporaryFile
-from typing import IO, Any, Iterator, Optional, cast
+from typing import IO, Any, Iterator, Optional
import networkx as nx
import numpy as np
import pandas as pd
-from lxml.html.soupparser import fromstring as soupparser_fromstring # pyright: ignore
+from lxml.html.soupparser import fromstring as soupparser_fromstring
from typing_extensions import Self, TypeAlias
from unstructured.chunking import add_chunking_strategy
@@ -110,19 +110,12 @@ def partition_xlsx(
):
if not opts.find_subtable:
html_text = (
- sheet.to_html( # pyright: ignore[reportUnknownMemberType]
- index=False, header=opts.include_header, na_rep=""
- )
+ sheet.to_html(index=False, header=opts.include_header, na_rep="")
if opts.infer_table_structure
else None
)
# XXX: `html_text` can be `None`. What happens on this call in that case?
- text = cast(
- str,
- soupparser_fromstring( # pyright: ignore[reportUnknownMemberType]
- html_text
- ).text_content(),
- )
+ text = soupparser_fromstring(html_text).text_content()
if opts.include_metadata:
metadata = ElementMetadata(
@@ -151,15 +144,10 @@ def partition_xlsx(
# -- emit core-table (if it exists) as a `Table` element --
core_table = subtable_parser.core_table
if core_table is not None:
- html_text = core_table.to_html( # pyright: ignore[reportUnknownMemberType]
+ html_text = core_table.to_html(
index=False, header=opts.include_header, na_rep=""
)
- text = cast(
- str,
- soupparser_fromstring( # pyright: ignore[reportUnknownMemberType]
- html_text
- ).text_content(),
- )
+ text = soupparser_fromstring(html_text).text_content()
element = Table(text=text)
element.metadata = _get_metadata(sheet_name, page_number, opts)
element.metadata.text_as_html = (
@@ -285,17 +273,13 @@ class _XlsxPartitionerOptions:
def sheets(self) -> dict[str, pd.DataFrame]:
"""The spreadsheet worksheets, each as a data-frame mapped by sheet-name."""
if file_path := self._file_path:
- return pd.read_excel( # pyright: ignore[reportUnknownMemberType]
- file_path, sheet_name=None, header=self.header_row_idx
- )
+ return pd.read_excel(file_path, sheet_name=None, header=self.header_row_idx)
if f := self._file:
if isinstance(f, SpooledTemporaryFile):
f.seek(0)
f = io.BytesIO(f.read())
- return pd.read_excel( # pyright: ignore[reportUnknownMemberType]
- f, sheet_name=None, header=self.header_row_idx
- )
+ return pd.read_excel(f, sheet_name=None, header=self.header_row_idx)
raise ValueError("Either 'filename' or 'file' argument must be specified.")
@@ -383,7 +367,7 @@ class _ConnectedComponents:
max_row, max_col = self._worksheet_df.shape
node_array = np.indices((max_row, max_col)).T
empty_cells = self._worksheet_df.isna().T
- nodes_to_remove = [tuple(pair) for pair in node_array[empty_cells]]
+ nodes_to_remove = [tuple(pair) for pair in node_array[empty_cells]] # pyright: ignore
graph: nx.Graph = nx.grid_2d_graph(max_row, max_col) # pyright: ignore
graph.remove_nodes_from(nodes_to_remove) # pyright: ignore
@@ -499,7 +483,7 @@ class _SubtableParser:
"""Index of each single-cell row in subtable, in top-down order."""
def iter_single_cell_row_idxs() -> Iterator[int]:
- for idx, (_, row) in enumerate(self._subtable.iterrows()): # pyright: ignore
+ for idx, (_, row) in enumerate(self._subtable.iterrows()):
if row.count() != 1:
continue
yield idx
diff --git a/unstructured/utils.py b/unstructured/utils.py
index 3152d02e7..03632e37a 100644
--- a/unstructured/utils.py
+++ b/unstructured/utils.py
@@ -2,7 +2,6 @@ from __future__ import annotations
import asyncio
import functools
-import html
import importlib
import inspect
import json
@@ -23,7 +22,6 @@ from typing import (
Iterator,
List,
Optional,
- Sequence,
Tuple,
TypeVar,
cast,
@@ -62,36 +60,6 @@ def get_call_args_applying_defaults(
return call_args
-def htmlify_matrix_of_cell_texts(matrix: Sequence[Sequence[str]]) -> str:
- """Form an HTML table from "rows" and "columns" of `matrix`.
-
- Character overhead is minimized:
- - No whitespace padding is added for human readability
- - No newlines ("\n") are added
- - No ``, ``, or `` elements are used; we can't tell where those might be
- semantically appropriate anyway so at best they would consume unnecessary space and at worst
- would be misleading.
- """
-
- def iter_trs(rows_of_cell_strs: Sequence[Sequence[str]]) -> Iterator[str]:
- for row_cell_strs in rows_of_cell_strs:
- # -- suppress emission of rows with no cells --
- if not row_cell_strs:
- continue
- yield f"{''.join(iter_tds(row_cell_strs))}
"
-
- def iter_tds(row_cell_strs: Sequence[str]) -> Iterator[str]:
- for s in row_cell_strs:
- # -- take care of things like '<' and '>' in the text --
- s = html.escape(s)
- # -- substitute
elements for line-feeds in the text --
- s = "
".join(s.split("\n"))
- # -- strip leading and trailing whitespace, wrap it up and go --
- yield f"{s.strip()} | "
-
- return f"{''.join(iter_trs(matrix))}
" if matrix else ""
-
-
def is_temp_file_path(file_path: str) -> bool:
"""True when file_path is in the Python-defined tempdir.