feat(chunk): split tables on even row boundaries (#3504)

**Summary**
Use more sophisticated algorithm for splitting oversized `Table`
elements into `TableChunk` elements during chunking to ensure element
text and HTML are "synchronized" and HTML is always parseable.

**Additional Context**
Table splitting now has the following characteristics:
- `TableChunk.metadata.text_as_html` is always a parseable HTML
`<table>` subtree.
- `TableChunk.text` is always the text in the HTML version of the table
fragment in `.metadata.text_as_html`. Text and HTML are "synchronized".
- The table is divided at a whole-row boundary whenever possible.
- A row is broken at an even-cell boundary when a single row is larger
than the chunking window.
- A cell is broken at an even-word boundary when a single cell is larger
than the chunking window.
- `.text_as_html` is "minified", removing all extraneous whitespace and
unneeded elements or attributes. This maximizes the semantic "density"
of each chunk.
This commit is contained in:
Steve Canny 2024-08-19 11:56:53 -07:00 committed by GitHub
parent 99f72d65ba
commit a861ed8fe7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
21 changed files with 1003 additions and 140 deletions

View File

@ -1,4 +1,4 @@
## 0.15.6-dev0
## 0.15.6-dev1
### Enhancements
@ -7,6 +7,7 @@
### Fixes
* **Update CI for `ingest-test-fixture-update-pr` to resolve NLTK model download errors.**
* **Synchronized text and html on `TableChunk` splits.** When a `Table` element is divided during chunking to fit the chunking window, `TableChunk.text` corresponds exactly with the table text in `TableChunk.metadata.text_as_html`, `.text_as_html` is always parseable HTML, and the table is split on even row boundaries whenever possible.
## 0.15.5

View File

@ -7,6 +7,7 @@ from __future__ import annotations
from typing import Any, Sequence
import pytest
from lxml.html import fragment_fromstring
from unstructured.chunking.base import (
ChunkingOptions,
@ -16,10 +17,14 @@ from unstructured.chunking.base import (
TablePreChunk,
TextPreChunk,
TextPreChunkAccumulator,
_CellAccumulator,
_RowAccumulator,
_TableSplitter,
_TextSplitter,
is_on_next_page,
is_title,
)
from unstructured.common.html_table import HtmlCell, HtmlRow, HtmlTable
from unstructured.documents.elements import (
CheckBox,
CompositeElement,
@ -341,16 +346,21 @@ class DescribePreChunkBuilder:
builder.add_element(Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."))
pre_chunk = list(builder.flush())[0]
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._text == "Lorem ipsum dolor sit amet consectetur adipiscing elit."
builder.add_element(Table("In rhoncus ipsum sed lectus porta volutpat."))
pre_chunk = list(builder.flush())[0]
assert pre_chunk._text == "dipiscing elit.\nIn rhoncus ipsum sed lectus porta volutpat."
assert isinstance(pre_chunk, TablePreChunk)
assert pre_chunk._text_with_overlap == (
"dipiscing elit.\nIn rhoncus ipsum sed lectus porta volutpat."
)
builder.add_element(Text("Donec semper facilisis metus finibus."))
pre_chunk = list(builder.flush())[0]
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._text == "porta volutpat.\n\nDonec semper facilisis metus finibus."
def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
@ -386,7 +396,7 @@ class DescribeTablePreChunk:
"</tbody>\n"
"</table>"
)
text_table = "Header Col 1 Header Col 2\n" "Lorem ipsum adipiscing"
text_table = "Header Col 1 Header Col 2\nLorem ipsum adipiscing"
pre_chunk = TablePreChunk(
Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
overlap_prefix="ctus porta volutpat.",
@ -401,18 +411,27 @@ class DescribeTablePreChunk:
"ctus porta volutpat.\nHeader Col 1 Header Col 2\nLorem ipsum adipiscing"
)
assert chunk.metadata.text_as_html == (
"<table>\n"
"<thead>\n"
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
"</thead>\n"
"<tbody>\n"
"<tr><td>Lorem ipsum </td><td>adipiscing </td></tr>\n"
"</tbody>\n"
"<table>"
"<tr><td>Header Col 1</td><td>Header Col 2</td></tr>"
"<tr><td>Lorem ipsum</td><td>adipiscing</td></tr>"
"</table>"
)
with pytest.raises(StopIteration):
next(chunk_iter)
def but_not_when_the_table_is_is_empty_or_contains_only_whitespace(self):
html_table = "<table><tr><td/><td> \t \n </td></tr></table>"
pre_chunk = TablePreChunk(
Table(" \t \n ", metadata=ElementMetadata(text_as_html=html_table)),
overlap_prefix="volutpat.",
opts=ChunkingOptions(max_characters=175),
)
chunk_iter = pre_chunk.iter_chunks()
with pytest.raises(StopIteration):
next(chunk_iter)
def and_it_includes_the_original_table_element_in_metadata_when_so_instructed(self):
table = Table("foo bar", metadata=ElementMetadata(text_as_html="<table>foo bar</table>"))
opts = ChunkingOptions(include_orig_elements=True)
@ -437,21 +456,18 @@ class DescribeTablePreChunk:
assert chunk.metadata.orig_elements is None
def it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self):
# fixed-overhead = 8+8+9+8+9+8 = 50
# per-row overhead = 27
html_table = (
"<table>\n" # 8
"<thead>\n" # 8
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
"</thead>\n" # 9
"<tbody>\n" # 8
"<tr><td>Lorem ipsum </td><td>A Link example</td></tr>\n"
"<tr><td>Consectetur </td><td>adipiscing elit</td></tr>\n"
"<tr><td>Nunc aliquam </td><td>id enim nec molestie</td></tr>\n"
"<tr><td>Vivamus quis </td><td>nunc ipsum donec ac fermentum</td></tr>\n"
"</tbody>\n" # 9
"</table>" # 8
)
html_table = """\
<table>
<thead>
<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>
</thead>
<tbody>
<tr><td>Lorem ipsum </td><td>A Link example</td></tr>
<tr><td>Consectetur </td><td>adipiscing elit</td></tr>
<tr><td>Nunc aliquam </td><td>id enim nec molestie</td></tr>
</tbody>
</table>
"""
text_table = (
"Header Col 1 Header Col 2\n"
"Lorem ipsum dolor sit amet\n"
@ -469,48 +485,33 @@ class DescribeTablePreChunk:
chunk = next(chunk_iter)
assert isinstance(chunk, TableChunk)
assert chunk.text == (
"Header Col 1 Header Col 2\n"
"Lorem ipsum dolor sit amet\n"
"Consectetur adipiscing elit"
)
assert chunk.text == "Header Col 1 Header Col 2"
assert chunk.metadata.text_as_html == (
"<table>\n"
"<thead>\n"
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
"</thead>\n"
"<tbody>\n"
"<tr><td>Lo"
"<table><tr><td>Header Col 1</td><td>Header Col 2</td></tr></table>"
)
assert not chunk.metadata.is_continuation
assert chunk.metadata.is_continuation is None
# --
chunk = next(chunk_iter)
assert isinstance(chunk, TableChunk)
assert chunk.text == (
"Nunc aliquam id enim nec molestie\nVivamus quis nunc ipsum donec ac fermentum"
)
assert chunk.text == "Lorem ipsum A Link example"
assert chunk.metadata.text_as_html == (
"rem ipsum </td><td>A Link example</td></tr>\n"
"<tr><td>Consectetur </td><td>adipiscing elit</td><"
)
assert chunk.metadata.is_continuation
# -- note that text runs out but HTML continues because it's significantly longer. So two
# -- of these chunks have HTML but no text.
chunk = next(chunk_iter)
assert isinstance(chunk, TableChunk)
assert chunk.text == ""
assert chunk.metadata.text_as_html == (
"/tr>\n"
"<tr><td>Nunc aliquam </td><td>id enim nec molestie</td></tr>\n"
"<tr><td>Vivamus quis </td><td>"
"<table><tr><td>Lorem ipsum</td><td>A Link example</td></tr></table>"
)
assert chunk.metadata.is_continuation
# --
chunk = next(chunk_iter)
assert isinstance(chunk, TableChunk)
assert chunk.text == ""
assert chunk.text == "Consectetur adipiscing elit"
assert chunk.metadata.text_as_html == (
"nunc ipsum donec ac fermentum</td></tr>\n</tbody>\n</table>"
"<table><tr><td>Consectetur</td><td>adipiscing elit</td></tr></table>"
)
assert chunk.metadata.is_continuation
# --
chunk = next(chunk_iter)
assert isinstance(chunk, TableChunk)
assert chunk.text == "Nunc aliquam id enim nec molestie"
assert chunk.metadata.text_as_html == (
"<table><tr><td>Nunc aliquam</td><td>id enim nec molestie</td></tr></table>"
)
assert chunk.metadata.is_continuation
# --
@ -545,8 +546,8 @@ class DescribeTablePreChunk:
[
# -- normally it splits exactly on overlap size |------- 20 -------|
("In rhoncus ipsum sed lectus porta volutpat.", "ctus porta volutpat."),
# -- but it strips leading and trailing whitespace when the tail includes it --
("In rhoncus ipsum sed lectus porta volutpat. ", "porta volutpat."),
# -- but it strips leading whitespace when the tail includes it --
("In rhoncus ipsum sed lectus porta volutpat.", "porta volutpat."),
],
)
def it_computes_its_overlap_tail_for_use_in_inter_pre_chunk_overlap(
@ -578,7 +579,7 @@ class DescribeTablePreChunk:
pre_chunk = TablePreChunk(
Table(text), overlap_prefix=overlap_prefix, opts=ChunkingOptions()
)
assert pre_chunk._text == expected_value
assert pre_chunk._text_with_overlap == expected_value
def it_computes_metadata_for_each_chunk_to_help(self):
table = Table("Lorem ipsum", metadata=ElementMetadata(text_as_html="<table/>"))
@ -659,6 +660,10 @@ class DescribeTextPreChunk:
assert (pre_chunk == other_pre_chunk) is expected_value
def and_it_knows_it_is_not_equal_to_an_object_that_is_not_a_TextPreChunk(self):
pre_chunk = TextPreChunk([], overlap_prefix="", opts=ChunkingOptions())
assert pre_chunk != 42
@pytest.mark.parametrize(
("max_characters", "combine_text_under_n_chars", "expected_value"),
[
@ -833,6 +838,19 @@ class DescribeTextPreChunk:
assert [c.metadata.is_continuation for c in chunk_iter] == [None, True, True]
def but_it_generates_no_chunks_when_the_pre_chunk_contains_no_text(self):
metadata = ElementMetadata()
pre_chunk = TextPreChunk(
[PageBreak("", metadata=metadata)],
overlap_prefix="",
opts=ChunkingOptions(),
)
chunk_iter = pre_chunk.iter_chunks()
with pytest.raises(StopIteration):
next(chunk_iter)
@pytest.mark.parametrize(
("text", "expected_value"),
[
@ -1098,6 +1116,168 @@ class DescribeTextPreChunk:
# ================================================================================================
class Describe_TableSplitter:
"""Unit-test suite for `unstructured.chunking.base._TableSplitter`."""
def it_splits_an_HTML_table_on_even_rows_when_possible(self):
opts = ChunkingOptions(max_characters=(150))
html_table = HtmlTable.from_html_text(
"""
<table border="1" class="dataframe">
<tbody>
<tr>
<td>Stanley
Cups</td>
<td></td>
<td></td>
</tr>
<tr>
<td>Team</td>
<td>Location</td>
<td>Stanley Cups</td>
</tr>
<tr>
<td>Blues</td>
<td>STL</td>
<td>1</td>
</tr>
<tr>
<td>Flyers</td>
<td>PHI</td>
<td>2</td>
</tr>
<tr>
<td>Maple Leafs</td>
<td>TOR</td>
<td>13</td>
</tr>
</tbody>
</table>
"""
)
assert list(_TableSplitter.iter_subtables(html_table, opts)) == [
(
"Stanley Cups Team Location Stanley Cups",
"<table>"
"<tr><td>Stanley Cups</td><td/><td/></tr>"
"<tr><td>Team</td><td>Location</td><td>Stanley Cups</td></tr>"
"</table>",
),
(
"Blues STL 1 Flyers PHI 2",
"<table>"
"<tr><td>Blues</td><td>STL</td><td>1</td></tr>"
"<tr><td>Flyers</td><td>PHI</td><td>2</td></tr>"
"</table>",
),
(
"Maple Leafs TOR 13",
"<table>" "<tr><td>Maple Leafs</td><td>TOR</td><td>13</td></tr>" "</table>",
),
]
def and_it_splits_an_oversized_row_on_an_even_cell_boundary_when_possible(self):
opts = ChunkingOptions(max_characters=(100))
html_table = HtmlTable.from_html_text(
"""
<html><body><table>
<tr>
<td>Lorem ipsum dolor sit amet.</td>
<td> Consectetur adipiscing elit. </td>
<td>
Laboris nisi ut
aliquip ex ea commodo.
</td>
</tr>
<tr>
<td>Duis</td>
<td>Dolor</td>
</tr>
<tr>
<td>Duis</td>
<td>Cillum</td>
</tr>
</table></body></html>
"""
)
assert list(_TableSplitter.iter_subtables(html_table, opts)) == [
(
"Lorem ipsum dolor sit amet. Consectetur adipiscing elit.",
"<table><tr>"
"<td>Lorem ipsum dolor sit amet.</td>"
"<td>Consectetur adipiscing elit.</td>"
"</tr></table>",
),
(
"Laboris nisi ut aliquip ex ea commodo.",
"<table><tr><td>Laboris nisi ut aliquip ex ea commodo.</td></tr></table>",
),
(
"Duis Dolor Duis Cillum",
"<table>"
"<tr><td>Duis</td><td>Dolor</td></tr>"
"<tr><td>Duis</td><td>Cillum</td></tr>"
"</table>",
),
]
def and_it_splits_an_oversized_cell_on_an_even_word_boundary(self):
opts = ChunkingOptions(max_characters=(100))
html_table = HtmlTable.from_html_text(
"""
<table>
<thead>
<tr>
<td>
Lorem ipsum dolor sit amet,
consectetur adipiscing elit.
Sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua.
</td>
<td> Ut enim ad minim veniam. </td>
<td> Quis nostrud exercitation ullamco. </td>
</tr>
</thead>
<tbody>
<tr><td>Duis aute irure dolor</td></tr>
<tr><td>In reprehenderit voluptate.</td></tr>
</tbody>
</table
"""
)
assert list(_TableSplitter.iter_subtables(html_table, opts)) == [
(
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do",
"<table>"
"<tr><td>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do</td></tr>"
"</table>",
),
(
"eiusmod tempor incididunt ut labore et dolore magna aliqua.",
"<table>"
"<tr><td>eiusmod tempor incididunt ut labore et dolore magna aliqua.</td></tr>"
"</table>",
),
(
"Ut enim ad minim veniam. Quis nostrud exercitation ullamco.",
"<table><tr>"
"<td>Ut enim ad minim veniam.</td>"
"<td>Quis nostrud exercitation ullamco.</td>"
"</tr></table>",
),
(
"Duis aute irure dolor In reprehenderit voluptate.",
"<table>"
"<tr><td>Duis aute irure dolor</td></tr>"
"<tr><td>In reprehenderit voluptate.</td></tr>"
"</table>",
),
]
class Describe_TextSplitter:
"""Unit-test suite for `unstructured.chunking.base._TextSplitter` objects."""
@ -1199,6 +1379,192 @@ class Describe_TextSplitter:
assert remainder == "ipiscing. In rhoncus ipsum sed lectus."
class Describe_CellAccumulator:
"""Unit-test suite for `unstructured.chunking.base._CellAccumulator`."""
def it_is_empty_on_construction(self):
accum = _CellAccumulator(maxlen=100)
assert accum._cells == []
def it_accumulates_elements_added_to_it(self):
td = fragment_fromstring("<td>foobar</td>")
cell = HtmlCell(td)
accum = _CellAccumulator(maxlen=100)
accum.add_cell(cell)
assert accum._cells == [cell]
@pytest.mark.parametrize(
("cell_html", "expected_value"),
[
("<td/>", True),
("<td>Lorem Ipsum.</td>", True),
("<td>Lorem Ipsum dolor sit.</td>", True),
("<td>Lorem Ipsum dolor sit amet.</td>", False),
],
)
def it_will_fit_a_cell_with_text_shorter_than_maxlen_minus_33_when_empty(
self, cell_html: str, expected_value: bool
):
"""Cell text must be 22-chars or shorter to fit in 55-char window.
`<table><tr><td>...</td></tr></table>` overhead is 33 characters.
"""
accum = _CellAccumulator(maxlen=55)
cell = HtmlCell(fragment_fromstring(cell_html))
assert accum.will_fit(cell) is expected_value
@pytest.mark.parametrize(
("cell_html", "expected_value"),
[
("<td/>", True), # -- 0 --
("<td>Lorem Ipsum.</td>", True), # -- 12 --
("<td>Lorem Ipsum amet.</td>", True), # -- 17 --
("<td>Lorem Ipsum dolor.</td>", False), # -- 18 --
("<td>Lorem Ipsum dolor sit amet.</td>", False), # -- 27 --
],
)
def and_it_will_fit_a_cell_with_text_shorter_than_remaining_space_minus_9_when_not_empty(
self, cell_html: str, expected_value: bool
):
"""Cell text must be 9-chars shorter than remaining space to fit with accumulated cells.
`<td>...</td>` overhead is 9 characters.
"""
accum = _CellAccumulator(maxlen=85)
accum.add_cell(HtmlCell(fragment_fromstring("<td>abcdefghijklmnopqrstuvwxyz</td>")))
# -- remaining space is 85 - 26 -33 = 26; max new cell text len is 17 --
cell = HtmlCell(fragment_fromstring(cell_html))
assert accum.will_fit(cell) is expected_value
def it_generates_a_TextAndHtml_pair_and_resets_itself_to_empty_when_flushed(self):
accum = _CellAccumulator(maxlen=100)
accum.add_cell(HtmlCell(fragment_fromstring("<td>abcde fghij klmno</td>")))
text, html = next(accum.flush())
assert text == "abcde fghij klmno"
assert html == "<table><tr><td>abcde fghij klmno</td></tr></table>"
assert accum._cells == []
def and_the_HTML_contains_as_many_cells_as_were_accumulated(self):
accum = _CellAccumulator(maxlen=100)
accum.add_cell(HtmlCell(fragment_fromstring("<td>abcde fghij klmno</td>")))
accum.add_cell(HtmlCell(fragment_fromstring("<td>pqrst uvwxy z</td>")))
text, html = next(accum.flush())
assert text == "abcde fghij klmno pqrst uvwxy z"
assert html == "<table><tr><td>abcde fghij klmno</td><td>pqrst uvwxy z</td></tr></table>"
assert accum._cells == []
def but_it_does_not_generate_a_TextAndHtml_pair_when_empty(self):
accum = _CellAccumulator(maxlen=100)
with pytest.raises(StopIteration):
next(accum.flush())
class Describe_RowAccumulator:
"""Unit-test suite for `unstructured.chunking.base._RowAccumulator`."""
def it_is_empty_on_construction(self):
accum = _RowAccumulator(maxlen=100)
assert accum._rows == []
def it_accumulates_rows_added_to_it(self):
accum = _RowAccumulator(maxlen=100)
row = HtmlRow(fragment_fromstring("<tr><td>foo</td><td>bar</td></tr>"))
accum.add_row(row)
assert accum._rows == [row]
@pytest.mark.parametrize(
("row_html", "expected_value"),
[
("<tr/>", True), # -- 5 --
("<tr><td/></tr>", True), # -- 14 --
("<tr><td>Lorem Ipsum.</td></tr>", True), # -- 30 --
("<tr><td>Lorem Ipsum dolor sit.</td></tr>", True), # -- 40 --
("<tr><td>Lorem</td><td>Sit amet</td></tr>", True), # -- 40 --
("<tr><td>Lorem Ipsum dolor sit amet.</td></tr>", False), # -- 45 --
("<tr><td>Lorem Ipsum</td><td>Dolor sit.</td></tr>", False), # -- 48 --
],
)
def it_will_fit_a_row_with_HTML_shorter_than_maxlen_minus_15_when_empty(
self, row_html: str, expected_value: bool
):
"""Row HTML must be 40-chars or shorter to fit in 55-char chunking window.
`<table>...</table>` overhead is 15 characters.
"""
accum = _RowAccumulator(maxlen=55)
row = HtmlRow(fragment_fromstring(row_html))
assert accum.will_fit(row) is expected_value
@pytest.mark.parametrize(
("row_html", "expected_value"),
[
("<tr/>", True), # -- 5 --
("<tr><td/></tr>", True), # -- 14 --
("<tr><td>Lorem Ipsum dolor sit</td></tr>", True), # -- 39 --
("<tr><td>Lorem Ipsum dolor sit.</td></tr>", True), # -- 40 --
("<tr><td>Lorem</td><td>Sit amet</td></tr>", True), # -- 40 --
("<tr><td>Lorem</td><td>Sit amet.</td></tr>", False), # -- 41 --
("<tr><td>Lorem Ipsum</td><td>Dolor sit.</td></tr>", False), # -- 48 --
],
)
def and_it_will_fit_a_row_with_HTML_shorter_than_remaining_space_when_not_empty(
self, row_html: str, expected_value: bool
):
"""There is no overhead beyond row HTML for additional rows."""
accum = _RowAccumulator(maxlen=99)
accum.add_row(HtmlRow(fragment_fromstring("<tr><td>abcdefghijklmnopqrstuvwxyz</td></tr>")))
# -- remaining space is 85 - 26 - 33 = 26; max new row HTML len is 40 --
row = HtmlRow(fragment_fromstring(row_html))
assert accum.will_fit(row) is expected_value
def it_generates_a_TextAndHtml_pair_and_resets_itself_to_empty_when_flushed(self):
accum = _RowAccumulator(maxlen=100)
accum.add_row(HtmlRow(fragment_fromstring("<tr><td>abcde fghij klmno</td></tr>")))
text, html = next(accum.flush())
assert text == "abcde fghij klmno"
assert html == "<table><tr><td>abcde fghij klmno</td></tr></table>"
assert accum._rows == []
def and_the_HTML_contains_as_many_rows_as_were_accumulated(self):
accum = _RowAccumulator(maxlen=100)
accum.add_row(HtmlRow(fragment_fromstring("<tr><td>abcde fghij klmno</td></tr>")))
accum.add_row(HtmlRow(fragment_fromstring("<tr><td>pqrst uvwxy z</td></tr>")))
text, html = next(accum.flush())
assert text == "abcde fghij klmno pqrst uvwxy z"
assert html == (
"<table>"
"<tr><td>abcde fghij klmno</td></tr>"
"<tr><td>pqrst uvwxy z</td></tr>"
"</table>"
)
assert accum._rows == []
def but_it_does_not_generate_a_TextAndHtml_pair_when_empty(self):
accum = _RowAccumulator(maxlen=100)
with pytest.raises(StopIteration):
next(accum.flush())
# ================================================================================================
# PRE-CHUNK COMBINER
# ================================================================================================

View File

@ -1,8 +1,18 @@
# pyright: reportPrivateUsage=false
"""Unit-test suite for the `unstructured.common.html_table` module."""
from __future__ import annotations
from unstructured.common.html_table import htmlify_matrix_of_cell_texts
import pytest
from lxml.html import fragment_fromstring
from unstructured.common.html_table import (
HtmlCell,
HtmlRow,
HtmlTable,
htmlify_matrix_of_cell_texts,
)
class Describe_htmlify_matrix_of_cell_texts:
@ -11,8 +21,8 @@ class Describe_htmlify_matrix_of_cell_texts:
def test_htmlify_matrix_handles_empty_cells(self):
assert htmlify_matrix_of_cell_texts([["cell1", "", "cell3"], ["", "cell5", ""]]) == (
"<table>"
"<tr><td>cell1</td><td></td><td>cell3</td></tr>"
"<tr><td></td><td>cell5</td><td></td></tr>"
"<tr><td>cell1</td><td/><td>cell3</td></tr>"
"<tr><td/><td>cell5</td><td/></tr>"
"</table>"
)
@ -31,3 +41,163 @@ class Describe_htmlify_matrix_of_cell_texts:
def test_htmlify_matrix_handles_empty_matrix(self):
assert htmlify_matrix_of_cell_texts([]) == ""
class DescribeHtmlTable:
"""Unit-test suite for `unstructured.common.html_table.HtmlTable`."""
def it_can_construct_from_html_text(self):
html_table = HtmlTable.from_html_text("<table><tr><td>foobar</td></tr></table>")
assert isinstance(html_table, HtmlTable)
assert html_table._table.tag == "table"
@pytest.mark.parametrize(
"html_text",
[
"<table><tr><td>foobar</td></tr></table>",
"<body><table><tr><td>foobar</td></tr></table></body>",
"<html><body><table><tr><td>foobar</td></tr></table></body></html>",
],
)
def it_can_find_a_table_wrapped_in_an_html_or_body_element(self, html_text: str):
html_table = HtmlTable.from_html_text(html_text)
assert isinstance(html_table, HtmlTable)
assert html_table._table.tag == "table"
def but_it_raises_when_no_table_element_is_present_in_the_html(self):
with pytest.raises(ValueError, match="`html_text` contains no `<table>` element"):
HtmlTable.from_html_text("<html><body><tr><td>foobar</td></tr></body></html>")
def it_removes_any_attributes_present_on_the_table_element(self):
html_table = HtmlTable.from_html_text(
'<table border="1", class="foobar"><tr><td>foobar</td></tr></table>',
)
assert html_table.html == "<table><tr><td>foobar</td></tr></table>"
@pytest.mark.parametrize(
"html_text",
[
"<table><thead><tr><td>foobar</td></tr></thead></table>",
"<table><thead><tr><td>foobar</td></tr></thead><tbody></tbody></table>",
"<table><tbody><tr><td>foobar</td></tr></tbody><tfoot></tfoot></table>",
],
)
def it_removes_any_thead_tbody_or_tfoot_elements_present_within_the_table_element(
self, html_text: str
):
html_table = HtmlTable.from_html_text(html_text)
assert html_table.html == "<table><tr><td>foobar</td></tr></table>"
def it_changes_any_th_elements_to_td_elements_for_cell_element_uniformity(self):
html_table = HtmlTable.from_html_text(
"<table>"
" <tr><th>a</th><th/><th>b</th></tr>"
" <tr><td/><td>c</td><td/></tr>"
"</table>"
)
assert html_table.html == (
"<table><tr><td>a</td><td/><td>b</td></tr><tr><td/><td>c</td><td/></tr></table>"
)
def it_removes_any_extra_whitespace_between_elements_and_normalizes_whitespace_in_text(self):
html_table = HtmlTable.from_html_text(
"\n <table>\n <tr>\n <td>\tabc def\nghi </td>\n </tr>\n</table>\n ",
)
assert html_table.html == "<table><tr><td>abc def ghi</td></tr></table>"
def it_can_serialize_the_table_element_to_str_html_text(self):
table = fragment_fromstring("<table><tr><td>foobar</td></tr></table>")
html_table = HtmlTable(table)
assert html_table.html == "<table><tr><td>foobar</td></tr></table>"
def it_can_iterate_the_rows_in_the_table(self):
html_table = HtmlTable.from_html_text(
"<table>"
" <tr><td>abc</td><td>def</td><td>ghi</td></tr>"
" <tr><td>jkl</td><td>mno</td><td>pqr</td></tr>"
" <tr><td>stu</td><td>vwx</td><td>yz</td></tr>"
"</table>"
)
row_iter = html_table.iter_rows()
row = next(row_iter)
assert isinstance(row, HtmlRow)
assert row.html == "<tr><td>abc</td><td>def</td><td>ghi</td></tr>"
# --
row = next(row_iter)
assert isinstance(row, HtmlRow)
assert row.html == "<tr><td>jkl</td><td>mno</td><td>pqr</td></tr>"
# --
row = next(row_iter)
assert isinstance(row, HtmlRow)
assert row.html == "<tr><td>stu</td><td>vwx</td><td>yz</td></tr>"
# --
with pytest.raises(StopIteration):
next(row_iter)
def it_provides_access_to_the_clear_concatenated_text_of_the_table(self):
html_table = HtmlTable.from_html_text(
"<table>"
" <tr><th> a\n b c </th><th/><th>def</th></tr>"
" <tr><td>gh \ti</td><td/><td>\n jk l </td></tr>"
" <tr><td/><td> m n op\n</td><td/></tr>"
"</table>"
)
assert html_table.text == "a b c def gh i jk l m n op"
class DescribeHtmlRow:
"""Unit-test suite for `unstructured.common.html_table.HtmlRow`."""
def it_can_serialize_the_row_to_html(self):
assert HtmlRow(fragment_fromstring("<tr><td>a</td><td>b</td><td/></tr>")).html == (
"<tr><td>a</td><td>b</td><td/></tr>"
)
def it_can_iterate_the_cells_in_the_row(self):
row = HtmlRow(fragment_fromstring("<tr><td>a</td><td>b</td><td/></tr>"))
cell_iter = row.iter_cells()
cell = next(cell_iter)
assert isinstance(cell, HtmlCell)
assert cell.html == "<td>a</td>"
# --
cell = next(cell_iter)
assert isinstance(cell, HtmlCell)
assert cell.html == "<td>b</td>"
# --
cell = next(cell_iter)
assert isinstance(cell, HtmlCell)
assert cell.html == "<td/>"
# --
with pytest.raises(StopIteration):
next(cell_iter)
def it_can_iterate_the_texts_of_the_cells_in_the_row(self):
row = HtmlRow(fragment_fromstring("<tr><td>a</td><td>b</td><td/></tr>"))
text_iter = row.iter_cell_texts()
assert next(text_iter) == "a"
assert next(text_iter) == "b"
with pytest.raises(StopIteration):
next(text_iter)
class DescribeHtmlCell:
"""Unit-test suite for `unstructured.common.html_table.HtmlCell`."""
def it_can_serialize_the_cell_to_html(self):
assert HtmlCell(fragment_fromstring("<td>a b c</td>")).html == "<td>a b c</td>"
@pytest.mark.parametrize(
("cell_html", "expected_value"),
[("<td> Lorem ipsum </td>", "Lorem ipsum"), ("<td/>", "")],
)
def it_knows_the_text_in_the_cell(self, cell_html: str, expected_value: str):
assert HtmlCell(fragment_fromstring(cell_html)).text == expected_value

View File

@ -504,7 +504,7 @@ def test_partition_html_accommodates_tds_with_child_elements():
)
assert element.metadata.text_as_html == (
"<table>"
"<tr><td></td><td></td></tr>"
"<tr><td/><td/></tr>"
"<tr><td>☒</td><td>ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES"
" EXCHANGE ACT OF 1934</td></tr>"
"</table>"

View File

@ -110,7 +110,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
# to give it that time process the writes. Will timeout after checking for a minute.
num_of_vectors_remote=0
attempt=1
sleep_amount=8
sleep_amount=30
while [ "$num_of_vectors_remote" -eq 0 ] && [ "$attempt" -lt 4 ]; do
echo "attempt $attempt: sleeping $sleep_amount seconds to let index finish catching up after writes"
sleep $sleep_amount

View File

@ -4,7 +4,7 @@
"element_id": "32bc8af17151389d3e80f65036f8e65b",
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
"metadata": {
"text_as_html": "<table><tr><td></td><td></td><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>",
"text_as_html": "<table><tr><td/><td/><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>",
"languages": [
"eng"
],

View File

@ -4,7 +4,7 @@
"element_id": "32bc8af17151389d3e80f65036f8e65b",
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
"metadata": {
"text_as_html": "<table><tr><td></td><td></td><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>",
"text_as_html": "<table><tr><td/><td/><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>",
"languages": [
"eng"
],

View File

@ -16,7 +16,7 @@
"languages": [
"eng"
],
"text_as_html": "<table><tr><td>Driver</td><td></td></tr><tr><td>Approver</td><td></td></tr><tr><td>Contributors</td><td></td></tr><tr><td>Informed</td><td></td></tr><tr><td>Objective</td><td></td></tr><tr><td>Due date</td><td></td></tr><tr><td>Key outcomes</td><td></td></tr><tr><td>Status</td><td>NOT STARTED / IN PROGRESS / COMPLETE</td></tr></table>"
"text_as_html": "<table><tr><td>Driver</td><td/></tr><tr><td>Approver</td><td/></tr><tr><td>Contributors</td><td/></tr><tr><td>Informed</td><td/></tr><tr><td>Objective</td><td/></tr><tr><td>Due date</td><td/></tr><tr><td>Key outcomes</td><td/></tr><tr><td>Status</td><td>NOT STARTED / IN PROGRESS / COMPLETE</td></tr></table>"
},
"text": "Driver Approver Contributors Informed Objective Due date Key outcomes Status NOT STARTED / IN PROGRESS / COMPLETE",
"type": "Table"
@ -80,7 +80,7 @@
"languages": [
"eng"
],
"text_as_html": "<table><tr><td>Must have:</td><td></td></tr><tr><td>Nice to have:</td><td></td></tr><tr><td>Not in scope:</td><td></td></tr></table>"
"text_as_html": "<table><tr><td>Must have:</td><td/></tr><tr><td>Nice to have:</td><td/></tr><tr><td>Not in scope:</td><td/></tr></table>"
},
"text": "Must have: Nice to have: Not in scope:",
"type": "Table"
@ -312,7 +312,7 @@
"languages": [
"eng"
],
"text_as_html": "<table><tr><td>Milestone</td><td>Owner</td><td>Deadline</td><td>Status</td></tr><tr><td></td><td></td><td></td><td></td></tr><tr><td></td><td></td><td></td><td></td></tr><tr><td></td><td></td><td></td><td></td></tr></table>"
"text_as_html": "<table><tr><td>Milestone</td><td>Owner</td><td>Deadline</td><td>Status</td></tr><tr><td/><td/><td/><td/></tr><tr><td/><td/><td/><td/></tr><tr><td/><td/><td/><td/></tr></table>"
},
"text": "Milestone Owner Deadline Status",
"type": "Table"

View File

@ -100,7 +100,7 @@
"languages": [
"eng"
],
"text_as_html": "<table><tr><td>Time</td><td>Item</td><td>Presenter</td><td>Notes</td></tr><tr><td></td><td></td><td></td><td></td></tr><tr><td></td><td></td><td></td><td></td></tr></table>"
"text_as_html": "<table><tr><td>Time</td><td>Item</td><td>Presenter</td><td>Notes</td></tr><tr><td/><td/><td/><td/></tr><tr><td/><td/><td/><td/></tr></table>"
},
"text": "Time Item Presenter Notes",
"type": "Table"

View File

@ -118,7 +118,7 @@
"languages": [
"eng"
],
"text_as_html": "<table><tr><td>Notes</td><td></td></tr><tr><td>Important Links</td><td></td></tr></table>"
"text_as_html": "<table><tr><td>Notes</td><td/></tr><tr><td>Important Links</td><td/></tr></table>"
},
"text": "Notes Important Links",
"type": "Table"

View File

@ -4,7 +4,7 @@
"element_id": "32bc8af17151389d3e80f65036f8e65b",
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
"metadata": {
"text_as_html": "<table><tr><td></td><td></td><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>",
"text_as_html": "<table><tr><td/><td/><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>",
"languages": [
"eng"
],

View File

@ -4,7 +4,7 @@
"element_id": "32bc8af17151389d3e80f65036f8e65b",
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
"metadata": {
"text_as_html": "<table><tr><td></td><td></td><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>",
"text_as_html": "<table><tr><td/><td/><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>",
"languages": [
"eng"
],

View File

@ -4,7 +4,7 @@
"element_id": "32bc8af17151389d3e80f65036f8e65b",
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
"metadata": {
"text_as_html": "<table><tr><td></td><td></td><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>",
"text_as_html": "<table><tr><td/><td/><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>",
"languages": [
"eng"
],

View File

@ -4,7 +4,7 @@
"element_id": "32bc8af17151389d3e80f65036f8e65b",
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
"metadata": {
"text_as_html": "<table><tr><td></td><td></td><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>",
"text_as_html": "<table><tr><td/><td/><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>",
"languages": [
"eng"
],

View File

@ -4,7 +4,7 @@
"element_id": "32bc8af17151389d3e80f65036f8e65b",
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
"metadata": {
"text_as_html": "<table><tr><td></td><td></td><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>",
"text_as_html": "<table><tr><td/><td/><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>",
"languages": [
"eng"
],

View File

@ -1,3 +0,0 @@
metric average sample_sd population_sd count
cct-accuracy 0.811 0.239 0.232 17
cct-%missing 0.024 0.032 0.031 17
1 metric average sample_sd population_sd count
2 cct-accuracy 0.811 0.239 0.232 17
3 cct-%missing 0.024 0.032 0.031 17

View File

@ -1,18 +0,0 @@
filename doctype connector cct-accuracy cct-%missing
fake-text.txt txt Sharepoint 1.0 0.0
ideas-page.html html Sharepoint 0.93 0.033
stanley-cups.xlsx xlsx Sharepoint 0.778 0.0
Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf pdf azure 0.981 0.005
IRS-form-1987.pdf pdf azure 0.794 0.135
spring-weather.html html azure 0.0 0.018
example-10k.html html local 0.754 0.027
fake-html-cp1252.html html local 0.659 0.0
ideas-page.html html local 0.93 0.033
UDHR_first_article_all.txt txt local-single-file 0.995 0.0
handbook-1p.docx docx local-single-file-basic-chunking 0.858 0.029
fake-html-cp1252.html html local-single-file-with-encoding 0.659 0.0
layout-parser-paper-with-table.jpg jpg local-single-file-with-pdf-infer-table-structure 0.716 0.032
layout-parser-paper.pdf pdf local-single-file-with-pdf-infer-table-structure 0.95 0.029
2023-Jan-economic-outlook.pdf pdf s3 0.84 0.044
page-with-formula.pdf pdf s3 0.971 0.021
recalibrating-risk-report.pdf pdf s3 0.968 0.008
1 filename doctype connector cct-accuracy cct-%missing
2 fake-text.txt txt Sharepoint 1.0 0.0
3 ideas-page.html html Sharepoint 0.93 0.033
4 stanley-cups.xlsx xlsx Sharepoint 0.778 0.0
5 Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf pdf azure 0.981 0.005
6 IRS-form-1987.pdf pdf azure 0.794 0.135
7 spring-weather.html html azure 0.0 0.018
8 example-10k.html html local 0.754 0.027
9 fake-html-cp1252.html html local 0.659 0.0
10 ideas-page.html html local 0.93 0.033
11 UDHR_first_article_all.txt txt local-single-file 0.995 0.0
12 handbook-1p.docx docx local-single-file-basic-chunking 0.858 0.029
13 fake-html-cp1252.html html local-single-file-with-encoding 0.659 0.0
14 layout-parser-paper-with-table.jpg jpg local-single-file-with-pdf-infer-table-structure 0.716 0.032
15 layout-parser-paper.pdf pdf local-single-file-with-pdf-infer-table-structure 0.95 0.029
16 2023-Jan-economic-outlook.pdf pdf s3 0.84 0.044
17 page-with-formula.pdf pdf s3 0.971 0.021
18 recalibrating-risk-report.pdf pdf s3 0.968 0.008

View File

@ -1 +1 @@
__version__ = "0.15.6-dev0" # pragma: no cover
__version__ = "0.15.6-dev1" # pragma: no cover

View File

@ -9,6 +9,7 @@ from typing import Any, Callable, DefaultDict, Iterable, Iterator, cast
import regex
from typing_extensions import Self, TypeAlias
from unstructured.common.html_table import HtmlCell, HtmlRow, HtmlTable
from unstructured.documents.elements import (
CompositeElement,
ConsolidationStrategy,
@ -46,6 +47,8 @@ BoundaryPredicate: TypeAlias = Callable[[Element], bool]
PreChunk: TypeAlias = "TablePreChunk | TextPreChunk"
"""The kind of object produced by a pre-chunker."""
TextAndHtml: TypeAlias = tuple[str, str]
# ================================================================================================
# CHUNKING OPTIONS
@ -441,37 +444,31 @@ class TablePreChunk:
def iter_chunks(self) -> Iterator[Table | TableChunk]:
"""Split this pre-chunk into `Table` or `TableChunk` objects maxlen or smaller."""
maxlen = self._opts.hard_max
text_remainder = self._text
html_remainder = self._table.metadata.text_as_html or ""
# -- only text-split a table when it's longer than the chunking window --
if len(text_remainder) <= maxlen and len(html_remainder) <= maxlen:
# -- but the overlap-prefix must be added to its text --
yield Table(text=text_remainder, metadata=self._metadata)
# -- A table with no non-whitespace text produces no chunks --
if not self._table_text:
return
split = self._opts.split
is_continuation = False
while text_remainder or html_remainder:
# -- split off the next chunk-worth of characters into a TableChunk --
chunk_text, text_remainder = split(text_remainder)
# -- only text-split a table when it's longer than the chunking window --
maxlen = self._opts.hard_max
if len(self._text_with_overlap) <= maxlen and len(self._html) <= maxlen:
# -- use the compactified html for .text_as_html, even though we're not splitting --
metadata = self._metadata
metadata.text_as_html = self._html or None
# -- note the overlap-prefix is prepended to its text --
yield Table(text=self._text_with_overlap, metadata=metadata)
return
# -- Attach maxchars of the html to the chunk. Note no attempt is made to add only the
# -- HTML elements that *correspond* to the TextChunk.text fragment.
if html_remainder:
chunk_html, html_remainder = html_remainder[:maxlen], html_remainder[maxlen:]
metadata.text_as_html = chunk_html
# -- When there's no HTML, split it like a normal element. Also fall back to text-only
# -- chunks when `max_characters` is less than 50. `.text_as_html` metadata is impractical
# -- for a chunking window that small because the 33 characterss of HTML overhead for each
# -- chunk (`<table><tr><td>...</td></tr></table>`) would produce a very large number of
# -- very small chunks.
if not self._html or self._opts.hard_max < 50:
yield from self._iter_text_only_table_chunks()
return
# -- mark second and later chunks as a continuation --
if is_continuation:
metadata.is_continuation = True
yield TableChunk(text=chunk_text, metadata=metadata)
is_continuation = True
# -- otherwise, form splits with "synchronized" text and html --
yield from self._iter_text_and_html_table_chunks()
@lazyproperty
def overlap_tail(self) -> str:
@ -482,18 +479,80 @@ class TablePreChunk:
trailing whitespace.
"""
overlap = self._opts.inter_chunk_overlap
return self._text[-overlap:].strip() if overlap else ""
return self._text_with_overlap[-overlap:].strip() if overlap else ""
@lazyproperty
def _html(self) -> str:
"""The compactified HTML for this table when it has text-as-HTML.
The empty string when table-structure has not been captured, perhaps because
`infer_table_structure` was set `False` in the partitioning call.
"""
if not (html_table := self._html_table):
return ""
return html_table.html
@lazyproperty
def _html_table(self) -> HtmlTable | None:
"""The `lxml` HTML element object for this table.
`None` when the `Table` element has no `.metadata.text_as_html`.
"""
if (text_as_html := self._table.metadata.text_as_html) is None:
return None
text_as_html = text_as_html.strip()
if not text_as_html: # pragma: no cover
return None
return HtmlTable.from_html_text(text_as_html)
def _iter_text_and_html_table_chunks(self) -> Iterator[TableChunk]:
"""Split table into chunks where HTML corresponds exactly to text.
`.metadata.text_as_html` for each chunk is a parsable `<table>` HTML fragment.
"""
if (html_table := self._html_table) is None: # pragma: no cover
raise ValueError("this method is undefined for a table having no .text_as_html")
is_continuation = False
for text, html in _TableSplitter.iter_subtables(html_table, self._opts):
metadata = self._metadata
metadata.text_as_html = html
# -- second and later chunks get `.metadata.is_continuation = True` --
metadata.is_continuation = is_continuation or None
is_continuation = True
yield TableChunk(text=text, metadata=metadata)
def _iter_text_only_table_chunks(self) -> Iterator[TableChunk]:
"""Split oversized text-only table (no text-as-html) into chunks."""
text_remainder = self._text_with_overlap
split = self._opts.split
is_continuation = False
while text_remainder:
# -- split off the next chunk-worth of characters into a TableChunk --
chunk_text, text_remainder = split(text_remainder)
metadata = self._metadata
# -- second and later chunks get `.metadata.is_continuation = True` --
metadata.is_continuation = is_continuation or None
is_continuation = True
yield TableChunk(text=chunk_text, metadata=metadata)
@property
def _metadata(self) -> ElementMetadata:
"""The base `.metadata` value for chunks formed from this pre-chunk.
The term "base" here means that other metadata fields will be added, depending on the chunk.
In particular, `.metadata.text_as_html` will be different for each text-split chunk and
`.metadata.is_continuation` must be added for second-and-later text-split chunks.
The term "base" here means that other metadata fields will be added, depending on the
chunk. In particular, `.metadata.text_as_html` will be different for each text-split chunk
and `.metadata.is_continuation` must be added for second-and-later text-split chunks.
Note this is a fresh copy of the metadata on each call since it will need to be mutated
differently for each chunk formed from from this pre-chunk.
differently for each chunk formed from this pre-chunk.
"""
CS = ConsolidationStrategy
metadata = copy.deepcopy(self._table.metadata)
@ -528,10 +587,15 @@ class TablePreChunk:
return [orig_table]
@lazyproperty
def _text(self) -> str:
def _table_text(self) -> str:
"""The text in this table, not including any overlap-prefix or extra whitespace."""
return " ".join(self._table.text.split())
@lazyproperty
def _text_with_overlap(self) -> str:
"""The text for this chunk, including the overlap-prefix when present."""
overlap_prefix = self._overlap_prefix
table_text = self._table.text
table_text = self._table.text.strip()
# -- use row-separator between overlap and table-text --
return overlap_prefix + "\n" + table_text if overlap_prefix else table_text
@ -795,6 +859,82 @@ class TextPreChunk:
# ================================================================================================
class _TableSplitter:
"""Produces (text, html) pairs for a `<table>` HtmlElement.
Each chunk contains a whole number of rows whenever possible. An oversized row is split on an
even cell boundary and a single cell that is by itself too big to fit in the chunking window
is divided by text-splitting.
The returned `html` value is always a parseable HTML `<table>` subtree.
"""
def __init__(self, table_element: HtmlTable, opts: ChunkingOptions):
self._table_element = table_element
self._opts = opts
@classmethod
def iter_subtables(
cls, table_element: HtmlTable, opts: ChunkingOptions
) -> Iterator[TextAndHtml]:
"""Generate (text, html) pair for each split of this table pre-chunk.
Each split is on an even row boundary whenever possible, falling back to even cell and even
word boundaries when a row or cell is by itself oversized, respectively.
"""
return cls(table_element, opts)._iter_subtables()
def _iter_subtables(self) -> Iterator[TextAndHtml]:
"""Generate (text, html) pairs containing as many whole rows as will fit in window.
Falls back to splitting rows into whole cells when a single row is by itself too big to
fit in the chunking window.
"""
accum = _RowAccumulator(maxlen=self._opts.hard_max)
for row in self._table_element.iter_rows():
# -- if row won't fit, any WIP chunk is done, send it on its way --
if not accum.will_fit(row):
yield from accum.flush()
# -- if row fits, add it to accumulator --
if accum.will_fit(row):
accum.add_row(row)
else: # -- otherwise, single row is bigger than chunking window --
yield from self._iter_row_splits(row)
yield from accum.flush()
def _iter_row_splits(self, row: HtmlRow) -> Iterator[TextAndHtml]:
"""Split oversized row into (text, html) pairs containing as many cells as will fit."""
accum = _CellAccumulator(maxlen=self._opts.hard_max)
for cell in row.iter_cells():
# -- if cell won't fit, flush and check again --
if not accum.will_fit(cell):
yield from accum.flush()
# -- if cell fits, add it to accumulator --
if accum.will_fit(cell):
accum.add_cell(cell)
else: # -- otherwise, single cell is bigger than chunking window --
yield from self._iter_cell_splits(cell)
yield from accum.flush()
def _iter_cell_splits(self, cell: HtmlCell) -> Iterator[TextAndHtml]:
"""Split a single oversized cell into sub-sub-sub-table HTML fragments."""
# -- 33 is len("<table><tr><td></td></tr></table>"), HTML overhead beyond text content --
opts = ChunkingOptions(max_characters=(self._opts.hard_max - 33))
split = _TextSplitter(opts)
text, remainder = split(cell.text)
yield text, f"<table><tr><td>{text}</td></tr></table>"
# -- an oversized cell will have a remainder, split that up into additional chunks.
while remainder:
text, remainder = split(remainder)
yield text, f"<table><tr><td>{text}</td></tr></table>"
class _TextSplitter:
"""Provides a text-splitting function configured on construction.
@ -911,6 +1051,97 @@ class _TextSplitter:
return fragment, overlapped_remainder
class _CellAccumulator:
"""Incrementally build `<table>` fragment cell-by-cell to maximally fill chunking window.
Accumulate cells until chunking window is filled, then generate the text and HTML for the
subtable composed of all those rows that fit in the window.
"""
def __init__(self, maxlen: int):
self._maxlen = maxlen
self._cells: list[HtmlCell] = []
def add_cell(self, cell: HtmlCell) -> None:
"""Add `cell` to this accumulation. Caller is responsible for ensuring it will fit."""
self._cells.append(cell)
def flush(self) -> Iterator[TextAndHtml]:
"""Generate zero-or-one (text, html) pairs for accumulated sub-sub-table."""
if not self._cells:
return
text = " ".join(self._iter_cell_texts())
tds_str = "".join(c.html for c in self._cells)
html = f"<table><tr>{tds_str}</tr></table>"
self._cells.clear()
yield text, html
def will_fit(self, cell: HtmlCell) -> bool:
"""True when `cell` will fit within remaining space left by accummulated cells."""
return self._remaining_space >= len(cell.html)
def _iter_cell_texts(self) -> Iterator[str]:
"""Generate contents of each accumulated cell as a separate string.
A cell that is empty or contains only whitespace does not generate a string.
"""
for cell in self._cells:
if not (text := cell.text):
continue
yield text
@property
def _remaining_space(self) -> int:
"""Number of characters remaining when accumulated cells are formed into HTML."""
# -- 24 is `len("<table><tr></tr></table>")`, the overhead in addition to `<td>`
# -- HTML fragments
return self._maxlen - 24 - sum(len(c.html) for c in self._cells)
class _RowAccumulator:
"""Maybe `SubtableAccumulator`.
Accumulate rows until chunking window is filled, then generate the text and HTML for the
subtable composed of all those rows that fit in the window.
"""
def __init__(self, maxlen: int):
self._maxlen = maxlen
self._rows: list[HtmlRow] = []
def add_row(self, row: HtmlRow) -> None:
"""Add `row` to this accumulation. Caller is responsible for ensuring it will fit."""
self._rows.append(row)
def flush(self) -> Iterator[TextAndHtml]:
"""Generate zero-or-one (text, html) pairs for accumulated sub-table."""
if not self._rows:
return
text = " ".join(self._iter_cell_texts())
trs_str = "".join(r.html for r in self._rows)
html = f"<table>{trs_str}</table>"
self._rows.clear()
yield text, html
def will_fit(self, row: HtmlRow) -> bool:
"""True when `row` will fit within remaining space left by accummulated rows."""
return self._remaining_space >= len(row.html)
def _iter_cell_texts(self) -> Iterator[str]:
"""Generate contents of each row cell as a separate string.
A cell that is empty or contains only whitespace does not generate a string.
"""
for r in self._rows:
yield from r.iter_cell_texts()
@property
def _remaining_space(self) -> int:
"""Number of characters remaining when accumulated rows are formed into HTML."""
# -- 15 is `len("<table></table>")`, the overhead in addition to `<tr>` HTML fragments --
return self._maxlen - 15 - sum(len(r.html) for r in self._rows)
# ================================================================================================
# PRE-CHUNK COMBINER
# ================================================================================================

View File

@ -6,7 +6,15 @@ Used during partitioning as well as chunking.
from __future__ import annotations
import html
from typing import Iterator, Sequence
from typing import TYPE_CHECKING, Iterator, Sequence, cast
from lxml import etree
from lxml.html import fragment_fromstring
from unstructured.utils import lazyproperty
if TYPE_CHECKING:
from lxml.html import HtmlElement
def htmlify_matrix_of_cell_texts(matrix: Sequence[Sequence[str]]) -> str:
@ -33,7 +41,121 @@ def htmlify_matrix_of_cell_texts(matrix: Sequence[Sequence[str]]) -> str:
s = html.escape(s)
# -- substitute <br/> elements for line-feeds in the text --
s = "<br/>".join(s.split("\n"))
# -- strip leading and trailing whitespace, wrap it up and go --
yield f"<td>{s.strip()}</td>"
# -- normalize whitespace in cell --
cell_text = " ".join(s.split())
# -- emit void `<td/>` when cell text is empty string --
yield f"<td>{cell_text}</td>" if cell_text else "<td/>"
return f"<table>{''.join(iter_trs(matrix))}</table>" if matrix else ""
class HtmlTable:
"""A `<table>` element."""
def __init__(self, table: HtmlElement):
self._table = table
@classmethod
def from_html_text(cls, html_text: str) -> HtmlTable:
# -- root is always a `<table>` element so far but let's be robust --
root = fragment_fromstring(html_text)
tables = root.xpath("//table")
if not tables:
raise ValueError("`html_text` contains no `<table>` element")
table = tables[0]
# -- remove `<thead>`, `<tbody>`, and `<tfoot>` noise elements when present --
noise_elements = table.xpath(".//thead | .//tbody | .//tfoot")
for e in noise_elements:
e.drop_tag()
# -- normalize and compactify the HTML --
for e in table.iter():
# -- Strip all attributes from elements, like border="1", class="dataframe" added
# -- by pandas.DataFrame.to_html(), style="text-align: right;", etc.
e.attrib.clear()
# -- change any `<th>` elements to `<td>` so all cells have the same tag --
if e.tag == "th":
e.tag = "td"
# -- normalize whitespace in element text; this removes indent whitespace before nested
# -- elements and reduces whitespace between words to a single space.
if e.text:
e.text = " ".join(e.text.split())
# -- remove all tails, those are newline + indent if anything --
if e.tail:
e.tail = None
return cls(table)
@lazyproperty
def html(self) -> str:
"""The HTML-fragment for this `<table>` element, all on one line.
Like: `<table><tr><td>foo</td></tr><tr><td>bar</td></tr></table>`
The HTML contains no human-readability whitespace, attributes, or `<thead>`, `<tbody>`, or
`<tfoot>` tags. It is made as compact as possible to maximize the semantic content in a
given space. This is particularly important for chunking.
"""
return etree.tostring(self._table, encoding=str)
def iter_rows(self) -> Iterator[HtmlRow]:
yield from (HtmlRow(tr) for tr in cast("list[HtmlElement]", self._table.xpath("./tr")))
@lazyproperty
def text(self) -> str:
"""The clean, concatenated, text for this table."""
table_text = " ".join(self._table.itertext())
# -- blank cells will introduce extra whitespace, so normalize after accumulating --
return " ".join(table_text.split())
class HtmlRow:
"""A `<tr>` element."""
def __init__(self, tr: HtmlElement):
self._tr = tr
@lazyproperty
def html(self) -> str:
"""Like "<tr><td>foo</td><td>bar</td></tr>"."""
return etree.tostring(self._tr, encoding=str)
def iter_cells(self) -> Iterator[HtmlCell]:
for td in self._tr:
yield HtmlCell(td)
def iter_cell_texts(self) -> Iterator[str]:
"""Generate contents of each cell of this row as a separate string.
A cell that is empty or contains only whitespace does not generate a string.
"""
for td in self._tr:
if (text := td.text) is None:
continue
text = text.strip()
if not text:
continue
yield text
class HtmlCell:
"""A `<td>` element."""
def __init__(self, td: HtmlElement):
self._td = td
@lazyproperty
def html(self) -> str:
"""Like "<td>foo bar baz</td>"."""
return etree.tostring(self._td, encoding=str) if self.text else "<td/>"
@lazyproperty
def text(self) -> str:
"""Text inside `<td>` element, empty string when no text."""
if (text := self._td.text) is None:
return ""
return text.strip()

View File

@ -4,13 +4,7 @@ from __future__ import annotations
import os
import re
import sys
from typing import List, Optional
if sys.version_info < (3, 8):
from typing_extensions import Final # pragma: nocover
else:
from typing import Final
from typing import Final, List, Optional
from unstructured.cleaners.core import remove_punctuation
from unstructured.logger import trace_logger