mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-10-14 17:43:20 +00:00
fix(html): unequal row lengths in HTMLTable.text_as_html (#2345)
Fixes #2339 Fixes to HTML partitioning introduced with v0.11.0 removed the use of `tabulate` for forming the HTML placed in `HTMLTable.text_as_html`. This had several benefits, but part of `tabulate`'s behavior was to make row-length (cell-count) uniform across the rows of the table. Lacking this prior uniformity produced a downstream problem reported in On closer inspection, the method used to "harvest" cell-text was producing more text-nodes than there were cells and was sensitive to where whitespace was used to format the HTML. It also "moved" text to different columns in certain rows. Refine the cell-text gathering mechanism to get exactly one text string for each row cell, eliminating whitespace formatting nodes and producing strict correspondence between the number of cells in the original HTML table row and that placed in HTML.text_as_html. HTML tables that are uniform (every row has the same number of cells) will produce a uniform table in `.text_as_html`. Merged cells may still produce a non-uniform table in `.text_as_html` (because the source table is non-uniform).
This commit is contained in:
parent
950e5d68f9
commit
22cbdce7ca
@ -1,4 +1,4 @@
|
|||||||
## 0.11.9-dev1
|
## 0.11.9-dev2
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
@ -10,6 +10,8 @@
|
|||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
|
* **Fix unequal row-length in HTMLTable.text_as_html.** Fixes to other aspects of partition_html() in v0.11 allowed unequal cell-counts in table rows. Make the cells in each row correspond 1:1 with cells in the original table row. This fix also removes "noise" cells resulting from HTML-formatting whitespace and eliminates the "column-shifting" of cells that previously resulted from noise-cells.
|
||||||
|
|
||||||
## 0.11.8
|
## 0.11.8
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
@ -6,6 +6,7 @@ from typing import Dict, List
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
from lxml import html as lxml_html
|
||||||
|
|
||||||
from unstructured.documents import html
|
from unstructured.documents import html
|
||||||
from unstructured.documents.base import Page
|
from unstructured.documents.base import Page
|
||||||
@ -28,6 +29,7 @@ from unstructured.documents.html import (
|
|||||||
HTMLTable,
|
HTMLTable,
|
||||||
HTMLTitle,
|
HTMLTitle,
|
||||||
TagsMixin,
|
TagsMixin,
|
||||||
|
_parse_HTMLTable_from_table_elem,
|
||||||
)
|
)
|
||||||
|
|
||||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||||
@ -883,6 +885,108 @@ def test_line_break_in_text_tag(tag):
|
|||||||
assert doc.elements[1].text == "World"
|
assert doc.elements[1].text == "World"
|
||||||
|
|
||||||
|
|
||||||
|
# -- unit-level tests ----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class Describe_parse_HTMLTable_from_table_elem:
|
||||||
|
"""Unit-test suite for `unstructured.documents.html._parse_HTMLTable_from_table_elem`."""
|
||||||
|
|
||||||
|
def it_produces_one_cell_for_each_original_table_cell(self):
|
||||||
|
table_html = (
|
||||||
|
# -- include formatting whitespace to make sure it is removed --
|
||||||
|
"<table>\n"
|
||||||
|
" <tr>\n"
|
||||||
|
" <td>foo</td>\n"
|
||||||
|
" <td>bar</td>\n"
|
||||||
|
" </tr>\n"
|
||||||
|
"</table>"
|
||||||
|
)
|
||||||
|
table_elem = lxml_html.fromstring(table_html) # pyright: ignore[reportUnknownMemberType]
|
||||||
|
|
||||||
|
html_table = _parse_HTMLTable_from_table_elem(table_elem)
|
||||||
|
|
||||||
|
assert isinstance(html_table, HTMLTable)
|
||||||
|
assert html_table.text == "foo bar"
|
||||||
|
assert html_table.text_as_html == "<table><tr><td>foo</td><td>bar</td></tr></table>"
|
||||||
|
|
||||||
|
def it_accommodates_tds_with_child_elements(self):
|
||||||
|
"""Like this example from an SEC 10k filing."""
|
||||||
|
table_html = (
|
||||||
|
"<table>\n"
|
||||||
|
" <tr>\n"
|
||||||
|
" <td></td>\n"
|
||||||
|
" <td></td>\n"
|
||||||
|
" </tr>\n"
|
||||||
|
" <tr>\n"
|
||||||
|
" <td>\n"
|
||||||
|
" <p>\n"
|
||||||
|
" <span>\n"
|
||||||
|
' <ix:nonNumeric id="F_be4cc145-372a-4689-be60-d8a70b0c8b9a"'
|
||||||
|
' contextRef="C_1de69f73-df01-4830-8af0-0f11b469bc4a" name="dei:DocumentAnnualReport"'
|
||||||
|
' format="ixt-sec:boolballotbox">\n'
|
||||||
|
" <span>☒</span>\n"
|
||||||
|
" </ix:nonNumeric>\n"
|
||||||
|
" </span>\n"
|
||||||
|
" </p>\n"
|
||||||
|
" </td>\n"
|
||||||
|
" <td>\n"
|
||||||
|
" <p>\n"
|
||||||
|
" <span>ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE"
|
||||||
|
" ACT OF 1934</span>\n"
|
||||||
|
" </p>\n"
|
||||||
|
" </td>\n"
|
||||||
|
" </tr>\n"
|
||||||
|
"</table>\n"
|
||||||
|
)
|
||||||
|
table_elem = lxml_html.fromstring(table_html) # pyright: ignore[reportUnknownMemberType]
|
||||||
|
|
||||||
|
html_table = _parse_HTMLTable_from_table_elem(table_elem)
|
||||||
|
|
||||||
|
assert isinstance(html_table, HTMLTable)
|
||||||
|
assert html_table.text == (
|
||||||
|
"☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934"
|
||||||
|
)
|
||||||
|
print(f"{html_table.text_as_html=}")
|
||||||
|
assert html_table.text_as_html == (
|
||||||
|
"<table>"
|
||||||
|
"<tr><td></td><td></td></tr>"
|
||||||
|
"<tr><td>☒</td><td>ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES"
|
||||||
|
" EXCHANGE ACT OF 1934</td></tr>"
|
||||||
|
"</table>"
|
||||||
|
)
|
||||||
|
|
||||||
|
def it_reduces_a_nested_table_to_its_text_placed_in_the_cell_containing_the_nested_table(self):
|
||||||
|
"""Recursively ..."""
|
||||||
|
nested_table_html = (
|
||||||
|
"<table>\n"
|
||||||
|
" <tr>\n"
|
||||||
|
" <td>\n"
|
||||||
|
" <table>\n"
|
||||||
|
" <tr><td>foo</td><td>bar</td></tr>\n"
|
||||||
|
" <tr><td>baz</td><td>bng</td></tr>\n"
|
||||||
|
" </table>\n"
|
||||||
|
" </td>\n"
|
||||||
|
" <td>\n"
|
||||||
|
" <table>\n"
|
||||||
|
" <tr><td>fizz</td><td>bang</td></tr>\n"
|
||||||
|
" </table>\n"
|
||||||
|
" </td>\n"
|
||||||
|
" </tr>\n"
|
||||||
|
"</table>"
|
||||||
|
)
|
||||||
|
table_elem = lxml_html.fromstring( # pyright: ignore[reportUnknownMemberType]
|
||||||
|
nested_table_html
|
||||||
|
)
|
||||||
|
|
||||||
|
html_table = _parse_HTMLTable_from_table_elem(table_elem)
|
||||||
|
|
||||||
|
assert isinstance(html_table, HTMLTable)
|
||||||
|
assert html_table.text == "foo bar baz bng fizz bang"
|
||||||
|
assert html_table.text_as_html == (
|
||||||
|
"<table><tr><td>foo bar baz bng</td><td>fizz bang</td></tr></table>"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# -- module-level fixtures -----------------------------------------------------------------------
|
# -- module-level fixtures -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@ -29,10 +29,10 @@ def test_partition_epub_from_filename_returns_table_in_elements():
|
|||||||
assert (
|
assert (
|
||||||
elements[14].text.replace("\n", " ")
|
elements[14].text.replace("\n", " ")
|
||||||
== Table(
|
== Table(
|
||||||
text="Contents. List of Illustrations "
|
text="Contents. List of Illustrations "
|
||||||
"(In certain versions of this etext [in certain browsers] "
|
"(In certain versions of this etext [in certain browsers] "
|
||||||
"clicking on the image will bring up a larger version.) "
|
"clicking on the image will bring up a larger version.) "
|
||||||
" (etext transcriber's note)",
|
"(etext transcriber's note)",
|
||||||
).text
|
).text
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -15,8 +15,7 @@ def test_partition_rtf_from_filename():
|
|||||||
assert len(elements) > 0
|
assert len(elements) > 0
|
||||||
assert elements[0] == Title("My First Heading")
|
assert elements[0] == Title("My First Heading")
|
||||||
assert elements[-1] == Table(
|
assert elements[-1] == Table(
|
||||||
text="Column 1 \n Column 2 \n Row 1, Cell 1 \n Row 1, "
|
text="Column 1 Column 2 Row 1, Cell 1 Row 1, Cell 2 Row 2, Cell 1 Row 2, Cell 2"
|
||||||
"Cell 2 \n Row 2, Cell 1 \n Row 2, Cell 2",
|
|
||||||
)
|
)
|
||||||
for element in elements:
|
for element in elements:
|
||||||
assert element.metadata.filename == "fake-doc.rtf"
|
assert element.metadata.filename == "fake-doc.rtf"
|
||||||
|
@ -1145,9 +1145,7 @@ def test_add_chunking_strategy_on_partition_auto_respects_max_chars():
|
|||||||
assert len(partitioned_table_elements_5_chars) != len(table_elements)
|
assert len(partitioned_table_elements_5_chars) != len(table_elements)
|
||||||
assert len(partitioned_table_elements_200_chars) != len(table_elements)
|
assert len(partitioned_table_elements_200_chars) != len(table_elements)
|
||||||
|
|
||||||
# trailing whitespace is stripped from the first chunk, leaving only a checkbox character
|
assert len(partitioned_table_elements_5_chars[0].text) == 5
|
||||||
assert len(partitioned_table_elements_5_chars[0].text) == 1
|
|
||||||
# but the second chunk is the full 5 characters
|
|
||||||
assert len(partitioned_table_elements_5_chars[1].text) == 5
|
assert len(partitioned_table_elements_5_chars[1].text) == 5
|
||||||
assert len(partitioned_table_elements_5_chars[0].metadata.text_as_html) == 5
|
assert len(partitioned_table_elements_5_chars[0].metadata.text_as_html) == 5
|
||||||
|
|
||||||
|
@ -276,7 +276,7 @@ def test_partition_html_on_ideas_page(filename="example-docs/ideas-page.html"):
|
|||||||
elements = partition_html(filename=filename)
|
elements = partition_html(filename=filename)
|
||||||
assert len(elements) == 1
|
assert len(elements) == 1
|
||||||
assert elements[0] == Table(
|
assert elements[0] == Table(
|
||||||
text="January 2023 ( Someone fed my essays into GPT to make something "
|
text="January 2023 ( Someone fed my essays into GPT to make something "
|
||||||
"that could answer\nquestions based on them, then asked it where good "
|
"that could answer\nquestions based on them, then asked it where good "
|
||||||
"ideas come from. The\nanswer was ok, but not what I would have said. "
|
"ideas come from. The\nanswer was ok, but not what I would have said. "
|
||||||
"This is what I would have said.) The way to get new ideas is to notice "
|
"This is what I would have said.) The way to get new ideas is to notice "
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[
|
[
|
||||||
{
|
{
|
||||||
"element_id": "e83a347af95db7ba47b5351f411e00c7",
|
"element_id": "8088fbcca4eb780b8a4b8efe4b018860",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"data_source": {
|
"data_source": {
|
||||||
"date_created": "2023-06-16T05:04:47",
|
"date_created": "2023-06-16T05:04:47",
|
||||||
@ -106,9 +106,9 @@
|
|||||||
"eng"
|
"eng"
|
||||||
],
|
],
|
||||||
"page_number": 1,
|
"page_number": 1,
|
||||||
"text_as_html": "<table><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you'll notice it's full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
|
"text_as_html": "<table><tr><td></td><td></td><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you'll notice it's full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
|
||||||
},
|
},
|
||||||
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
|
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
|
||||||
"type": "Table"
|
"type": "Table"
|
||||||
}
|
}
|
||||||
]
|
]
|
@ -1,6 +1,6 @@
|
|||||||
[
|
[
|
||||||
{
|
{
|
||||||
"element_id": "e83a347af95db7ba47b5351f411e00c7",
|
"element_id": "8088fbcca4eb780b8a4b8efe4b018860",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"data_source": {
|
"data_source": {
|
||||||
"date_created": "2023-06-16T05:04:47",
|
"date_created": "2023-06-16T05:04:47",
|
||||||
@ -18,9 +18,9 @@
|
|||||||
"eng"
|
"eng"
|
||||||
],
|
],
|
||||||
"page_number": 1,
|
"page_number": 1,
|
||||||
"text_as_html": "<table><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you'll notice it's full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
|
"text_as_html": "<table><tr><td></td><td></td><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you'll notice it's full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
|
||||||
},
|
},
|
||||||
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
|
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
|
||||||
"type": "Table"
|
"type": "Table"
|
||||||
}
|
}
|
||||||
]
|
]
|
@ -1,6 +1,6 @@
|
|||||||
[
|
[
|
||||||
{
|
{
|
||||||
"element_id": "10b5ef18a3c7fb1d7436b2e1b256e5b9",
|
"element_id": "597883fce258148ee227842378ce55c3",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"data_source": {
|
"data_source": {
|
||||||
"date_created": "2023-07-09T12:55:50.911000",
|
"date_created": "2023-07-09T12:55:50.911000",
|
||||||
@ -17,9 +17,9 @@
|
|||||||
"eng"
|
"eng"
|
||||||
],
|
],
|
||||||
"page_number": 1,
|
"page_number": 1,
|
||||||
"text_as_html": "<table><tr><td>Driver</td></tr><tr><td>Approver</td></tr><tr><td>Contributors</td></tr><tr><td>Informed</td></tr><tr><td>Objective</td></tr><tr><td>Due date</td></tr><tr><td>Key outcomes</td></tr><tr><td>Status</td><td>NOT STARTED</td><td>/</td><td>IN PROGRESS</td><td>/</td><td>COMPLETE</td></tr></table>"
|
"text_as_html": "<table><tr><td>Driver</td><td></td></tr><tr><td>Approver</td><td></td></tr><tr><td>Contributors</td><td></td></tr><tr><td>Informed</td><td></td></tr><tr><td>Objective</td><td></td></tr><tr><td>Due date</td><td></td></tr><tr><td>Key outcomes</td><td></td></tr><tr><td>Status</td><td>NOT STARTED / IN PROGRESS / COMPLETE</td></tr></table>"
|
||||||
},
|
},
|
||||||
"text": "Driver Approver Contributors Informed Objective Due date Key outcomes Status NOT STARTED / IN PROGRESS / COMPLETE",
|
"text": "Driver Approver Contributors Informed Objective Due date Key outcomes Status NOT STARTED / IN PROGRESS / COMPLETE",
|
||||||
"type": "Table"
|
"type": "Table"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -84,7 +84,7 @@
|
|||||||
"eng"
|
"eng"
|
||||||
],
|
],
|
||||||
"page_number": 1,
|
"page_number": 1,
|
||||||
"text_as_html": "<table><tr><td>Must have:</td></tr><tr><td>Nice to have:</td></tr><tr><td>Not in scope:</td></tr></table>"
|
"text_as_html": "<table><tr><td>Must have:</td><td></td></tr><tr><td>Nice to have:</td><td></td></tr><tr><td>Not in scope:</td><td></td></tr></table>"
|
||||||
},
|
},
|
||||||
"text": "Must have: Nice to have: Not in scope:",
|
"text": "Must have: Nice to have: Not in scope:",
|
||||||
"type": "Table"
|
"type": "Table"
|
||||||
@ -327,7 +327,7 @@
|
|||||||
"eng"
|
"eng"
|
||||||
],
|
],
|
||||||
"page_number": 1,
|
"page_number": 1,
|
||||||
"text_as_html": "<table><tr><td>Milestone</td><td>Owner</td><td>Deadline</td><td>Status</td></tr></table>"
|
"text_as_html": "<table><tr><td>Milestone</td><td>Owner</td><td>Deadline</td><td>Status</td></tr><tr><td></td><td></td><td></td><td></td></tr><tr><td></td><td></td><td></td><td></td></tr><tr><td></td><td></td><td></td><td></td></tr></table>"
|
||||||
},
|
},
|
||||||
"text": "Milestone Owner Deadline Status",
|
"text": "Milestone Owner Deadline Status",
|
||||||
"type": "Table"
|
"type": "Table"
|
||||||
|
@ -171,7 +171,7 @@
|
|||||||
"eng"
|
"eng"
|
||||||
],
|
],
|
||||||
"page_number": 1,
|
"page_number": 1,
|
||||||
"text_as_html": "<table><tr><td>Time</td><td>Item</td><td>Presenter</td><td>Notes</td></tr></table>"
|
"text_as_html": "<table><tr><td>Time</td><td>Item</td><td>Presenter</td><td>Notes</td></tr><tr><td></td><td></td><td></td><td></td></tr><tr><td></td><td></td><td></td><td></td></tr></table>"
|
||||||
},
|
},
|
||||||
"text": "Time Item Presenter Notes",
|
"text": "Time Item Presenter Notes",
|
||||||
"type": "Table"
|
"type": "Table"
|
||||||
|
@ -299,7 +299,7 @@
|
|||||||
"eng"
|
"eng"
|
||||||
],
|
],
|
||||||
"page_number": 1,
|
"page_number": 1,
|
||||||
"text_as_html": "<table><tr><td>Notes</td></tr><tr><td>Important Links</td></tr></table>"
|
"text_as_html": "<table><tr><td>Notes</td><td></td></tr><tr><td>Important Links</td><td></td></tr></table>"
|
||||||
},
|
},
|
||||||
"text": "Notes Important Links",
|
"text": "Notes Important Links",
|
||||||
"type": "Table"
|
"type": "Table"
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[
|
[
|
||||||
{
|
{
|
||||||
"element_id": "e83a347af95db7ba47b5351f411e00c7",
|
"element_id": "8088fbcca4eb780b8a4b8efe4b018860",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"data_source": {
|
"data_source": {
|
||||||
"date_created": "2023-06-20T23:49:31.038000+00:00",
|
"date_created": "2023-06-20T23:49:31.038000+00:00",
|
||||||
@ -17,9 +17,9 @@
|
|||||||
"eng"
|
"eng"
|
||||||
],
|
],
|
||||||
"page_number": 1,
|
"page_number": 1,
|
||||||
"text_as_html": "<table><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you'll notice it's full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
|
"text_as_html": "<table><tr><td></td><td></td><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you'll notice it's full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
|
||||||
},
|
},
|
||||||
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
|
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
|
||||||
"type": "Table"
|
"type": "Table"
|
||||||
}
|
}
|
||||||
]
|
]
|
@ -1,6 +1,6 @@
|
|||||||
[
|
[
|
||||||
{
|
{
|
||||||
"element_id": "e83a347af95db7ba47b5351f411e00c7",
|
"element_id": "8088fbcca4eb780b8a4b8efe4b018860",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"data_source": {
|
"data_source": {
|
||||||
"date_created": "2023-06-20T23:48:13.750000+00:00",
|
"date_created": "2023-06-20T23:48:13.750000+00:00",
|
||||||
@ -17,9 +17,9 @@
|
|||||||
"eng"
|
"eng"
|
||||||
],
|
],
|
||||||
"page_number": 1,
|
"page_number": 1,
|
||||||
"text_as_html": "<table><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you'll notice it's full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
|
"text_as_html": "<table><tr><td></td><td></td><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you'll notice it's full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
|
||||||
},
|
},
|
||||||
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
|
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
|
||||||
"type": "Table"
|
"type": "Table"
|
||||||
}
|
}
|
||||||
]
|
]
|
@ -1,6 +1,6 @@
|
|||||||
[
|
[
|
||||||
{
|
{
|
||||||
"element_id": "e83a347af95db7ba47b5351f411e00c7",
|
"element_id": "8088fbcca4eb780b8a4b8efe4b018860",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"data_source": {
|
"data_source": {
|
||||||
"date_created": "2023-06-20T23:48:24.586000+00:00",
|
"date_created": "2023-06-20T23:48:24.586000+00:00",
|
||||||
@ -17,9 +17,9 @@
|
|||||||
"eng"
|
"eng"
|
||||||
],
|
],
|
||||||
"page_number": 1,
|
"page_number": 1,
|
||||||
"text_as_html": "<table><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you'll notice it's full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
|
"text_as_html": "<table><tr><td></td><td></td><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you'll notice it's full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
|
||||||
},
|
},
|
||||||
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
|
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
|
||||||
"type": "Table"
|
"type": "Table"
|
||||||
}
|
}
|
||||||
]
|
]
|
@ -1 +1 @@
|
|||||||
__version__ = "0.11.9-dev1" # pragma: no cover
|
__version__ = "0.11.9-dev2" # pragma: no cover
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, cast
|
from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Tuple, cast
|
||||||
|
|
||||||
if sys.version_info < (3, 8):
|
if sys.version_info < (3, 8):
|
||||||
from typing_extensions import Final
|
from typing_extensions import Final
|
||||||
@ -343,9 +343,20 @@ def _parse_HTMLTable_from_table_elem(table_elem: etree._Element) -> Optional[Ele
|
|||||||
if not trs:
|
if not trs:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
table_data = [[str(text) for text in tr.itertext()] for tr in trs]
|
def iter_cell_texts(tr: etree._Element) -> Iterator[str]:
|
||||||
|
"""Generate the text of each cell in `tr`."""
|
||||||
|
# -- a cell can be either a "data" cell (td) or a "heading" cell (th) --
|
||||||
|
tds = cast(List[etree._Element], tr.xpath("./td | ./th"))
|
||||||
|
for td in tds:
|
||||||
|
# -- a cell can contain other elements like spans etc. so we can't count on the text
|
||||||
|
# -- being directly below the `<td>` element. `.itertext()` gets all of it recursively.
|
||||||
|
# -- Filter out whitespace text nodes that result from HTML formatting.
|
||||||
|
stripped_text_nodes = (t.strip() for t in cast(Iterator[str], td.itertext()))
|
||||||
|
yield " ".join(t for t in stripped_text_nodes if t)
|
||||||
|
|
||||||
|
table_data = [list(iter_cell_texts(tr)) for tr in trs]
|
||||||
html_table = htmlify_matrix_of_cell_texts(table_data)
|
html_table = htmlify_matrix_of_cell_texts(table_data)
|
||||||
table_text = " ".join(" ".join(row) for row in table_data).strip()
|
table_text = " ".join(" ".join(t for t in row if t) for row in table_data).strip()
|
||||||
|
|
||||||
if table_text == "":
|
if table_text == "":
|
||||||
return None
|
return None
|
||||||
|
Loading…
x
Reference in New Issue
Block a user