diff --git a/CHANGELOG.md b/CHANGELOG.md
index 13dfd363c..f38012bfe 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.11.9-dev1
+## 0.11.9-dev2
### Enhancements
@@ -10,6 +10,8 @@
### Fixes
+* **Fix unequal row-length in HTMLTable.text_as_html.** Fixes to other aspects of partition_html() in v0.11 allowed unequal cell-counts in table rows. Make the cells in each row correspond 1:1 with cells in the original table row. This fix also removes "noise" cells resulting from HTML-formatting whitespace and eliminates the "column-shifting" of cells that previously resulted from noise-cells.
+
## 0.11.8
### Enhancements
diff --git a/test_unstructured/documents/test_html.py b/test_unstructured/documents/test_html.py
index 01c120155..48777cf13 100644
--- a/test_unstructured/documents/test_html.py
+++ b/test_unstructured/documents/test_html.py
@@ -6,6 +6,7 @@ from typing import Dict, List
import pytest
from lxml import etree
+from lxml import html as lxml_html
from unstructured.documents import html
from unstructured.documents.base import Page
@@ -28,6 +29,7 @@ from unstructured.documents.html import (
HTMLTable,
HTMLTitle,
TagsMixin,
+ _parse_HTMLTable_from_table_elem,
)
DIRECTORY = pathlib.Path(__file__).parent.resolve()
@@ -883,6 +885,108 @@ def test_line_break_in_text_tag(tag):
assert doc.elements[1].text == "World"
+# -- unit-level tests ----------------------------------------------------------------------------
+
+
+class Describe_parse_HTMLTable_from_table_elem:
+ """Unit-test suite for `unstructured.documents.html._parse_HTMLTable_from_table_elem`."""
+
+ def it_produces_one_cell_for_each_original_table_cell(self):
+ table_html = (
+ # -- include formatting whitespace to make sure it is removed --
+ "
\n"
+ " \n"
+ " foo | \n"
+ " bar | \n"
+ "
\n"
+ "
"
+ )
+ table_elem = lxml_html.fromstring(table_html) # pyright: ignore[reportUnknownMemberType]
+
+ html_table = _parse_HTMLTable_from_table_elem(table_elem)
+
+ assert isinstance(html_table, HTMLTable)
+ assert html_table.text == "foo bar"
+ assert html_table.text_as_html == ""
+
+ def it_accommodates_tds_with_child_elements(self):
+ """Like this example from an SEC 10k filing."""
+ table_html = (
+ "\n"
+ " \n"
+ " | \n"
+ " | \n"
+ "
\n"
+ " \n"
+ " \n"
+ " \n"
+ " \n"
+ ' \n'
+ " ☒\n"
+ " \n"
+ " \n"
+ " \n"
+ " | \n"
+ " \n"
+ " \n"
+ " ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE"
+ " ACT OF 1934\n"
+ " \n"
+ " | \n"
+ "
\n"
+ "
\n"
+ )
+ table_elem = lxml_html.fromstring(table_html) # pyright: ignore[reportUnknownMemberType]
+
+ html_table = _parse_HTMLTable_from_table_elem(table_elem)
+
+ assert isinstance(html_table, HTMLTable)
+ assert html_table.text == (
+ "☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934"
+ )
+ print(f"{html_table.text_as_html=}")
+ assert html_table.text_as_html == (
+ ""
+ " | |
"
+ "☒ | ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES"
+ " EXCHANGE ACT OF 1934 |
"
+ "
"
+ )
+
+ def it_reduces_a_nested_table_to_its_text_placed_in_the_cell_containing_the_nested_table(self):
+ """Recursively ..."""
+ nested_table_html = (
+ "\n"
+ " \n"
+ " \n"
+ " \n"
+ " foo | bar | \n"
+ " baz | bng | \n"
+ " \n"
+ " | \n"
+ " \n"
+ " \n"
+ " | \n"
+ "
\n"
+ "
"
+ )
+ table_elem = lxml_html.fromstring( # pyright: ignore[reportUnknownMemberType]
+ nested_table_html
+ )
+
+ html_table = _parse_HTMLTable_from_table_elem(table_elem)
+
+ assert isinstance(html_table, HTMLTable)
+ assert html_table.text == "foo bar baz bng fizz bang"
+ assert html_table.text_as_html == (
+ ""
+ )
+
+
# -- module-level fixtures -----------------------------------------------------------------------
diff --git a/test_unstructured/partition/epub/test_epub.py b/test_unstructured/partition/epub/test_epub.py
index f7d32eba7..ab0843e39 100644
--- a/test_unstructured/partition/epub/test_epub.py
+++ b/test_unstructured/partition/epub/test_epub.py
@@ -29,10 +29,10 @@ def test_partition_epub_from_filename_returns_table_in_elements():
assert (
elements[14].text.replace("\n", " ")
== Table(
- text="Contents. List of Illustrations "
+ text="Contents. List of Illustrations "
"(In certain versions of this etext [in certain browsers] "
"clicking on the image will bring up a larger version.) "
- " (etext transcriber's note)",
+ "(etext transcriber's note)",
).text
)
diff --git a/test_unstructured/partition/pypandoc/test_rtf.py b/test_unstructured/partition/pypandoc/test_rtf.py
index 466334382..6e41afa9b 100644
--- a/test_unstructured/partition/pypandoc/test_rtf.py
+++ b/test_unstructured/partition/pypandoc/test_rtf.py
@@ -15,8 +15,7 @@ def test_partition_rtf_from_filename():
assert len(elements) > 0
assert elements[0] == Title("My First Heading")
assert elements[-1] == Table(
- text="Column 1 \n Column 2 \n Row 1, Cell 1 \n Row 1, "
- "Cell 2 \n Row 2, Cell 1 \n Row 2, Cell 2",
+ text="Column 1 Column 2 Row 1, Cell 1 Row 1, Cell 2 Row 2, Cell 1 Row 2, Cell 2"
)
for element in elements:
assert element.metadata.filename == "fake-doc.rtf"
diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
index bd2b3596f..4103cd566 100644
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@@ -1145,9 +1145,7 @@ def test_add_chunking_strategy_on_partition_auto_respects_max_chars():
assert len(partitioned_table_elements_5_chars) != len(table_elements)
assert len(partitioned_table_elements_200_chars) != len(table_elements)
- # trailing whitespace is stripped from the first chunk, leaving only a checkbox character
- assert len(partitioned_table_elements_5_chars[0].text) == 1
- # but the second chunk is the full 5 characters
+ assert len(partitioned_table_elements_5_chars[0].text) == 5
assert len(partitioned_table_elements_5_chars[1].text) == 5
assert len(partitioned_table_elements_5_chars[0].metadata.text_as_html) == 5
diff --git a/test_unstructured/partition/test_html_partition.py b/test_unstructured/partition/test_html_partition.py
index 8acb6ca8e..d2d2485b0 100644
--- a/test_unstructured/partition/test_html_partition.py
+++ b/test_unstructured/partition/test_html_partition.py
@@ -276,7 +276,7 @@ def test_partition_html_on_ideas_page(filename="example-docs/ideas-page.html"):
elements = partition_html(filename=filename)
assert len(elements) == 1
assert elements[0] == Table(
- text="January 2023 ( Someone fed my essays into GPT to make something "
+ text="January 2023 ( Someone fed my essays into GPT to make something "
"that could answer\nquestions based on them, then asked it where good "
"ideas come from. The\nanswer was ok, but not what I would have said. "
"This is what I would have said.) The way to get new ideas is to notice "
diff --git a/test_unstructured_ingest/expected-structured-output/Sharepoint-with-permissions/Shared Documents/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/Sharepoint-with-permissions/Shared Documents/ideas-page.html.json
index d1f15ca6e..b2f283363 100644
--- a/test_unstructured_ingest/expected-structured-output/Sharepoint-with-permissions/Shared Documents/ideas-page.html.json
+++ b/test_unstructured_ingest/expected-structured-output/Sharepoint-with-permissions/Shared Documents/ideas-page.html.json
@@ -1,6 +1,6 @@
[
{
- "element_id": "e83a347af95db7ba47b5351f411e00c7",
+ "element_id": "8088fbcca4eb780b8a4b8efe4b018860",
"metadata": {
"data_source": {
"date_created": "2023-06-16T05:04:47",
@@ -106,9 +106,9 @@
"eng"
],
"page_number": 1,
- "text_as_html": "January 2023 | ( | Someone | fed my essays into GPT to make something that could answer questions based on them, then asked it where good ideas come from. The answer was ok, but not what I would have said. This is what I would have said.) | The way to get new ideas is to notice anomalies: what seems strange, or missing, or broken? You can see anomalies in everyday life (much of standup comedy is based on this), but the best place to look for them is at the frontiers of knowledge. | Knowledge grows fractally. From a distance its edges look smooth, but when you learn enough to get close to one, you'll notice it's full of gaps. These gaps will seem obvious; it will seem inexplicable that no one has tried x or wondered about y. In the best case, exploring such gaps yields whole new fractal buds. |
"
+ "text_as_html": " | | January 2023 ( Someone fed my essays into GPT to make something that could answer questions based on them, then asked it where good ideas come from. The answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange, or missing, or broken? You can see anomalies in everyday life (much of standup comedy is based on this), but the best place to look for them is at the frontiers of knowledge. Knowledge grows fractally. From a distance its edges look smooth, but when you learn enough to get close to one, you'll notice it's full of gaps. These gaps will seem obvious; it will seem inexplicable that no one has tried x or wondered about y. In the best case, exploring such gaps yields whole new fractal buds. |
"
},
- "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
+ "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
"type": "Table"
}
]
\ No newline at end of file
diff --git a/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/ideas-page.html.json
index 2bcd8de7d..d4d9d9027 100644
--- a/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/ideas-page.html.json
+++ b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/ideas-page.html.json
@@ -1,6 +1,6 @@
[
{
- "element_id": "e83a347af95db7ba47b5351f411e00c7",
+ "element_id": "8088fbcca4eb780b8a4b8efe4b018860",
"metadata": {
"data_source": {
"date_created": "2023-06-16T05:04:47",
@@ -18,9 +18,9 @@
"eng"
],
"page_number": 1,
- "text_as_html": "January 2023 | ( | Someone | fed my essays into GPT to make something that could answer questions based on them, then asked it where good ideas come from. The answer was ok, but not what I would have said. This is what I would have said.) | The way to get new ideas is to notice anomalies: what seems strange, or missing, or broken? You can see anomalies in everyday life (much of standup comedy is based on this), but the best place to look for them is at the frontiers of knowledge. | Knowledge grows fractally. From a distance its edges look smooth, but when you learn enough to get close to one, you'll notice it's full of gaps. These gaps will seem obvious; it will seem inexplicable that no one has tried x or wondered about y. In the best case, exploring such gaps yields whole new fractal buds. |
"
+ "text_as_html": " | | January 2023 ( Someone fed my essays into GPT to make something that could answer questions based on them, then asked it where good ideas come from. The answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange, or missing, or broken? You can see anomalies in everyday life (much of standup comedy is based on this), but the best place to look for them is at the frontiers of knowledge. Knowledge grows fractally. From a distance its edges look smooth, but when you learn enough to get close to one, you'll notice it's full of gaps. These gaps will seem obvious; it will seem inexplicable that no one has tried x or wondered about y. In the best case, exploring such gaps yields whole new fractal buds. |
"
},
- "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
+ "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
"type": "Table"
}
]
\ No newline at end of file
diff --git a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1540126.json b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1540126.json
index 65069fd06..7886f2741 100644
--- a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1540126.json
+++ b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1540126.json
@@ -1,6 +1,6 @@
[
{
- "element_id": "10b5ef18a3c7fb1d7436b2e1b256e5b9",
+ "element_id": "597883fce258148ee227842378ce55c3",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
@@ -17,9 +17,9 @@
"eng"
],
"page_number": 1,
- "text_as_html": "Driver |
Approver |
Contributors |
Informed |
Objective |
Due date |
Key outcomes |
Status | NOT STARTED | / | IN PROGRESS | / | COMPLETE |
"
+ "text_as_html": "Driver | |
Approver | |
Contributors | |
Informed | |
Objective | |
Due date | |
Key outcomes | |
Status | NOT STARTED / IN PROGRESS / COMPLETE |
"
},
- "text": "Driver Approver Contributors Informed Objective Due date Key outcomes Status NOT STARTED / IN PROGRESS / COMPLETE",
+ "text": "Driver Approver Contributors Informed Objective Due date Key outcomes Status NOT STARTED / IN PROGRESS / COMPLETE",
"type": "Table"
},
{
@@ -84,7 +84,7 @@
"eng"
],
"page_number": 1,
- "text_as_html": "Must have: |
Nice to have: |
Not in scope: |
"
+ "text_as_html": "Must have: | |
Nice to have: | |
Not in scope: | |
"
},
"text": "Must have: Nice to have: Not in scope:",
"type": "Table"
@@ -327,7 +327,7 @@
"eng"
],
"page_number": 1,
- "text_as_html": "Milestone | Owner | Deadline | Status |
"
+ "text_as_html": "Milestone | Owner | Deadline | Status |
| | | |
| | | |
| | | |
"
},
"text": "Milestone Owner Deadline Status",
"type": "Table"
diff --git a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605928.json b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605928.json
index 807233f3c..885c0cb25 100644
--- a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605928.json
+++ b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605928.json
@@ -171,7 +171,7 @@
"eng"
],
"page_number": 1,
- "text_as_html": ""
+ "text_as_html": ""
},
"text": "Time Item Presenter Notes",
"type": "Table"
diff --git a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605942.json b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605942.json
index be5c40bfc..7f97c1210 100644
--- a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605942.json
+++ b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605942.json
@@ -299,7 +299,7 @@
"eng"
],
"page_number": 1,
- "text_as_html": ""
+ "text_as_html": ""
},
"text": "Notes Important Links",
"type": "Table"
diff --git a/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json
index b31e71054..3af04c59b 100644
--- a/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json
+++ b/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json
@@ -1,6 +1,6 @@
[
{
- "element_id": "e83a347af95db7ba47b5351f411e00c7",
+ "element_id": "8088fbcca4eb780b8a4b8efe4b018860",
"metadata": {
"data_source": {
"date_created": "2023-06-20T23:49:31.038000+00:00",
@@ -17,9 +17,9 @@
"eng"
],
"page_number": 1,
- "text_as_html": "January 2023 | ( | Someone | fed my essays into GPT to make something that could answer questions based on them, then asked it where good ideas come from. The answer was ok, but not what I would have said. This is what I would have said.) | The way to get new ideas is to notice anomalies: what seems strange, or missing, or broken? You can see anomalies in everyday life (much of standup comedy is based on this), but the best place to look for them is at the frontiers of knowledge. | Knowledge grows fractally. From a distance its edges look smooth, but when you learn enough to get close to one, you'll notice it's full of gaps. These gaps will seem obvious; it will seem inexplicable that no one has tried x or wondered about y. In the best case, exploring such gaps yields whole new fractal buds. |
"
+ "text_as_html": " | | January 2023 ( Someone fed my essays into GPT to make something that could answer questions based on them, then asked it where good ideas come from. The answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange, or missing, or broken? You can see anomalies in everyday life (much of standup comedy is based on this), but the best place to look for them is at the frontiers of knowledge. Knowledge grows fractally. From a distance its edges look smooth, but when you learn enough to get close to one, you'll notice it's full of gaps. These gaps will seem obvious; it will seem inexplicable that no one has tried x or wondered about y. In the best case, exploring such gaps yields whole new fractal buds. |
"
},
- "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
+ "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
"type": "Table"
}
]
\ No newline at end of file
diff --git a/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json
index 1084cf9aa..9942da6a6 100644
--- a/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json
+++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json
@@ -1,6 +1,6 @@
[
{
- "element_id": "e83a347af95db7ba47b5351f411e00c7",
+ "element_id": "8088fbcca4eb780b8a4b8efe4b018860",
"metadata": {
"data_source": {
"date_created": "2023-06-20T23:48:13.750000+00:00",
@@ -17,9 +17,9 @@
"eng"
],
"page_number": 1,
- "text_as_html": "January 2023 | ( | Someone | fed my essays into GPT to make something that could answer questions based on them, then asked it where good ideas come from. The answer was ok, but not what I would have said. This is what I would have said.) | The way to get new ideas is to notice anomalies: what seems strange, or missing, or broken? You can see anomalies in everyday life (much of standup comedy is based on this), but the best place to look for them is at the frontiers of knowledge. | Knowledge grows fractally. From a distance its edges look smooth, but when you learn enough to get close to one, you'll notice it's full of gaps. These gaps will seem obvious; it will seem inexplicable that no one has tried x or wondered about y. In the best case, exploring such gaps yields whole new fractal buds. |
"
+ "text_as_html": " | | January 2023 ( Someone fed my essays into GPT to make something that could answer questions based on them, then asked it where good ideas come from. The answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange, or missing, or broken? You can see anomalies in everyday life (much of standup comedy is based on this), but the best place to look for them is at the frontiers of knowledge. Knowledge grows fractally. From a distance its edges look smooth, but when you learn enough to get close to one, you'll notice it's full of gaps. These gaps will seem obvious; it will seem inexplicable that no one has tried x or wondered about y. In the best case, exploring such gaps yields whole new fractal buds. |
"
},
- "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
+ "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
"type": "Table"
}
]
\ No newline at end of file
diff --git a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json
index fa1f070d8..4d2018ef4 100644
--- a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json
+++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json
@@ -1,6 +1,6 @@
[
{
- "element_id": "e83a347af95db7ba47b5351f411e00c7",
+ "element_id": "8088fbcca4eb780b8a4b8efe4b018860",
"metadata": {
"data_source": {
"date_created": "2023-06-20T23:48:24.586000+00:00",
@@ -17,9 +17,9 @@
"eng"
],
"page_number": 1,
- "text_as_html": "January 2023 | ( | Someone | fed my essays into GPT to make something that could answer questions based on them, then asked it where good ideas come from. The answer was ok, but not what I would have said. This is what I would have said.) | The way to get new ideas is to notice anomalies: what seems strange, or missing, or broken? You can see anomalies in everyday life (much of standup comedy is based on this), but the best place to look for them is at the frontiers of knowledge. | Knowledge grows fractally. From a distance its edges look smooth, but when you learn enough to get close to one, you'll notice it's full of gaps. These gaps will seem obvious; it will seem inexplicable that no one has tried x or wondered about y. In the best case, exploring such gaps yields whole new fractal buds. |
"
+ "text_as_html": " | | January 2023 ( Someone fed my essays into GPT to make something that could answer questions based on them, then asked it where good ideas come from. The answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange, or missing, or broken? You can see anomalies in everyday life (much of standup comedy is based on this), but the best place to look for them is at the frontiers of knowledge. Knowledge grows fractally. From a distance its edges look smooth, but when you learn enough to get close to one, you'll notice it's full of gaps. These gaps will seem obvious; it will seem inexplicable that no one has tried x or wondered about y. In the best case, exploring such gaps yields whole new fractal buds. |
"
},
- "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
+ "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
"type": "Table"
}
]
\ No newline at end of file
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index cb2abecf2..4371b4c01 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.11.9-dev1" # pragma: no cover
+__version__ = "0.11.9-dev2" # pragma: no cover
diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py
index 9b715943e..5e00ad3d1 100644
--- a/unstructured/documents/html.py
+++ b/unstructured/documents/html.py
@@ -3,7 +3,7 @@
from __future__ import annotations
import sys
-from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, cast
+from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Tuple, cast
if sys.version_info < (3, 8):
from typing_extensions import Final
@@ -343,9 +343,20 @@ def _parse_HTMLTable_from_table_elem(table_elem: etree._Element) -> Optional[Ele
if not trs:
return None
- table_data = [[str(text) for text in tr.itertext()] for tr in trs]
+ def iter_cell_texts(tr: etree._Element) -> Iterator[str]:
+ """Generate the text of each cell in `tr`."""
+ # -- a cell can be either a "data" cell (td) or a "heading" cell (th) --
+ tds = cast(List[etree._Element], tr.xpath("./td | ./th"))
+ for td in tds:
+ # -- a cell can contain other elements like spans etc. so we can't count on the text
+ # -- being directly below the `` element. `.itertext()` gets all of it recursively.
+ # -- Filter out whitespace text nodes that result from HTML formatting.
+ stripped_text_nodes = (t.strip() for t in cast(Iterator[str], td.itertext()))
+ yield " ".join(t for t in stripped_text_nodes if t)
+
+ table_data = [list(iter_cell_texts(tr)) for tr in trs]
html_table = htmlify_matrix_of_cell_texts(table_data)
- table_text = " ".join(" ".join(row) for row in table_data).strip()
+ table_text = " ".join(" ".join(t for t in row if t) for row in table_data).strip()
if table_text == "":
return None
|