diff --git a/CHANGELOG.md b/CHANGELOG.md index 13dfd363c..f38012bfe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.11.9-dev1 +## 0.11.9-dev2 ### Enhancements @@ -10,6 +10,8 @@ ### Fixes +* **Fix unequal row-length in HTMLTable.text_as_html.** Fixes to other aspects of partition_html() in v0.11 allowed unequal cell-counts in table rows. Make the cells in each row correspond 1:1 with cells in the original table row. This fix also removes "noise" cells resulting from HTML-formatting whitespace and eliminates the "column-shifting" of cells that previously resulted from noise-cells. + ## 0.11.8 ### Enhancements diff --git a/test_unstructured/documents/test_html.py b/test_unstructured/documents/test_html.py index 01c120155..48777cf13 100644 --- a/test_unstructured/documents/test_html.py +++ b/test_unstructured/documents/test_html.py @@ -6,6 +6,7 @@ from typing import Dict, List import pytest from lxml import etree +from lxml import html as lxml_html from unstructured.documents import html from unstructured.documents.base import Page @@ -28,6 +29,7 @@ from unstructured.documents.html import ( HTMLTable, HTMLTitle, TagsMixin, + _parse_HTMLTable_from_table_elem, ) DIRECTORY = pathlib.Path(__file__).parent.resolve() @@ -883,6 +885,108 @@ def test_line_break_in_text_tag(tag): assert doc.elements[1].text == "World" +# -- unit-level tests ---------------------------------------------------------------------------- + + +class Describe_parse_HTMLTable_from_table_elem: + """Unit-test suite for `unstructured.documents.html._parse_HTMLTable_from_table_elem`.""" + + def it_produces_one_cell_for_each_original_table_cell(self): + table_html = ( + # -- include formatting whitespace to make sure it is removed -- + "\n" + " \n" + " \n" + " \n" + " \n" + "
foobar
" + ) + table_elem = lxml_html.fromstring(table_html) # pyright: ignore[reportUnknownMemberType] + + html_table = _parse_HTMLTable_from_table_elem(table_elem) + + assert isinstance(html_table, HTMLTable) + assert html_table.text == "foo bar" + assert html_table.text_as_html == "
foobar
" + + def it_accommodates_tds_with_child_elements(self): + """Like this example from an SEC 10k filing.""" + table_html = ( + "\n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + "
\n" + "

\n" + " \n" + ' \n' + " \n" + " \n" + " \n" + "

\n" + "
\n" + "

\n" + " ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE" + " ACT OF 1934\n" + "

\n" + "
\n" + ) + table_elem = lxml_html.fromstring(table_html) # pyright: ignore[reportUnknownMemberType] + + html_table = _parse_HTMLTable_from_table_elem(table_elem) + + assert isinstance(html_table, HTMLTable) + assert html_table.text == ( + "☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934" + ) + print(f"{html_table.text_as_html=}") + assert html_table.text_as_html == ( + "" + "" + "" + "
ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES" + " EXCHANGE ACT OF 1934
" + ) + + def it_reduces_a_nested_table_to_its_text_placed_in_the_cell_containing_the_nested_table(self): + """Recursively ...""" + nested_table_html = ( + "\n" + " \n" + " \n" + " \n" + " \n" + "
\n" + " \n" + " \n" + " \n" + "
foobar
bazbng
\n" + "
\n" + " \n" + " \n" + "
fizzbang
\n" + "
" + ) + table_elem = lxml_html.fromstring( # pyright: ignore[reportUnknownMemberType] + nested_table_html + ) + + html_table = _parse_HTMLTable_from_table_elem(table_elem) + + assert isinstance(html_table, HTMLTable) + assert html_table.text == "foo bar baz bng fizz bang" + assert html_table.text_as_html == ( + "
foo bar baz bngfizz bang
" + ) + + # -- module-level fixtures ----------------------------------------------------------------------- diff --git a/test_unstructured/partition/epub/test_epub.py b/test_unstructured/partition/epub/test_epub.py index f7d32eba7..ab0843e39 100644 --- a/test_unstructured/partition/epub/test_epub.py +++ b/test_unstructured/partition/epub/test_epub.py @@ -29,10 +29,10 @@ def test_partition_epub_from_filename_returns_table_in_elements(): assert ( elements[14].text.replace("\n", " ") == Table( - text="Contents. List of Illustrations " + text="Contents. List of Illustrations " "(In certain versions of this etext [in certain browsers] " "clicking on the image will bring up a larger version.) " - " (etext transcriber's note)", + "(etext transcriber's note)", ).text ) diff --git a/test_unstructured/partition/pypandoc/test_rtf.py b/test_unstructured/partition/pypandoc/test_rtf.py index 466334382..6e41afa9b 100644 --- a/test_unstructured/partition/pypandoc/test_rtf.py +++ b/test_unstructured/partition/pypandoc/test_rtf.py @@ -15,8 +15,7 @@ def test_partition_rtf_from_filename(): assert len(elements) > 0 assert elements[0] == Title("My First Heading") assert elements[-1] == Table( - text="Column 1 \n Column 2 \n Row 1, Cell 1 \n Row 1, " - "Cell 2 \n Row 2, Cell 1 \n Row 2, Cell 2", + text="Column 1 Column 2 Row 1, Cell 1 Row 1, Cell 2 Row 2, Cell 1 Row 2, Cell 2" ) for element in elements: assert element.metadata.filename == "fake-doc.rtf" diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index bd2b3596f..4103cd566 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -1145,9 +1145,7 @@ def test_add_chunking_strategy_on_partition_auto_respects_max_chars(): assert len(partitioned_table_elements_5_chars) != len(table_elements) assert len(partitioned_table_elements_200_chars) != len(table_elements) - # trailing whitespace is stripped from the first chunk, leaving only a checkbox character - assert len(partitioned_table_elements_5_chars[0].text) == 1 - # but the second chunk is the full 5 characters + assert len(partitioned_table_elements_5_chars[0].text) == 5 assert len(partitioned_table_elements_5_chars[1].text) == 5 assert len(partitioned_table_elements_5_chars[0].metadata.text_as_html) == 5 diff --git a/test_unstructured/partition/test_html_partition.py b/test_unstructured/partition/test_html_partition.py index 8acb6ca8e..d2d2485b0 100644 --- a/test_unstructured/partition/test_html_partition.py +++ b/test_unstructured/partition/test_html_partition.py @@ -276,7 +276,7 @@ def test_partition_html_on_ideas_page(filename="example-docs/ideas-page.html"): elements = partition_html(filename=filename) assert len(elements) == 1 assert elements[0] == Table( - text="January 2023 ( Someone fed my essays into GPT to make something " + text="January 2023 ( Someone fed my essays into GPT to make something " "that could answer\nquestions based on them, then asked it where good " "ideas come from. The\nanswer was ok, but not what I would have said. " "This is what I would have said.) The way to get new ideas is to notice " diff --git a/test_unstructured_ingest/expected-structured-output/Sharepoint-with-permissions/Shared Documents/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/Sharepoint-with-permissions/Shared Documents/ideas-page.html.json index d1f15ca6e..b2f283363 100644 --- a/test_unstructured_ingest/expected-structured-output/Sharepoint-with-permissions/Shared Documents/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/Sharepoint-with-permissions/Shared Documents/ideas-page.html.json @@ -1,6 +1,6 @@ [ { - "element_id": "e83a347af95db7ba47b5351f411e00c7", + "element_id": "8088fbcca4eb780b8a4b8efe4b018860", "metadata": { "data_source": { "date_created": "2023-06-16T05:04:47", @@ -106,9 +106,9 @@ "eng" ], "page_number": 1, - "text_as_html": "
January 2023(Someonefed my essays into GPT to make something that could answer
questions based on them, then asked it where good ideas come from. The
answer was ok, but not what I would have said. This is what I would have said.)
The way to get new ideas is to notice anomalies: what seems strange,
or missing, or broken? You can see anomalies in everyday life (much
of standup comedy is based on this), but the best place to look for
them is at the frontiers of knowledge.
Knowledge grows fractally.
From a distance its edges look smooth, but when you learn enough
to get close to one, you'll notice it's full of gaps. These gaps
will seem obvious; it will seem inexplicable that no one has tried
x or wondered about y. In the best case, exploring such gaps yields
whole new fractal buds.
" + "text_as_html": "
January 2023 ( Someone fed my essays into GPT to make something that could answer
questions based on them, then asked it where good ideas come from. The
answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,
or missing, or broken? You can see anomalies in everyday life (much
of standup comedy is based on this), but the best place to look for
them is at the frontiers of knowledge. Knowledge grows fractally.
From a distance its edges look smooth, but when you learn enough
to get close to one, you'll notice it's full of gaps. These gaps
will seem obvious; it will seem inexplicable that no one has tried
x or wondered about y. In the best case, exploring such gaps yields
whole new fractal buds.
" }, - "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.", + "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.", "type": "Table" } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/ideas-page.html.json index 2bcd8de7d..d4d9d9027 100644 --- a/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/ideas-page.html.json @@ -1,6 +1,6 @@ [ { - "element_id": "e83a347af95db7ba47b5351f411e00c7", + "element_id": "8088fbcca4eb780b8a4b8efe4b018860", "metadata": { "data_source": { "date_created": "2023-06-16T05:04:47", @@ -18,9 +18,9 @@ "eng" ], "page_number": 1, - "text_as_html": "
January 2023(Someonefed my essays into GPT to make something that could answer
questions based on them, then asked it where good ideas come from. The
answer was ok, but not what I would have said. This is what I would have said.)
The way to get new ideas is to notice anomalies: what seems strange,
or missing, or broken? You can see anomalies in everyday life (much
of standup comedy is based on this), but the best place to look for
them is at the frontiers of knowledge.
Knowledge grows fractally.
From a distance its edges look smooth, but when you learn enough
to get close to one, you'll notice it's full of gaps. These gaps
will seem obvious; it will seem inexplicable that no one has tried
x or wondered about y. In the best case, exploring such gaps yields
whole new fractal buds.
" + "text_as_html": "
January 2023 ( Someone fed my essays into GPT to make something that could answer
questions based on them, then asked it where good ideas come from. The
answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,
or missing, or broken? You can see anomalies in everyday life (much
of standup comedy is based on this), but the best place to look for
them is at the frontiers of knowledge. Knowledge grows fractally.
From a distance its edges look smooth, but when you learn enough
to get close to one, you'll notice it's full of gaps. These gaps
will seem obvious; it will seem inexplicable that no one has tried
x or wondered about y. In the best case, exploring such gaps yields
whole new fractal buds.
" }, - "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.", + "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.", "type": "Table" } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1540126.json b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1540126.json index 65069fd06..7886f2741 100644 --- a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1540126.json +++ b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1540126.json @@ -1,6 +1,6 @@ [ { - "element_id": "10b5ef18a3c7fb1d7436b2e1b256e5b9", + "element_id": "597883fce258148ee227842378ce55c3", "metadata": { "data_source": { "date_created": "2023-07-09T12:55:50.911000", @@ -17,9 +17,9 @@ "eng" ], "page_number": 1, - "text_as_html": "
Driver
Approver
Contributors
Informed
Objective
Due date
Key outcomes
StatusNOT STARTED/IN PROGRESS/COMPLETE
" + "text_as_html": "
Driver
Approver
Contributors
Informed
Objective
Due date
Key outcomes
StatusNOT STARTED / IN PROGRESS / COMPLETE
" }, - "text": "Driver Approver Contributors Informed Objective Due date Key outcomes Status NOT STARTED / IN PROGRESS / COMPLETE", + "text": "Driver Approver Contributors Informed Objective Due date Key outcomes Status NOT STARTED / IN PROGRESS / COMPLETE", "type": "Table" }, { @@ -84,7 +84,7 @@ "eng" ], "page_number": 1, - "text_as_html": "
Must have:
Nice to have:
Not in scope:
" + "text_as_html": "
Must have:
Nice to have:
Not in scope:
" }, "text": "Must have: Nice to have: Not in scope:", "type": "Table" @@ -327,7 +327,7 @@ "eng" ], "page_number": 1, - "text_as_html": "
MilestoneOwnerDeadlineStatus
" + "text_as_html": "
MilestoneOwnerDeadlineStatus
" }, "text": "Milestone Owner Deadline Status", "type": "Table" diff --git a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605928.json b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605928.json index 807233f3c..885c0cb25 100644 --- a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605928.json +++ b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605928.json @@ -171,7 +171,7 @@ "eng" ], "page_number": 1, - "text_as_html": "
TimeItemPresenterNotes
" + "text_as_html": "
TimeItemPresenterNotes
" }, "text": "Time Item Presenter Notes", "type": "Table" diff --git a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605942.json b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605942.json index be5c40bfc..7f97c1210 100644 --- a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605942.json +++ b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605942.json @@ -299,7 +299,7 @@ "eng" ], "page_number": 1, - "text_as_html": "
Notes
Important Links
" + "text_as_html": "
Notes
Important Links
" }, "text": "Notes Important Links", "type": "Table" diff --git a/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json index b31e71054..3af04c59b 100644 --- a/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json @@ -1,6 +1,6 @@ [ { - "element_id": "e83a347af95db7ba47b5351f411e00c7", + "element_id": "8088fbcca4eb780b8a4b8efe4b018860", "metadata": { "data_source": { "date_created": "2023-06-20T23:49:31.038000+00:00", @@ -17,9 +17,9 @@ "eng" ], "page_number": 1, - "text_as_html": "
January 2023(Someonefed my essays into GPT to make something that could answer
questions based on them, then asked it where good ideas come from. The
answer was ok, but not what I would have said. This is what I would have said.)
The way to get new ideas is to notice anomalies: what seems strange,
or missing, or broken? You can see anomalies in everyday life (much
of standup comedy is based on this), but the best place to look for
them is at the frontiers of knowledge.
Knowledge grows fractally.
From a distance its edges look smooth, but when you learn enough
to get close to one, you'll notice it's full of gaps. These gaps
will seem obvious; it will seem inexplicable that no one has tried
x or wondered about y. In the best case, exploring such gaps yields
whole new fractal buds.
" + "text_as_html": "
January 2023 ( Someone fed my essays into GPT to make something that could answer
questions based on them, then asked it where good ideas come from. The
answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,
or missing, or broken? You can see anomalies in everyday life (much
of standup comedy is based on this), but the best place to look for
them is at the frontiers of knowledge. Knowledge grows fractally.
From a distance its edges look smooth, but when you learn enough
to get close to one, you'll notice it's full of gaps. These gaps
will seem obvious; it will seem inexplicable that no one has tried
x or wondered about y. In the best case, exploring such gaps yields
whole new fractal buds.
" }, - "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.", + "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.", "type": "Table" } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json index 1084cf9aa..9942da6a6 100644 --- a/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json @@ -1,6 +1,6 @@ [ { - "element_id": "e83a347af95db7ba47b5351f411e00c7", + "element_id": "8088fbcca4eb780b8a4b8efe4b018860", "metadata": { "data_source": { "date_created": "2023-06-20T23:48:13.750000+00:00", @@ -17,9 +17,9 @@ "eng" ], "page_number": 1, - "text_as_html": "
January 2023(Someonefed my essays into GPT to make something that could answer
questions based on them, then asked it where good ideas come from. The
answer was ok, but not what I would have said. This is what I would have said.)
The way to get new ideas is to notice anomalies: what seems strange,
or missing, or broken? You can see anomalies in everyday life (much
of standup comedy is based on this), but the best place to look for
them is at the frontiers of knowledge.
Knowledge grows fractally.
From a distance its edges look smooth, but when you learn enough
to get close to one, you'll notice it's full of gaps. These gaps
will seem obvious; it will seem inexplicable that no one has tried
x or wondered about y. In the best case, exploring such gaps yields
whole new fractal buds.
" + "text_as_html": "
January 2023 ( Someone fed my essays into GPT to make something that could answer
questions based on them, then asked it where good ideas come from. The
answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,
or missing, or broken? You can see anomalies in everyday life (much
of standup comedy is based on this), but the best place to look for
them is at the frontiers of knowledge. Knowledge grows fractally.
From a distance its edges look smooth, but when you learn enough
to get close to one, you'll notice it's full of gaps. These gaps
will seem obvious; it will seem inexplicable that no one has tried
x or wondered about y. In the best case, exploring such gaps yields
whole new fractal buds.
" }, - "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.", + "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.", "type": "Table" } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json index fa1f070d8..4d2018ef4 100644 --- a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json @@ -1,6 +1,6 @@ [ { - "element_id": "e83a347af95db7ba47b5351f411e00c7", + "element_id": "8088fbcca4eb780b8a4b8efe4b018860", "metadata": { "data_source": { "date_created": "2023-06-20T23:48:24.586000+00:00", @@ -17,9 +17,9 @@ "eng" ], "page_number": 1, - "text_as_html": "
January 2023(Someonefed my essays into GPT to make something that could answer
questions based on them, then asked it where good ideas come from. The
answer was ok, but not what I would have said. This is what I would have said.)
The way to get new ideas is to notice anomalies: what seems strange,
or missing, or broken? You can see anomalies in everyday life (much
of standup comedy is based on this), but the best place to look for
them is at the frontiers of knowledge.
Knowledge grows fractally.
From a distance its edges look smooth, but when you learn enough
to get close to one, you'll notice it's full of gaps. These gaps
will seem obvious; it will seem inexplicable that no one has tried
x or wondered about y. In the best case, exploring such gaps yields
whole new fractal buds.
" + "text_as_html": "
January 2023 ( Someone fed my essays into GPT to make something that could answer
questions based on them, then asked it where good ideas come from. The
answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,
or missing, or broken? You can see anomalies in everyday life (much
of standup comedy is based on this), but the best place to look for
them is at the frontiers of knowledge. Knowledge grows fractally.
From a distance its edges look smooth, but when you learn enough
to get close to one, you'll notice it's full of gaps. These gaps
will seem obvious; it will seem inexplicable that no one has tried
x or wondered about y. In the best case, exploring such gaps yields
whole new fractal buds.
" }, - "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.", + "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.", "type": "Table" } ] \ No newline at end of file diff --git a/unstructured/__version__.py b/unstructured/__version__.py index cb2abecf2..4371b4c01 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.11.9-dev1" # pragma: no cover +__version__ = "0.11.9-dev2" # pragma: no cover diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py index 9b715943e..5e00ad3d1 100644 --- a/unstructured/documents/html.py +++ b/unstructured/documents/html.py @@ -3,7 +3,7 @@ from __future__ import annotations import sys -from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, cast +from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Tuple, cast if sys.version_info < (3, 8): from typing_extensions import Final @@ -343,9 +343,20 @@ def _parse_HTMLTable_from_table_elem(table_elem: etree._Element) -> Optional[Ele if not trs: return None - table_data = [[str(text) for text in tr.itertext()] for tr in trs] + def iter_cell_texts(tr: etree._Element) -> Iterator[str]: + """Generate the text of each cell in `tr`.""" + # -- a cell can be either a "data" cell (td) or a "heading" cell (th) -- + tds = cast(List[etree._Element], tr.xpath("./td | ./th")) + for td in tds: + # -- a cell can contain other elements like spans etc. so we can't count on the text + # -- being directly below the `` element. `.itertext()` gets all of it recursively. + # -- Filter out whitespace text nodes that result from HTML formatting. + stripped_text_nodes = (t.strip() for t in cast(Iterator[str], td.itertext())) + yield " ".join(t for t in stripped_text_nodes if t) + + table_data = [list(iter_cell_texts(tr)) for tr in trs] html_table = htmlify_matrix_of_cell_texts(table_data) - table_text = " ".join(" ".join(row) for row in table_data).strip() + table_text = " ".join(" ".join(t for t in row if t) for row in table_data).strip() if table_text == "": return None