fix: assorted partition_html() bugs (#2113)

Addresses a cluster of HTML-related bugs:
- empty table is identified as bulleted-table
- `partition_html()` emits empty (no text) tables (#1928)
- `.text_as_html` contains inappropriate `<br>` elements in invalid
locations.
- cells enclosed in `<thead>` and `<tfoot>` elements are dropped (#1928)
- `.text_as_html` contains whitespace padding

Each of these is addressed in a separate commit below.

Fixes #1928.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: scanny <scanny@users.noreply.github.com>
Co-authored-by: Yuming Long <63475068+yuming-long@users.noreply.github.com>
This commit is contained in:
Steve Canny 2023-11-20 08:29:32 -08:00 committed by GitHub
parent 13a23deba6
commit ee9be2a3b2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 326 additions and 136 deletions

View File

@ -22,6 +22,11 @@
* **Fix some pdfs returning `KeyError: 'N'`** Certain pdfs were throwing this error when being opened by pdfminer. Added a wrapper function for pdfminer that allows these documents to be partitioned.
* **Fix mis-splits on `Table` chunks.** Remedies repeated appearance of full `.text_as_html` on metadata of each `TableChunk` split from a `Table` element too large to fit in the chunking window.
* **Import tables_agent from inference** so that we don't have to initialize a global table agent in unstructured OCR again
* **Fix empty table is identified as bulleted-table.** A table with no text content was mistakenly identified as a bulleted-table and processed by the wrong branch of the initial HTML partitioner.
* **Fix partition_html() emits empty (no text) tables.** A table with cells nested below a `<thead>` or `<tfoot>` element was emitted as a table element having no text and unparseable HTML in `element.metadata.text_as_html`. Do not emit empty tables to the element stream.
* **Fix HTML `element.metadata.text_as_html` contains spurious <br> elements in invalid locations.** The HTML generated for the `text_as_html` metadata for HTML tables contained `<br>` elements invalid locations like between `<table>` and `<tr>`. Change the HTML generator such that these do not appear.
* **Fix HTML table cells enclosed in <thead> and <tfoot> elements are dropped.** HTML table cells nested in a `<thead>` or `<tfoot>` element were not detected and the text in those cells was omitted from the table element text and `.text_as_html`. Detect table rows regardless of the semantic tag they may be nested in.
* **Remove whitespace padding from `.text_as_html`.** `tabulate` inserts padding spaces to achieve visual alignment of columns in HTML tables it generates. Add our own HTML generator to do this simple job and omit that padding as well as newlines ("\n") used for human readability.
* **Fix local connector with absolute input path** When passed an absolute filepath for the input document path, the local connector incorrectly writes the output file to the input file directory. This fixes such that the output in this case is written to `output-dir/input-filename.json`
## 0.10.30

View File

@ -25,6 +25,7 @@ from unstructured.documents.html import (
TEXT_TAGS,
HTMLDocument,
HTMLNarrativeText,
HTMLTable,
HTMLTitle,
TagsMixin,
)
@ -32,21 +33,26 @@ from unstructured.documents.html import (
DIRECTORY = pathlib.Path(__file__).parent.resolve()
TAGS = (
"<a><abbr><acronym><address><applet><area><article><aside><audio><b><base><basefont><bdi>"
"<bdo><big><blockquote><body><br><button><canvas><caption><center><cite><code><col>"
"<colgroup><data><datalist><dd><del><details><dfn><dialog><dir><div><dl><dt><em><embed>"
"<fieldset><figcaption><figure><font><footer><form><frame><frameset><h1><h2><h3><h4><h5><h6>"
"<head><header><hr><html><i><iframe><img><input><ins><kbd><label><legend><li><link><main>"
"<map><mark><meta><meter><nav><noframes><noscript><object><ol><optgroup><option><output><p>"
"<param><picture><pre><progress><q><rp><rt><ruby><s><samp><script><section><select><small>"
"<source><span><strike><strong><style><sub><summary><sup><table><tbody><td><template>"
"<textarea><tfoot><th><thead><time><title><tr><track><tt><u><ul><var><video><wbr>"
(
"<a><abbr><acronym><address><applet><area><article><aside><audio><b><base><basefont><bdi>"
"<bdo><big><blockquote><body><br><button><canvas><caption><center><cite><code><col>"
"<colgroup><data><datalist><dd><del><details><dfn><dialog><dir><div><dl><dt><em><embed>"
"<fieldset><figcaption><figure><font><footer><form><frame><frameset><h1><h2><h3><h4><h5>"
"<h6><head><header><hr><html><i><iframe><img><input><ins><kbd><label><legend><li><link>"
"<main><map><mark><meta><meter><nav><noframes><noscript><object><ol><optgroup><option>"
"<output><p><param><picture><pre><progress><q><rp><rt><ruby><s><samp><script><section>"
"<select><small><source><span><strike><strong><style><sub><summary><sup><table><tbody><td>"
"<template><textarea><tfoot><th><thead><time><title><tr><track><tt><u><ul><var><video><wbr>"
)
.replace(">", "")
.split("<")[1:]
)
TAGS = TAGS.replace(">", "").split("<")[1:]
VOID_TAGS = "<area><base><br><col><embed><hr><img><input><link><meta><param><source><track><wbr>"
VOID_TAGS = VOID_TAGS.replace(">", "").split("<")[1:]
VOID_TAGS = (
("<area><base><br><col><embed><hr><img><input><link><meta><param><source><track><wbr>")
.replace(">", "")
.split("<")[1:]
)
INCLUDED_TAGS = TEXT_TAGS + HEADING_TAGS + LIST_ITEM_TAGS + SECTION_TAGS
EXCLUDED_TAGS = [
@ -56,23 +62,157 @@ EXCLUDED_TAGS = [
]
@pytest.fixture()
def sample_doc():
table_element = HTMLTitle(
"I'm a title in a table.",
tag="p",
ancestortags=("table", "tbody", "tr", "td"),
# -- table-extraction behaviors ------------------------------------------------------------------
def test_it_can_parse_a_bare_bones_table_to_an_HTMLTable_element():
"""Bare-bones means no `<thead>`, `<tbody>`, or `<tfoot>` elements."""
html_str = (
"<html>\n"
"<body>\n"
" <table>\n"
" <tr><td>Lorem</td><td>Ipsum</td></tr>\n"
" <tr><td>Ut enim non</td><td>ad minim\nveniam quis</td></tr>\n"
" </table>\n"
"</body>\n"
"</html>"
)
narrative = HTMLNarrativeText("I'm some narrative text", tag="p", ancestortags=())
page1 = Page(0)
page1.elements = [table_element, narrative]
header = HTMLTitle("I'm a header", tag="header", ancestortags=())
body = HTMLNarrativeText("Body text", tag="p", ancestortags=())
footer = HTMLTitle("I'm a footer", tag="footer", ancestortags=())
page2 = Page(1)
page2.elements = [header, body, footer]
doc = HTMLDocument.from_pages([page1, page2])
return doc
html_document = HTMLDocument.from_string(html_str)
# -- there is exactly one element and it's an HTMLTable instance --
(element,) = html_document.elements
assert isinstance(element, HTMLTable)
# -- table text is joined into a single string; no row or cell boundaries are represented --
assert element.text == "Lorem Ipsum Ut enim non ad minim\nveniam quis"
# -- An HTML representation is also available that is longer but represents table structure.
# -- Note this is padded with undesired spaces for human-readability that doesn't matter to us.
assert element.text_as_html == (
"<table>"
"<tr><td>Lorem</td><td>Ipsum</td></tr>"
"<tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr>"
"</table>"
)
def test_it_accommodates_column_heading_cells_enclosed_in_thead_tbody_and_tfoot_elements():
"""Cells within a `table/thead` element are included in the text and html.
The presence of a `<thead>` element in the original also determines whether a `<thead>` element
appears in `.text_as_html` or whether the first row of cells is simply in the body.
"""
html_str = (
"<html>\n"
"<body>\n"
" <table>\n"
" <thead>\n"
" <tr><th>Lorem</th><th>Ipsum</th></tr>\n"
" </thead>\n"
" <tbody>\n"
" <tr><th>Lorem ipsum</th><td>dolor sit amet nulla</td></tr>\n"
" <tr><th>Ut enim non</th><td>ad minim\nveniam quis</td></tr>\n"
" </tbody>\n"
" <tfoot>\n"
" <tr><th>Dolor</th><td>Equis</td></tr>\n"
" </tfoot>\n"
" </table>\n"
"</body>\n"
"</html>"
)
html_document = HTMLDocument.from_string(html_str)
(element,) = html_document.elements
assert isinstance(element, HTMLTable)
assert element.text_as_html == (
"<table>"
"<tr><td>Lorem</td><td>Ipsum</td></tr>"
"<tr><td>Lorem ipsum</td><td>dolor sit amet nulla</td></tr>"
"<tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr>"
"<tr><td>Dolor</td><td>Equis</td></tr>"
"</table>"
)
def test_it_does_not_emit_an_HTMLTable_element_for_a_table_with_no_text():
html_str = (
"<html>\n"
"<body>\n"
" <table>\n"
" <tr><td> </td><td> </td></tr>\n"
" <tr><td> </td><td> </td></tr>\n"
" </table>\n"
"</body>\n"
"</html>"
)
html_document = HTMLDocument.from_string(html_str)
assert html_document.elements == []
def test_it_does_not_consider_an_empty_table_a_bulleted_text_table():
html_str = (
"<html>\n"
"<body>\n"
" <table>\n"
" <tr><td> </td><td> </td></tr>\n"
" <tr><td> </td><td> </td></tr>\n"
" </table>\n"
"</body>\n"
"</html>"
)
html_document = HTMLDocument.from_string(html_str)
html_elem = html_document.document_tree
assert html_elem is not None
table = html_elem.find(".//table")
assert table is not None
assert html._is_bulleted_table(table) is False
def test_it_provides_parseable_HTML_in_text_as_html():
html_str = (
"<html>\n"
"<body>\n"
" <table>\n"
" <thead>\n"
" <tr><th>Lorem</th><th>Ipsum</th></tr>\n"
" </thead>\n"
" <tbody>\n"
" <tr><th>Lorem ipsum</th><td>dolor sit amet nulla</td></tr>\n"
" <tr><th>Ut enim non</th><td>ad minim\nveniam quis</td></tr>\n"
" </tbody>\n"
" <tfoot>\n"
" <tr><th>Dolor</th><td>Equis</td></tr>\n"
" </tfoot>\n"
" </table>\n"
"</body>\n"
"</html>"
)
html_document = HTMLDocument.from_string(html_str)
(element,) = html_document.elements
assert isinstance(element, HTMLTable)
text_as_html = element.text_as_html
assert text_as_html is not None
html = etree.fromstring(text_as_html, etree.HTMLParser())
assert html is not None
# -- lxml adds the <html><body> container, that's not present in `.text_as_html` --
assert etree.tostring(html, encoding=str) == (
"<html><body>"
"<table>"
"<tr><td>Lorem</td><td>Ipsum</td></tr>"
"<tr><td>Lorem ipsum</td><td>dolor sit amet nulla</td></tr>"
"<tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr>"
"<tr><td>Dolor</td><td>Equis</td></tr>"
"</table>"
"</body></html>"
)
# ------------------------------------------------------------------------------------------------
def test_parses_tags_correctly():
@ -196,7 +336,6 @@ def test_parse_nothing():
def test_read_with_existing_pages():
page = Page(number=0)
html_document = HTMLDocument.from_pages([page])
html_document._read()
assert html_document.pages == [page]
@ -547,7 +686,6 @@ def test_containers_with_text_are_processed():
</div>
</div>"""
html_document = HTMLDocument.from_string(html_str)
html_document._read()
assert html_document.elements == [
Text(text="Hi All,"),
@ -570,7 +708,6 @@ def test_html_grabs_bulleted_text_in_tags():
</body>
</html>"""
html_document = HTMLDocument.from_string(html_str)
html_document._read()
assert html_document.elements == [
ListItem(text="Happy Groundhog's day!"),
@ -590,7 +727,6 @@ def test_html_grabs_bulleted_text_in_paras():
</body>
</html>"""
html_document = HTMLDocument.from_string(html_str)
html_document._read()
assert html_document.elements == [
ListItem(text="Happy Groundhog's day!"),
@ -642,7 +778,6 @@ def test_html_grabs_bulleted_text_in_tables():
</body>
</html>"""
html_document = HTMLDocument.from_string(html_str)
html_document._read()
assert html_document.elements == [
ListItem(text="Happy Groundhog's day!"),
@ -727,3 +862,25 @@ def test_line_break_in_text_tag(tag):
doc = HTMLDocument.from_string(raw_html)
assert doc.elements[0].text == "Hello"
assert doc.elements[1].text == "World"
# -- module-level fixtures -----------------------------------------------------------------------
@pytest.fixture()
def sample_doc():
table_element = HTMLTitle(
"I'm a title in a table.",
tag="p",
ancestortags=("table", "tbody", "tr", "td"),
)
narrative = HTMLNarrativeText("I'm some narrative text", tag="p", ancestortags=())
page1 = Page(0)
page1.elements = [table_element, narrative]
header = HTMLTitle("I'm a header", tag="header", ancestortags=())
body = HTMLNarrativeText("Body text", tag="p", ancestortags=())
footer = HTMLTitle("I'm a footer", tag="footer", ancestortags=())
page2 = Page(1)
page2.elements = [header, body, footer]
doc = HTMLDocument.from_pages([page1, page2])
return doc

View File

@ -1,3 +1,5 @@
# pyright: reportPrivateUsage=false
import os
from pathlib import Path
@ -35,7 +37,7 @@ def test_read_xml(sample_document, tmpdir):
def test_xml_read_raises():
xml_document = XMLDocument()
with pytest.raises(NotImplementedError):
xml_document._read()
xml_document._parse_pages_from_element_tree()
def test_from_string(sample_document):

View File

@ -677,17 +677,17 @@ def test_partition_html_respects_detect_language_per_element():
@pytest.mark.parametrize(
("tag", "expected"),
[
("thead", ""),
("foo", ""),
("thead", "<table><tr><td>Header 1</td><td>Header 2</td></tr></table>"),
("tfoot", "<table><tr><td>Header 1</td><td>Header 2</td></tr></table>"),
],
)
def test_partition_html_with_table_without_tbody(tag, expected):
table_html = f"""
<table>
<{tag}>
<tr><th>Header 1</th><th>Header 2</th></tr>
</{tag}>
</table>
"""
def test_partition_html_with_table_without_tbody(tag: str, expected: str):
table_html = (
f"<table>\n"
f" <{tag}>\n"
f" <tr><th>Header 1</th><th>Header 2</th></tr>\n"
f" </{tag}>\n"
f"</table>"
)
partitions = partition_html(text=table_html)
assert partitions[0].metadata.text_as_html == expected

View File

@ -106,7 +106,7 @@
"eng"
],
"page_number": 1,
"text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
"text_as_html": "<table><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
},
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
"type": "Table"

View File

@ -18,7 +18,7 @@
"eng"
],
"page_number": 1,
"text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
"text_as_html": "<table><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
},
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
"type": "Table"

View File

@ -17,7 +17,7 @@
"eng"
],
"page_number": 1,
"text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
"text_as_html": "<table><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
},
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
"type": "Table"

View File

@ -17,7 +17,7 @@
"eng"
],
"page_number": 1,
"text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
"text_as_html": "<table><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
},
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
"type": "Table"

View File

@ -17,7 +17,7 @@
"eng"
],
"page_number": 1,
"text_as_html": "<table><br><tbody><br><tr><td>Driver </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Approver </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Contributors</td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Informed </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Objective </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Due date </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Key outcomes</td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Status </td><td>NOT STARTED</td><td>/</td><td>IN PROGRESS</td><td>/</td><td>COMPLETE</td></tr><br></tbody><br></table>"
"text_as_html": "<table><tr><td>Driver</td></tr><tr><td>Approver</td></tr><tr><td>Contributors</td></tr><tr><td>Informed</td></tr><tr><td>Objective</td></tr><tr><td>Due date</td></tr><tr><td>Key outcomes</td></tr><tr><td>Status</td><td>NOT STARTED</td><td>/</td><td>IN PROGRESS</td><td>/</td><td>COMPLETE</td></tr></table>"
},
"text": "Driver Approver Contributors Informed Objective Due date Key outcomes Status NOT STARTED / IN PROGRESS / COMPLETE",
"type": "Table"
@ -84,7 +84,7 @@
"eng"
],
"page_number": 1,
"text_as_html": "<table><br><tbody><br><tr><td>Must have: </td></tr><br><tr><td>Nice to have:</td></tr><br><tr><td>Not in scope:</td></tr><br></tbody><br></table>"
"text_as_html": "<table><tr><td>Must have:</td></tr><tr><td>Nice to have:</td></tr><tr><td>Not in scope:</td></tr></table>"
},
"text": "Must have: Nice to have: Not in scope:",
"type": "Table"
@ -327,7 +327,7 @@
"eng"
],
"page_number": 1,
"text_as_html": "<table><br><tbody><br><tr><td>Milestone</td><td>Owner</td><td>Deadline</td><td>Status</td></tr><br><tr><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td> </td><td> </td><td> </td><td> </td></tr><br></tbody><br></table>"
"text_as_html": "<table><tr><td>Milestone</td><td>Owner</td><td>Deadline</td><td>Status</td></tr></table>"
},
"text": "Milestone Owner Deadline Status",
"type": "Table"

View File

@ -171,7 +171,7 @@
"eng"
],
"page_number": 1,
"text_as_html": "<table><br><tbody><br><tr><td>Time</td><td>Item</td><td>Presenter</td><td>Notes</td></tr><br><tr><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td> </td><td> </td><td> </td><td> </td></tr><br></tbody><br></table>"
"text_as_html": "<table><tr><td>Time</td><td>Item</td><td>Presenter</td><td>Notes</td></tr></table>"
},
"text": "Time Item Presenter Notes",
"type": "Table"

View File

@ -299,7 +299,7 @@
"eng"
],
"page_number": 1,
"text_as_html": "<table><br><tbody><br><tr><td>Notes </td></tr><br><tr><td>Important Links</td></tr><br></tbody><br></table>"
"text_as_html": "<table><tr><td>Notes</td></tr><tr><td>Important Links</td></tr></table>"
},
"text": "Notes Important Links",
"type": "Table"

View File

@ -767,7 +767,7 @@
"fra"
],
"page_number": 1,
"text_as_html": "<table><br><tbody><br><tr><td>Testdoc3 Table: Column 1 Row 0</td><td>Testdoc3 Table: Column 2 Row 0</td><td>Testdoc3 Table: Column 3 Row 0</td></tr><br><tr><td>Testdoc3 Table: Column 1 Row 1</td><td>Testdoc3 Table: Column 2 Row 1</td><td>Testdoc3 Table: Column 3 Row 1</td></tr><br><tr><td>Testdoc3 Table: Column 1 Row 2</td><td>Testdoc3 Table: Column 2 Row 2</td><td>Testdoc3 Table: Column 3 Row 2</td></tr><br></tbody><br></table>"
"text_as_html": "<table><tr><td>Testdoc3 Table: Column 1 Row 0</td><td>Testdoc3 Table: Column 2 Row 0</td><td>Testdoc3 Table: Column 3 Row 0</td></tr><tr><td>Testdoc3 Table: Column 1 Row 1</td><td>Testdoc3 Table: Column 2 Row 1</td><td>Testdoc3 Table: Column 3 Row 1</td></tr><tr><td>Testdoc3 Table: Column 1 Row 2</td><td>Testdoc3 Table: Column 2 Row 2</td><td>Testdoc3 Table: Column 3 Row 2</td></tr></table>"
},
"text": "Testdoc3 Table: Column 1 Row 0 Testdoc3 Table: Column 2 Row 0 Testdoc3 Table: Column 3 Row 0 Testdoc3 Table: Column 1 Row 1 Testdoc3 Table: Column 2 Row 1 Testdoc3 Table: Column 3 Row 1 Testdoc3 Table: Column 1 Row 2 Testdoc3 Table: Column 2 Row 2 Testdoc3 Table: Column 3 Row 2",
"type": "Table"

View File

@ -767,7 +767,7 @@
"fra"
],
"page_number": 1,
"text_as_html": "<table><br><tbody><br><tr><td>Testdoc2 Table: Column 1 Row 0</td><td>Testdoc2 Table: Column 2 Row 0</td><td>Testdoc2 Table: Column 3 Row 0</td></tr><br><tr><td>Testdoc2 Table: Column 1 Row 1</td><td>Testdoc2 Table: Column 2 Row 1</td><td>Testdoc2 Table: Column 3 Row 1</td></tr><br><tr><td>Testdoc2 Table: Column 1 Row 2</td><td>Testdoc2 Table: Column 2 Row 2</td><td>Testdoc2 Table: Column 3 Row 2</td></tr><br></tbody><br></table>"
"text_as_html": "<table><tr><td>Testdoc2 Table: Column 1 Row 0</td><td>Testdoc2 Table: Column 2 Row 0</td><td>Testdoc2 Table: Column 3 Row 0</td></tr><tr><td>Testdoc2 Table: Column 1 Row 1</td><td>Testdoc2 Table: Column 2 Row 1</td><td>Testdoc2 Table: Column 3 Row 1</td></tr><tr><td>Testdoc2 Table: Column 1 Row 2</td><td>Testdoc2 Table: Column 2 Row 2</td><td>Testdoc2 Table: Column 3 Row 2</td></tr></table>"
},
"text": "Testdoc2 Table: Column 1 Row 0 Testdoc2 Table: Column 2 Row 0 Testdoc2 Table: Column 3 Row 0 Testdoc2 Table: Column 1 Row 1 Testdoc2 Table: Column 2 Row 1 Testdoc2 Table: Column 3 Row 1 Testdoc2 Table: Column 1 Row 2 Testdoc2 Table: Column 2 Row 2 Testdoc2 Table: Column 3 Row 2",
"type": "Table"

View File

@ -15,7 +15,7 @@
"eng"
],
"page_number": 1,
"text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
"text_as_html": "<table><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
},
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
"type": "Table"

View File

@ -15,7 +15,7 @@
"eng"
],
"page_number": 1,
"text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
"text_as_html": "<table><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
},
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
"type": "Table"

View File

@ -17,7 +17,7 @@
"eng"
],
"page_number": 1,
"text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
"text_as_html": "<table><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
},
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
"type": "Table"

View File

@ -17,7 +17,7 @@
"eng"
],
"page_number": 1,
"text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
"text_as_html": "<table><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
},
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
"type": "Table"

View File

@ -17,7 +17,7 @@
"eng"
],
"page_number": 1,
"text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
"text_as_html": "<table><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
},
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
"type": "Table"

View File

@ -3,7 +3,7 @@
from __future__ import annotations
import sys
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, cast
if sys.version_info < (3, 8):
from typing_extensions import Final
@ -11,7 +11,6 @@ else:
from typing import Final
from lxml import etree
from tabulate import tabulate
from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes
from unstructured.documents.base import Page
@ -36,6 +35,7 @@ from unstructured.partition.text_type import (
is_possible_title,
is_us_city_state_zip,
)
from unstructured.utils import htmlify_matrix_of_cell_texts
TEXT_TAGS: Final[List[str]] = ["p", "a", "td", "span", "font"]
LIST_ITEM_TAGS: Final[List[str]] = ["li", "dd"]
@ -138,7 +138,7 @@ class HTMLDocument(XMLDocument):
self.assembled_articles = assemble_articles
super().__init__(stylesheet=stylesheet, parser=parser)
def _read(self) -> List[Page]:
def _parse_pages_from_element_tree(self) -> List[Page]:
"""Parse HTML elements into pages.
A *page* is a subsequence of the document-elements parsed from the HTML document
@ -209,9 +209,10 @@ class HTMLDocument(XMLDocument):
)
elif tag_elem.tag in TABLE_TAGS:
element = _process_leaf_table_item(tag_elem)
element = _parse_HTMLTable_from_table_elem(tag_elem)
if element is not None:
page.elements.append(element)
if element or tag_elem.tag == "table":
descendanttag_elems = tuple(tag_elem.iterdescendants())
elif tag_elem.tag in PAGEBREAK_TAGS and len(page.elements) > 0:
@ -303,6 +304,60 @@ def _get_links_from_tag(tag_elem: etree._Element) -> List[Link]:
return links
def _is_bulleted_table(table_elem: etree._Element) -> bool:
"""True when all text in `table_elem` is bulleted text.
A table-row containing no text is not considered, but at least one bulleted-text item must be
present. A table with no text in any row is not a bulleted table.
"""
if table_elem.tag != "table":
return False
trs = table_elem.findall(".//tr")
tr_texts = [_construct_text(tr) for tr in trs]
# -- a table with no text is not a bulleted table --
if all(not text for text in tr_texts):
return False
# -- all non-empty rows must contain bulleted text --
if any(text and not is_bulleted_text(text) for text in tr_texts):
return False
return True
def _parse_HTMLTable_from_table_elem(table_elem: etree._Element) -> Optional[Element]:
"""Form `HTMLTable` element from `tbl_elem`."""
if table_elem.tag != "table":
return None
# -- NOTE that this algorithm handles a nested-table by parsing all of its text into the text
# -- for the _cell_ containing the table (and this is recursive, so a table nested within a
# -- cell within the table within the cell too.)
trs = cast(
List[etree._Element], table_elem.xpath("./tr | ./thead/tr | ./tbody/tr | ./tfoot/tr")
)
if not trs:
return None
table_data = [[str(text) for text in tr.itertext()] for tr in trs]
html_table = htmlify_matrix_of_cell_texts(table_data)
table_text = " ".join(" ".join(row) for row in table_data).strip()
if table_text == "":
return None
return HTMLTable(
text=table_text,
text_as_html=html_table,
tag=table_elem.tag,
ancestortags=tuple(el.tag for el in table_elem.iterancestors())[::-1],
)
def _get_emphasized_texts_from_tag(tag_elem: etree._Element) -> List[Dict[str, str]]:
"""Emphasized text within and below `tag_element`.
@ -524,54 +579,6 @@ def _is_text_tag(tag_elem: etree._Element, max_predecessor_len: int = 5) -> bool
return False
def _process_leaf_table_item(tbl_elem: etree._Element) -> Optional[Element]:
"""Form `HTMLTable` element from `tbl_elem`."""
# -- Note this function theoretically _can_ be called with any element in TABLE_TAGS, but never
# -- actually _will_ get called with anything but a `<table>` element. Logic of that is:
# - gets called for each top-level `table` element
# - will never find a nested table, even if there is one
# - therefore will not return None
# - therefore all its descendents will be marked visited and loop will not descend further
# into the table element.
if tbl_elem.tag not in TABLE_TAGS:
return None
# -- ALSO NOTE that this algorithm will parse all the text within a nested table into the text
# -- for the _cell_ the table is nested in (and this is recursive, so a table nested within a
# -- cell within the table within the cell too.)
# TODO: this is not going to find nested tables, it will only find a `<table>` element that is
# a (direct) child of the current element. and a nested table is only ever a child of a `<td>`
# element. FURTHER, we have no good reason to detect and/or skip nested tables because they are
# already being parsed.
nested_table = tbl_elem.findall("table")
if nested_table:
return None
rows = tbl_elem.findall("tr")
if not rows:
body = tbl_elem.find("tbody")
rows = body.findall("tr") if body is not None else []
if len(rows) > 0:
table_data = [[str(text) for text in row.itertext()] for row in rows]
html_table = tabulate(table_data, tablefmt="html")
table_text = " ".join(" ".join(row) for row in table_data).strip()
# TODO: this branch is the one responsible for returning empty (and therefore unparseable) table
# document elements. This should return `None` instead. Better, make this a zero-or-one
# generator function.
else:
table_text = ""
html_table = ""
return HTMLTable(
text=table_text,
text_as_html=html_table.replace("\n", "<br>"),
tag=tbl_elem.tag,
ancestortags=tuple(el.tag for el in tbl_elem.iterancestors())[::-1],
)
def _process_list_item(
tag_elem: etree._Element,
max_predecessor_len: int = 5,
@ -650,20 +657,6 @@ def _bulleted_text_from_table(table: etree._Element) -> List[Element]:
return bulleted_text
def _is_bulleted_table(table_elem: etree._Element) -> bool:
"""True when `<table>` element `tag_elem` contains bulleted text."""
if table_elem.tag != "table":
return False
rows = table_elem.findall(".//tr")
for row in rows:
text = _construct_text(row)
if text and not is_bulleted_text(text):
return False
return True
def _has_adjacent_bulleted_spans(tag_elem: etree._Element, children: List[etree._Element]) -> bool:
"""True when `tag_elem` is a <div> or <pre> containing two or more adjacent bulleted spans.

View File

@ -1,6 +1,7 @@
from typing import List, Optional, Union
from typing import Any, List, Optional, Union
from lxml import etree
from typing_extensions import Self
from unstructured.documents.base import Document, Page
from unstructured.file_utils.encoding import read_txt_file
@ -43,17 +44,17 @@ class XMLDocument(Document):
self.document_tree = None
super().__init__()
def _read(self):
def _parse_pages_from_element_tree(self) -> List[Page]:
raise NotImplementedError
@property
def pages(self) -> List[Page]:
"""Gets all elements from pages in sequential order."""
if self._pages is None:
self._pages = self._read()
self._pages = self._parse_pages_from_element_tree()
return super().pages
def _read_xml(self, content):
def _read_xml(self, content: str):
"""Reads in an XML file and converts it to an lxml element tree object."""
# NOTE(robinson) - without the carriage return at the beginning, you get
# output that looks like the following when you run partition_pdf
@ -98,8 +99,8 @@ class XMLDocument(Document):
text: str,
parser: VALID_PARSERS = None,
stylesheet: Optional[str] = None,
**kwargs,
):
**kwargs: Any,
) -> Self:
"""Supports reading in an XML file as a raw string rather than as a file."""
logger.info("Reading document from string ...")
doc = cls(parser=parser, stylesheet=stylesheet, **kwargs)
@ -109,12 +110,12 @@ class XMLDocument(Document):
@classmethod
def from_file(
cls,
filename,
filename: str,
parser: VALID_PARSERS = None,
stylesheet: Optional[str] = None,
encoding: Optional[str] = None,
**kwargs,
):
**kwargs: Any,
) -> Self:
_, content = read_txt_file(filename=filename, encoding=encoding)
return cls.from_string(content, parser=parser, stylesheet=stylesheet, **kwargs)

View File

@ -1,4 +1,5 @@
import functools
import html
import importlib
import json
import os
@ -17,6 +18,7 @@ from typing import (
Iterator,
List,
Optional,
Sequence,
Tuple,
TypeVar,
Union,
@ -37,6 +39,36 @@ _T = TypeVar("_T")
_P = ParamSpec("_P")
def htmlify_matrix_of_cell_texts(matrix: Sequence[Sequence[str]]) -> str:
"""Form an HTML table from "rows" and "columns" of `matrix`.
Character overhead is minimized:
- No whitespace padding is added for human readability
- No newlines ("\n") are added
- No `<thead>`, `<tbody>`, or `<tfoot>` elements are used; we can't tell where those might be
semantically appropriate anyway so at best they would consume unnecessary space and at worst
would be misleading.
"""
def iter_trs(rows_of_cell_strs: Sequence[Sequence[str]]) -> Iterator[str]:
for row_cell_strs in rows_of_cell_strs:
# -- suppress emission of rows with no cells --
if not row_cell_strs:
continue
yield f"<tr>{''.join(iter_tds(row_cell_strs))}</tr>"
def iter_tds(row_cell_strs: Sequence[str]) -> Iterator[str]:
for s in row_cell_strs:
# -- take care of things like '<' and '>' in the text --
s = html.escape(s)
# -- substitute <br/> elements for line-feeds in the text --
s = "<br/>".join(s.split("\n"))
# -- strip leading and trailing whitespace, wrap it up and go --
yield f"<td>{s.strip()}</td>"
return f"<table>{''.join(iter_trs(matrix))}</table>" if matrix else ""
class lazyproperty(Generic[_T]):
"""Decorator like @property, but evaluated only on first access.