mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-24 13:44:05 +00:00
fix: assorted partition_html() bugs (#2113)
Addresses a cluster of HTML-related bugs: - empty table is identified as bulleted-table - `partition_html()` emits empty (no text) tables (#1928) - `.text_as_html` contains inappropriate `<br>` elements in invalid locations. - cells enclosed in `<thead>` and `<tfoot>` elements are dropped (#1928) - `.text_as_html` contains whitespace padding Each of these is addressed in a separate commit below. Fixes #1928. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: scanny <scanny@users.noreply.github.com> Co-authored-by: Yuming Long <63475068+yuming-long@users.noreply.github.com>
This commit is contained in:
parent
13a23deba6
commit
ee9be2a3b2
@ -22,6 +22,11 @@
|
||||
* **Fix some pdfs returning `KeyError: 'N'`** Certain pdfs were throwing this error when being opened by pdfminer. Added a wrapper function for pdfminer that allows these documents to be partitioned.
|
||||
* **Fix mis-splits on `Table` chunks.** Remedies repeated appearance of full `.text_as_html` on metadata of each `TableChunk` split from a `Table` element too large to fit in the chunking window.
|
||||
* **Import tables_agent from inference** so that we don't have to initialize a global table agent in unstructured OCR again
|
||||
* **Fix empty table is identified as bulleted-table.** A table with no text content was mistakenly identified as a bulleted-table and processed by the wrong branch of the initial HTML partitioner.
|
||||
* **Fix partition_html() emits empty (no text) tables.** A table with cells nested below a `<thead>` or `<tfoot>` element was emitted as a table element having no text and unparseable HTML in `element.metadata.text_as_html`. Do not emit empty tables to the element stream.
|
||||
* **Fix HTML `element.metadata.text_as_html` contains spurious <br> elements in invalid locations.** The HTML generated for the `text_as_html` metadata for HTML tables contained `<br>` elements invalid locations like between `<table>` and `<tr>`. Change the HTML generator such that these do not appear.
|
||||
* **Fix HTML table cells enclosed in <thead> and <tfoot> elements are dropped.** HTML table cells nested in a `<thead>` or `<tfoot>` element were not detected and the text in those cells was omitted from the table element text and `.text_as_html`. Detect table rows regardless of the semantic tag they may be nested in.
|
||||
* **Remove whitespace padding from `.text_as_html`.** `tabulate` inserts padding spaces to achieve visual alignment of columns in HTML tables it generates. Add our own HTML generator to do this simple job and omit that padding as well as newlines ("\n") used for human readability.
|
||||
* **Fix local connector with absolute input path** When passed an absolute filepath for the input document path, the local connector incorrectly writes the output file to the input file directory. This fixes such that the output in this case is written to `output-dir/input-filename.json`
|
||||
|
||||
## 0.10.30
|
||||
|
||||
@ -25,6 +25,7 @@ from unstructured.documents.html import (
|
||||
TEXT_TAGS,
|
||||
HTMLDocument,
|
||||
HTMLNarrativeText,
|
||||
HTMLTable,
|
||||
HTMLTitle,
|
||||
TagsMixin,
|
||||
)
|
||||
@ -32,21 +33,26 @@ from unstructured.documents.html import (
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
|
||||
TAGS = (
|
||||
"<a><abbr><acronym><address><applet><area><article><aside><audio><b><base><basefont><bdi>"
|
||||
"<bdo><big><blockquote><body><br><button><canvas><caption><center><cite><code><col>"
|
||||
"<colgroup><data><datalist><dd><del><details><dfn><dialog><dir><div><dl><dt><em><embed>"
|
||||
"<fieldset><figcaption><figure><font><footer><form><frame><frameset><h1><h2><h3><h4><h5><h6>"
|
||||
"<head><header><hr><html><i><iframe><img><input><ins><kbd><label><legend><li><link><main>"
|
||||
"<map><mark><meta><meter><nav><noframes><noscript><object><ol><optgroup><option><output><p>"
|
||||
"<param><picture><pre><progress><q><rp><rt><ruby><s><samp><script><section><select><small>"
|
||||
"<source><span><strike><strong><style><sub><summary><sup><table><tbody><td><template>"
|
||||
"<textarea><tfoot><th><thead><time><title><tr><track><tt><u><ul><var><video><wbr>"
|
||||
(
|
||||
"<a><abbr><acronym><address><applet><area><article><aside><audio><b><base><basefont><bdi>"
|
||||
"<bdo><big><blockquote><body><br><button><canvas><caption><center><cite><code><col>"
|
||||
"<colgroup><data><datalist><dd><del><details><dfn><dialog><dir><div><dl><dt><em><embed>"
|
||||
"<fieldset><figcaption><figure><font><footer><form><frame><frameset><h1><h2><h3><h4><h5>"
|
||||
"<h6><head><header><hr><html><i><iframe><img><input><ins><kbd><label><legend><li><link>"
|
||||
"<main><map><mark><meta><meter><nav><noframes><noscript><object><ol><optgroup><option>"
|
||||
"<output><p><param><picture><pre><progress><q><rp><rt><ruby><s><samp><script><section>"
|
||||
"<select><small><source><span><strike><strong><style><sub><summary><sup><table><tbody><td>"
|
||||
"<template><textarea><tfoot><th><thead><time><title><tr><track><tt><u><ul><var><video><wbr>"
|
||||
)
|
||||
.replace(">", "")
|
||||
.split("<")[1:]
|
||||
)
|
||||
|
||||
TAGS = TAGS.replace(">", "").split("<")[1:]
|
||||
|
||||
VOID_TAGS = "<area><base><br><col><embed><hr><img><input><link><meta><param><source><track><wbr>"
|
||||
VOID_TAGS = VOID_TAGS.replace(">", "").split("<")[1:]
|
||||
VOID_TAGS = (
|
||||
("<area><base><br><col><embed><hr><img><input><link><meta><param><source><track><wbr>")
|
||||
.replace(">", "")
|
||||
.split("<")[1:]
|
||||
)
|
||||
|
||||
INCLUDED_TAGS = TEXT_TAGS + HEADING_TAGS + LIST_ITEM_TAGS + SECTION_TAGS
|
||||
EXCLUDED_TAGS = [
|
||||
@ -56,23 +62,157 @@ EXCLUDED_TAGS = [
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def sample_doc():
|
||||
table_element = HTMLTitle(
|
||||
"I'm a title in a table.",
|
||||
tag="p",
|
||||
ancestortags=("table", "tbody", "tr", "td"),
|
||||
# -- table-extraction behaviors ------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_it_can_parse_a_bare_bones_table_to_an_HTMLTable_element():
|
||||
"""Bare-bones means no `<thead>`, `<tbody>`, or `<tfoot>` elements."""
|
||||
html_str = (
|
||||
"<html>\n"
|
||||
"<body>\n"
|
||||
" <table>\n"
|
||||
" <tr><td>Lorem</td><td>Ipsum</td></tr>\n"
|
||||
" <tr><td>Ut enim non</td><td>ad minim\nveniam quis</td></tr>\n"
|
||||
" </table>\n"
|
||||
"</body>\n"
|
||||
"</html>"
|
||||
)
|
||||
narrative = HTMLNarrativeText("I'm some narrative text", tag="p", ancestortags=())
|
||||
page1 = Page(0)
|
||||
page1.elements = [table_element, narrative]
|
||||
header = HTMLTitle("I'm a header", tag="header", ancestortags=())
|
||||
body = HTMLNarrativeText("Body text", tag="p", ancestortags=())
|
||||
footer = HTMLTitle("I'm a footer", tag="footer", ancestortags=())
|
||||
page2 = Page(1)
|
||||
page2.elements = [header, body, footer]
|
||||
doc = HTMLDocument.from_pages([page1, page2])
|
||||
return doc
|
||||
|
||||
html_document = HTMLDocument.from_string(html_str)
|
||||
|
||||
# -- there is exactly one element and it's an HTMLTable instance --
|
||||
(element,) = html_document.elements
|
||||
assert isinstance(element, HTMLTable)
|
||||
# -- table text is joined into a single string; no row or cell boundaries are represented --
|
||||
assert element.text == "Lorem Ipsum Ut enim non ad minim\nveniam quis"
|
||||
# -- An HTML representation is also available that is longer but represents table structure.
|
||||
# -- Note this is padded with undesired spaces for human-readability that doesn't matter to us.
|
||||
assert element.text_as_html == (
|
||||
"<table>"
|
||||
"<tr><td>Lorem</td><td>Ipsum</td></tr>"
|
||||
"<tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr>"
|
||||
"</table>"
|
||||
)
|
||||
|
||||
|
||||
def test_it_accommodates_column_heading_cells_enclosed_in_thead_tbody_and_tfoot_elements():
|
||||
"""Cells within a `table/thead` element are included in the text and html.
|
||||
|
||||
The presence of a `<thead>` element in the original also determines whether a `<thead>` element
|
||||
appears in `.text_as_html` or whether the first row of cells is simply in the body.
|
||||
"""
|
||||
html_str = (
|
||||
"<html>\n"
|
||||
"<body>\n"
|
||||
" <table>\n"
|
||||
" <thead>\n"
|
||||
" <tr><th>Lorem</th><th>Ipsum</th></tr>\n"
|
||||
" </thead>\n"
|
||||
" <tbody>\n"
|
||||
" <tr><th>Lorem ipsum</th><td>dolor sit amet nulla</td></tr>\n"
|
||||
" <tr><th>Ut enim non</th><td>ad minim\nveniam quis</td></tr>\n"
|
||||
" </tbody>\n"
|
||||
" <tfoot>\n"
|
||||
" <tr><th>Dolor</th><td>Equis</td></tr>\n"
|
||||
" </tfoot>\n"
|
||||
" </table>\n"
|
||||
"</body>\n"
|
||||
"</html>"
|
||||
)
|
||||
|
||||
html_document = HTMLDocument.from_string(html_str)
|
||||
|
||||
(element,) = html_document.elements
|
||||
assert isinstance(element, HTMLTable)
|
||||
assert element.text_as_html == (
|
||||
"<table>"
|
||||
"<tr><td>Lorem</td><td>Ipsum</td></tr>"
|
||||
"<tr><td>Lorem ipsum</td><td>dolor sit amet nulla</td></tr>"
|
||||
"<tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr>"
|
||||
"<tr><td>Dolor</td><td>Equis</td></tr>"
|
||||
"</table>"
|
||||
)
|
||||
|
||||
|
||||
def test_it_does_not_emit_an_HTMLTable_element_for_a_table_with_no_text():
|
||||
html_str = (
|
||||
"<html>\n"
|
||||
"<body>\n"
|
||||
" <table>\n"
|
||||
" <tr><td> </td><td> </td></tr>\n"
|
||||
" <tr><td> </td><td> </td></tr>\n"
|
||||
" </table>\n"
|
||||
"</body>\n"
|
||||
"</html>"
|
||||
)
|
||||
|
||||
html_document = HTMLDocument.from_string(html_str)
|
||||
|
||||
assert html_document.elements == []
|
||||
|
||||
|
||||
def test_it_does_not_consider_an_empty_table_a_bulleted_text_table():
|
||||
html_str = (
|
||||
"<html>\n"
|
||||
"<body>\n"
|
||||
" <table>\n"
|
||||
" <tr><td> </td><td> </td></tr>\n"
|
||||
" <tr><td> </td><td> </td></tr>\n"
|
||||
" </table>\n"
|
||||
"</body>\n"
|
||||
"</html>"
|
||||
)
|
||||
html_document = HTMLDocument.from_string(html_str)
|
||||
html_elem = html_document.document_tree
|
||||
assert html_elem is not None
|
||||
table = html_elem.find(".//table")
|
||||
assert table is not None
|
||||
|
||||
assert html._is_bulleted_table(table) is False
|
||||
|
||||
|
||||
def test_it_provides_parseable_HTML_in_text_as_html():
|
||||
html_str = (
|
||||
"<html>\n"
|
||||
"<body>\n"
|
||||
" <table>\n"
|
||||
" <thead>\n"
|
||||
" <tr><th>Lorem</th><th>Ipsum</th></tr>\n"
|
||||
" </thead>\n"
|
||||
" <tbody>\n"
|
||||
" <tr><th>Lorem ipsum</th><td>dolor sit amet nulla</td></tr>\n"
|
||||
" <tr><th>Ut enim non</th><td>ad minim\nveniam quis</td></tr>\n"
|
||||
" </tbody>\n"
|
||||
" <tfoot>\n"
|
||||
" <tr><th>Dolor</th><td>Equis</td></tr>\n"
|
||||
" </tfoot>\n"
|
||||
" </table>\n"
|
||||
"</body>\n"
|
||||
"</html>"
|
||||
)
|
||||
html_document = HTMLDocument.from_string(html_str)
|
||||
(element,) = html_document.elements
|
||||
assert isinstance(element, HTMLTable)
|
||||
text_as_html = element.text_as_html
|
||||
assert text_as_html is not None
|
||||
|
||||
html = etree.fromstring(text_as_html, etree.HTMLParser())
|
||||
|
||||
assert html is not None
|
||||
# -- lxml adds the <html><body> container, that's not present in `.text_as_html` --
|
||||
assert etree.tostring(html, encoding=str) == (
|
||||
"<html><body>"
|
||||
"<table>"
|
||||
"<tr><td>Lorem</td><td>Ipsum</td></tr>"
|
||||
"<tr><td>Lorem ipsum</td><td>dolor sit amet nulla</td></tr>"
|
||||
"<tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr>"
|
||||
"<tr><td>Dolor</td><td>Equis</td></tr>"
|
||||
"</table>"
|
||||
"</body></html>"
|
||||
)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_parses_tags_correctly():
|
||||
@ -196,7 +336,6 @@ def test_parse_nothing():
|
||||
def test_read_with_existing_pages():
|
||||
page = Page(number=0)
|
||||
html_document = HTMLDocument.from_pages([page])
|
||||
html_document._read()
|
||||
assert html_document.pages == [page]
|
||||
|
||||
|
||||
@ -547,7 +686,6 @@ def test_containers_with_text_are_processed():
|
||||
</div>
|
||||
</div>"""
|
||||
html_document = HTMLDocument.from_string(html_str)
|
||||
html_document._read()
|
||||
|
||||
assert html_document.elements == [
|
||||
Text(text="Hi All,"),
|
||||
@ -570,7 +708,6 @@ def test_html_grabs_bulleted_text_in_tags():
|
||||
</body>
|
||||
</html>"""
|
||||
html_document = HTMLDocument.from_string(html_str)
|
||||
html_document._read()
|
||||
|
||||
assert html_document.elements == [
|
||||
ListItem(text="Happy Groundhog's day!"),
|
||||
@ -590,7 +727,6 @@ def test_html_grabs_bulleted_text_in_paras():
|
||||
</body>
|
||||
</html>"""
|
||||
html_document = HTMLDocument.from_string(html_str)
|
||||
html_document._read()
|
||||
|
||||
assert html_document.elements == [
|
||||
ListItem(text="Happy Groundhog's day!"),
|
||||
@ -642,7 +778,6 @@ def test_html_grabs_bulleted_text_in_tables():
|
||||
</body>
|
||||
</html>"""
|
||||
html_document = HTMLDocument.from_string(html_str)
|
||||
html_document._read()
|
||||
|
||||
assert html_document.elements == [
|
||||
ListItem(text="Happy Groundhog's day!"),
|
||||
@ -727,3 +862,25 @@ def test_line_break_in_text_tag(tag):
|
||||
doc = HTMLDocument.from_string(raw_html)
|
||||
assert doc.elements[0].text == "Hello"
|
||||
assert doc.elements[1].text == "World"
|
||||
|
||||
|
||||
# -- module-level fixtures -----------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def sample_doc():
|
||||
table_element = HTMLTitle(
|
||||
"I'm a title in a table.",
|
||||
tag="p",
|
||||
ancestortags=("table", "tbody", "tr", "td"),
|
||||
)
|
||||
narrative = HTMLNarrativeText("I'm some narrative text", tag="p", ancestortags=())
|
||||
page1 = Page(0)
|
||||
page1.elements = [table_element, narrative]
|
||||
header = HTMLTitle("I'm a header", tag="header", ancestortags=())
|
||||
body = HTMLNarrativeText("Body text", tag="p", ancestortags=())
|
||||
footer = HTMLTitle("I'm a footer", tag="footer", ancestortags=())
|
||||
page2 = Page(1)
|
||||
page2.elements = [header, body, footer]
|
||||
doc = HTMLDocument.from_pages([page1, page2])
|
||||
return doc
|
||||
|
||||
@ -1,3 +1,5 @@
|
||||
# pyright: reportPrivateUsage=false
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
@ -35,7 +37,7 @@ def test_read_xml(sample_document, tmpdir):
|
||||
def test_xml_read_raises():
|
||||
xml_document = XMLDocument()
|
||||
with pytest.raises(NotImplementedError):
|
||||
xml_document._read()
|
||||
xml_document._parse_pages_from_element_tree()
|
||||
|
||||
|
||||
def test_from_string(sample_document):
|
||||
|
||||
@ -677,17 +677,17 @@ def test_partition_html_respects_detect_language_per_element():
|
||||
@pytest.mark.parametrize(
|
||||
("tag", "expected"),
|
||||
[
|
||||
("thead", ""),
|
||||
("foo", ""),
|
||||
("thead", "<table><tr><td>Header 1</td><td>Header 2</td></tr></table>"),
|
||||
("tfoot", "<table><tr><td>Header 1</td><td>Header 2</td></tr></table>"),
|
||||
],
|
||||
)
|
||||
def test_partition_html_with_table_without_tbody(tag, expected):
|
||||
table_html = f"""
|
||||
<table>
|
||||
<{tag}>
|
||||
<tr><th>Header 1</th><th>Header 2</th></tr>
|
||||
</{tag}>
|
||||
</table>
|
||||
"""
|
||||
def test_partition_html_with_table_without_tbody(tag: str, expected: str):
|
||||
table_html = (
|
||||
f"<table>\n"
|
||||
f" <{tag}>\n"
|
||||
f" <tr><th>Header 1</th><th>Header 2</th></tr>\n"
|
||||
f" </{tag}>\n"
|
||||
f"</table>"
|
||||
)
|
||||
partitions = partition_html(text=table_html)
|
||||
assert partitions[0].metadata.text_as_html == expected
|
||||
|
||||
@ -106,7 +106,7 @@
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you'll notice it's full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
|
||||
"text_as_html": "<table><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you'll notice it's full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
|
||||
},
|
||||
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
|
||||
"type": "Table"
|
||||
|
||||
@ -18,7 +18,7 @@
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you'll notice it's full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
|
||||
"text_as_html": "<table><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you'll notice it's full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
|
||||
},
|
||||
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
|
||||
"type": "Table"
|
||||
|
||||
@ -17,7 +17,7 @@
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you'll notice it's full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
|
||||
"text_as_html": "<table><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you'll notice it's full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
|
||||
},
|
||||
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
|
||||
"type": "Table"
|
||||
|
||||
@ -17,7 +17,7 @@
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you'll notice it's full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
|
||||
"text_as_html": "<table><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you'll notice it's full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
|
||||
},
|
||||
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
|
||||
"type": "Table"
|
||||
|
||||
@ -17,7 +17,7 @@
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"text_as_html": "<table><br><tbody><br><tr><td>Driver </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Approver </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Contributors</td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Informed </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Objective </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Due date </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Key outcomes</td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Status </td><td>NOT STARTED</td><td>/</td><td>IN PROGRESS</td><td>/</td><td>COMPLETE</td></tr><br></tbody><br></table>"
|
||||
"text_as_html": "<table><tr><td>Driver</td></tr><tr><td>Approver</td></tr><tr><td>Contributors</td></tr><tr><td>Informed</td></tr><tr><td>Objective</td></tr><tr><td>Due date</td></tr><tr><td>Key outcomes</td></tr><tr><td>Status</td><td>NOT STARTED</td><td>/</td><td>IN PROGRESS</td><td>/</td><td>COMPLETE</td></tr></table>"
|
||||
},
|
||||
"text": "Driver Approver Contributors Informed Objective Due date Key outcomes Status NOT STARTED / IN PROGRESS / COMPLETE",
|
||||
"type": "Table"
|
||||
@ -84,7 +84,7 @@
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"text_as_html": "<table><br><tbody><br><tr><td>Must have: </td></tr><br><tr><td>Nice to have:</td></tr><br><tr><td>Not in scope:</td></tr><br></tbody><br></table>"
|
||||
"text_as_html": "<table><tr><td>Must have:</td></tr><tr><td>Nice to have:</td></tr><tr><td>Not in scope:</td></tr></table>"
|
||||
},
|
||||
"text": "Must have: Nice to have: Not in scope:",
|
||||
"type": "Table"
|
||||
@ -327,7 +327,7 @@
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"text_as_html": "<table><br><tbody><br><tr><td>Milestone</td><td>Owner</td><td>Deadline</td><td>Status</td></tr><br><tr><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td> </td><td> </td><td> </td><td> </td></tr><br></tbody><br></table>"
|
||||
"text_as_html": "<table><tr><td>Milestone</td><td>Owner</td><td>Deadline</td><td>Status</td></tr></table>"
|
||||
},
|
||||
"text": "Milestone Owner Deadline Status",
|
||||
"type": "Table"
|
||||
|
||||
@ -171,7 +171,7 @@
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"text_as_html": "<table><br><tbody><br><tr><td>Time</td><td>Item</td><td>Presenter</td><td>Notes</td></tr><br><tr><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td> </td><td> </td><td> </td><td> </td></tr><br></tbody><br></table>"
|
||||
"text_as_html": "<table><tr><td>Time</td><td>Item</td><td>Presenter</td><td>Notes</td></tr></table>"
|
||||
},
|
||||
"text": "Time Item Presenter Notes",
|
||||
"type": "Table"
|
||||
|
||||
@ -299,7 +299,7 @@
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"text_as_html": "<table><br><tbody><br><tr><td>Notes </td></tr><br><tr><td>Important Links</td></tr><br></tbody><br></table>"
|
||||
"text_as_html": "<table><tr><td>Notes</td></tr><tr><td>Important Links</td></tr></table>"
|
||||
},
|
||||
"text": "Notes Important Links",
|
||||
"type": "Table"
|
||||
|
||||
@ -767,7 +767,7 @@
|
||||
"fra"
|
||||
],
|
||||
"page_number": 1,
|
||||
"text_as_html": "<table><br><tbody><br><tr><td>Testdoc3 Table: Column 1 Row 0</td><td>Testdoc3 Table: Column 2 Row 0</td><td>Testdoc3 Table: Column 3 Row 0</td></tr><br><tr><td>Testdoc3 Table: Column 1 Row 1</td><td>Testdoc3 Table: Column 2 Row 1</td><td>Testdoc3 Table: Column 3 Row 1</td></tr><br><tr><td>Testdoc3 Table: Column 1 Row 2</td><td>Testdoc3 Table: Column 2 Row 2</td><td>Testdoc3 Table: Column 3 Row 2</td></tr><br></tbody><br></table>"
|
||||
"text_as_html": "<table><tr><td>Testdoc3 Table: Column 1 Row 0</td><td>Testdoc3 Table: Column 2 Row 0</td><td>Testdoc3 Table: Column 3 Row 0</td></tr><tr><td>Testdoc3 Table: Column 1 Row 1</td><td>Testdoc3 Table: Column 2 Row 1</td><td>Testdoc3 Table: Column 3 Row 1</td></tr><tr><td>Testdoc3 Table: Column 1 Row 2</td><td>Testdoc3 Table: Column 2 Row 2</td><td>Testdoc3 Table: Column 3 Row 2</td></tr></table>"
|
||||
},
|
||||
"text": "Testdoc3 Table: Column 1 Row 0 Testdoc3 Table: Column 2 Row 0 Testdoc3 Table: Column 3 Row 0 Testdoc3 Table: Column 1 Row 1 Testdoc3 Table: Column 2 Row 1 Testdoc3 Table: Column 3 Row 1 Testdoc3 Table: Column 1 Row 2 Testdoc3 Table: Column 2 Row 2 Testdoc3 Table: Column 3 Row 2",
|
||||
"type": "Table"
|
||||
|
||||
@ -767,7 +767,7 @@
|
||||
"fra"
|
||||
],
|
||||
"page_number": 1,
|
||||
"text_as_html": "<table><br><tbody><br><tr><td>Testdoc2 Table: Column 1 Row 0</td><td>Testdoc2 Table: Column 2 Row 0</td><td>Testdoc2 Table: Column 3 Row 0</td></tr><br><tr><td>Testdoc2 Table: Column 1 Row 1</td><td>Testdoc2 Table: Column 2 Row 1</td><td>Testdoc2 Table: Column 3 Row 1</td></tr><br><tr><td>Testdoc2 Table: Column 1 Row 2</td><td>Testdoc2 Table: Column 2 Row 2</td><td>Testdoc2 Table: Column 3 Row 2</td></tr><br></tbody><br></table>"
|
||||
"text_as_html": "<table><tr><td>Testdoc2 Table: Column 1 Row 0</td><td>Testdoc2 Table: Column 2 Row 0</td><td>Testdoc2 Table: Column 3 Row 0</td></tr><tr><td>Testdoc2 Table: Column 1 Row 1</td><td>Testdoc2 Table: Column 2 Row 1</td><td>Testdoc2 Table: Column 3 Row 1</td></tr><tr><td>Testdoc2 Table: Column 1 Row 2</td><td>Testdoc2 Table: Column 2 Row 2</td><td>Testdoc2 Table: Column 3 Row 2</td></tr></table>"
|
||||
},
|
||||
"text": "Testdoc2 Table: Column 1 Row 0 Testdoc2 Table: Column 2 Row 0 Testdoc2 Table: Column 3 Row 0 Testdoc2 Table: Column 1 Row 1 Testdoc2 Table: Column 2 Row 1 Testdoc2 Table: Column 3 Row 1 Testdoc2 Table: Column 1 Row 2 Testdoc2 Table: Column 2 Row 2 Testdoc2 Table: Column 3 Row 2",
|
||||
"type": "Table"
|
||||
|
||||
@ -15,7 +15,7 @@
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you'll notice it's full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
|
||||
"text_as_html": "<table><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you'll notice it's full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
|
||||
},
|
||||
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
|
||||
"type": "Table"
|
||||
|
||||
@ -15,7 +15,7 @@
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you'll notice it's full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
|
||||
"text_as_html": "<table><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you'll notice it's full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
|
||||
},
|
||||
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
|
||||
"type": "Table"
|
||||
|
||||
@ -17,7 +17,7 @@
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you'll notice it's full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
|
||||
"text_as_html": "<table><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you'll notice it's full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
|
||||
},
|
||||
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
|
||||
"type": "Table"
|
||||
|
||||
@ -17,7 +17,7 @@
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you'll notice it's full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
|
||||
"text_as_html": "<table><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you'll notice it's full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
|
||||
},
|
||||
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
|
||||
"type": "Table"
|
||||
|
||||
@ -17,7 +17,7 @@
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you'll notice it's full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
|
||||
"text_as_html": "<table><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you'll notice it's full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
|
||||
},
|
||||
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
|
||||
"type": "Table"
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple
|
||||
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, cast
|
||||
|
||||
if sys.version_info < (3, 8):
|
||||
from typing_extensions import Final
|
||||
@ -11,7 +11,6 @@ else:
|
||||
from typing import Final
|
||||
|
||||
from lxml import etree
|
||||
from tabulate import tabulate
|
||||
|
||||
from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes
|
||||
from unstructured.documents.base import Page
|
||||
@ -36,6 +35,7 @@ from unstructured.partition.text_type import (
|
||||
is_possible_title,
|
||||
is_us_city_state_zip,
|
||||
)
|
||||
from unstructured.utils import htmlify_matrix_of_cell_texts
|
||||
|
||||
TEXT_TAGS: Final[List[str]] = ["p", "a", "td", "span", "font"]
|
||||
LIST_ITEM_TAGS: Final[List[str]] = ["li", "dd"]
|
||||
@ -138,7 +138,7 @@ class HTMLDocument(XMLDocument):
|
||||
self.assembled_articles = assemble_articles
|
||||
super().__init__(stylesheet=stylesheet, parser=parser)
|
||||
|
||||
def _read(self) -> List[Page]:
|
||||
def _parse_pages_from_element_tree(self) -> List[Page]:
|
||||
"""Parse HTML elements into pages.
|
||||
|
||||
A *page* is a subsequence of the document-elements parsed from the HTML document
|
||||
@ -209,9 +209,10 @@ class HTMLDocument(XMLDocument):
|
||||
)
|
||||
|
||||
elif tag_elem.tag in TABLE_TAGS:
|
||||
element = _process_leaf_table_item(tag_elem)
|
||||
element = _parse_HTMLTable_from_table_elem(tag_elem)
|
||||
if element is not None:
|
||||
page.elements.append(element)
|
||||
if element or tag_elem.tag == "table":
|
||||
descendanttag_elems = tuple(tag_elem.iterdescendants())
|
||||
|
||||
elif tag_elem.tag in PAGEBREAK_TAGS and len(page.elements) > 0:
|
||||
@ -303,6 +304,60 @@ def _get_links_from_tag(tag_elem: etree._Element) -> List[Link]:
|
||||
return links
|
||||
|
||||
|
||||
def _is_bulleted_table(table_elem: etree._Element) -> bool:
|
||||
"""True when all text in `table_elem` is bulleted text.
|
||||
|
||||
A table-row containing no text is not considered, but at least one bulleted-text item must be
|
||||
present. A table with no text in any row is not a bulleted table.
|
||||
"""
|
||||
if table_elem.tag != "table":
|
||||
return False
|
||||
|
||||
trs = table_elem.findall(".//tr")
|
||||
tr_texts = [_construct_text(tr) for tr in trs]
|
||||
|
||||
# -- a table with no text is not a bulleted table --
|
||||
if all(not text for text in tr_texts):
|
||||
return False
|
||||
|
||||
# -- all non-empty rows must contain bulleted text --
|
||||
if any(text and not is_bulleted_text(text) for text in tr_texts):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def _parse_HTMLTable_from_table_elem(table_elem: etree._Element) -> Optional[Element]:
|
||||
"""Form `HTMLTable` element from `tbl_elem`."""
|
||||
if table_elem.tag != "table":
|
||||
return None
|
||||
|
||||
# -- NOTE that this algorithm handles a nested-table by parsing all of its text into the text
|
||||
# -- for the _cell_ containing the table (and this is recursive, so a table nested within a
|
||||
# -- cell within the table within the cell too.)
|
||||
|
||||
trs = cast(
|
||||
List[etree._Element], table_elem.xpath("./tr | ./thead/tr | ./tbody/tr | ./tfoot/tr")
|
||||
)
|
||||
|
||||
if not trs:
|
||||
return None
|
||||
|
||||
table_data = [[str(text) for text in tr.itertext()] for tr in trs]
|
||||
html_table = htmlify_matrix_of_cell_texts(table_data)
|
||||
table_text = " ".join(" ".join(row) for row in table_data).strip()
|
||||
|
||||
if table_text == "":
|
||||
return None
|
||||
|
||||
return HTMLTable(
|
||||
text=table_text,
|
||||
text_as_html=html_table,
|
||||
tag=table_elem.tag,
|
||||
ancestortags=tuple(el.tag for el in table_elem.iterancestors())[::-1],
|
||||
)
|
||||
|
||||
|
||||
def _get_emphasized_texts_from_tag(tag_elem: etree._Element) -> List[Dict[str, str]]:
|
||||
"""Emphasized text within and below `tag_element`.
|
||||
|
||||
@ -524,54 +579,6 @@ def _is_text_tag(tag_elem: etree._Element, max_predecessor_len: int = 5) -> bool
|
||||
return False
|
||||
|
||||
|
||||
def _process_leaf_table_item(tbl_elem: etree._Element) -> Optional[Element]:
|
||||
"""Form `HTMLTable` element from `tbl_elem`."""
|
||||
# -- Note this function theoretically _can_ be called with any element in TABLE_TAGS, but never
|
||||
# -- actually _will_ get called with anything but a `<table>` element. Logic of that is:
|
||||
# - gets called for each top-level `table` element
|
||||
# - will never find a nested table, even if there is one
|
||||
# - therefore will not return None
|
||||
# - therefore all its descendents will be marked visited and loop will not descend further
|
||||
# into the table element.
|
||||
if tbl_elem.tag not in TABLE_TAGS:
|
||||
return None
|
||||
|
||||
# -- ALSO NOTE that this algorithm will parse all the text within a nested table into the text
|
||||
# -- for the _cell_ the table is nested in (and this is recursive, so a table nested within a
|
||||
# -- cell within the table within the cell too.)
|
||||
|
||||
# TODO: this is not going to find nested tables, it will only find a `<table>` element that is
|
||||
# a (direct) child of the current element. and a nested table is only ever a child of a `<td>`
|
||||
# element. FURTHER, we have no good reason to detect and/or skip nested tables because they are
|
||||
# already being parsed.
|
||||
nested_table = tbl_elem.findall("table")
|
||||
if nested_table:
|
||||
return None
|
||||
|
||||
rows = tbl_elem.findall("tr")
|
||||
if not rows:
|
||||
body = tbl_elem.find("tbody")
|
||||
rows = body.findall("tr") if body is not None else []
|
||||
if len(rows) > 0:
|
||||
table_data = [[str(text) for text in row.itertext()] for row in rows]
|
||||
html_table = tabulate(table_data, tablefmt="html")
|
||||
table_text = " ".join(" ".join(row) for row in table_data).strip()
|
||||
|
||||
# TODO: this branch is the one responsible for returning empty (and therefore unparseable) table
|
||||
# document elements. This should return `None` instead. Better, make this a zero-or-one
|
||||
# generator function.
|
||||
else:
|
||||
table_text = ""
|
||||
html_table = ""
|
||||
|
||||
return HTMLTable(
|
||||
text=table_text,
|
||||
text_as_html=html_table.replace("\n", "<br>"),
|
||||
tag=tbl_elem.tag,
|
||||
ancestortags=tuple(el.tag for el in tbl_elem.iterancestors())[::-1],
|
||||
)
|
||||
|
||||
|
||||
def _process_list_item(
|
||||
tag_elem: etree._Element,
|
||||
max_predecessor_len: int = 5,
|
||||
@ -650,20 +657,6 @@ def _bulleted_text_from_table(table: etree._Element) -> List[Element]:
|
||||
return bulleted_text
|
||||
|
||||
|
||||
def _is_bulleted_table(table_elem: etree._Element) -> bool:
|
||||
"""True when `<table>` element `tag_elem` contains bulleted text."""
|
||||
if table_elem.tag != "table":
|
||||
return False
|
||||
|
||||
rows = table_elem.findall(".//tr")
|
||||
for row in rows:
|
||||
text = _construct_text(row)
|
||||
if text and not is_bulleted_text(text):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def _has_adjacent_bulleted_spans(tag_elem: etree._Element, children: List[etree._Element]) -> bool:
|
||||
"""True when `tag_elem` is a <div> or <pre> containing two or more adjacent bulleted spans.
|
||||
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
from typing import List, Optional, Union
|
||||
from typing import Any, List, Optional, Union
|
||||
|
||||
from lxml import etree
|
||||
from typing_extensions import Self
|
||||
|
||||
from unstructured.documents.base import Document, Page
|
||||
from unstructured.file_utils.encoding import read_txt_file
|
||||
@ -43,17 +44,17 @@ class XMLDocument(Document):
|
||||
self.document_tree = None
|
||||
super().__init__()
|
||||
|
||||
def _read(self):
|
||||
def _parse_pages_from_element_tree(self) -> List[Page]:
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def pages(self) -> List[Page]:
|
||||
"""Gets all elements from pages in sequential order."""
|
||||
if self._pages is None:
|
||||
self._pages = self._read()
|
||||
self._pages = self._parse_pages_from_element_tree()
|
||||
return super().pages
|
||||
|
||||
def _read_xml(self, content):
|
||||
def _read_xml(self, content: str):
|
||||
"""Reads in an XML file and converts it to an lxml element tree object."""
|
||||
# NOTE(robinson) - without the carriage return at the beginning, you get
|
||||
# output that looks like the following when you run partition_pdf
|
||||
@ -98,8 +99,8 @@ class XMLDocument(Document):
|
||||
text: str,
|
||||
parser: VALID_PARSERS = None,
|
||||
stylesheet: Optional[str] = None,
|
||||
**kwargs,
|
||||
):
|
||||
**kwargs: Any,
|
||||
) -> Self:
|
||||
"""Supports reading in an XML file as a raw string rather than as a file."""
|
||||
logger.info("Reading document from string ...")
|
||||
doc = cls(parser=parser, stylesheet=stylesheet, **kwargs)
|
||||
@ -109,12 +110,12 @@ class XMLDocument(Document):
|
||||
@classmethod
|
||||
def from_file(
|
||||
cls,
|
||||
filename,
|
||||
filename: str,
|
||||
parser: VALID_PARSERS = None,
|
||||
stylesheet: Optional[str] = None,
|
||||
encoding: Optional[str] = None,
|
||||
**kwargs,
|
||||
):
|
||||
**kwargs: Any,
|
||||
) -> Self:
|
||||
_, content = read_txt_file(filename=filename, encoding=encoding)
|
||||
|
||||
return cls.from_string(content, parser=parser, stylesheet=stylesheet, **kwargs)
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
import functools
|
||||
import html
|
||||
import importlib
|
||||
import json
|
||||
import os
|
||||
@ -17,6 +18,7 @@ from typing import (
|
||||
Iterator,
|
||||
List,
|
||||
Optional,
|
||||
Sequence,
|
||||
Tuple,
|
||||
TypeVar,
|
||||
Union,
|
||||
@ -37,6 +39,36 @@ _T = TypeVar("_T")
|
||||
_P = ParamSpec("_P")
|
||||
|
||||
|
||||
def htmlify_matrix_of_cell_texts(matrix: Sequence[Sequence[str]]) -> str:
|
||||
"""Form an HTML table from "rows" and "columns" of `matrix`.
|
||||
|
||||
Character overhead is minimized:
|
||||
- No whitespace padding is added for human readability
|
||||
- No newlines ("\n") are added
|
||||
- No `<thead>`, `<tbody>`, or `<tfoot>` elements are used; we can't tell where those might be
|
||||
semantically appropriate anyway so at best they would consume unnecessary space and at worst
|
||||
would be misleading.
|
||||
"""
|
||||
|
||||
def iter_trs(rows_of_cell_strs: Sequence[Sequence[str]]) -> Iterator[str]:
|
||||
for row_cell_strs in rows_of_cell_strs:
|
||||
# -- suppress emission of rows with no cells --
|
||||
if not row_cell_strs:
|
||||
continue
|
||||
yield f"<tr>{''.join(iter_tds(row_cell_strs))}</tr>"
|
||||
|
||||
def iter_tds(row_cell_strs: Sequence[str]) -> Iterator[str]:
|
||||
for s in row_cell_strs:
|
||||
# -- take care of things like '<' and '>' in the text --
|
||||
s = html.escape(s)
|
||||
# -- substitute <br/> elements for line-feeds in the text --
|
||||
s = "<br/>".join(s.split("\n"))
|
||||
# -- strip leading and trailing whitespace, wrap it up and go --
|
||||
yield f"<td>{s.strip()}</td>"
|
||||
|
||||
return f"<table>{''.join(iter_trs(matrix))}</table>" if matrix else ""
|
||||
|
||||
|
||||
class lazyproperty(Generic[_T]):
|
||||
"""Decorator like @property, but evaluated only on first access.
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user