Adding table extraction to partition_html (#1324)

Adding table extraction to HTML partitioning.

This PR utilizes 'table' HTML elements to extract and parse HTML tables
and return them in partitioning.

```
# checkout this branch, go into ipython shell
In [1]: from unstructured.partition.html import partition_html
In [2]: path_to_html = "{html sample file with table}"
In [3]: elements = partition_html(path_to_html)
```
you should see the table in the elements list!
This commit is contained in:
Amanda Cameron 2023-09-11 11:14:11 -07:00 committed by GitHub
parent 59e850bbd9
commit a501d1d18f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 173 additions and 2833 deletions

View File

@ -1,3 +1,13 @@
## 0.10.15-dev1
### Enhancements
* Updated HTML Partitioning to extract tables
### Features
### Fixes
## 0.10.14 ## 0.10.14
### Enhancements ### Enhancements

View File

@ -10,6 +10,7 @@ from unstructured.documents.elements import (
Address, Address,
ListItem, ListItem,
NarrativeText, NarrativeText,
Table,
Text, Text,
Title, Title,
) )
@ -77,14 +78,7 @@ def test_parses_tags_correctly():
</html>""" </html>"""
doc = HTMLDocument.from_string(raw_html) doc = HTMLDocument.from_string(raw_html)
el = doc.elements[0] el = doc.elements[0]
assert el.ancestortags + (el.tag,) == ( assert el.ancestortags + (el.tag,) == ("html", "body", "table")
"html",
"body",
"table",
"tbody",
"tr",
"td",
)
def test_has_table_ancestor(): def test_has_table_ancestor():
@ -118,8 +112,8 @@ def test_read_without_skipping_table(monkeypatch):
</table> </table>
</body> </body>
</html>""" </html>"""
document = HTMLDocument.from_string(doc).doc_after_cleaners(skip_table_text=False) document = HTMLDocument.from_string(doc).doc_after_cleaners(skip_table=False)
assert document.pages[0].elements[0] == NarrativeText(text="Hi there! I am Matt!") assert document.pages[0].elements[0] == Table(text="Hi there! I am Matt!")
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -356,7 +350,7 @@ def test_read_html_doc(tmpdir, monkeypatch):
html_document = HTMLDocument.from_file(filename=filename).doc_after_cleaners( html_document = HTMLDocument.from_file(filename=filename).doc_after_cleaners(
skip_headers_and_footers=True, skip_headers_and_footers=True,
skip_table_text=True, skip_table=True,
) )
print("original pages: ", HTMLDocument.from_file(filename=filename).pages) print("original pages: ", HTMLDocument.from_file(filename=filename).pages)
print("filtered pages: ", html_document.pages) print("filtered pages: ", html_document.pages)
@ -472,7 +466,7 @@ def test_include_headers_and_footers(sample_doc):
def test_include_table_text(sample_doc): def test_include_table_text(sample_doc):
html_document = sample_doc.doc_after_cleaners(skip_table_text=False) html_document = sample_doc.doc_after_cleaners(skip_table=False)
assert len(html_document.pages[0].elements) == 2 assert len(html_document.pages[0].elements) == 2
@ -503,8 +497,8 @@ def test_exclude_tag_types(tag):
def test_tag_types_table(sample_doc): def test_tag_types_table(sample_doc):
html_document = sample_doc.doc_after_cleaners(skip_table_text=True) html_document = sample_doc.doc_after_cleaners(skip_table=True)
assert len(html_document.pages[0].elements) == 1 assert len(html_document.pages[0].elements) == 2
def test_nested_text_tags(): def test_nested_text_tags():
@ -518,7 +512,7 @@ def test_nested_text_tags():
</{tag1}> </{tag1}>
</body> </body>
""" """
html_document = HTMLDocument.from_string(html_str).doc_after_cleaners(skip_table_text=False) html_document = HTMLDocument.from_string(html_str).doc_after_cleaners(skip_table=False)
assert len(html_document.pages[0].elements) == 1 assert len(html_document.pages[0].elements) == 1
@ -664,7 +658,7 @@ def test_filter_in_place():
""" """
doc = HTMLDocument.from_string(html_doc) doc = HTMLDocument.from_string(html_doc)
assert len(doc.elements) == 2 assert len(doc.elements) == 2
doc.doc_after_cleaners(skip_table_text=True, inplace=True) doc.doc_after_cleaners(skip_table=True, inplace=True)
assert len(doc.elements) == 1 assert len(doc.elements) == 1

View File

@ -7,7 +7,7 @@ import requests
from requests.models import Response from requests.models import Response
from unstructured.cleaners.core import clean_extra_whitespace from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import ListItem, NarrativeText, Title from unstructured.documents.elements import ListItem, NarrativeText, Table, Title
from unstructured.partition.html import partition_html from unstructured.partition.html import partition_html
from unstructured.partition.json import partition_json from unstructured.partition.json import partition_json
from unstructured.staging.base import elements_to_json from unstructured.staging.base import elements_to_json
@ -267,26 +267,25 @@ def test_partition_html_raises_with_too_many_specified():
def test_partition_html_on_ideas_page(filename="example-docs/ideas-page.html"): def test_partition_html_on_ideas_page(filename="example-docs/ideas-page.html"):
elements = partition_html(filename=filename) elements = partition_html(filename=filename)
assert len(elements) == 4 assert len(elements) == 1
assert elements[0] == Table(
text="January 2023 ( Someone fed my essays into GPT to make something "
"that could answer\nquestions based on them, then asked it where good "
"ideas come from. The\nanswer was ok, but not what I would have said. "
"This is what I would have said.) The way to get new ideas is to notice "
"anomalies: what seems strange,\nor missing, or broken? You can see anomalies"
" in everyday life (much\nof standup comedy is based on this), but the best "
"place to look for\nthem is at the frontiers of knowledge. Knowledge grows "
"fractally.\nFrom a distance its edges look smooth, but when you learn "
"enough\nto get close to one, you'll notice it's full of gaps. These "
"gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx "
"or wondered about y. In the best case, exploring such gaps yields\nwhole "
"new fractal buds.",
)
assert elements[0] == Title("January 2023")
assert elements[0].metadata.emphasized_text_contents is None assert elements[0].metadata.emphasized_text_contents is None
assert elements[0].metadata.link_urls is None assert elements[0].metadata.link_urls is None
assert elements[0].metadata.text_as_html is not None
assert elements[1].text.startswith("(Someone fed my essays")
assert elements[1].text.endswith("I would have said.)")
assert len(elements[1].metadata.emphasized_text_contents) == 1
assert len(elements[1].metadata.link_urls) == 1
assert elements[2].text.startswith("The way to get new ideas")
assert elements[2].text.endswith("the frontiers of knowledge.")
assert elements[2].metadata.emphasized_text_contents is None
assert elements[2].metadata.link_urls is None
assert elements[3].text.startswith("Knowledge grows fractally")
assert elements[3].text.endswith("whole new fractal buds.")
assert elements[3].metadata.emphasized_text_contents is None
assert elements[3].metadata.link_urls is None
def test_user_without_file_write_permission_can_partition_html(tmp_path, monkeypatch): def test_user_without_file_write_permission_can_partition_html(tmp_path, monkeypatch):

View File

@ -1,58 +1,14 @@
[ [
{ {
"type": "Title", "type": "Table",
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986", "element_id": "e83a347af95db7ba47b5351f411e00c7",
"metadata": {
"data_source": {},
"filename": "ideas-page.html",
"filetype": "text/html",
"page_number": 1
},
"text": "January 2023"
},
{
"type": "NarrativeText",
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
"metadata": { "metadata": {
"data_source": {}, "data_source": {},
"filename": "ideas-page.html", "filename": "ideas-page.html",
"filetype": "text/html", "filetype": "text/html",
"page_number": 1, "page_number": 1,
"link_urls": [ "text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
"https://twitter.com/stef/status/1617222428727586816"
],
"link_texts": [
null
],
"emphasized_text_contents": [
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
],
"emphasized_text_tags": [
"i"
]
}, },
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
},
{
"type": "NarrativeText",
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
"metadata": {
"data_source": {},
"filename": "ideas-page.html",
"filetype": "text/html",
"page_number": 1
},
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
},
{
"type": "NarrativeText",
"element_id": "4eafbff98b81999dfbf3572440d22393",
"metadata": {
"data_source": {},
"filename": "ideas-page.html",
"filetype": "text/html",
"page_number": 1
},
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
} }
] ]

View File

@ -1,54 +1,13 @@
[ [
{ {
"type": "Title", "type": "Table",
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986", "element_id": "e83a347af95db7ba47b5351f411e00c7",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "January 2023"
},
{
"type": "NarrativeText",
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
"metadata": { "metadata": {
"data_source": {}, "data_source": {},
"filetype": "text/html", "filetype": "text/html",
"page_number": 1, "page_number": 1,
"link_urls": [ "text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
"https://twitter.com/stef/status/1617222428727586816"
],
"link_texts": [
null
],
"emphasized_text_contents": [
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
],
"emphasized_text_tags": [
"i"
]
}, },
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
},
{
"type": "NarrativeText",
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
},
{
"type": "NarrativeText",
"element_id": "4eafbff98b81999dfbf3572440d22393",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
} }
] ]

View File

@ -1,54 +1,13 @@
[ [
{ {
"type": "Title", "type": "Table",
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986", "element_id": "e83a347af95db7ba47b5351f411e00c7",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "January 2023"
},
{
"type": "NarrativeText",
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
"metadata": { "metadata": {
"data_source": {}, "data_source": {},
"filetype": "text/html", "filetype": "text/html",
"page_number": 1, "page_number": 1,
"link_urls": [ "text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
"https://twitter.com/stef/status/1617222428727586816"
],
"link_texts": [
null
],
"emphasized_text_contents": [
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
],
"emphasized_text_tags": [
"i"
]
}, },
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
},
{
"type": "NarrativeText",
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
},
{
"type": "NarrativeText",
"element_id": "4eafbff98b81999dfbf3572440d22393",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
} }
] ]

View File

@ -1,151 +1,14 @@
[ [
{ {
"type": "Title", "type": "Table",
"element_id": "9fe4c68ec20dda7c6b1d3f760e5e6af6", "element_id": "10b5ef18a3c7fb1d7436b2e1b256e5b9",
"metadata": { "metadata": {
"data_source": {}, "data_source": {},
"filetype": "text/html", "filetype": "text/html",
"page_number": 1, "page_number": 1,
"emphasized_text_contents": [ "text_as_html": "<table><br><tbody><br><tr><td>Driver </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Approver </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Contributors</td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Informed </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Objective </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Due date </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Key outcomes</td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Status </td><td>NOT STARTED</td><td>/</td><td>IN PROGRESS</td><td>/</td><td>COMPLETE</td></tr><br></tbody><br></table>"
"Driver"
],
"emphasized_text_tags": [
"strong"
]
}, },
"text": "Driver" "text": "Driver Approver Contributors Informed Objective Due date Key outcomes Status NOT STARTED / IN PROGRESS / COMPLETE"
},
{
"type": "Title",
"element_id": "3ebb5648c8bcb2934590555c69356e27",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Approver"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Approver"
},
{
"type": "Title",
"element_id": "350ad433c42fe8cecdb38439f33947ea",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Contributors"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Contributors"
},
{
"type": "Title",
"element_id": "31a717c19407f215d8bcd329fc82e646",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Informed"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Informed"
},
{
"type": "Title",
"element_id": "3b20adc3b2ce1c15ea6880c3151baabe",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Objective"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Objective"
},
{
"type": "Title",
"element_id": "e1cb6d30fa3f17ee1e50b2bcf1967374",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Due date"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Due date"
},
{
"type": "Title",
"element_id": "80f5b18f225fca5e493dc48e4e60e8c7",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Key outcomes"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Key outcomes"
},
{
"type": "Title",
"element_id": "920e413c7d411b61ef3e8c63b1cb6ad0",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Status"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Status"
},
{
"type": "Title",
"element_id": "a54416fced47600988250cacdb064691",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"NOT STARTED",
"IN PROGRESS",
"COMPLETE"
],
"emphasized_text_tags": [
"span",
"span",
"span"
]
},
"text": "NOT STARTED / IN PROGRESS / COMPLETE"
}, },
{ {
"type": "Title", "type": "Title",
@ -168,72 +31,15 @@
"text": "🎯 Scope" "text": "🎯 Scope"
}, },
{ {
"type": "NarrativeText", "type": "Table",
"element_id": "0e5c4ed000097332e1e1b29a96fefd56", "element_id": "f1f364fbde77afa0e99e8ea7ab4f7c3f",
"metadata": { "metadata": {
"data_source": {}, "data_source": {},
"filetype": "text/html", "filetype": "text/html",
"page_number": 1, "page_number": 1,
"emphasized_text_contents": [ "text_as_html": "<table><br><tbody><br><tr><td>Must have: </td></tr><br><tr><td>Nice to have:</td></tr><br><tr><td>Not in scope:</td></tr><br></tbody><br></table>"
"Must have:"
],
"emphasized_text_tags": [
"strong"
]
}, },
"text": "Must have:" "text": "Must have: Nice to have: Not in scope:"
},
{
"type": "ListItem",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": ""
},
{
"type": "ListItem",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": ""
},
{
"type": "NarrativeText",
"element_id": "d29e06627b1fec1ecf65bce63fc5fda5",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Nice to have:"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Nice to have:"
},
{
"type": "Title",
"element_id": "7f999c0456e4e85cc028aa6ed90455d4",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Not in scope:"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Not in scope:"
}, },
{ {
"type": "Title", "type": "Title",
@ -336,68 +142,15 @@
"text": "\\uD83D\\uDEA9 Milestones and deadlines" "text": "\\uD83D\\uDEA9 Milestones and deadlines"
}, },
{ {
"type": "Title", "type": "Table",
"element_id": "9e86248cf2351e388065b80307b7ac00", "element_id": "3f4ea3840d79521680c89a91dcd883cf",
"metadata": { "metadata": {
"data_source": {}, "data_source": {},
"filetype": "text/html", "filetype": "text/html",
"page_number": 1, "page_number": 1,
"emphasized_text_contents": [ "text_as_html": "<table><br><tbody><br><tr><td>Milestone</td><td>Owner</td><td>Deadline</td><td>Status</td></tr><br><tr><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td> </td><td> </td><td> </td><td> </td></tr><br></tbody><br></table>"
"Milestone"
],
"emphasized_text_tags": [
"strong"
]
}, },
"text": "Milestone" "text": "Milestone Owner Deadline Status"
},
{
"type": "Title",
"element_id": "4b1b8aa3608a26da451ae0630d75b60a",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Owner"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Owner"
},
{
"type": "Title",
"element_id": "6fcb38ddc858fc8592e4f693d04a2ed1",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Deadline"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Deadline"
},
{
"type": "Title",
"element_id": "920e413c7d411b61ef3e8c63b1cb6ad0",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Status"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Status"
}, },
{ {
"type": "Title", "type": "Title",

View File

@ -70,68 +70,15 @@
"text": "\\uD83D\\uDDE3 Discussion topics" "text": "\\uD83D\\uDDE3 Discussion topics"
}, },
{ {
"type": "Title", "type": "Table",
"element_id": "33b93476cf597a3330653b66a658983d", "element_id": "37af06e8e75d96a448a00026754b7942",
"metadata": { "metadata": {
"data_source": {}, "data_source": {},
"filetype": "text/html", "filetype": "text/html",
"page_number": 1, "page_number": 1,
"emphasized_text_contents": [ "text_as_html": "<table><br><tbody><br><tr><td>Time</td><td>Item</td><td>Presenter</td><td>Notes</td></tr><br><tr><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td> </td><td> </td><td> </td><td> </td></tr><br></tbody><br></table>"
"Time"
],
"emphasized_text_tags": [
"strong"
]
}, },
"text": "Time" "text": "Time Item Presenter Notes"
},
{
"type": "Title",
"element_id": "652bcc3a478428893cc505ae19f847b4",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Item"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Item"
},
{
"type": "Title",
"element_id": "9ef077a1231ea3b71df182b87db1cb7f",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Presenter"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Presenter"
},
{
"type": "Title",
"element_id": "8a7525b1492fb84833f5c4a69b30f4bf",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Notes"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Notes"
}, },
{ {
"type": "Title", "type": "Title",

View File

@ -138,35 +138,14 @@
"text": "" "text": ""
}, },
{ {
"type": "Title", "type": "Table",
"element_id": "8a7525b1492fb84833f5c4a69b30f4bf", "element_id": "a240e43c0ae70731c65ae5430d2dab7f",
"metadata": { "metadata": {
"data_source": {}, "data_source": {},
"filetype": "text/html", "filetype": "text/html",
"page_number": 1, "page_number": 1,
"emphasized_text_contents": [ "text_as_html": "<table><br><tbody><br><tr><td>Notes </td></tr><br><tr><td>Important Links</td></tr><br></tbody><br></table>"
"Notes"
],
"emphasized_text_tags": [
"strong"
]
}, },
"text": "Notes" "text": "Notes Important Links"
},
{
"type": "Title",
"element_id": "98e38cd6c5f88330322de759657563f9",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Important Links"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Important Links"
} }
] ]

View File

@ -346,147 +346,14 @@
"text": "Testdoc3 Heading 5 Sized Text" "text": "Testdoc3 Heading 5 Sized Text"
}, },
{ {
"type": "Title", "type": "Table",
"element_id": "a980779f0a4dcb2fbf46641f3d55fbf8", "element_id": "5abf3e1bbc85012fe9e1d25966e00f5e",
"metadata": { "metadata": {
"data_source": {}, "data_source": {},
"filetype": "text/html", "filetype": "text/html",
"page_number": 1, "page_number": 1,
"emphasized_text_contents": [ "text_as_html": "<table><br><tbody><br><tr><td>Testdoc3 Table: Column 1 Row 0</td><td>Testdoc3 Table: Column 2 Row 0</td><td>Testdoc3 Table: Column 3 Row 0</td></tr><br><tr><td>Testdoc3 Table: Column 1 Row 1</td><td>Testdoc3 Table: Column 2 Row 1</td><td>Testdoc3 Table: Column 3 Row 1</td></tr><br><tr><td>Testdoc3 Table: Column 1 Row 2</td><td>Testdoc3 Table: Column 2 Row 2</td><td>Testdoc3 Table: Column 3 Row 2</td></tr><br></tbody><br></table>"
"Testdoc3 Table: Column 1 Row 0"
],
"emphasized_text_tags": [
"strong"
]
}, },
"text": "Testdoc3 Table: Column 1 Row 0" "text": "Testdoc3 Table: Column 1 Row 0 Testdoc3 Table: Column 2 Row 0 Testdoc3 Table: Column 3 Row 0 Testdoc3 Table: Column 1 Row 1 Testdoc3 Table: Column 2 Row 1 Testdoc3 Table: Column 3 Row 1 Testdoc3 Table: Column 1 Row 2 Testdoc3 Table: Column 2 Row 2 Testdoc3 Table: Column 3 Row 2"
},
{
"type": "Title",
"element_id": "0a04f24b652d60a333c4ab7cb407703a",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Testdoc3 Table: Column 2 Row 0"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Testdoc3 Table: Column 2 Row 0"
},
{
"type": "Title",
"element_id": "0301eff44f871fbda777aa0237a0f452",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Testdoc3 Table: Column 3 Row 0"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Testdoc3 Table: Column 3 Row 0"
},
{
"type": "Title",
"element_id": "23f1bd85c5fad540ef96b0872e74e7a4",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Testdoc3 Table: Column 1 Row 1"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Testdoc3 Table: Column 1 Row 1"
},
{
"type": "Title",
"element_id": "10c66e15332d59c91094e825685044d2",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Testdoc3 Table: Column 2 Row 1"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Testdoc3 Table: Column 2 Row 1"
},
{
"type": "Title",
"element_id": "f27232db61c551577ee4ea73a08e7539",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Testdoc3 Table: Column 3 Row 1"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Testdoc3 Table: Column 3 Row 1"
},
{
"type": "Title",
"element_id": "2a83da2e0f9c1bc4950962ffd50c2611",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Testdoc3 Table: Column 1 Row 2"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Testdoc3 Table: Column 1 Row 2"
},
{
"type": "Title",
"element_id": "aad2133b4d02da862062868452a19f2d",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Testdoc3 Table: Column 2 Row 2"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Testdoc3 Table: Column 2 Row 2"
},
{
"type": "Title",
"element_id": "1d5426ac7bb0a72e5e85f81590b05645",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Testdoc3 Table: Column 3 Row 2"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Testdoc3 Table: Column 3 Row 2"
} }
] ]

View File

@ -346,147 +346,14 @@
"text": "Testdoc2 Heading 5 Sized Text" "text": "Testdoc2 Heading 5 Sized Text"
}, },
{ {
"type": "Title", "type": "Table",
"element_id": "7aa138ab1f6ef154504c3d8ade2fd1a0", "element_id": "a164cd72991a3856b7bbc6d52d8b04bf",
"metadata": { "metadata": {
"data_source": {}, "data_source": {},
"filetype": "text/html", "filetype": "text/html",
"page_number": 1, "page_number": 1,
"emphasized_text_contents": [ "text_as_html": "<table><br><tbody><br><tr><td>Testdoc2 Table: Column 1 Row 0</td><td>Testdoc2 Table: Column 2 Row 0</td><td>Testdoc2 Table: Column 3 Row 0</td></tr><br><tr><td>Testdoc2 Table: Column 1 Row 1</td><td>Testdoc2 Table: Column 2 Row 1</td><td>Testdoc2 Table: Column 3 Row 1</td></tr><br><tr><td>Testdoc2 Table: Column 1 Row 2</td><td>Testdoc2 Table: Column 2 Row 2</td><td>Testdoc2 Table: Column 3 Row 2</td></tr><br></tbody><br></table>"
"Testdoc2 Table: Column 1 Row 0"
],
"emphasized_text_tags": [
"strong"
]
}, },
"text": "Testdoc2 Table: Column 1 Row 0" "text": "Testdoc2 Table: Column 1 Row 0 Testdoc2 Table: Column 2 Row 0 Testdoc2 Table: Column 3 Row 0 Testdoc2 Table: Column 1 Row 1 Testdoc2 Table: Column 2 Row 1 Testdoc2 Table: Column 3 Row 1 Testdoc2 Table: Column 1 Row 2 Testdoc2 Table: Column 2 Row 2 Testdoc2 Table: Column 3 Row 2"
},
{
"type": "Title",
"element_id": "b40b0fee79c609772c958caa07bd47a8",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Testdoc2 Table: Column 2 Row 0"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Testdoc2 Table: Column 2 Row 0"
},
{
"type": "Title",
"element_id": "cc59bb6025ceae34c2b9c9d7cdbfbcf9",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Testdoc2 Table: Column 3 Row 0"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Testdoc2 Table: Column 3 Row 0"
},
{
"type": "Title",
"element_id": "3cb373750d4e46b4bbc980dd0d74321e",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Testdoc2 Table: Column 1 Row 1"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Testdoc2 Table: Column 1 Row 1"
},
{
"type": "Title",
"element_id": "219a8d1fc742fb75b2481a0a75c77a3b",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Testdoc2 Table: Column 2 Row 1"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Testdoc2 Table: Column 2 Row 1"
},
{
"type": "Title",
"element_id": "07a1ad32c97f3669f88014ee5942f616",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Testdoc2 Table: Column 3 Row 1"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Testdoc2 Table: Column 3 Row 1"
},
{
"type": "Title",
"element_id": "17228bddb06b739951fab2ab04c09ea8",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Testdoc2 Table: Column 1 Row 2"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Testdoc2 Table: Column 1 Row 2"
},
{
"type": "Title",
"element_id": "4ad7ae00fff8c8a3f903864d037cf86e",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Testdoc2 Table: Column 2 Row 2"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Testdoc2 Table: Column 2 Row 2"
},
{
"type": "Title",
"element_id": "f2701095922247ecafbbd3fe31d585bf",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"Testdoc2 Table: Column 3 Row 2"
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Testdoc2 Table: Column 3 Row 2"
} }
] ]

View File

@ -1,54 +1,13 @@
[ [
{ {
"type": "Title", "type": "Table",
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986", "element_id": "e83a347af95db7ba47b5351f411e00c7",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "January 2023"
},
{
"type": "NarrativeText",
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
"metadata": { "metadata": {
"data_source": {}, "data_source": {},
"filetype": "text/html", "filetype": "text/html",
"page_number": 1, "page_number": 1,
"link_urls": [ "text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
"https://twitter.com/stef/status/1617222428727586816"
],
"link_texts": [
null
],
"emphasized_text_contents": [
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
],
"emphasized_text_tags": [
"i"
]
}, },
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
},
{
"type": "NarrativeText",
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
},
{
"type": "NarrativeText",
"element_id": "4eafbff98b81999dfbf3572440d22393",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
} }
] ]

View File

@ -1,54 +1,13 @@
[ [
{ {
"type": "Title", "type": "Table",
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986", "element_id": "e83a347af95db7ba47b5351f411e00c7",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "January 2023"
},
{
"type": "NarrativeText",
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
"metadata": { "metadata": {
"data_source": {}, "data_source": {},
"filetype": "text/html", "filetype": "text/html",
"page_number": 1, "page_number": 1,
"link_urls": [ "text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
"https://twitter.com/stef/status/1617222428727586816"
],
"link_texts": [
null
],
"emphasized_text_contents": [
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
],
"emphasized_text_tags": [
"i"
]
}, },
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
},
{
"type": "NarrativeText",
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
},
{
"type": "NarrativeText",
"element_id": "4eafbff98b81999dfbf3572440d22393",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
} }
] ]

View File

@ -1,54 +1,13 @@
[ [
{ {
"type": "Title", "type": "Table",
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986", "element_id": "e83a347af95db7ba47b5351f411e00c7",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "January 2023"
},
{
"type": "NarrativeText",
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
"metadata": { "metadata": {
"data_source": {}, "data_source": {},
"filetype": "text/html", "filetype": "text/html",
"page_number": 1, "page_number": 1,
"link_urls": [ "text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
"https://twitter.com/stef/status/1617222428727586816"
],
"link_texts": [
null
],
"emphasized_text_contents": [
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
],
"emphasized_text_tags": [
"i"
]
}, },
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
},
{
"type": "NarrativeText",
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
},
{
"type": "NarrativeText",
"element_id": "4eafbff98b81999dfbf3572440d22393",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
} }
] ]

View File

@ -1,54 +1,13 @@
[ [
{ {
"type": "Title", "type": "Table",
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986", "element_id": "e83a347af95db7ba47b5351f411e00c7",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "January 2023"
},
{
"type": "NarrativeText",
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
"metadata": { "metadata": {
"data_source": {}, "data_source": {},
"filetype": "text/html", "filetype": "text/html",
"page_number": 1, "page_number": 1,
"link_urls": [ "text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
"https://twitter.com/stef/status/1617222428727586816"
],
"link_texts": [
null
],
"emphasized_text_contents": [
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
],
"emphasized_text_tags": [
"i"
]
}, },
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
},
{
"type": "NarrativeText",
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
},
{
"type": "NarrativeText",
"element_id": "4eafbff98b81999dfbf3572440d22393",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
} }
] ]

View File

@ -1,54 +1,13 @@
[ [
{ {
"type": "Title", "type": "Table",
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986", "element_id": "e83a347af95db7ba47b5351f411e00c7",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "January 2023"
},
{
"type": "NarrativeText",
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
"metadata": { "metadata": {
"data_source": {}, "data_source": {},
"filetype": "text/html", "filetype": "text/html",
"page_number": 1, "page_number": 1,
"link_urls": [ "text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
"https://twitter.com/stef/status/1617222428727586816"
],
"link_texts": [
null
],
"emphasized_text_contents": [
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
],
"emphasized_text_tags": [
"i"
]
}, },
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
},
{
"type": "NarrativeText",
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
},
{
"type": "NarrativeText",
"element_id": "4eafbff98b81999dfbf3572440d22393",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
} }
] ]

View File

@ -96,8 +96,8 @@
"text": "text\n \n with other" "text": "text\n \n with other"
}, },
{ {
"type": "UncategorizedText", "type": "Table",
"element_id": "8864784f943d9f832a3dce22ef8bcf01", "element_id": "8298c3f1d0016deb9cbf44832c33480c",
"metadata": { "metadata": {
"data_source": { "data_source": {
"date_created": "2023-08-04T18:31:00.000Z", "date_created": "2023-08-04T18:31:00.000Z",
@ -105,207 +105,9 @@
}, },
"filetype": "text/html", "filetype": "text/html",
"page_number": 1, "page_number": 1,
"emphasized_text_contents": [ "text_as_html": "<table><br><tbody><br><tr><td></td><td>column 1 </td><td> </td><td>column 2 </td><td> </td><td>pages </td><td> </td><td></td><td> </td><td></td><td> </td><td></td><td></td></tr><br><tr><td></td><td>c1r1 </td><td>content</td><td> </td><td> </td><td>c2r1 table <br> 2023-08-08T09:00:00.000-04:00<br> cell</td><td> </td><td></td><td>Page with every block</td><td></td><td> </td><td></td><td></td></tr><br><tr><td></td><td>c1r2 more </td><td>content</td><td> </td><td> </td><td>c2r2 table </td><td>cell </td><td></td><td> </td><td></td><td>Untitled</td><td></td><td></td></tr><br><tr><td></td><td>this is some green text</td><td> </td><td>this is </td><td>an</td><td> </td><td>equation</td><td></td><td> </td><td></td><td>Untitled</td><td></td><td></td></tr><br><tr><td></td><td>text1 </td><td>text2 </td><td>Multiline cell</td><td> </td><td>Another cell </td><td> </td><td></td><td>Untitled </td><td></td><td> </td><td></td><td></td></tr><br></tbody><br></table>"
"content"
],
"emphasized_text_tags": [
"b"
]
}, },
"text": "c1r1 \n \n content" "text": "column 1\n \n \n column 2\n \n \n pages\n \n \n \n c1r1 \n \n content \n \n \n \n c2r1 table \n 2023-08-08T09:00:00.000-04:00\n cell\n \n \n \n Page with every block \n \n \n \n \n \n c1r2 more \n \n content\n \n \n \n c2r2 table \n \n cell\n \n \n \n \n Untitled\n \n \n \n \n \n this is some green text\n \n \n this is \n \n an \n \n \n equation\n \n \n \n \n Untitled\n \n \n \n \n \n text1\n\n\n \n text2\n \n \n\nMultiline cell\n \n \n Another cell \n \n \n \n Untitled"
},
{
"type": "UncategorizedText",
"element_id": "6f75c9d2993dbb3981c019741c7962a9",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"page_number": 1
},
"text": "c2r1 table \n 2023-08-08T09:00:00.000-04:00\n cell"
},
{
"type": "Title",
"element_id": "5687503bd741f54090d4c0557c0eea1a",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"page_number": 1,
"link_urls": [
"https://www.notion.so/c47a45664c7a488bac2a1292ee507fcb"
],
"link_texts": [
"\n Page with every block \n "
]
},
"text": "Page with every block"
},
{
"type": "UncategorizedText",
"element_id": "13686520a51e25584bb06ab189b38552",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"page_number": 1,
"link_urls": [
"/122b2c22996b435b9de2ee0e9d2b04bc"
],
"link_texts": [
"\n content\n "
]
},
"text": "c1r2 more \n \n content"
},
{
"type": "UncategorizedText",
"element_id": "cf236cfe4b4c0ef644c37b4e491a4aa8",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"cell"
],
"emphasized_text_tags": [
"span"
]
},
"text": "c2r2 table \n \n cell"
},
{
"type": "Title",
"element_id": "f59ab8d1331b7b16952fbd388258f856",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"page_number": 1,
"link_urls": [
"https://www.notion.so/9ba4d6da8a574cfc81ebceac1fde52bd"
],
"link_texts": [
"\n Untitled\n "
]
},
"text": "Untitled"
},
{
"type": "NarrativeText",
"element_id": "7d96ce60a66271ef79da4c492ca7db8a",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"page_number": 1
},
"text": "this is some green text"
},
{
"type": "NarrativeText",
"element_id": "2d77a706008eebaf1f7c4e116bbe08b4",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"an",
"equation"
],
"emphasized_text_tags": [
"b",
"b"
]
},
"text": "this is \n \n an \n \n \n equation"
},
{
"type": "Title",
"element_id": "f59ab8d1331b7b16952fbd388258f856",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"page_number": 1,
"link_urls": [
"https://www.notion.so/a1a5dff426f34b8f9a709d51b2a00c73"
],
"link_texts": [
"\n Untitled\n "
]
},
"text": "Untitled"
},
{
"type": "UncategorizedText",
"element_id": "7e921a403f1840728e2887990cfe640d",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"page_number": 1,
"emphasized_text_contents": [
"text2"
],
"emphasized_text_tags": [
"i"
]
},
"text": "text1\n\n\n \n text2\n \n \n\nMultiline cell"
},
{
"type": "Title",
"element_id": "7013d5bb5a17e0e782e8971e23640bdb",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"page_number": 1
},
"text": "Another cell"
},
{
"type": "Title",
"element_id": "f59ab8d1331b7b16952fbd388258f856",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"page_number": 1,
"link_urls": [
"https://www.notion.so/84002066546448d0a030aa79b8d400b0"
],
"link_texts": [
"\n Untitled\n "
]
},
"text": "Untitled"
}, },
{ {
"type": "UncategorizedText", "type": "UncategorizedText",

View File

@ -1 +1 @@
__version__ = "0.10.14" # pragma: no cover __version__ = "0.10.15-dev1" # pragma: no cover

View File

@ -9,6 +9,7 @@ else:
from typing import Final from typing import Final
from lxml import etree from lxml import etree
from tabulate import tabulate
from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes
from unstructured.documents.base import Page from unstructured.documents.base import Page
@ -19,6 +20,7 @@ from unstructured.documents.elements import (
Link, Link,
ListItem, ListItem,
NarrativeText, NarrativeText,
Table,
Text, Text,
Title, Title,
) )
@ -53,6 +55,7 @@ class TagsMixin:
ancestortags: Sequence[str] = (), ancestortags: Sequence[str] = (),
links: Sequence[Link] = [], links: Sequence[Link] = [],
emphasized_texts: Sequence[dict] = [], emphasized_texts: Sequence[dict] = [],
text_as_html: Optional[str] = None,
**kwargs, **kwargs,
): ):
if tag is None: if tag is None:
@ -62,6 +65,7 @@ class TagsMixin:
self.ancestortags = ancestortags self.ancestortags = ancestortags
self.links = links self.links = links
self.emphasized_texts = emphasized_texts self.emphasized_texts = emphasized_texts
self.text_as_html = text_as_html
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
@ -101,6 +105,12 @@ class HTMLListItem(TagsMixin, ListItem):
pass pass
class HTMLTable(TagsMixin, Table):
"""NarrativeText with tag information"""
pass
class HTMLDocument(XMLDocument): class HTMLDocument(XMLDocument):
"""Class for handling HTML documents. Uses rules based parsing to identify sections """Class for handling HTML documents. Uses rules based parsing to identify sections
of interest within the document.""" of interest within the document."""
@ -168,6 +178,12 @@ class HTMLDocument(XMLDocument):
page.elements.append(element) page.elements.append(element)
descendanttag_elems = _get_bullet_descendants(tag_elem, next_element) descendanttag_elems = _get_bullet_descendants(tag_elem, next_element)
elif _is_table_item(tag_elem):
element, next_element = _process_leaf_table_item(tag_elem)
if element is not None:
page.elements.append(element)
descendanttag_elems = tuple(tag_elem.iterdescendants())
elif tag_elem.tag in PAGEBREAK_TAGS and len(page.elements) > 0: elif tag_elem.tag in PAGEBREAK_TAGS and len(page.elements) > 0:
pages.append(page) pages.append(page)
page_number += 1 page_number += 1
@ -183,7 +199,7 @@ class HTMLDocument(XMLDocument):
def doc_after_cleaners( def doc_after_cleaners(
self, self,
skip_headers_and_footers=False, skip_headers_and_footers=False,
skip_table_text=False, skip_table=False,
inplace=False, inplace=False,
) -> HTMLDocument: ) -> HTMLDocument:
"""Filters the elements and returns a new instance of the class based on the criteria """Filters the elements and returns a new instance of the class based on the criteria
@ -191,8 +207,8 @@ class HTMLDocument(XMLDocument):
page are filtered out. page are filtered out.
Parameters Parameters
---------- ----------
skip_table_text: skip_table:
If True, skips text that is contained within a table element If True, skips table element
skip_headers_and_footers: skip_headers_and_footers:
If True, ignores any content that is within <header> or <footer> tags If True, ignores any content that is within <header> or <footer> tags
inplace: inplace:
@ -202,8 +218,8 @@ class HTMLDocument(XMLDocument):
excluders = [] excluders = []
if skip_headers_and_footers: if skip_headers_and_footers:
excluders.append(in_header_or_footer) excluders.append(in_header_or_footer)
if skip_table_text: if skip_table:
excluders.append(has_table_ancestor) excluders.append(is_table)
pages = [] pages = []
page_number = 0 page_number = 0
@ -245,7 +261,6 @@ def _get_links_from_tag(tag_elem: etree.Element) -> List[Link]:
href = tag_elem.get("href") href = tag_elem.get("href")
if href: if href:
links.append({"text": tag_elem.text, "url": href}) links.append({"text": tag_elem.text, "url": href})
for tag in tag_elem.iterdescendants(): for tag in tag_elem.iterdescendants():
href = tag.get("href") href = tag.get("href")
if href: if href:
@ -441,6 +456,36 @@ def _is_text_tag(tag_elem: etree.Element, max_predecessor_len: int = 5) -> bool:
return False return False
def _process_leaf_table_item(
tag_elem: etree.Element,
) -> Tuple[Optional[Element], etree.Element]:
if tag_elem.tag in TABLE_TAGS:
nested_table = tag_elem.findall("table")
if not nested_table:
rows = tag_elem.findall("tr")
if not rows:
body = tag_elem.find("tbody")
rows = body.findall("tr")
if len(rows) > 0:
table_data = [list(row.itertext()) for row in rows]
html_table = tabulate(table_data, tablefmt="html")
table_text = " ".join(" ".join(row) for row in table_data).strip()
else:
table_text = ""
html_table = ""
return (
HTMLTable(
text=table_text,
text_as_html=html_table.replace("\n", "<br>"),
tag=tag_elem.tag,
ancestortags=tuple(el.tag for el in tag_elem.iterancestors())[::-1],
),
tag_elem,
)
return None, None
def _process_list_item( def _process_list_item(
tag_elem: etree.Element, tag_elem: etree.Element,
max_predecessor_len: int = 5, max_predecessor_len: int = 5,
@ -496,6 +541,13 @@ def is_list_item_tag(tag_elem: etree.Element) -> bool:
return False return False
def _is_table_item(tag_elem: etree.Element) -> bool:
"""Checks to see if a tag contains table item"""
if tag_elem.tag in TABLE_TAGS:
return True
return False
def _bulleted_text_from_table(table) -> List[Element]: def _bulleted_text_from_table(table) -> List[Element]:
"""Extracts bulletized narrative text from a table. """Extracts bulletized narrative text from a table.
NOTE: if a table has mixed bullets and non-bullets, only bullets are extracted. NOTE: if a table has mixed bullets and non-bullets, only bullets are extracted.
@ -541,6 +593,11 @@ def has_table_ancestor(element: TagsMixin) -> bool:
return any(ancestor in TABLE_TAGS for ancestor in element.ancestortags) return any(ancestor in TABLE_TAGS for ancestor in element.ancestortags)
def is_table(element: TagsMixin) -> bool:
"""Checks to see if an element is a table"""
return element.tag in TABLE_TAGS
def in_header_or_footer(element: TagsMixin) -> bool: def in_header_or_footer(element: TagsMixin) -> bool:
"""Checks to see if an element is contained within a header or a footer tag.""" """Checks to see if an element is contained within a header or a footer tag."""
if any(ancestor in HEADER_OR_FOOTER_TAGS for ancestor in element.ancestortags): if any(ancestor in HEADER_OR_FOOTER_TAGS for ancestor in element.ancestortags):

View File

@ -77,7 +77,6 @@ def partition_html(
return [] return []
# Verify that only one of the arguments was provided # Verify that only one of the arguments was provided
exactly_one(filename=filename, file=file, text=text, url=url) exactly_one(filename=filename, file=file, text=text, url=url)
last_modification_date = None last_modification_date = None
if filename is not None: if filename is not None:
last_modification_date = get_last_modified_date(filename) last_modification_date = get_last_modified_date(filename)
@ -118,7 +117,6 @@ def partition_html(
if skip_headers_and_footers: if skip_headers_and_footers:
document = filter_footer_and_header(document) document = filter_footer_and_header(document)
return document_to_element_list( return document_to_element_list(
document, document,
sortable=False, sortable=False,