mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-22 15:50:22 +00:00
Adding table extraction to partition_html (#1324)
Adding table extraction to HTML partitioning. This PR utilizes 'table' HTML elements to extract and parse HTML tables and return them in partitioning. ``` # checkout this branch, go into ipython shell In [1]: from unstructured.partition.html import partition_html In [2]: path_to_html = "{html sample file with table}" In [3]: elements = partition_html(path_to_html) ``` you should see the table in the elements list!
This commit is contained in:
parent
59e850bbd9
commit
a501d1d18f
10
CHANGELOG.md
10
CHANGELOG.md
@ -1,3 +1,13 @@
|
||||
## 0.10.15-dev1
|
||||
|
||||
### Enhancements
|
||||
|
||||
* Updated HTML Partitioning to extract tables
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
## 0.10.14
|
||||
|
||||
### Enhancements
|
||||
|
@ -10,6 +10,7 @@ from unstructured.documents.elements import (
|
||||
Address,
|
||||
ListItem,
|
||||
NarrativeText,
|
||||
Table,
|
||||
Text,
|
||||
Title,
|
||||
)
|
||||
@ -77,14 +78,7 @@ def test_parses_tags_correctly():
|
||||
</html>"""
|
||||
doc = HTMLDocument.from_string(raw_html)
|
||||
el = doc.elements[0]
|
||||
assert el.ancestortags + (el.tag,) == (
|
||||
"html",
|
||||
"body",
|
||||
"table",
|
||||
"tbody",
|
||||
"tr",
|
||||
"td",
|
||||
)
|
||||
assert el.ancestortags + (el.tag,) == ("html", "body", "table")
|
||||
|
||||
|
||||
def test_has_table_ancestor():
|
||||
@ -118,8 +112,8 @@ def test_read_without_skipping_table(monkeypatch):
|
||||
</table>
|
||||
</body>
|
||||
</html>"""
|
||||
document = HTMLDocument.from_string(doc).doc_after_cleaners(skip_table_text=False)
|
||||
assert document.pages[0].elements[0] == NarrativeText(text="Hi there! I am Matt!")
|
||||
document = HTMLDocument.from_string(doc).doc_after_cleaners(skip_table=False)
|
||||
assert document.pages[0].elements[0] == Table(text="Hi there! I am Matt!")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -356,7 +350,7 @@ def test_read_html_doc(tmpdir, monkeypatch):
|
||||
|
||||
html_document = HTMLDocument.from_file(filename=filename).doc_after_cleaners(
|
||||
skip_headers_and_footers=True,
|
||||
skip_table_text=True,
|
||||
skip_table=True,
|
||||
)
|
||||
print("original pages: ", HTMLDocument.from_file(filename=filename).pages)
|
||||
print("filtered pages: ", html_document.pages)
|
||||
@ -472,7 +466,7 @@ def test_include_headers_and_footers(sample_doc):
|
||||
|
||||
|
||||
def test_include_table_text(sample_doc):
|
||||
html_document = sample_doc.doc_after_cleaners(skip_table_text=False)
|
||||
html_document = sample_doc.doc_after_cleaners(skip_table=False)
|
||||
assert len(html_document.pages[0].elements) == 2
|
||||
|
||||
|
||||
@ -503,8 +497,8 @@ def test_exclude_tag_types(tag):
|
||||
|
||||
|
||||
def test_tag_types_table(sample_doc):
|
||||
html_document = sample_doc.doc_after_cleaners(skip_table_text=True)
|
||||
assert len(html_document.pages[0].elements) == 1
|
||||
html_document = sample_doc.doc_after_cleaners(skip_table=True)
|
||||
assert len(html_document.pages[0].elements) == 2
|
||||
|
||||
|
||||
def test_nested_text_tags():
|
||||
@ -518,7 +512,7 @@ def test_nested_text_tags():
|
||||
</{tag1}>
|
||||
</body>
|
||||
"""
|
||||
html_document = HTMLDocument.from_string(html_str).doc_after_cleaners(skip_table_text=False)
|
||||
html_document = HTMLDocument.from_string(html_str).doc_after_cleaners(skip_table=False)
|
||||
assert len(html_document.pages[0].elements) == 1
|
||||
|
||||
|
||||
@ -664,7 +658,7 @@ def test_filter_in_place():
|
||||
"""
|
||||
doc = HTMLDocument.from_string(html_doc)
|
||||
assert len(doc.elements) == 2
|
||||
doc.doc_after_cleaners(skip_table_text=True, inplace=True)
|
||||
doc.doc_after_cleaners(skip_table=True, inplace=True)
|
||||
assert len(doc.elements) == 1
|
||||
|
||||
|
||||
|
@ -7,7 +7,7 @@ import requests
|
||||
from requests.models import Response
|
||||
|
||||
from unstructured.cleaners.core import clean_extra_whitespace
|
||||
from unstructured.documents.elements import ListItem, NarrativeText, Title
|
||||
from unstructured.documents.elements import ListItem, NarrativeText, Table, Title
|
||||
from unstructured.partition.html import partition_html
|
||||
from unstructured.partition.json import partition_json
|
||||
from unstructured.staging.base import elements_to_json
|
||||
@ -267,26 +267,25 @@ def test_partition_html_raises_with_too_many_specified():
|
||||
|
||||
def test_partition_html_on_ideas_page(filename="example-docs/ideas-page.html"):
|
||||
elements = partition_html(filename=filename)
|
||||
assert len(elements) == 4
|
||||
assert len(elements) == 1
|
||||
assert elements[0] == Table(
|
||||
text="January 2023 ( Someone fed my essays into GPT to make something "
|
||||
"that could answer\nquestions based on them, then asked it where good "
|
||||
"ideas come from. The\nanswer was ok, but not what I would have said. "
|
||||
"This is what I would have said.) The way to get new ideas is to notice "
|
||||
"anomalies: what seems strange,\nor missing, or broken? You can see anomalies"
|
||||
" in everyday life (much\nof standup comedy is based on this), but the best "
|
||||
"place to look for\nthem is at the frontiers of knowledge. Knowledge grows "
|
||||
"fractally.\nFrom a distance its edges look smooth, but when you learn "
|
||||
"enough\nto get close to one, you'll notice it's full of gaps. These "
|
||||
"gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx "
|
||||
"or wondered about y. In the best case, exploring such gaps yields\nwhole "
|
||||
"new fractal buds.",
|
||||
)
|
||||
|
||||
assert elements[0] == Title("January 2023")
|
||||
assert elements[0].metadata.emphasized_text_contents is None
|
||||
assert elements[0].metadata.link_urls is None
|
||||
|
||||
assert elements[1].text.startswith("(Someone fed my essays")
|
||||
assert elements[1].text.endswith("I would have said.)")
|
||||
assert len(elements[1].metadata.emphasized_text_contents) == 1
|
||||
assert len(elements[1].metadata.link_urls) == 1
|
||||
|
||||
assert elements[2].text.startswith("The way to get new ideas")
|
||||
assert elements[2].text.endswith("the frontiers of knowledge.")
|
||||
assert elements[2].metadata.emphasized_text_contents is None
|
||||
assert elements[2].metadata.link_urls is None
|
||||
|
||||
assert elements[3].text.startswith("Knowledge grows fractally")
|
||||
assert elements[3].text.endswith("whole new fractal buds.")
|
||||
assert elements[3].metadata.emphasized_text_contents is None
|
||||
assert elements[3].metadata.link_urls is None
|
||||
assert elements[0].metadata.text_as_html is not None
|
||||
|
||||
|
||||
def test_user_without_file_write_permission_can_partition_html(tmp_path, monkeypatch):
|
||||
|
@ -1,58 +1,14 @@
|
||||
[
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filename": "ideas-page.html",
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "January 2023"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
|
||||
"type": "Table",
|
||||
"element_id": "e83a347af95db7ba47b5351f411e00c7",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filename": "ideas-page.html",
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"link_urls": [
|
||||
"https://twitter.com/stef/status/1617222428727586816"
|
||||
],
|
||||
"link_texts": [
|
||||
null
|
||||
],
|
||||
"emphasized_text_contents": [
|
||||
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"i"
|
||||
]
|
||||
"text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you'll notice it's full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
|
||||
},
|
||||
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filename": "ideas-page.html",
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "4eafbff98b81999dfbf3572440d22393",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filename": "ideas-page.html",
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
}
|
||||
]
|
@ -1,54 +1,13 @@
|
||||
[
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "January 2023"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
|
||||
"type": "Table",
|
||||
"element_id": "e83a347af95db7ba47b5351f411e00c7",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"link_urls": [
|
||||
"https://twitter.com/stef/status/1617222428727586816"
|
||||
],
|
||||
"link_texts": [
|
||||
null
|
||||
],
|
||||
"emphasized_text_contents": [
|
||||
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"i"
|
||||
]
|
||||
"text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you'll notice it's full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
|
||||
},
|
||||
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "4eafbff98b81999dfbf3572440d22393",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
}
|
||||
]
|
@ -1,54 +1,13 @@
|
||||
[
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "January 2023"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
|
||||
"type": "Table",
|
||||
"element_id": "e83a347af95db7ba47b5351f411e00c7",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"link_urls": [
|
||||
"https://twitter.com/stef/status/1617222428727586816"
|
||||
],
|
||||
"link_texts": [
|
||||
null
|
||||
],
|
||||
"emphasized_text_contents": [
|
||||
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"i"
|
||||
]
|
||||
"text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you'll notice it's full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
|
||||
},
|
||||
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "4eafbff98b81999dfbf3572440d22393",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
}
|
||||
]
|
@ -1,151 +1,14 @@
|
||||
[
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "9fe4c68ec20dda7c6b1d3f760e5e6af6",
|
||||
"type": "Table",
|
||||
"element_id": "10b5ef18a3c7fb1d7436b2e1b256e5b9",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Driver"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
"text_as_html": "<table><br><tbody><br><tr><td>Driver </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Approver </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Contributors</td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Informed </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Objective </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Due date </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Key outcomes</td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Status </td><td>NOT STARTED</td><td>/</td><td>IN PROGRESS</td><td>/</td><td>COMPLETE</td></tr><br></tbody><br></table>"
|
||||
},
|
||||
"text": "Driver"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "3ebb5648c8bcb2934590555c69356e27",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Approver"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Approver"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "350ad433c42fe8cecdb38439f33947ea",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Contributors"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Contributors"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "31a717c19407f215d8bcd329fc82e646",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Informed"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Informed"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "3b20adc3b2ce1c15ea6880c3151baabe",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Objective"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Objective"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "e1cb6d30fa3f17ee1e50b2bcf1967374",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Due date"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Due date"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "80f5b18f225fca5e493dc48e4e60e8c7",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Key outcomes"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Key outcomes"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "920e413c7d411b61ef3e8c63b1cb6ad0",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Status"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Status"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "a54416fced47600988250cacdb064691",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"NOT STARTED",
|
||||
"IN PROGRESS",
|
||||
"COMPLETE"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"span",
|
||||
"span",
|
||||
"span"
|
||||
]
|
||||
},
|
||||
"text": "NOT STARTED / IN PROGRESS / COMPLETE"
|
||||
"text": "Driver Approver Contributors Informed Objective Due date Key outcomes Status NOT STARTED / IN PROGRESS / COMPLETE"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
@ -168,72 +31,15 @@
|
||||
"text": "🎯 Scope"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "0e5c4ed000097332e1e1b29a96fefd56",
|
||||
"type": "Table",
|
||||
"element_id": "f1f364fbde77afa0e99e8ea7ab4f7c3f",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Must have:"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
"text_as_html": "<table><br><tbody><br><tr><td>Must have: </td></tr><br><tr><td>Nice to have:</td></tr><br><tr><td>Not in scope:</td></tr><br></tbody><br></table>"
|
||||
},
|
||||
"text": "Must have:"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "d29e06627b1fec1ecf65bce63fc5fda5",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Nice to have:"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Nice to have:"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "7f999c0456e4e85cc028aa6ed90455d4",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Not in scope:"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Not in scope:"
|
||||
"text": "Must have: Nice to have: Not in scope:"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
@ -336,68 +142,15 @@
|
||||
"text": "\\uD83D\\uDEA9 Milestones and deadlines"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "9e86248cf2351e388065b80307b7ac00",
|
||||
"type": "Table",
|
||||
"element_id": "3f4ea3840d79521680c89a91dcd883cf",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Milestone"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
"text_as_html": "<table><br><tbody><br><tr><td>Milestone</td><td>Owner</td><td>Deadline</td><td>Status</td></tr><br><tr><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td> </td><td> </td><td> </td><td> </td></tr><br></tbody><br></table>"
|
||||
},
|
||||
"text": "Milestone"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "4b1b8aa3608a26da451ae0630d75b60a",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Owner"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Owner"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "6fcb38ddc858fc8592e4f693d04a2ed1",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Deadline"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Deadline"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "920e413c7d411b61ef3e8c63b1cb6ad0",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Status"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Status"
|
||||
"text": "Milestone Owner Deadline Status"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
|
@ -70,68 +70,15 @@
|
||||
"text": "\\uD83D\\uDDE3 Discussion topics"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "33b93476cf597a3330653b66a658983d",
|
||||
"type": "Table",
|
||||
"element_id": "37af06e8e75d96a448a00026754b7942",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Time"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
"text_as_html": "<table><br><tbody><br><tr><td>Time</td><td>Item</td><td>Presenter</td><td>Notes</td></tr><br><tr><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td> </td><td> </td><td> </td><td> </td></tr><br></tbody><br></table>"
|
||||
},
|
||||
"text": "Time"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "652bcc3a478428893cc505ae19f847b4",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Item"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Item"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "9ef077a1231ea3b71df182b87db1cb7f",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Presenter"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Presenter"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "8a7525b1492fb84833f5c4a69b30f4bf",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Notes"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Notes"
|
||||
"text": "Time Item Presenter Notes"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
|
@ -138,35 +138,14 @@
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "8a7525b1492fb84833f5c4a69b30f4bf",
|
||||
"type": "Table",
|
||||
"element_id": "a240e43c0ae70731c65ae5430d2dab7f",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Notes"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
"text_as_html": "<table><br><tbody><br><tr><td>Notes </td></tr><br><tr><td>Important Links</td></tr><br></tbody><br></table>"
|
||||
},
|
||||
"text": "Notes"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "98e38cd6c5f88330322de759657563f9",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Important Links"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Important Links"
|
||||
"text": "Notes Important Links"
|
||||
}
|
||||
]
|
@ -346,147 +346,14 @@
|
||||
"text": "Testdoc3 Heading 5 Sized Text"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "a980779f0a4dcb2fbf46641f3d55fbf8",
|
||||
"type": "Table",
|
||||
"element_id": "5abf3e1bbc85012fe9e1d25966e00f5e",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Testdoc3 Table: Column 1 Row 0"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
"text_as_html": "<table><br><tbody><br><tr><td>Testdoc3 Table: Column 1 Row 0</td><td>Testdoc3 Table: Column 2 Row 0</td><td>Testdoc3 Table: Column 3 Row 0</td></tr><br><tr><td>Testdoc3 Table: Column 1 Row 1</td><td>Testdoc3 Table: Column 2 Row 1</td><td>Testdoc3 Table: Column 3 Row 1</td></tr><br><tr><td>Testdoc3 Table: Column 1 Row 2</td><td>Testdoc3 Table: Column 2 Row 2</td><td>Testdoc3 Table: Column 3 Row 2</td></tr><br></tbody><br></table>"
|
||||
},
|
||||
"text": "Testdoc3 Table: Column 1 Row 0"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "0a04f24b652d60a333c4ab7cb407703a",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Testdoc3 Table: Column 2 Row 0"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Testdoc3 Table: Column 2 Row 0"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "0301eff44f871fbda777aa0237a0f452",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Testdoc3 Table: Column 3 Row 0"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Testdoc3 Table: Column 3 Row 0"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "23f1bd85c5fad540ef96b0872e74e7a4",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Testdoc3 Table: Column 1 Row 1"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Testdoc3 Table: Column 1 Row 1"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "10c66e15332d59c91094e825685044d2",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Testdoc3 Table: Column 2 Row 1"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Testdoc3 Table: Column 2 Row 1"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "f27232db61c551577ee4ea73a08e7539",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Testdoc3 Table: Column 3 Row 1"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Testdoc3 Table: Column 3 Row 1"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "2a83da2e0f9c1bc4950962ffd50c2611",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Testdoc3 Table: Column 1 Row 2"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Testdoc3 Table: Column 1 Row 2"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "aad2133b4d02da862062868452a19f2d",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Testdoc3 Table: Column 2 Row 2"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Testdoc3 Table: Column 2 Row 2"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "1d5426ac7bb0a72e5e85f81590b05645",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Testdoc3 Table: Column 3 Row 2"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Testdoc3 Table: Column 3 Row 2"
|
||||
"text": "Testdoc3 Table: Column 1 Row 0 Testdoc3 Table: Column 2 Row 0 Testdoc3 Table: Column 3 Row 0 Testdoc3 Table: Column 1 Row 1 Testdoc3 Table: Column 2 Row 1 Testdoc3 Table: Column 3 Row 1 Testdoc3 Table: Column 1 Row 2 Testdoc3 Table: Column 2 Row 2 Testdoc3 Table: Column 3 Row 2"
|
||||
}
|
||||
]
|
@ -346,147 +346,14 @@
|
||||
"text": "Testdoc2 Heading 5 Sized Text"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "7aa138ab1f6ef154504c3d8ade2fd1a0",
|
||||
"type": "Table",
|
||||
"element_id": "a164cd72991a3856b7bbc6d52d8b04bf",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Testdoc2 Table: Column 1 Row 0"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
"text_as_html": "<table><br><tbody><br><tr><td>Testdoc2 Table: Column 1 Row 0</td><td>Testdoc2 Table: Column 2 Row 0</td><td>Testdoc2 Table: Column 3 Row 0</td></tr><br><tr><td>Testdoc2 Table: Column 1 Row 1</td><td>Testdoc2 Table: Column 2 Row 1</td><td>Testdoc2 Table: Column 3 Row 1</td></tr><br><tr><td>Testdoc2 Table: Column 1 Row 2</td><td>Testdoc2 Table: Column 2 Row 2</td><td>Testdoc2 Table: Column 3 Row 2</td></tr><br></tbody><br></table>"
|
||||
},
|
||||
"text": "Testdoc2 Table: Column 1 Row 0"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "b40b0fee79c609772c958caa07bd47a8",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Testdoc2 Table: Column 2 Row 0"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Testdoc2 Table: Column 2 Row 0"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "cc59bb6025ceae34c2b9c9d7cdbfbcf9",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Testdoc2 Table: Column 3 Row 0"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Testdoc2 Table: Column 3 Row 0"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "3cb373750d4e46b4bbc980dd0d74321e",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Testdoc2 Table: Column 1 Row 1"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Testdoc2 Table: Column 1 Row 1"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "219a8d1fc742fb75b2481a0a75c77a3b",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Testdoc2 Table: Column 2 Row 1"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Testdoc2 Table: Column 2 Row 1"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "07a1ad32c97f3669f88014ee5942f616",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Testdoc2 Table: Column 3 Row 1"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Testdoc2 Table: Column 3 Row 1"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "17228bddb06b739951fab2ab04c09ea8",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Testdoc2 Table: Column 1 Row 2"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Testdoc2 Table: Column 1 Row 2"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "4ad7ae00fff8c8a3f903864d037cf86e",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Testdoc2 Table: Column 2 Row 2"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Testdoc2 Table: Column 2 Row 2"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "f2701095922247ecafbbd3fe31d585bf",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"Testdoc2 Table: Column 3 Row 2"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Testdoc2 Table: Column 3 Row 2"
|
||||
"text": "Testdoc2 Table: Column 1 Row 0 Testdoc2 Table: Column 2 Row 0 Testdoc2 Table: Column 3 Row 0 Testdoc2 Table: Column 1 Row 1 Testdoc2 Table: Column 2 Row 1 Testdoc2 Table: Column 3 Row 1 Testdoc2 Table: Column 1 Row 2 Testdoc2 Table: Column 2 Row 2 Testdoc2 Table: Column 3 Row 2"
|
||||
}
|
||||
]
|
@ -1,54 +1,13 @@
|
||||
[
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "January 2023"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
|
||||
"type": "Table",
|
||||
"element_id": "e83a347af95db7ba47b5351f411e00c7",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"link_urls": [
|
||||
"https://twitter.com/stef/status/1617222428727586816"
|
||||
],
|
||||
"link_texts": [
|
||||
null
|
||||
],
|
||||
"emphasized_text_contents": [
|
||||
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"i"
|
||||
]
|
||||
"text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you'll notice it's full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
|
||||
},
|
||||
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "4eafbff98b81999dfbf3572440d22393",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
}
|
||||
]
|
@ -1,54 +1,13 @@
|
||||
[
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "January 2023"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
|
||||
"type": "Table",
|
||||
"element_id": "e83a347af95db7ba47b5351f411e00c7",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"link_urls": [
|
||||
"https://twitter.com/stef/status/1617222428727586816"
|
||||
],
|
||||
"link_texts": [
|
||||
null
|
||||
],
|
||||
"emphasized_text_contents": [
|
||||
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"i"
|
||||
]
|
||||
"text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you'll notice it's full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
|
||||
},
|
||||
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "4eafbff98b81999dfbf3572440d22393",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
}
|
||||
]
|
@ -1,54 +1,13 @@
|
||||
[
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "January 2023"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
|
||||
"type": "Table",
|
||||
"element_id": "e83a347af95db7ba47b5351f411e00c7",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"link_urls": [
|
||||
"https://twitter.com/stef/status/1617222428727586816"
|
||||
],
|
||||
"link_texts": [
|
||||
null
|
||||
],
|
||||
"emphasized_text_contents": [
|
||||
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"i"
|
||||
]
|
||||
"text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you'll notice it's full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
|
||||
},
|
||||
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "4eafbff98b81999dfbf3572440d22393",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
}
|
||||
]
|
@ -1,54 +1,13 @@
|
||||
[
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "January 2023"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
|
||||
"type": "Table",
|
||||
"element_id": "e83a347af95db7ba47b5351f411e00c7",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"link_urls": [
|
||||
"https://twitter.com/stef/status/1617222428727586816"
|
||||
],
|
||||
"link_texts": [
|
||||
null
|
||||
],
|
||||
"emphasized_text_contents": [
|
||||
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"i"
|
||||
]
|
||||
"text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you'll notice it's full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
|
||||
},
|
||||
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "4eafbff98b81999dfbf3572440d22393",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
}
|
||||
]
|
@ -1,54 +1,13 @@
|
||||
[
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "January 2023"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
|
||||
"type": "Table",
|
||||
"element_id": "e83a347af95db7ba47b5351f411e00c7",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"link_urls": [
|
||||
"https://twitter.com/stef/status/1617222428727586816"
|
||||
],
|
||||
"link_texts": [
|
||||
null
|
||||
],
|
||||
"emphasized_text_contents": [
|
||||
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"i"
|
||||
]
|
||||
"text_as_html": "<table><br><tbody><br><tr><td>January 2023</td><td>(</td><td>Someone</td><td>fed my essays into GPT to make something that could answer<br>questions based on them, then asked it where good ideas come from. The<br>answer was ok, but not what I would have said. This is what I would have said.)</td><td>The way to get new ideas is to notice anomalies: what seems strange,<br>or missing, or broken? You can see anomalies in everyday life (much<br>of standup comedy is based on this), but the best place to look for<br>them is at the frontiers of knowledge.</td><td>Knowledge grows fractally.<br>From a distance its edges look smooth, but when you learn enough<br>to get close to one, you'll notice it's full of gaps. These gaps<br>will seem obvious; it will seem inexplicable that no one has tried<br>x or wondered about y. In the best case, exploring such gaps yields<br>whole new fractal buds.</td></tr><br></tbody><br></table>"
|
||||
},
|
||||
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "4eafbff98b81999dfbf3572440d22393",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
}
|
||||
]
|
File diff suppressed because one or more lines are too long
@ -96,8 +96,8 @@
|
||||
"text": "text\n \n with other"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "8864784f943d9f832a3dce22ef8bcf01",
|
||||
"type": "Table",
|
||||
"element_id": "8298c3f1d0016deb9cbf44832c33480c",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"date_created": "2023-08-04T18:31:00.000Z",
|
||||
@ -105,207 +105,9 @@
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"content"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"b"
|
||||
]
|
||||
"text_as_html": "<table><br><tbody><br><tr><td></td><td>column 1 </td><td> </td><td>column 2 </td><td> </td><td>pages </td><td> </td><td></td><td> </td><td></td><td> </td><td></td><td></td></tr><br><tr><td></td><td>c1r1 </td><td>content</td><td> </td><td> </td><td>c2r1 table <br> 2023-08-08T09:00:00.000-04:00<br> cell</td><td> </td><td></td><td>Page with every block</td><td></td><td> </td><td></td><td></td></tr><br><tr><td></td><td>c1r2 more </td><td>content</td><td> </td><td> </td><td>c2r2 table </td><td>cell </td><td></td><td> </td><td></td><td>Untitled</td><td></td><td></td></tr><br><tr><td></td><td>this is some green text</td><td> </td><td>this is </td><td>an</td><td> </td><td>equation</td><td></td><td> </td><td></td><td>Untitled</td><td></td><td></td></tr><br><tr><td></td><td>text1 </td><td>text2 </td><td>Multiline cell</td><td> </td><td>Another cell </td><td> </td><td></td><td>Untitled </td><td></td><td> </td><td></td><td></td></tr><br></tbody><br></table>"
|
||||
},
|
||||
"text": "c1r1 \n \n content"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "6f75c9d2993dbb3981c019741c7962a9",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"date_created": "2023-08-04T18:31:00.000Z",
|
||||
"date_modified": "2023-08-17T18:48:00.000Z"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "c2r1 table \n 2023-08-08T09:00:00.000-04:00\n cell"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "5687503bd741f54090d4c0557c0eea1a",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"date_created": "2023-08-04T18:31:00.000Z",
|
||||
"date_modified": "2023-08-17T18:48:00.000Z"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"link_urls": [
|
||||
"https://www.notion.so/c47a45664c7a488bac2a1292ee507fcb"
|
||||
],
|
||||
"link_texts": [
|
||||
"\n Page with every block \n "
|
||||
]
|
||||
},
|
||||
"text": "Page with every block"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "13686520a51e25584bb06ab189b38552",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"date_created": "2023-08-04T18:31:00.000Z",
|
||||
"date_modified": "2023-08-17T18:48:00.000Z"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"link_urls": [
|
||||
"/122b2c22996b435b9de2ee0e9d2b04bc"
|
||||
],
|
||||
"link_texts": [
|
||||
"\n content\n "
|
||||
]
|
||||
},
|
||||
"text": "c1r2 more \n \n content"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "cf236cfe4b4c0ef644c37b4e491a4aa8",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"date_created": "2023-08-04T18:31:00.000Z",
|
||||
"date_modified": "2023-08-17T18:48:00.000Z"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"cell"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"span"
|
||||
]
|
||||
},
|
||||
"text": "c2r2 table \n \n cell"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "f59ab8d1331b7b16952fbd388258f856",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"date_created": "2023-08-04T18:31:00.000Z",
|
||||
"date_modified": "2023-08-17T18:48:00.000Z"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"link_urls": [
|
||||
"https://www.notion.so/9ba4d6da8a574cfc81ebceac1fde52bd"
|
||||
],
|
||||
"link_texts": [
|
||||
"\n Untitled\n "
|
||||
]
|
||||
},
|
||||
"text": "Untitled"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "7d96ce60a66271ef79da4c492ca7db8a",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"date_created": "2023-08-04T18:31:00.000Z",
|
||||
"date_modified": "2023-08-17T18:48:00.000Z"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "this is some green text"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "2d77a706008eebaf1f7c4e116bbe08b4",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"date_created": "2023-08-04T18:31:00.000Z",
|
||||
"date_modified": "2023-08-17T18:48:00.000Z"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"an",
|
||||
"equation"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"b",
|
||||
"b"
|
||||
]
|
||||
},
|
||||
"text": "this is \n \n an \n \n \n equation"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "f59ab8d1331b7b16952fbd388258f856",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"date_created": "2023-08-04T18:31:00.000Z",
|
||||
"date_modified": "2023-08-17T18:48:00.000Z"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"link_urls": [
|
||||
"https://www.notion.so/a1a5dff426f34b8f9a709d51b2a00c73"
|
||||
],
|
||||
"link_texts": [
|
||||
"\n Untitled\n "
|
||||
]
|
||||
},
|
||||
"text": "Untitled"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "7e921a403f1840728e2887990cfe640d",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"date_created": "2023-08-04T18:31:00.000Z",
|
||||
"date_modified": "2023-08-17T18:48:00.000Z"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_text_contents": [
|
||||
"text2"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"i"
|
||||
]
|
||||
},
|
||||
"text": "text1\n\n\n \n text2\n \n \n\nMultiline cell"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "7013d5bb5a17e0e782e8971e23640bdb",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"date_created": "2023-08-04T18:31:00.000Z",
|
||||
"date_modified": "2023-08-17T18:48:00.000Z"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Another cell"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "f59ab8d1331b7b16952fbd388258f856",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"date_created": "2023-08-04T18:31:00.000Z",
|
||||
"date_modified": "2023-08-17T18:48:00.000Z"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"link_urls": [
|
||||
"https://www.notion.so/84002066546448d0a030aa79b8d400b0"
|
||||
],
|
||||
"link_texts": [
|
||||
"\n Untitled\n "
|
||||
]
|
||||
},
|
||||
"text": "Untitled"
|
||||
"text": "column 1\n \n \n column 2\n \n \n pages\n \n \n \n c1r1 \n \n content \n \n \n \n c2r1 table \n 2023-08-08T09:00:00.000-04:00\n cell\n \n \n \n Page with every block \n \n \n \n \n \n c1r2 more \n \n content\n \n \n \n c2r2 table \n \n cell\n \n \n \n \n Untitled\n \n \n \n \n \n this is some green text\n \n \n this is \n \n an \n \n \n equation\n \n \n \n \n Untitled\n \n \n \n \n \n text1\n\n\n \n text2\n \n \n\nMultiline cell\n \n \n Another cell \n \n \n \n Untitled"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
|
File diff suppressed because one or more lines are too long
@ -1 +1 @@
|
||||
__version__ = "0.10.14" # pragma: no cover
|
||||
__version__ = "0.10.15-dev1" # pragma: no cover
|
||||
|
@ -9,6 +9,7 @@ else:
|
||||
from typing import Final
|
||||
|
||||
from lxml import etree
|
||||
from tabulate import tabulate
|
||||
|
||||
from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes
|
||||
from unstructured.documents.base import Page
|
||||
@ -19,6 +20,7 @@ from unstructured.documents.elements import (
|
||||
Link,
|
||||
ListItem,
|
||||
NarrativeText,
|
||||
Table,
|
||||
Text,
|
||||
Title,
|
||||
)
|
||||
@ -53,6 +55,7 @@ class TagsMixin:
|
||||
ancestortags: Sequence[str] = (),
|
||||
links: Sequence[Link] = [],
|
||||
emphasized_texts: Sequence[dict] = [],
|
||||
text_as_html: Optional[str] = None,
|
||||
**kwargs,
|
||||
):
|
||||
if tag is None:
|
||||
@ -62,6 +65,7 @@ class TagsMixin:
|
||||
self.ancestortags = ancestortags
|
||||
self.links = links
|
||||
self.emphasized_texts = emphasized_texts
|
||||
self.text_as_html = text_as_html
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
|
||||
@ -101,6 +105,12 @@ class HTMLListItem(TagsMixin, ListItem):
|
||||
pass
|
||||
|
||||
|
||||
class HTMLTable(TagsMixin, Table):
|
||||
"""NarrativeText with tag information"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class HTMLDocument(XMLDocument):
|
||||
"""Class for handling HTML documents. Uses rules based parsing to identify sections
|
||||
of interest within the document."""
|
||||
@ -168,6 +178,12 @@ class HTMLDocument(XMLDocument):
|
||||
page.elements.append(element)
|
||||
descendanttag_elems = _get_bullet_descendants(tag_elem, next_element)
|
||||
|
||||
elif _is_table_item(tag_elem):
|
||||
element, next_element = _process_leaf_table_item(tag_elem)
|
||||
if element is not None:
|
||||
page.elements.append(element)
|
||||
descendanttag_elems = tuple(tag_elem.iterdescendants())
|
||||
|
||||
elif tag_elem.tag in PAGEBREAK_TAGS and len(page.elements) > 0:
|
||||
pages.append(page)
|
||||
page_number += 1
|
||||
@ -183,7 +199,7 @@ class HTMLDocument(XMLDocument):
|
||||
def doc_after_cleaners(
|
||||
self,
|
||||
skip_headers_and_footers=False,
|
||||
skip_table_text=False,
|
||||
skip_table=False,
|
||||
inplace=False,
|
||||
) -> HTMLDocument:
|
||||
"""Filters the elements and returns a new instance of the class based on the criteria
|
||||
@ -191,8 +207,8 @@ class HTMLDocument(XMLDocument):
|
||||
page are filtered out.
|
||||
Parameters
|
||||
----------
|
||||
skip_table_text:
|
||||
If True, skips text that is contained within a table element
|
||||
skip_table:
|
||||
If True, skips table element
|
||||
skip_headers_and_footers:
|
||||
If True, ignores any content that is within <header> or <footer> tags
|
||||
inplace:
|
||||
@ -202,8 +218,8 @@ class HTMLDocument(XMLDocument):
|
||||
excluders = []
|
||||
if skip_headers_and_footers:
|
||||
excluders.append(in_header_or_footer)
|
||||
if skip_table_text:
|
||||
excluders.append(has_table_ancestor)
|
||||
if skip_table:
|
||||
excluders.append(is_table)
|
||||
|
||||
pages = []
|
||||
page_number = 0
|
||||
@ -245,7 +261,6 @@ def _get_links_from_tag(tag_elem: etree.Element) -> List[Link]:
|
||||
href = tag_elem.get("href")
|
||||
if href:
|
||||
links.append({"text": tag_elem.text, "url": href})
|
||||
|
||||
for tag in tag_elem.iterdescendants():
|
||||
href = tag.get("href")
|
||||
if href:
|
||||
@ -441,6 +456,36 @@ def _is_text_tag(tag_elem: etree.Element, max_predecessor_len: int = 5) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
def _process_leaf_table_item(
|
||||
tag_elem: etree.Element,
|
||||
) -> Tuple[Optional[Element], etree.Element]:
|
||||
if tag_elem.tag in TABLE_TAGS:
|
||||
nested_table = tag_elem.findall("table")
|
||||
if not nested_table:
|
||||
rows = tag_elem.findall("tr")
|
||||
if not rows:
|
||||
body = tag_elem.find("tbody")
|
||||
rows = body.findall("tr")
|
||||
if len(rows) > 0:
|
||||
table_data = [list(row.itertext()) for row in rows]
|
||||
html_table = tabulate(table_data, tablefmt="html")
|
||||
table_text = " ".join(" ".join(row) for row in table_data).strip()
|
||||
else:
|
||||
table_text = ""
|
||||
html_table = ""
|
||||
return (
|
||||
HTMLTable(
|
||||
text=table_text,
|
||||
text_as_html=html_table.replace("\n", "<br>"),
|
||||
tag=tag_elem.tag,
|
||||
ancestortags=tuple(el.tag for el in tag_elem.iterancestors())[::-1],
|
||||
),
|
||||
tag_elem,
|
||||
)
|
||||
|
||||
return None, None
|
||||
|
||||
|
||||
def _process_list_item(
|
||||
tag_elem: etree.Element,
|
||||
max_predecessor_len: int = 5,
|
||||
@ -496,6 +541,13 @@ def is_list_item_tag(tag_elem: etree.Element) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
def _is_table_item(tag_elem: etree.Element) -> bool:
|
||||
"""Checks to see if a tag contains table item"""
|
||||
if tag_elem.tag in TABLE_TAGS:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _bulleted_text_from_table(table) -> List[Element]:
|
||||
"""Extracts bulletized narrative text from a table.
|
||||
NOTE: if a table has mixed bullets and non-bullets, only bullets are extracted.
|
||||
@ -541,6 +593,11 @@ def has_table_ancestor(element: TagsMixin) -> bool:
|
||||
return any(ancestor in TABLE_TAGS for ancestor in element.ancestortags)
|
||||
|
||||
|
||||
def is_table(element: TagsMixin) -> bool:
|
||||
"""Checks to see if an element is a table"""
|
||||
return element.tag in TABLE_TAGS
|
||||
|
||||
|
||||
def in_header_or_footer(element: TagsMixin) -> bool:
|
||||
"""Checks to see if an element is contained within a header or a footer tag."""
|
||||
if any(ancestor in HEADER_OR_FOOTER_TAGS for ancestor in element.ancestortags):
|
||||
|
@ -77,7 +77,6 @@ def partition_html(
|
||||
return []
|
||||
# Verify that only one of the arguments was provided
|
||||
exactly_one(filename=filename, file=file, text=text, url=url)
|
||||
|
||||
last_modification_date = None
|
||||
if filename is not None:
|
||||
last_modification_date = get_last_modified_date(filename)
|
||||
@ -118,7 +117,6 @@ def partition_html(
|
||||
|
||||
if skip_headers_and_footers:
|
||||
document = filter_footer_and_header(document)
|
||||
|
||||
return document_to_element_list(
|
||||
document,
|
||||
sortable=False,
|
||||
|
Loading…
x
Reference in New Issue
Block a user