From a501d1d18f67875b36d7e3cdca55455f2ee10a0e Mon Sep 17 00:00:00 2001 From: Amanda Cameron Date: Mon, 11 Sep 2023 11:14:11 -0700 Subject: [PATCH] Adding table extraction to partition_html (#1324) Adding table extraction to HTML partitioning. This PR utilizes 'table' HTML elements to extract and parse HTML tables and return them in partitioning. ``` # checkout this branch, go into ipython shell In [1]: from unstructured.partition.html import partition_html In [2]: path_to_html = "{html sample file with table}" In [3]: elements = partition_html(path_to_html) ``` you should see the table in the elements list! --- CHANGELOG.md | 10 + test_unstructured/documents/test_html.py | 26 +- .../partition/test_html_partition.py | 35 +- .../Shared Documents/ideas-page.json | 52 +- .../box/nested-1/ideas-page.html.json | 49 +- .../box/nested-2/ideas-page.html.json | 49 +- .../confluence-diff/MFS/1540126.json | 271 +---- .../confluence-diff/MFS/1605928.json | 61 +- .../confluence-diff/MFS/1605942.json | 29 +- .../confluence-diff/testteamsp/1605989.json | 141 +-- .../confluence-diff/testteamsp/1802252.json | 141 +-- .../dropbox/nested-1/ideas-page.html.json | 49 +- .../dropbox/nested-2/ideas-page.html.json | 49 +- .../gcs/ideas-page.html.json | 49 +- .../gcs/nested-1/nested/ideas-page.html.json | 49 +- .../gcs/nested-2/nested/ideas-page.html.json | 49 +- .../122b2c22-996b-435b-9de2-ee0e9d2b04bc.json | 973 +----------------- .../c47a4566-4c7a-488b-ac2a-1292ee507fcb.json | 206 +--- .../d3d87fc6-61cc-4bb5-89ed-e9dff0df1526.json | 645 +----------- unstructured/__version__.py | 2 +- unstructured/documents/html.py | 69 +- unstructured/partition/html.py | 2 - 22 files changed, 173 insertions(+), 2833 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f948ace13..a3416b037 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## 0.10.15-dev1 + +### Enhancements + +* Updated HTML Partitioning to extract tables + +### Features + +### Fixes + ## 0.10.14 ### Enhancements diff --git a/test_unstructured/documents/test_html.py b/test_unstructured/documents/test_html.py index 4e2586b6d..d6d236f08 100644 --- a/test_unstructured/documents/test_html.py +++ b/test_unstructured/documents/test_html.py @@ -10,6 +10,7 @@ from unstructured.documents.elements import ( Address, ListItem, NarrativeText, + Table, Text, Title, ) @@ -77,14 +78,7 @@ def test_parses_tags_correctly(): """ doc = HTMLDocument.from_string(raw_html) el = doc.elements[0] - assert el.ancestortags + (el.tag,) == ( - "html", - "body", - "table", - "tbody", - "tr", - "td", - ) + assert el.ancestortags + (el.tag,) == ("html", "body", "table") def test_has_table_ancestor(): @@ -118,8 +112,8 @@ def test_read_without_skipping_table(monkeypatch): """ - document = HTMLDocument.from_string(doc).doc_after_cleaners(skip_table_text=False) - assert document.pages[0].elements[0] == NarrativeText(text="Hi there! I am Matt!") + document = HTMLDocument.from_string(doc).doc_after_cleaners(skip_table=False) + assert document.pages[0].elements[0] == Table(text="Hi there! I am Matt!") @pytest.mark.parametrize( @@ -356,7 +350,7 @@ def test_read_html_doc(tmpdir, monkeypatch): html_document = HTMLDocument.from_file(filename=filename).doc_after_cleaners( skip_headers_and_footers=True, - skip_table_text=True, + skip_table=True, ) print("original pages: ", HTMLDocument.from_file(filename=filename).pages) print("filtered pages: ", html_document.pages) @@ -472,7 +466,7 @@ def test_include_headers_and_footers(sample_doc): def test_include_table_text(sample_doc): - html_document = sample_doc.doc_after_cleaners(skip_table_text=False) + html_document = sample_doc.doc_after_cleaners(skip_table=False) assert len(html_document.pages[0].elements) == 2 @@ -503,8 +497,8 @@ def test_exclude_tag_types(tag): def test_tag_types_table(sample_doc): - html_document = sample_doc.doc_after_cleaners(skip_table_text=True) - assert len(html_document.pages[0].elements) == 1 + html_document = sample_doc.doc_after_cleaners(skip_table=True) + assert len(html_document.pages[0].elements) == 2 def test_nested_text_tags(): @@ -518,7 +512,7 @@ def test_nested_text_tags(): """ - html_document = HTMLDocument.from_string(html_str).doc_after_cleaners(skip_table_text=False) + html_document = HTMLDocument.from_string(html_str).doc_after_cleaners(skip_table=False) assert len(html_document.pages[0].elements) == 1 @@ -664,7 +658,7 @@ def test_filter_in_place(): """ doc = HTMLDocument.from_string(html_doc) assert len(doc.elements) == 2 - doc.doc_after_cleaners(skip_table_text=True, inplace=True) + doc.doc_after_cleaners(skip_table=True, inplace=True) assert len(doc.elements) == 1 diff --git a/test_unstructured/partition/test_html_partition.py b/test_unstructured/partition/test_html_partition.py index 27172d228..7f052985a 100644 --- a/test_unstructured/partition/test_html_partition.py +++ b/test_unstructured/partition/test_html_partition.py @@ -7,7 +7,7 @@ import requests from requests.models import Response from unstructured.cleaners.core import clean_extra_whitespace -from unstructured.documents.elements import ListItem, NarrativeText, Title +from unstructured.documents.elements import ListItem, NarrativeText, Table, Title from unstructured.partition.html import partition_html from unstructured.partition.json import partition_json from unstructured.staging.base import elements_to_json @@ -267,26 +267,25 @@ def test_partition_html_raises_with_too_many_specified(): def test_partition_html_on_ideas_page(filename="example-docs/ideas-page.html"): elements = partition_html(filename=filename) - assert len(elements) == 4 + assert len(elements) == 1 + assert elements[0] == Table( + text="January 2023 ( Someone fed my essays into GPT to make something " + "that could answer\nquestions based on them, then asked it where good " + "ideas come from. The\nanswer was ok, but not what I would have said. " + "This is what I would have said.) The way to get new ideas is to notice " + "anomalies: what seems strange,\nor missing, or broken? You can see anomalies" + " in everyday life (much\nof standup comedy is based on this), but the best " + "place to look for\nthem is at the frontiers of knowledge. Knowledge grows " + "fractally.\nFrom a distance its edges look smooth, but when you learn " + "enough\nto get close to one, you'll notice it's full of gaps. These " + "gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx " + "or wondered about y. In the best case, exploring such gaps yields\nwhole " + "new fractal buds.", + ) - assert elements[0] == Title("January 2023") assert elements[0].metadata.emphasized_text_contents is None assert elements[0].metadata.link_urls is None - - assert elements[1].text.startswith("(Someone fed my essays") - assert elements[1].text.endswith("I would have said.)") - assert len(elements[1].metadata.emphasized_text_contents) == 1 - assert len(elements[1].metadata.link_urls) == 1 - - assert elements[2].text.startswith("The way to get new ideas") - assert elements[2].text.endswith("the frontiers of knowledge.") - assert elements[2].metadata.emphasized_text_contents is None - assert elements[2].metadata.link_urls is None - - assert elements[3].text.startswith("Knowledge grows fractally") - assert elements[3].text.endswith("whole new fractal buds.") - assert elements[3].metadata.emphasized_text_contents is None - assert elements[3].metadata.link_urls is None + assert elements[0].metadata.text_as_html is not None def test_user_without_file_write_permission_can_partition_html(tmp_path, monkeypatch): diff --git a/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/ideas-page.json b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/ideas-page.json index ca891379e..f1e5b6fa8 100644 --- a/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/ideas-page.json +++ b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/ideas-page.json @@ -1,58 +1,14 @@ [ { - "type": "Title", - "element_id": "17c1a6701c263407d0fcf7c3ebfb2986", - "metadata": { - "data_source": {}, - "filename": "ideas-page.html", - "filetype": "text/html", - "page_number": 1 - }, - "text": "January 2023" - }, - { - "type": "NarrativeText", - "element_id": "6ea0e510b7ea64f87b55c1fe388cba7f", + "type": "Table", + "element_id": "e83a347af95db7ba47b5351f411e00c7", "metadata": { "data_source": {}, "filename": "ideas-page.html", "filetype": "text/html", "page_number": 1, - "link_urls": [ - "https://twitter.com/stef/status/1617222428727586816" - ], - "link_texts": [ - null - ], - "emphasized_text_contents": [ - "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" - ], - "emphasized_text_tags": [ - "i" - ] + "text_as_html": "



January 2023(Someonefed my essays into GPT to make something that could answer
questions based on them, then asked it where good ideas come from. The
answer was ok, but not what I would have said. This is what I would have said.)
The way to get new ideas is to notice anomalies: what seems strange,
or missing, or broken? You can see anomalies in everyday life (much
of standup comedy is based on this), but the best place to look for
them is at the frontiers of knowledge.
Knowledge grows fractally.
From a distance its edges look smooth, but when you learn enough
to get close to one, you'll notice it's full of gaps. These gaps
will seem obvious; it will seem inexplicable that no one has tried
x or wondered about y. In the best case, exploring such gaps yields
whole new fractal buds.
" }, - "text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" - }, - { - "type": "NarrativeText", - "element_id": "a8ce0a2e7d66af2000e6c3bd36994411", - "metadata": { - "data_source": {}, - "filename": "ideas-page.html", - "filetype": "text/html", - "page_number": 1 - }, - "text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge." - }, - { - "type": "NarrativeText", - "element_id": "4eafbff98b81999dfbf3572440d22393", - "metadata": { - "data_source": {}, - "filename": "ideas-page.html", - "filetype": "text/html", - "page_number": 1 - }, - "text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." + "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/box/nested-1/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/box/nested-1/ideas-page.html.json index 899445dac..3db09d27c 100644 --- a/test_unstructured_ingest/expected-structured-output/box/nested-1/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/box/nested-1/ideas-page.html.json @@ -1,54 +1,13 @@ [ { - "type": "Title", - "element_id": "17c1a6701c263407d0fcf7c3ebfb2986", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1 - }, - "text": "January 2023" - }, - { - "type": "NarrativeText", - "element_id": "6ea0e510b7ea64f87b55c1fe388cba7f", + "type": "Table", + "element_id": "e83a347af95db7ba47b5351f411e00c7", "metadata": { "data_source": {}, "filetype": "text/html", "page_number": 1, - "link_urls": [ - "https://twitter.com/stef/status/1617222428727586816" - ], - "link_texts": [ - null - ], - "emphasized_text_contents": [ - "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" - ], - "emphasized_text_tags": [ - "i" - ] + "text_as_html": "



January 2023(Someonefed my essays into GPT to make something that could answer
questions based on them, then asked it where good ideas come from. The
answer was ok, but not what I would have said. This is what I would have said.)
The way to get new ideas is to notice anomalies: what seems strange,
or missing, or broken? You can see anomalies in everyday life (much
of standup comedy is based on this), but the best place to look for
them is at the frontiers of knowledge.
Knowledge grows fractally.
From a distance its edges look smooth, but when you learn enough
to get close to one, you'll notice it's full of gaps. These gaps
will seem obvious; it will seem inexplicable that no one has tried
x or wondered about y. In the best case, exploring such gaps yields
whole new fractal buds.
" }, - "text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" - }, - { - "type": "NarrativeText", - "element_id": "a8ce0a2e7d66af2000e6c3bd36994411", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1 - }, - "text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge." - }, - { - "type": "NarrativeText", - "element_id": "4eafbff98b81999dfbf3572440d22393", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1 - }, - "text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." + "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/box/nested-2/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/box/nested-2/ideas-page.html.json index 899445dac..3db09d27c 100644 --- a/test_unstructured_ingest/expected-structured-output/box/nested-2/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/box/nested-2/ideas-page.html.json @@ -1,54 +1,13 @@ [ { - "type": "Title", - "element_id": "17c1a6701c263407d0fcf7c3ebfb2986", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1 - }, - "text": "January 2023" - }, - { - "type": "NarrativeText", - "element_id": "6ea0e510b7ea64f87b55c1fe388cba7f", + "type": "Table", + "element_id": "e83a347af95db7ba47b5351f411e00c7", "metadata": { "data_source": {}, "filetype": "text/html", "page_number": 1, - "link_urls": [ - "https://twitter.com/stef/status/1617222428727586816" - ], - "link_texts": [ - null - ], - "emphasized_text_contents": [ - "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" - ], - "emphasized_text_tags": [ - "i" - ] + "text_as_html": "



January 2023(Someonefed my essays into GPT to make something that could answer
questions based on them, then asked it where good ideas come from. The
answer was ok, but not what I would have said. This is what I would have said.)
The way to get new ideas is to notice anomalies: what seems strange,
or missing, or broken? You can see anomalies in everyday life (much
of standup comedy is based on this), but the best place to look for
them is at the frontiers of knowledge.
Knowledge grows fractally.
From a distance its edges look smooth, but when you learn enough
to get close to one, you'll notice it's full of gaps. These gaps
will seem obvious; it will seem inexplicable that no one has tried
x or wondered about y. In the best case, exploring such gaps yields
whole new fractal buds.
" }, - "text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" - }, - { - "type": "NarrativeText", - "element_id": "a8ce0a2e7d66af2000e6c3bd36994411", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1 - }, - "text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge." - }, - { - "type": "NarrativeText", - "element_id": "4eafbff98b81999dfbf3572440d22393", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1 - }, - "text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." + "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1540126.json b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1540126.json index 42f3c097b..1de1caf68 100644 --- a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1540126.json +++ b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1540126.json @@ -1,151 +1,14 @@ [ { - "type": "Title", - "element_id": "9fe4c68ec20dda7c6b1d3f760e5e6af6", + "type": "Table", + "element_id": "10b5ef18a3c7fb1d7436b2e1b256e5b9", "metadata": { "data_source": {}, "filetype": "text/html", "page_number": 1, - "emphasized_text_contents": [ - "Driver" - ], - "emphasized_text_tags": [ - "strong" - ] + "text_as_html": "










Driver
Approver
Contributors
Informed
Objective
Due date
Key outcomes
Status NOT STARTED/IN PROGRESS/COMPLETE
" }, - "text": "Driver" - }, - { - "type": "Title", - "element_id": "3ebb5648c8bcb2934590555c69356e27", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Approver" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Approver" - }, - { - "type": "Title", - "element_id": "350ad433c42fe8cecdb38439f33947ea", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Contributors" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Contributors" - }, - { - "type": "Title", - "element_id": "31a717c19407f215d8bcd329fc82e646", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Informed" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Informed" - }, - { - "type": "Title", - "element_id": "3b20adc3b2ce1c15ea6880c3151baabe", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Objective" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Objective" - }, - { - "type": "Title", - "element_id": "e1cb6d30fa3f17ee1e50b2bcf1967374", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Due date" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Due date" - }, - { - "type": "Title", - "element_id": "80f5b18f225fca5e493dc48e4e60e8c7", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Key outcomes" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Key outcomes" - }, - { - "type": "Title", - "element_id": "920e413c7d411b61ef3e8c63b1cb6ad0", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Status" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Status" - }, - { - "type": "Title", - "element_id": "a54416fced47600988250cacdb064691", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "NOT STARTED", - "IN PROGRESS", - "COMPLETE" - ], - "emphasized_text_tags": [ - "span", - "span", - "span" - ] - }, - "text": "NOT STARTED / IN PROGRESS / COMPLETE" + "text": "Driver Approver Contributors Informed Objective Due date Key outcomes Status NOT STARTED / IN PROGRESS / COMPLETE" }, { "type": "Title", @@ -168,72 +31,15 @@ "text": "🎯 Scope" }, { - "type": "NarrativeText", - "element_id": "0e5c4ed000097332e1e1b29a96fefd56", + "type": "Table", + "element_id": "f1f364fbde77afa0e99e8ea7ab4f7c3f", "metadata": { "data_source": {}, "filetype": "text/html", "page_number": 1, - "emphasized_text_contents": [ - "Must have:" - ], - "emphasized_text_tags": [ - "strong" - ] + "text_as_html": "





Must have:
Nice to have:
Not in scope:
" }, - "text": "Must have:" - }, - { - "type": "ListItem", - "element_id": "e3b0c44298fc1c149afbf4c8996fb924", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1 - }, - "text": "" - }, - { - "type": "ListItem", - "element_id": "e3b0c44298fc1c149afbf4c8996fb924", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1 - }, - "text": "" - }, - { - "type": "NarrativeText", - "element_id": "d29e06627b1fec1ecf65bce63fc5fda5", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Nice to have:" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Nice to have:" - }, - { - "type": "Title", - "element_id": "7f999c0456e4e85cc028aa6ed90455d4", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Not in scope:" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Not in scope:" + "text": "Must have: Nice to have: Not in scope:" }, { "type": "Title", @@ -336,68 +142,15 @@ "text": "\\uD83D\\uDEA9 Milestones and deadlines" }, { - "type": "Title", - "element_id": "9e86248cf2351e388065b80307b7ac00", + "type": "Table", + "element_id": "3f4ea3840d79521680c89a91dcd883cf", "metadata": { "data_source": {}, "filetype": "text/html", "page_number": 1, - "emphasized_text_contents": [ - "Milestone" - ], - "emphasized_text_tags": [ - "strong" - ] + "text_as_html": "






MilestoneOwnerDeadlineStatus
" }, - "text": "Milestone" - }, - { - "type": "Title", - "element_id": "4b1b8aa3608a26da451ae0630d75b60a", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Owner" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Owner" - }, - { - "type": "Title", - "element_id": "6fcb38ddc858fc8592e4f693d04a2ed1", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Deadline" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Deadline" - }, - { - "type": "Title", - "element_id": "920e413c7d411b61ef3e8c63b1cb6ad0", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Status" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Status" + "text": "Milestone Owner Deadline Status" }, { "type": "Title", diff --git a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605928.json b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605928.json index 4008c5343..5c25fe4a2 100644 --- a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605928.json +++ b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605928.json @@ -70,68 +70,15 @@ "text": "\\uD83D\\uDDE3 Discussion topics" }, { - "type": "Title", - "element_id": "33b93476cf597a3330653b66a658983d", + "type": "Table", + "element_id": "37af06e8e75d96a448a00026754b7942", "metadata": { "data_source": {}, "filetype": "text/html", "page_number": 1, - "emphasized_text_contents": [ - "Time" - ], - "emphasized_text_tags": [ - "strong" - ] + "text_as_html": "





TimeItemPresenterNotes
" }, - "text": "Time" - }, - { - "type": "Title", - "element_id": "652bcc3a478428893cc505ae19f847b4", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Item" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Item" - }, - { - "type": "Title", - "element_id": "9ef077a1231ea3b71df182b87db1cb7f", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Presenter" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Presenter" - }, - { - "type": "Title", - "element_id": "8a7525b1492fb84833f5c4a69b30f4bf", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Notes" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Notes" + "text": "Time Item Presenter Notes" }, { "type": "Title", diff --git a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605942.json b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605942.json index 6c0f4d9a7..5a8e7b548 100644 --- a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605942.json +++ b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605942.json @@ -138,35 +138,14 @@ "text": "" }, { - "type": "Title", - "element_id": "8a7525b1492fb84833f5c4a69b30f4bf", + "type": "Table", + "element_id": "a240e43c0ae70731c65ae5430d2dab7f", "metadata": { "data_source": {}, "filetype": "text/html", "page_number": 1, - "emphasized_text_contents": [ - "Notes" - ], - "emphasized_text_tags": [ - "strong" - ] + "text_as_html": "




Notes
Important Links
" }, - "text": "Notes" - }, - { - "type": "Title", - "element_id": "98e38cd6c5f88330322de759657563f9", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Important Links" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Important Links" + "text": "Notes Important Links" } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1605989.json b/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1605989.json index c502897a3..ce30934f6 100644 --- a/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1605989.json +++ b/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1605989.json @@ -346,147 +346,14 @@ "text": "Testdoc3 Heading 5 Sized Text" }, { - "type": "Title", - "element_id": "a980779f0a4dcb2fbf46641f3d55fbf8", + "type": "Table", + "element_id": "5abf3e1bbc85012fe9e1d25966e00f5e", "metadata": { "data_source": {}, "filetype": "text/html", "page_number": 1, - "emphasized_text_contents": [ - "Testdoc3 Table: Column 1 Row 0" - ], - "emphasized_text_tags": [ - "strong" - ] + "text_as_html": "





Testdoc3 Table: Column 1 Row 0Testdoc3 Table: Column 2 Row 0Testdoc3 Table: Column 3 Row 0
Testdoc3 Table: Column 1 Row 1Testdoc3 Table: Column 2 Row 1Testdoc3 Table: Column 3 Row 1
Testdoc3 Table: Column 1 Row 2Testdoc3 Table: Column 2 Row 2Testdoc3 Table: Column 3 Row 2
" }, - "text": "Testdoc3 Table: Column 1 Row 0" - }, - { - "type": "Title", - "element_id": "0a04f24b652d60a333c4ab7cb407703a", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Testdoc3 Table: Column 2 Row 0" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Testdoc3 Table: Column 2 Row 0" - }, - { - "type": "Title", - "element_id": "0301eff44f871fbda777aa0237a0f452", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Testdoc3 Table: Column 3 Row 0" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Testdoc3 Table: Column 3 Row 0" - }, - { - "type": "Title", - "element_id": "23f1bd85c5fad540ef96b0872e74e7a4", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Testdoc3 Table: Column 1 Row 1" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Testdoc3 Table: Column 1 Row 1" - }, - { - "type": "Title", - "element_id": "10c66e15332d59c91094e825685044d2", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Testdoc3 Table: Column 2 Row 1" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Testdoc3 Table: Column 2 Row 1" - }, - { - "type": "Title", - "element_id": "f27232db61c551577ee4ea73a08e7539", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Testdoc3 Table: Column 3 Row 1" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Testdoc3 Table: Column 3 Row 1" - }, - { - "type": "Title", - "element_id": "2a83da2e0f9c1bc4950962ffd50c2611", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Testdoc3 Table: Column 1 Row 2" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Testdoc3 Table: Column 1 Row 2" - }, - { - "type": "Title", - "element_id": "aad2133b4d02da862062868452a19f2d", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Testdoc3 Table: Column 2 Row 2" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Testdoc3 Table: Column 2 Row 2" - }, - { - "type": "Title", - "element_id": "1d5426ac7bb0a72e5e85f81590b05645", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Testdoc3 Table: Column 3 Row 2" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Testdoc3 Table: Column 3 Row 2" + "text": "Testdoc3 Table: Column 1 Row 0 Testdoc3 Table: Column 2 Row 0 Testdoc3 Table: Column 3 Row 0 Testdoc3 Table: Column 1 Row 1 Testdoc3 Table: Column 2 Row 1 Testdoc3 Table: Column 3 Row 1 Testdoc3 Table: Column 1 Row 2 Testdoc3 Table: Column 2 Row 2 Testdoc3 Table: Column 3 Row 2" } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1802252.json b/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1802252.json index a0f2cf3ae..e9dd26b1d 100644 --- a/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1802252.json +++ b/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1802252.json @@ -346,147 +346,14 @@ "text": "Testdoc2 Heading 5 Sized Text" }, { - "type": "Title", - "element_id": "7aa138ab1f6ef154504c3d8ade2fd1a0", + "type": "Table", + "element_id": "a164cd72991a3856b7bbc6d52d8b04bf", "metadata": { "data_source": {}, "filetype": "text/html", "page_number": 1, - "emphasized_text_contents": [ - "Testdoc2 Table: Column 1 Row 0" - ], - "emphasized_text_tags": [ - "strong" - ] + "text_as_html": "





Testdoc2 Table: Column 1 Row 0Testdoc2 Table: Column 2 Row 0Testdoc2 Table: Column 3 Row 0
Testdoc2 Table: Column 1 Row 1Testdoc2 Table: Column 2 Row 1Testdoc2 Table: Column 3 Row 1
Testdoc2 Table: Column 1 Row 2Testdoc2 Table: Column 2 Row 2Testdoc2 Table: Column 3 Row 2
" }, - "text": "Testdoc2 Table: Column 1 Row 0" - }, - { - "type": "Title", - "element_id": "b40b0fee79c609772c958caa07bd47a8", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Testdoc2 Table: Column 2 Row 0" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Testdoc2 Table: Column 2 Row 0" - }, - { - "type": "Title", - "element_id": "cc59bb6025ceae34c2b9c9d7cdbfbcf9", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Testdoc2 Table: Column 3 Row 0" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Testdoc2 Table: Column 3 Row 0" - }, - { - "type": "Title", - "element_id": "3cb373750d4e46b4bbc980dd0d74321e", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Testdoc2 Table: Column 1 Row 1" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Testdoc2 Table: Column 1 Row 1" - }, - { - "type": "Title", - "element_id": "219a8d1fc742fb75b2481a0a75c77a3b", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Testdoc2 Table: Column 2 Row 1" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Testdoc2 Table: Column 2 Row 1" - }, - { - "type": "Title", - "element_id": "07a1ad32c97f3669f88014ee5942f616", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Testdoc2 Table: Column 3 Row 1" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Testdoc2 Table: Column 3 Row 1" - }, - { - "type": "Title", - "element_id": "17228bddb06b739951fab2ab04c09ea8", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Testdoc2 Table: Column 1 Row 2" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Testdoc2 Table: Column 1 Row 2" - }, - { - "type": "Title", - "element_id": "4ad7ae00fff8c8a3f903864d037cf86e", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Testdoc2 Table: Column 2 Row 2" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Testdoc2 Table: Column 2 Row 2" - }, - { - "type": "Title", - "element_id": "f2701095922247ecafbbd3fe31d585bf", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Testdoc2 Table: Column 3 Row 2" - ], - "emphasized_text_tags": [ - "strong" - ] - }, - "text": "Testdoc2 Table: Column 3 Row 2" + "text": "Testdoc2 Table: Column 1 Row 0 Testdoc2 Table: Column 2 Row 0 Testdoc2 Table: Column 3 Row 0 Testdoc2 Table: Column 1 Row 1 Testdoc2 Table: Column 2 Row 1 Testdoc2 Table: Column 3 Row 1 Testdoc2 Table: Column 1 Row 2 Testdoc2 Table: Column 2 Row 2 Testdoc2 Table: Column 3 Row 2" } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/dropbox/nested-1/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/dropbox/nested-1/ideas-page.html.json index 899445dac..3db09d27c 100644 --- a/test_unstructured_ingest/expected-structured-output/dropbox/nested-1/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/dropbox/nested-1/ideas-page.html.json @@ -1,54 +1,13 @@ [ { - "type": "Title", - "element_id": "17c1a6701c263407d0fcf7c3ebfb2986", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1 - }, - "text": "January 2023" - }, - { - "type": "NarrativeText", - "element_id": "6ea0e510b7ea64f87b55c1fe388cba7f", + "type": "Table", + "element_id": "e83a347af95db7ba47b5351f411e00c7", "metadata": { "data_source": {}, "filetype": "text/html", "page_number": 1, - "link_urls": [ - "https://twitter.com/stef/status/1617222428727586816" - ], - "link_texts": [ - null - ], - "emphasized_text_contents": [ - "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" - ], - "emphasized_text_tags": [ - "i" - ] + "text_as_html": "



January 2023(Someonefed my essays into GPT to make something that could answer
questions based on them, then asked it where good ideas come from. The
answer was ok, but not what I would have said. This is what I would have said.)
The way to get new ideas is to notice anomalies: what seems strange,
or missing, or broken? You can see anomalies in everyday life (much
of standup comedy is based on this), but the best place to look for
them is at the frontiers of knowledge.
Knowledge grows fractally.
From a distance its edges look smooth, but when you learn enough
to get close to one, you'll notice it's full of gaps. These gaps
will seem obvious; it will seem inexplicable that no one has tried
x or wondered about y. In the best case, exploring such gaps yields
whole new fractal buds.
" }, - "text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" - }, - { - "type": "NarrativeText", - "element_id": "a8ce0a2e7d66af2000e6c3bd36994411", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1 - }, - "text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge." - }, - { - "type": "NarrativeText", - "element_id": "4eafbff98b81999dfbf3572440d22393", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1 - }, - "text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." + "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/dropbox/nested-2/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/dropbox/nested-2/ideas-page.html.json index 899445dac..3db09d27c 100644 --- a/test_unstructured_ingest/expected-structured-output/dropbox/nested-2/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/dropbox/nested-2/ideas-page.html.json @@ -1,54 +1,13 @@ [ { - "type": "Title", - "element_id": "17c1a6701c263407d0fcf7c3ebfb2986", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1 - }, - "text": "January 2023" - }, - { - "type": "NarrativeText", - "element_id": "6ea0e510b7ea64f87b55c1fe388cba7f", + "type": "Table", + "element_id": "e83a347af95db7ba47b5351f411e00c7", "metadata": { "data_source": {}, "filetype": "text/html", "page_number": 1, - "link_urls": [ - "https://twitter.com/stef/status/1617222428727586816" - ], - "link_texts": [ - null - ], - "emphasized_text_contents": [ - "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" - ], - "emphasized_text_tags": [ - "i" - ] + "text_as_html": "



January 2023(Someonefed my essays into GPT to make something that could answer
questions based on them, then asked it where good ideas come from. The
answer was ok, but not what I would have said. This is what I would have said.)
The way to get new ideas is to notice anomalies: what seems strange,
or missing, or broken? You can see anomalies in everyday life (much
of standup comedy is based on this), but the best place to look for
them is at the frontiers of knowledge.
Knowledge grows fractally.
From a distance its edges look smooth, but when you learn enough
to get close to one, you'll notice it's full of gaps. These gaps
will seem obvious; it will seem inexplicable that no one has tried
x or wondered about y. In the best case, exploring such gaps yields
whole new fractal buds.
" }, - "text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" - }, - { - "type": "NarrativeText", - "element_id": "a8ce0a2e7d66af2000e6c3bd36994411", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1 - }, - "text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge." - }, - { - "type": "NarrativeText", - "element_id": "4eafbff98b81999dfbf3572440d22393", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1 - }, - "text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." + "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json index 899445dac..3db09d27c 100644 --- a/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json @@ -1,54 +1,13 @@ [ { - "type": "Title", - "element_id": "17c1a6701c263407d0fcf7c3ebfb2986", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1 - }, - "text": "January 2023" - }, - { - "type": "NarrativeText", - "element_id": "6ea0e510b7ea64f87b55c1fe388cba7f", + "type": "Table", + "element_id": "e83a347af95db7ba47b5351f411e00c7", "metadata": { "data_source": {}, "filetype": "text/html", "page_number": 1, - "link_urls": [ - "https://twitter.com/stef/status/1617222428727586816" - ], - "link_texts": [ - null - ], - "emphasized_text_contents": [ - "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" - ], - "emphasized_text_tags": [ - "i" - ] + "text_as_html": "



January 2023(Someonefed my essays into GPT to make something that could answer
questions based on them, then asked it where good ideas come from. The
answer was ok, but not what I would have said. This is what I would have said.)
The way to get new ideas is to notice anomalies: what seems strange,
or missing, or broken? You can see anomalies in everyday life (much
of standup comedy is based on this), but the best place to look for
them is at the frontiers of knowledge.
Knowledge grows fractally.
From a distance its edges look smooth, but when you learn enough
to get close to one, you'll notice it's full of gaps. These gaps
will seem obvious; it will seem inexplicable that no one has tried
x or wondered about y. In the best case, exploring such gaps yields
whole new fractal buds.
" }, - "text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" - }, - { - "type": "NarrativeText", - "element_id": "a8ce0a2e7d66af2000e6c3bd36994411", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1 - }, - "text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge." - }, - { - "type": "NarrativeText", - "element_id": "4eafbff98b81999dfbf3572440d22393", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1 - }, - "text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." + "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json index 899445dac..3db09d27c 100644 --- a/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json @@ -1,54 +1,13 @@ [ { - "type": "Title", - "element_id": "17c1a6701c263407d0fcf7c3ebfb2986", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1 - }, - "text": "January 2023" - }, - { - "type": "NarrativeText", - "element_id": "6ea0e510b7ea64f87b55c1fe388cba7f", + "type": "Table", + "element_id": "e83a347af95db7ba47b5351f411e00c7", "metadata": { "data_source": {}, "filetype": "text/html", "page_number": 1, - "link_urls": [ - "https://twitter.com/stef/status/1617222428727586816" - ], - "link_texts": [ - null - ], - "emphasized_text_contents": [ - "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" - ], - "emphasized_text_tags": [ - "i" - ] + "text_as_html": "



January 2023(Someonefed my essays into GPT to make something that could answer
questions based on them, then asked it where good ideas come from. The
answer was ok, but not what I would have said. This is what I would have said.)
The way to get new ideas is to notice anomalies: what seems strange,
or missing, or broken? You can see anomalies in everyday life (much
of standup comedy is based on this), but the best place to look for
them is at the frontiers of knowledge.
Knowledge grows fractally.
From a distance its edges look smooth, but when you learn enough
to get close to one, you'll notice it's full of gaps. These gaps
will seem obvious; it will seem inexplicable that no one has tried
x or wondered about y. In the best case, exploring such gaps yields
whole new fractal buds.
" }, - "text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" - }, - { - "type": "NarrativeText", - "element_id": "a8ce0a2e7d66af2000e6c3bd36994411", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1 - }, - "text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge." - }, - { - "type": "NarrativeText", - "element_id": "4eafbff98b81999dfbf3572440d22393", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1 - }, - "text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." + "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json index 899445dac..3db09d27c 100644 --- a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json @@ -1,54 +1,13 @@ [ { - "type": "Title", - "element_id": "17c1a6701c263407d0fcf7c3ebfb2986", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1 - }, - "text": "January 2023" - }, - { - "type": "NarrativeText", - "element_id": "6ea0e510b7ea64f87b55c1fe388cba7f", + "type": "Table", + "element_id": "e83a347af95db7ba47b5351f411e00c7", "metadata": { "data_source": {}, "filetype": "text/html", "page_number": 1, - "link_urls": [ - "https://twitter.com/stef/status/1617222428727586816" - ], - "link_texts": [ - null - ], - "emphasized_text_contents": [ - "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" - ], - "emphasized_text_tags": [ - "i" - ] + "text_as_html": "



January 2023(Someonefed my essays into GPT to make something that could answer
questions based on them, then asked it where good ideas come from. The
answer was ok, but not what I would have said. This is what I would have said.)
The way to get new ideas is to notice anomalies: what seems strange,
or missing, or broken? You can see anomalies in everyday life (much
of standup comedy is based on this), but the best place to look for
them is at the frontiers of knowledge.
Knowledge grows fractally.
From a distance its edges look smooth, but when you learn enough
to get close to one, you'll notice it's full of gaps. These gaps
will seem obvious; it will seem inexplicable that no one has tried
x or wondered about y. In the best case, exploring such gaps yields
whole new fractal buds.
" }, - "text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" - }, - { - "type": "NarrativeText", - "element_id": "a8ce0a2e7d66af2000e6c3bd36994411", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1 - }, - "text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge." - }, - { - "type": "NarrativeText", - "element_id": "4eafbff98b81999dfbf3572440d22393", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1 - }, - "text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." + "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/notion/122b2c22-996b-435b-9de2-ee0e9d2b04bc.json b/test_unstructured_ingest/expected-structured-output/notion/122b2c22-996b-435b-9de2-ee0e9d2b04bc.json index 6304ffa3f..5a8967cc8 100644 --- a/test_unstructured_ingest/expected-structured-output/notion/122b2c22-996b-435b-9de2-ee0e9d2b04bc.json +++ b/test_unstructured_ingest/expected-structured-output/notion/122b2c22-996b-435b-9de2-ee0e9d2b04bc.json @@ -1,33 +1,7 @@ [ { - "type": "UncategorizedText", - "element_id": "d8d2a2140ba63413c452dbefe499f90b", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "2023-08-04T18:31:00.000Z" - }, - { - "type": "UncategorizedText", - "element_id": "d8d2a2140ba63413c452dbefe499f90b", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "2023-08-04T18:31:00.000Z" - }, - { - "type": "Title", - "element_id": "548b1cea7491191a12465d055db621f4", + "type": "Table", + "element_id": "f06d7fd05f2335c203720ceca5301eba", "metadata": { "data_source": { "date_created": "2023-08-04T18:31:00.000Z", @@ -35,947 +9,8 @@ }, "filetype": "text/html", "page_number": 1, - "link_urls": [ - "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100" - ], - "link_texts": [ - "\n Roman Isecke\n " - ], - "emphasized_text_contents": [ - "Roman Isecke" - ], - "emphasized_text_tags": [ - "span" - ] + "text_as_html": "













Created time Last edited timeOwner PageTagsVerification
2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman IseckeNew Page unverified
2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman IseckeMorale Events Policies unverified
2023-08-04T18:31:00.000Z 2023-08-04T19:02:00.000Z Roman IseckeNew Page With Verification expired Roman Isecke 2023-08-04T04:00:00.000Z - 2023-08-11T04:00:00.000Z
2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman IseckeVacation Policy Policies unverified
2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman IseckeMission, Vision, Values Vision Company Updates unverified
2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman IseckeRecent Press Company Updates unverified
2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman IseckeGetting Started unverified
2023-08-04T18:31:00.000Z 2023-08-17T18:48:00.000Z Roman IseckePage with every block Company UpdatesPolicies verified Roman Isecke2023-08-04T04:00:00.000Z - 2023-11-02T04:00:00.000Z
2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman IseckeCorporate Travel Policies unverified
2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman IseckeBenefits Policies Policies unverified
" }, - "text": "Roman Isecke" - }, - { - "type": "Title", - "element_id": "7b544ee99a84930c8049d5c91f8e7541", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "New Page" - }, - { - "type": "Title", - "element_id": "97b7e2db799e2b79e65f418b42a7d305", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "unverified" - ], - "emphasized_text_tags": [ - "span" - ] - }, - "text": "unverified" - }, - { - "type": "UncategorizedText", - "element_id": "d8d2a2140ba63413c452dbefe499f90b", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "2023-08-04T18:31:00.000Z" - }, - { - "type": "UncategorizedText", - "element_id": "d8d2a2140ba63413c452dbefe499f90b", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "2023-08-04T18:31:00.000Z" - }, - { - "type": "Title", - "element_id": "548b1cea7491191a12465d055db621f4", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "link_urls": [ - "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100" - ], - "link_texts": [ - "\n Roman Isecke\n " - ], - "emphasized_text_contents": [ - "Roman Isecke" - ], - "emphasized_text_tags": [ - "span" - ] - }, - "text": "Roman Isecke" - }, - { - "type": "Title", - "element_id": "a3bc48c9c0c00bd86bfcefcb833d3fd4", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "Morale Events" - }, - { - "type": "Title", - "element_id": "d3ad1f1f8c9c4f5a4a593571085513a4", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Policies" - ], - "emphasized_text_tags": [ - "span" - ] - }, - "text": "Policies" - }, - { - "type": "Title", - "element_id": "97b7e2db799e2b79e65f418b42a7d305", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "unverified" - ], - "emphasized_text_tags": [ - "span" - ] - }, - "text": "unverified" - }, - { - "type": "UncategorizedText", - "element_id": "d8d2a2140ba63413c452dbefe499f90b", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "2023-08-04T18:31:00.000Z" - }, - { - "type": "UncategorizedText", - "element_id": "9eca9d6f69bb98c4ec616c4aec38d0d2", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "2023-08-04T19:02:00.000Z" - }, - { - "type": "Title", - "element_id": "548b1cea7491191a12465d055db621f4", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "link_urls": [ - "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100" - ], - "link_texts": [ - "\n Roman Isecke\n " - ], - "emphasized_text_contents": [ - "Roman Isecke" - ], - "emphasized_text_tags": [ - "span" - ] - }, - "text": "Roman Isecke" - }, - { - "type": "Title", - "element_id": "c502fd59c2cdff4881f98c3ce019dc77", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "New Page With Verification" - }, - { - "type": "UncategorizedText", - "element_id": "20079ac60749535ee21512b3091a61e0", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "link_urls": [ - "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100" - ], - "link_texts": [ - "\n Roman Isecke\n " - ], - "emphasized_text_contents": [ - "expired" - ], - "emphasized_text_tags": [ - "span" - ] - }, - "text": "expired\n \n \n Roman Isecke\n \n \n 2023-08-04T04:00:00.000Z - 2023-08-11T04:00:00.000Z" - }, - { - "type": "UncategorizedText", - "element_id": "d8d2a2140ba63413c452dbefe499f90b", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "2023-08-04T18:31:00.000Z" - }, - { - "type": "UncategorizedText", - "element_id": "d8d2a2140ba63413c452dbefe499f90b", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "2023-08-04T18:31:00.000Z" - }, - { - "type": "Title", - "element_id": "548b1cea7491191a12465d055db621f4", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "link_urls": [ - "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100" - ], - "link_texts": [ - "\n Roman Isecke\n " - ], - "emphasized_text_contents": [ - "Roman Isecke" - ], - "emphasized_text_tags": [ - "span" - ] - }, - "text": "Roman Isecke" - }, - { - "type": "Title", - "element_id": "c911244e369f9ee203656a820c260e4d", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "Vacation Policy" - }, - { - "type": "Title", - "element_id": "d3ad1f1f8c9c4f5a4a593571085513a4", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Policies" - ], - "emphasized_text_tags": [ - "span" - ] - }, - "text": "Policies" - }, - { - "type": "Title", - "element_id": "97b7e2db799e2b79e65f418b42a7d305", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "unverified" - ], - "emphasized_text_tags": [ - "span" - ] - }, - "text": "unverified" - }, - { - "type": "UncategorizedText", - "element_id": "d8d2a2140ba63413c452dbefe499f90b", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "2023-08-04T18:31:00.000Z" - }, - { - "type": "UncategorizedText", - "element_id": "d8d2a2140ba63413c452dbefe499f90b", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "2023-08-04T18:31:00.000Z" - }, - { - "type": "Title", - "element_id": "548b1cea7491191a12465d055db621f4", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "link_urls": [ - "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100" - ], - "link_texts": [ - "\n Roman Isecke\n " - ], - "emphasized_text_contents": [ - "Roman Isecke" - ], - "emphasized_text_tags": [ - "span" - ] - }, - "text": "Roman Isecke" - }, - { - "type": "Title", - "element_id": "94efbf7307081f8f45b11a183ad99254", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "Mission, Vision, Values" - }, - { - "type": "UncategorizedText", - "element_id": "575d595cf4830f838cc79edf3a4bd5fc", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Vision", - "Company Updates" - ], - "emphasized_text_tags": [ - "span", - "span" - ] - }, - "text": "Vision\n \n \n Company Updates" - }, - { - "type": "Title", - "element_id": "97b7e2db799e2b79e65f418b42a7d305", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "unverified" - ], - "emphasized_text_tags": [ - "span" - ] - }, - "text": "unverified" - }, - { - "type": "UncategorizedText", - "element_id": "d8d2a2140ba63413c452dbefe499f90b", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "2023-08-04T18:31:00.000Z" - }, - { - "type": "UncategorizedText", - "element_id": "d8d2a2140ba63413c452dbefe499f90b", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "2023-08-04T18:31:00.000Z" - }, - { - "type": "Title", - "element_id": "548b1cea7491191a12465d055db621f4", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "link_urls": [ - "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100" - ], - "link_texts": [ - "\n Roman Isecke\n " - ], - "emphasized_text_contents": [ - "Roman Isecke" - ], - "emphasized_text_tags": [ - "span" - ] - }, - "text": "Roman Isecke" - }, - { - "type": "Title", - "element_id": "b2d356b3e28717647c73b8767da6c485", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "Recent Press" - }, - { - "type": "Title", - "element_id": "67538900b235164b3f1debd8a8d80b44", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Company Updates" - ], - "emphasized_text_tags": [ - "span" - ] - }, - "text": "Company Updates" - }, - { - "type": "Title", - "element_id": "97b7e2db799e2b79e65f418b42a7d305", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "unverified" - ], - "emphasized_text_tags": [ - "span" - ] - }, - "text": "unverified" - }, - { - "type": "UncategorizedText", - "element_id": "d8d2a2140ba63413c452dbefe499f90b", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "2023-08-04T18:31:00.000Z" - }, - { - "type": "UncategorizedText", - "element_id": "d8d2a2140ba63413c452dbefe499f90b", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "2023-08-04T18:31:00.000Z" - }, - { - "type": "Title", - "element_id": "548b1cea7491191a12465d055db621f4", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "link_urls": [ - "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100" - ], - "link_texts": [ - "\n Roman Isecke\n " - ], - "emphasized_text_contents": [ - "Roman Isecke" - ], - "emphasized_text_tags": [ - "span" - ] - }, - "text": "Roman Isecke" - }, - { - "type": "Title", - "element_id": "d00eca1bae6742803906ab42a831e8b5", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "Getting Started" - }, - { - "type": "Title", - "element_id": "97b7e2db799e2b79e65f418b42a7d305", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "unverified" - ], - "emphasized_text_tags": [ - "span" - ] - }, - "text": "unverified" - }, - { - "type": "UncategorizedText", - "element_id": "d8d2a2140ba63413c452dbefe499f90b", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "2023-08-04T18:31:00.000Z" - }, - { - "type": "UncategorizedText", - "element_id": "a9e87d3147c54fd5fa061709e15ed0bf", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "2023-08-17T18:48:00.000Z" - }, - { - "type": "Title", - "element_id": "548b1cea7491191a12465d055db621f4", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "link_urls": [ - "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100" - ], - "link_texts": [ - "\n Roman Isecke\n " - ], - "emphasized_text_contents": [ - "Roman Isecke" - ], - "emphasized_text_tags": [ - "span" - ] - }, - "text": "Roman Isecke" - }, - { - "type": "Title", - "element_id": "5687503bd741f54090d4c0557c0eea1a", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "Page with every block" - }, - { - "type": "UncategorizedText", - "element_id": "d7501f757bf490f053005b707829e343", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Company Updates", - "Policies" - ], - "emphasized_text_tags": [ - "span", - "span" - ] - }, - "text": "Company Updates\n \n \n Policies" - }, - { - "type": "UncategorizedText", - "element_id": "aad0d1a0dbac83ea1906db66ecbff086", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "link_urls": [ - "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100" - ], - "link_texts": [ - "\n Roman Isecke\n " - ], - "emphasized_text_contents": [ - "verified" - ], - "emphasized_text_tags": [ - "span" - ] - }, - "text": "verified\n \n \n Roman Isecke\n \n \n 2023-08-04T04:00:00.000Z - 2023-11-02T04:00:00.000Z" - }, - { - "type": "UncategorizedText", - "element_id": "d8d2a2140ba63413c452dbefe499f90b", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "2023-08-04T18:31:00.000Z" - }, - { - "type": "UncategorizedText", - "element_id": "d8d2a2140ba63413c452dbefe499f90b", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "2023-08-04T18:31:00.000Z" - }, - { - "type": "Title", - "element_id": "548b1cea7491191a12465d055db621f4", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "link_urls": [ - "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100" - ], - "link_texts": [ - "\n Roman Isecke\n " - ], - "emphasized_text_contents": [ - "Roman Isecke" - ], - "emphasized_text_tags": [ - "span" - ] - }, - "text": "Roman Isecke" - }, - { - "type": "Title", - "element_id": "b2c1cf36a9b45cdefac07d1899b96ff1", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "Corporate Travel" - }, - { - "type": "Title", - "element_id": "d3ad1f1f8c9c4f5a4a593571085513a4", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Policies" - ], - "emphasized_text_tags": [ - "span" - ] - }, - "text": "Policies" - }, - { - "type": "Title", - "element_id": "97b7e2db799e2b79e65f418b42a7d305", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "unverified" - ], - "emphasized_text_tags": [ - "span" - ] - }, - "text": "unverified" - }, - { - "type": "UncategorizedText", - "element_id": "d8d2a2140ba63413c452dbefe499f90b", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "2023-08-04T18:31:00.000Z" - }, - { - "type": "UncategorizedText", - "element_id": "d8d2a2140ba63413c452dbefe499f90b", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "2023-08-04T18:31:00.000Z" - }, - { - "type": "Title", - "element_id": "548b1cea7491191a12465d055db621f4", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "link_urls": [ - "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100" - ], - "link_texts": [ - "\n Roman Isecke\n " - ], - "emphasized_text_contents": [ - "Roman Isecke" - ], - "emphasized_text_tags": [ - "span" - ] - }, - "text": "Roman Isecke" - }, - { - "type": "Title", - "element_id": "8bcdb5d9bc2bda33af04bae4495f5e37", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "Benefits Policies" - }, - { - "type": "Title", - "element_id": "d3ad1f1f8c9c4f5a4a593571085513a4", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Policies" - ], - "emphasized_text_tags": [ - "span" - ] - }, - "text": "Policies" - }, - { - "type": "Title", - "element_id": "97b7e2db799e2b79e65f418b42a7d305", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "unverified" - ], - "emphasized_text_tags": [ - "span" - ] - }, - "text": "unverified" + "text": "Created time\n \n \n Last edited time\n \n \n Owner\n \n \n Page\n \n \n Tags\n \n \n Verification\n \n \n \n \n 2023-08-04T18:31:00.000Z\n \n \n \n \n 2023-08-04T18:31:00.000Z\n \n \n \n \n \n \n Roman Isecke\n \n \n \n \n \n \n New Page\n \n \n \n \n \n \n \n \n \n unverified\n \n \n \n \n \n \n \n 2023-08-04T18:31:00.000Z\n \n \n \n \n 2023-08-04T18:31:00.000Z\n \n \n \n \n \n \n Roman Isecke\n \n \n \n \n \n \n Morale Events\n \n \n \n \n \n Policies\n \n \n \n \n \n \n \n unverified\n \n \n \n \n \n \n \n 2023-08-04T18:31:00.000Z\n \n \n \n \n 2023-08-04T19:02:00.000Z\n \n \n \n \n \n \n Roman Isecke\n \n \n \n \n \n \n New Page With Verification\n \n \n \n \n \n \n \n \n \n expired\n \n \n Roman Isecke\n \n \n 2023-08-04T04:00:00.000Z - 2023-08-11T04:00:00.000Z\n \n \n \n \n \n \n \n 2023-08-04T18:31:00.000Z\n \n \n \n \n 2023-08-04T18:31:00.000Z\n \n \n \n \n \n \n Roman Isecke\n \n \n \n \n \n \n Vacation Policy\n \n \n \n \n \n Policies\n \n \n \n \n \n \n \n unverified\n \n \n \n \n \n \n \n 2023-08-04T18:31:00.000Z\n \n \n \n \n 2023-08-04T18:31:00.000Z\n \n \n \n \n \n \n Roman Isecke\n \n \n \n \n \n \n Mission, Vision, Values\n \n \n \n \n \n Vision\n \n \n Company Updates\n \n \n \n \n \n \n \n unverified\n \n \n \n \n \n \n \n 2023-08-04T18:31:00.000Z\n \n \n \n \n 2023-08-04T18:31:00.000Z\n \n \n \n \n \n \n Roman Isecke\n \n \n \n \n \n \n Recent Press\n \n \n \n \n \n Company Updates\n \n \n \n \n \n \n \n unverified\n \n \n \n \n \n \n \n 2023-08-04T18:31:00.000Z\n \n \n \n \n 2023-08-04T18:31:00.000Z\n \n \n \n \n \n \n Roman Isecke\n \n \n \n \n \n \n Getting Started\n \n \n \n \n \n \n \n \n \n unverified\n \n \n \n \n \n \n \n 2023-08-04T18:31:00.000Z\n \n \n \n \n 2023-08-17T18:48:00.000Z\n \n \n \n \n \n \n Roman Isecke\n \n \n \n \n \n \n Page with every block \n \n \n \n \n \n Company Updates\n \n \n Policies\n \n \n \n \n \n \n \n verified\n \n \n Roman Isecke\n \n \n 2023-08-04T04:00:00.000Z - 2023-11-02T04:00:00.000Z\n \n \n \n \n \n \n \n 2023-08-04T18:31:00.000Z\n \n \n \n \n 2023-08-04T18:31:00.000Z\n \n \n \n \n \n \n Roman Isecke\n \n \n \n \n \n \n Corporate Travel\n \n \n \n \n \n Policies\n \n \n \n \n \n \n \n unverified\n \n \n \n \n \n \n \n 2023-08-04T18:31:00.000Z\n \n \n \n \n 2023-08-04T18:31:00.000Z\n \n \n \n \n \n \n Roman Isecke\n \n \n \n \n \n \n Benefits Policies\n \n \n \n \n \n Policies\n \n \n \n \n \n \n \n unverified" } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.json b/test_unstructured_ingest/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.json index ea8a504a4..39c370c3b 100644 --- a/test_unstructured_ingest/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.json +++ b/test_unstructured_ingest/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.json @@ -96,8 +96,8 @@ "text": "text\n \n with other" }, { - "type": "UncategorizedText", - "element_id": "8864784f943d9f832a3dce22ef8bcf01", + "type": "Table", + "element_id": "8298c3f1d0016deb9cbf44832c33480c", "metadata": { "data_source": { "date_created": "2023-08-04T18:31:00.000Z", @@ -105,207 +105,9 @@ }, "filetype": "text/html", "page_number": 1, - "emphasized_text_contents": [ - "content" - ], - "emphasized_text_tags": [ - "b" - ] + "text_as_html": "







column 1 column 2 pages
c1r1 content c2r1 table
2023-08-08T09:00:00.000-04:00
cell
Page with every block
c1r2 more content c2r2 table cell Untitled
this is some green text this is an equation Untitled
text1 text2 Multiline cell Another cell Untitled
" }, - "text": "c1r1 \n \n content" - }, - { - "type": "UncategorizedText", - "element_id": "6f75c9d2993dbb3981c019741c7962a9", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "c2r1 table \n 2023-08-08T09:00:00.000-04:00\n cell" - }, - { - "type": "Title", - "element_id": "5687503bd741f54090d4c0557c0eea1a", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "link_urls": [ - "https://www.notion.so/c47a45664c7a488bac2a1292ee507fcb" - ], - "link_texts": [ - "\n Page with every block \n " - ] - }, - "text": "Page with every block" - }, - { - "type": "UncategorizedText", - "element_id": "13686520a51e25584bb06ab189b38552", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "link_urls": [ - "/122b2c22996b435b9de2ee0e9d2b04bc" - ], - "link_texts": [ - "\n content\n " - ] - }, - "text": "c1r2 more \n \n content" - }, - { - "type": "UncategorizedText", - "element_id": "cf236cfe4b4c0ef644c37b4e491a4aa8", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "cell" - ], - "emphasized_text_tags": [ - "span" - ] - }, - "text": "c2r2 table \n \n cell" - }, - { - "type": "Title", - "element_id": "f59ab8d1331b7b16952fbd388258f856", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "link_urls": [ - "https://www.notion.so/9ba4d6da8a574cfc81ebceac1fde52bd" - ], - "link_texts": [ - "\n Untitled\n " - ] - }, - "text": "Untitled" - }, - { - "type": "NarrativeText", - "element_id": "7d96ce60a66271ef79da4c492ca7db8a", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "this is some green text" - }, - { - "type": "NarrativeText", - "element_id": "2d77a706008eebaf1f7c4e116bbe08b4", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "an", - "equation" - ], - "emphasized_text_tags": [ - "b", - "b" - ] - }, - "text": "this is \n \n an \n \n \n equation" - }, - { - "type": "Title", - "element_id": "f59ab8d1331b7b16952fbd388258f856", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "link_urls": [ - "https://www.notion.so/a1a5dff426f34b8f9a709d51b2a00c73" - ], - "link_texts": [ - "\n Untitled\n " - ] - }, - "text": "Untitled" - }, - { - "type": "UncategorizedText", - "element_id": "7e921a403f1840728e2887990cfe640d", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "text2" - ], - "emphasized_text_tags": [ - "i" - ] - }, - "text": "text1\n\n\n \n text2\n \n \n\nMultiline cell" - }, - { - "type": "Title", - "element_id": "7013d5bb5a17e0e782e8971e23640bdb", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "Another cell" - }, - { - "type": "Title", - "element_id": "f59ab8d1331b7b16952fbd388258f856", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "link_urls": [ - "https://www.notion.so/84002066546448d0a030aa79b8d400b0" - ], - "link_texts": [ - "\n Untitled\n " - ] - }, - "text": "Untitled" + "text": "column 1\n \n \n column 2\n \n \n pages\n \n \n \n c1r1 \n \n content \n \n \n \n c2r1 table \n 2023-08-08T09:00:00.000-04:00\n cell\n \n \n \n Page with every block \n \n \n \n \n \n c1r2 more \n \n content\n \n \n \n c2r2 table \n \n cell\n \n \n \n \n Untitled\n \n \n \n \n \n this is some green text\n \n \n this is \n \n an \n \n \n equation\n \n \n \n \n Untitled\n \n \n \n \n \n text1\n\n\n \n text2\n \n \n\nMultiline cell\n \n \n Another cell \n \n \n \n Untitled" }, { "type": "UncategorizedText", diff --git a/test_unstructured_ingest/expected-structured-output/notion/d3d87fc6-61cc-4bb5-89ed-e9dff0df1526.json b/test_unstructured_ingest/expected-structured-output/notion/d3d87fc6-61cc-4bb5-89ed-e9dff0df1526.json index e037e6cfe..cede094b1 100644 --- a/test_unstructured_ingest/expected-structured-output/notion/d3d87fc6-61cc-4bb5-89ed-e9dff0df1526.json +++ b/test_unstructured_ingest/expected-structured-output/notion/d3d87fc6-61cc-4bb5-89ed-e9dff0df1526.json @@ -1,20 +1,7 @@ [ { - "type": "UncategorizedText", - "element_id": "d6ec04f65fbb09dbefa4210ef201c9c0", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "2023-08-14 - 2023-08-27" - }, - { - "type": "Title", - "element_id": "548b1cea7491191a12465d055db621f4", + "type": "Table", + "element_id": "2026c17673ac794e40e78d1c8e28df5c", "metadata": { "data_source": { "date_created": "2023-08-02T20:36:00.000Z", @@ -22,632 +9,8 @@ }, "filetype": "text/html", "page_number": 1, - "link_urls": [ - "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100" - ], - "link_texts": [ - "\n Roman Isecke\n " - ] + "text_as_html": "






Completed tasks DatesMy Checkbox My Created ByMy Created TimeMy DateMy EmailMy Person My TextMy formula My multiselectMy number My phone numMy select Sprint IDSprint nameSprint statusTasksTotal tasks URL
0 2023-08-14 - 2023-08-27 Roman Isecke 2023-08-02T20:36:00.000Z 2023-08-31 email@custom.domaine DevOps-Bot False Option 1 12 SPRI1-2 Next notion://sprints/sprint_task_relation 1
0 2023-08-28 - 2023-09-10 Roman Isecke 2023-08-02T20:36:00.000Z 2023-08-29T00:00:00.000-04:00 - 2023-08-31T00:00:00.000-04:00 text More text with link False 45666645345465454 option 1 SPRI1-3 Sprint 3 Future notion://sprints/sprint_task_relation 1
0.25 2023-07-31 - 2023-08-13 Roman Isecke 2023-08-02T20:36:00.000Z 2023-08-07 roman@unstructured.io Roman Isecke Jason Scheirer This is someformattedtext TrueOption 2 Option 1 32 1234 option 2 SPRI1-1 Sprint 1 Currentnotion://sprints/sprint_task_relation
4
www.google.com
" }, - "text": "Roman Isecke" - }, - { - "type": "UncategorizedText", - "element_id": "9dfe062b68f15b3623944bd8ebb71b24", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "2023-08-02T20:36:00.000Z" - }, - { - "type": "UncategorizedText", - "element_id": "dece647865149e5a86e06c1af7c64aa5", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "2023-08-31" - }, - { - "type": "EmailAddress", - "element_id": "3357b6f2cc3b8584f1b7e66afbb46d34", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "email@custom.domaine" - }, - { - "type": "Title", - "element_id": "f6c286e4b3078307fc8ae3635b4b2f5b", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "DevOps-Bot" - ], - "emphasized_text_tags": [ - "span" - ] - }, - "text": "DevOps-Bot" - }, - { - "type": "Title", - "element_id": "60a33e6cf5151f2d52eddae9685cfa27", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "False" - }, - { - "type": "Title", - "element_id": "fae2db093e1dd31042e8ab9427e8673a", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Option 1" - ], - "emphasized_text_tags": [ - "span" - ] - }, - "text": "Option 1" - }, - { - "type": "UncategorizedText", - "element_id": "6b51d431df5d7f141cbececcf79edf3d", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "12" - }, - { - "type": "Title", - "element_id": "31eac5f6d8daefa258fc494a7e020bc8", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "SPRI1-2" - }, - { - "type": "Title", - "element_id": "1ff57a29d7c9d11bdf61c1b80f2b289b", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "Next" - }, - { - "type": "Title", - "element_id": "dfcd7f16dd2d92ee4ec22516fb45abd6", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "notion://sprints/sprint_task_relation" - }, - { - "type": "UncategorizedText", - "element_id": "188dc9ca72be97b25c9fff24f24ae74b", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "2023-08-28 - 2023-09-10" - }, - { - "type": "Title", - "element_id": "548b1cea7491191a12465d055db621f4", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "link_urls": [ - "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100" - ], - "link_texts": [ - "\n Roman Isecke\n " - ] - }, - "text": "Roman Isecke" - }, - { - "type": "UncategorizedText", - "element_id": "9dfe062b68f15b3623944bd8ebb71b24", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "2023-08-02T20:36:00.000Z" - }, - { - "type": "UncategorizedText", - "element_id": "c9b64468a792bcbf76cdce6d7ecc3bb9", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "2023-08-29T00:00:00.000-04:00 - 2023-08-31T00:00:00.000-04:00" - }, - { - "type": "Title", - "element_id": "982d9e3eb996f559e633f4d194def376", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "text" - }, - { - "type": "UncategorizedText", - "element_id": "f5cbeacfbddd0de7391bc723762001a6", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "link_urls": [ - "/51243b9d55dc4131b2ae03eff1ef1783" - ], - "link_texts": [ - "\n link\n " - ], - "emphasized_text_contents": [ - "More", - "text", - "text", - "with", - "link" - ], - "emphasized_text_tags": [ - "span", - "span", - "span", - "span", - "span" - ] - }, - "text": "More \n \n \n \n text\n \n \n \n with \n \n \n \n link" - }, - { - "type": "Title", - "element_id": "60a33e6cf5151f2d52eddae9685cfa27", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "False" - }, - { - "type": "UncategorizedText", - "element_id": "710375baee13b41d02266bd01d5f6b34", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "45666645345465454" - }, - { - "type": "Title", - "element_id": "41a49f786d133c212cf1a35177700394", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "option 1" - }, - { - "type": "Title", - "element_id": "502bb591b927c74b9f12ef78df9d5b1b", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "SPRI1-3" - }, - { - "type": "Title", - "element_id": "18e350f89256491ebe1f8cce73a45231", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "Sprint 3" - }, - { - "type": "Title", - "element_id": "61636cdef547228389f0260d1dbb952b", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "Future" - }, - { - "type": "Title", - "element_id": "dfcd7f16dd2d92ee4ec22516fb45abd6", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "notion://sprints/sprint_task_relation" - }, - { - "type": "UncategorizedText", - "element_id": "a30a043314fa89294fa2c1c989a01fbb", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "0.25" - }, - { - "type": "UncategorizedText", - "element_id": "3dee6959f0ef3a4e6147de48fc70a814", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "2023-07-31 - 2023-08-13" - }, - { - "type": "Title", - "element_id": "548b1cea7491191a12465d055db621f4", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "link_urls": [ - "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100" - ], - "link_texts": [ - "\n Roman Isecke\n " - ] - }, - "text": "Roman Isecke" - }, - { - "type": "UncategorizedText", - "element_id": "9dfe062b68f15b3623944bd8ebb71b24", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "2023-08-02T20:36:00.000Z" - }, - { - "type": "UncategorizedText", - "element_id": "7e4059ebb0ebf24caae1f12cb79b8c9c", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "2023-08-07" - }, - { - "type": "EmailAddress", - "element_id": "1ae8f7599f4f616683d2a69d29658afa", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "roman@unstructured.io" - }, - { - "type": "UncategorizedText", - "element_id": "5d2e9bcd00123dd21fc54731fef97129", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "link_urls": [ - "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100", - "https://lh3.googleusercontent.com/a/AAcHTtdiriiUNnUcm1dkAp7cbmmQyeO-acsViQHFS9v0=s100" - ], - "link_texts": [ - "\n Roman Isecke\n ", - "\n Jason Scheirer\n " - ], - "emphasized_text_contents": [ - "Roman Isecke", - "Jason Scheirer" - ], - "emphasized_text_tags": [ - "span", - "span" - ] - }, - "text": "Roman Isecke\n \n \n \n \n Jason Scheirer" - }, - { - "type": "NarrativeText", - "element_id": "495e614a2084bd7c40e34b0b69534e67", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "This is some", - "formatted", - "formatted", - "text" - ], - "emphasized_text_tags": [ - "span", - "span", - "b", - "span" - ] - }, - "text": "This is some \n \n \n \n formatted\n \n \n \n text" - }, - { - "type": "Title", - "element_id": "3cbc87c7681f34db4617feaa2c880193", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "True" - }, - { - "type": "UncategorizedText", - "element_id": "4b2e896ce5416db25c44f6918648d0f4", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "emphasized_text_contents": [ - "Option 2", - "Option 1" - ], - "emphasized_text_tags": [ - "span", - "span" - ] - }, - "text": "Option 2\n \n \n Option 1" - }, - { - "type": "UncategorizedText", - "element_id": "e29c9c180c6279b0b02abd6a1801c7c0", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "32" - }, - { - "type": "UncategorizedText", - "element_id": "03ac674216f3e15c761ee1a5e255f067", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "1234" - }, - { - "type": "Title", - "element_id": "209ef9fc4dfe2166bcf2460b80334276", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "option 2" - }, - { - "type": "Title", - "element_id": "3fafa60b6782f5d52caf7be755d82232", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "SPRI1-1" - }, - { - "type": "Title", - "element_id": "f931bdb912a40a788890924578a0cff7", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "Sprint 1" - }, - { - "type": "Title", - "element_id": "e0d1b68224bf0b31ef16b206c65b5f8f", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "Current" - }, - { - "type": "Title", - "element_id": "dfcd7f16dd2d92ee4ec22516fb45abd6", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "notion://sprints/sprint_task_relation" - }, - { - "type": "Title", - "element_id": "191347bfe55d0ca9a574db77bc864827", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "link_urls": [ - "www.google.com" - ], - "link_texts": [ - "\n www.google.com\n " - ] - }, - "text": "www.google.com" + "text": "Completed tasks\n \n \n Dates\n \n \n My Checkbox\n \n \n My Created By\n \n \n My Created Time\n \n \n My Date\n \n \n My Email\n \n \n My Person\n \n \n My Text\n \n \n My formula\n \n \n My multiselect\n \n \n My number\n \n \n My phone num\n \n \n My select\n \n \n Sprint ID\n \n \n Sprint name\n \n \n Sprint status\n \n \n Tasks\n \n \n Total tasks\n \n \n URL\n \n \n \n \n 0\n \n \n \n \n 2023-08-14 - 2023-08-27\n \n \n \n \n \n \n \n \n \n Roman Isecke\n \n \n \n \n 2023-08-02T20:36:00.000Z\n \n \n \n \n 2023-08-31\n \n \n \n \n email@custom.domaine\n \n \n \n \n \n \n DevOps-Bot\n \n \n \n \n \n \n \n \n \n False\n \n \n \n \n \n Option 1\n \n \n \n \n \n 12\n \n \n \n \n \n \n \n \n \n \n SPRI1-2\n \n \n \n \n \n \n \n Next\n \n \n \n \n notion://sprints/sprint_task_relation\n \n \n \n \n 1\n \n \n \n \n \n \n \n \n 0\n \n \n \n \n 2023-08-28 - 2023-09-10\n \n \n \n \n \n \n \n \n \n Roman Isecke\n \n \n \n \n 2023-08-02T20:36:00.000Z\n \n \n \n \n 2023-08-29T00:00:00.000-04:00 - 2023-08-31T00:00:00.000-04:00\n \n \n \n \n text\n \n \n \n \n \n \n \n \n More \n \n \n \n text\n \n \n \n with \n \n \n \n link\n \n \n \n \n \n \n False\n \n \n \n \n \n \n \n \n \n \n 45666645345465454\n \n \n \n \n option 1\n \n \n \n \n SPRI1-3\n \n \n \n \n Sprint 3\n \n \n \n \n Future\n \n \n \n \n notion://sprints/sprint_task_relation\n \n \n \n \n 1\n \n \n \n \n \n \n \n \n 0.25\n \n \n \n \n 2023-07-31 - 2023-08-13\n \n \n \n \n \n \n \n \n \n Roman Isecke\n \n \n \n \n 2023-08-02T20:36:00.000Z\n \n \n \n \n 2023-08-07\n \n \n \n \n roman@unstructured.io\n \n \n \n \n \n \n Roman Isecke\n \n \n \n \n Jason Scheirer\n \n \n \n \n \n \n \n This is some \n \n \n \n formatted\n \n \n \n text\n \n \n \n \n \n True\n \n \n \n \n \n Option 2\n \n \n Option 1\n \n \n \n \n \n 32\n \n \n \n \n 1234\n \n \n \n \n option 2\n \n \n \n \n SPRI1-1\n \n \n \n \n Sprint 1\n \n \n \n \n Current\n \n \n \n \n notion://sprints/sprint_task_relation\n \n \n \n \n 4\n \n \n \n \n www.google.com" } ] \ No newline at end of file diff --git a/unstructured/__version__.py b/unstructured/__version__.py index ce8de9130..45743f95c 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.14" # pragma: no cover +__version__ = "0.10.15-dev1" # pragma: no cover diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py index d3298ad67..8315462dd 100644 --- a/unstructured/documents/html.py +++ b/unstructured/documents/html.py @@ -9,6 +9,7 @@ else: from typing import Final from lxml import etree +from tabulate import tabulate from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes from unstructured.documents.base import Page @@ -19,6 +20,7 @@ from unstructured.documents.elements import ( Link, ListItem, NarrativeText, + Table, Text, Title, ) @@ -53,6 +55,7 @@ class TagsMixin: ancestortags: Sequence[str] = (), links: Sequence[Link] = [], emphasized_texts: Sequence[dict] = [], + text_as_html: Optional[str] = None, **kwargs, ): if tag is None: @@ -62,6 +65,7 @@ class TagsMixin: self.ancestortags = ancestortags self.links = links self.emphasized_texts = emphasized_texts + self.text_as_html = text_as_html super().__init__(*args, **kwargs) @@ -101,6 +105,12 @@ class HTMLListItem(TagsMixin, ListItem): pass +class HTMLTable(TagsMixin, Table): + """NarrativeText with tag information""" + + pass + + class HTMLDocument(XMLDocument): """Class for handling HTML documents. Uses rules based parsing to identify sections of interest within the document.""" @@ -168,6 +178,12 @@ class HTMLDocument(XMLDocument): page.elements.append(element) descendanttag_elems = _get_bullet_descendants(tag_elem, next_element) + elif _is_table_item(tag_elem): + element, next_element = _process_leaf_table_item(tag_elem) + if element is not None: + page.elements.append(element) + descendanttag_elems = tuple(tag_elem.iterdescendants()) + elif tag_elem.tag in PAGEBREAK_TAGS and len(page.elements) > 0: pages.append(page) page_number += 1 @@ -183,7 +199,7 @@ class HTMLDocument(XMLDocument): def doc_after_cleaners( self, skip_headers_and_footers=False, - skip_table_text=False, + skip_table=False, inplace=False, ) -> HTMLDocument: """Filters the elements and returns a new instance of the class based on the criteria @@ -191,8 +207,8 @@ class HTMLDocument(XMLDocument): page are filtered out. Parameters ---------- - skip_table_text: - If True, skips text that is contained within a table element + skip_table: + If True, skips table element skip_headers_and_footers: If True, ignores any content that is within
or