unstructured/test_unstructured_ingest/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.json
ryannikolaidis 66bf4b0198
feat: support extracting image url in html (#3955)
also removes mimetype when base64 is not included in image metadata

---------

Co-authored-by: ryannikolaidis <ryannikolaidis@users.noreply.github.com>
2025-03-13 22:41:10 +00:00

631 lines
16 KiB
JSON

[
{
"element_id": "cd153f73463db45ea02bd9ba6ce4168e",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "Page with every block",
"type": "UncategorizedText"
},
{
"element_id": "098442d39ccc8a9731627be8a843d02a",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "Notion Tip: Tag pages to let collaborators know what they can expect to use the page for. You can add one or many tags to any page in a wiki.",
"type": "NarrativeText"
},
{
"element_id": "868a2b2294814990d664cf13ffd1e2a7",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "Heading 2",
"type": "UncategorizedText"
},
{
"element_id": "af888c9a9a14c9c6616cf54ac230c20a",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "This is some new text",
"type": "NarrativeText"
},
{
"element_id": "99388232115e119009419bd8b07c93b9",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"emphasized_text_contents": [
"formatted"
],
"emphasized_text_tags": [
"b"
],
"filetype": "text/html",
"languages": [
"eng"
],
"link_texts": [
"text"
],
"link_urls": [
"/9ba4d6da8a574cfc81ebceac1fde52bd"
]
},
"text": "Some/less → more formatted text with other content and stuff 2023-08-07 : @Roman Isecke",
"type": "UncategorizedText"
},
{
"element_id": "91b9abcc226cbe676d827950030c6702",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
],
"text_as_html": "<table><tr><td>column 1</td><td>column 2</td><td>pages</td></tr><tr><td>c1r1 content</td><td>c2r1 table <br/> 2023-08-08T09:00:00.000-04:00<br/> cell</td><td>Page with every block</td></tr><tr><td>c1r2 more content</td><td>c2r2 table cell</td><td>Untitled</td></tr><tr><td>this is some green text</td><td>this is an equation</td><td>Untitled</td></tr><tr><td>text1 text2 Multiline cell</td><td>Another cell</td><td>Untitled</td></tr></table>"
},
"text": "column 1 column 2 pages c1r1 content c2r1 table \n 2023-08-08T09:00:00.000-04:00\n cell Page with every block c1r2 more content c2r2 table cell Untitled this is some green text this is an equation Untitled text1 text2 Multiline cell Another cell Untitled",
"type": "Table"
},
{
"element_id": "0b73b1397f01db39dc98a983bd3aeb3d",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "E = {mc^2}",
"type": "UncategorizedText"
},
{
"element_id": "7535c23e3c0bda50ea38df65f7a64bca",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "Numbered list",
"type": "ListItem"
},
{
"element_id": "155061ede32096c81085eabf421f9fe0",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "A number child",
"type": "ListItem"
},
{
"element_id": "1ff4a64dcc74b4cbdf4270776c2adab0",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "A number grandchild",
"type": "ListItem"
},
{
"element_id": "9e0342a8c3a010f7802d874fa447f72b",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "great",
"type": "ListItem"
},
{
"element_id": "240e4a3a9b5843192b03086325da2169",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "super great",
"type": "ListItem"
},
{
"element_id": "d1e6a3da60ba834365b2230689c4d8a6",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "with test text",
"type": "ListItem"
},
{
"element_id": "db78c6b732dc265e380889e394c6354f",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "Bullet one",
"type": "ListItem"
},
{
"element_id": "f31b201c44870108f395a238bff36413",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "A child bullet",
"type": "ListItem"
},
{
"element_id": "5929608d0a4d2f055635bbab72df26ec",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "A grandchild bullet",
"type": "ListItem"
},
{
"element_id": "1e93d6f8cf7c8af51ddf222be77b4882",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "great",
"type": "ListItem"
},
{
"element_id": "c53244024b7b1e86b20bcc1489d9dc4a",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "super great",
"type": "ListItem"
},
{
"element_id": "3602b0a8a126be064654623590163f49",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "Bullet two",
"type": "ListItem"
},
{
"element_id": "27d5b17e90250d77a76da1f6d93f8e8b",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "I quote myself testings Notion",
"type": "NarrativeText"
},
{
"element_id": "8831856d3670d91d6fa2121af0694022",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
],
"link_texts": [
"https://www.notion.so/icons/airplane_brown.svg"
],
"link_urls": [
"https://www.notion.so/icons/airplane_brown.svg"
]
},
"text": "https://www.notion.so/icons/airplane_brown.svg I call this out",
"type": "NarrativeText"
},
{
"element_id": "df59e087da5910b2cb1c98801bb24c85",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
],
"link_texts": [
"https://www.wikipedia.org/"
],
"link_urls": [
"https://www.wikipedia.org/"
]
},
"text": "https://www.wikipedia.org/",
"type": "UncategorizedText"
},
{
"element_id": "0f215d56b4a1fc900dc2dad40b7df66f",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
],
"link_texts": [
"https://play-lh.googleusercontent.com/KwUBNPbMTk9jDXYS2AeX3illtVRTkrKVh5xR1Mg4WHd0CG2tV4mrh1z3kXi5z_warlk"
],
"link_urls": [
"https://play-lh.googleusercontent.com/KwUBNPbMTk9jDXYS2AeX3illtVRTkrKVh5xR1Mg4WHd0CG2tV4mrh1z3kXi5z_warlk"
]
},
"text": "https://play-lh.googleusercontent.com/KwUBNPbMTk9jDXYS2AeX3illtVRTkrKVh5xR1Mg4WHd0CG2tV4mrh1z3kXi5z_warlk",
"type": "UncategorizedText"
},
{
"element_id": "5da75c186c36d3117e60f08d49e66085",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "Child Database:",
"type": "UncategorizedText"
},
{
"element_id": "a82757a2b9004569ab1761d061847bd3",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
],
"link_texts": [
"Analytics"
],
"link_urls": [
"https://www.notion.so/d1fad658f1cf4eedb0b5ee72b9f0b530"
]
},
"text": "Analytics",
"type": "UncategorizedText"
},
{
"element_id": "29a6be22a8770f106f54f4abcdc1de68",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "Child Page:",
"type": "UncategorizedText"
},
{
"element_id": "d07d54a1ce286a7679952d4e4ce82c8e",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
],
"link_texts": [
"Untitled"
],
"link_urls": [
"https://www.notion.so/9ba4d6da8a574cfc81ebceac1fde52bd"
]
},
"text": "Untitled",
"type": "UncategorizedText"
},
{
"element_id": "d4c02f5b35a00e87ef7be603d82c5df3",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "s = \"this is some code\"",
"type": "NarrativeText"
},
{
"element_id": "59aab31c8b60641b906a81db51c596a6",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "This is my code caption",
"type": "NarrativeText"
},
{
"element_id": "7fc741d4226b15a910af95ff3fde6253",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "This is some text",
"type": "NarrativeText"
},
{
"element_id": "f67f0aef4f1ceb0fa98491872aa741ac",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "This is text in next column",
"type": "NarrativeText"
},
{
"element_id": "f08a88064f2c33164502652db93fad32",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "Final text in column",
"type": "UncategorizedText"
},
{
"element_id": "fa3e9d761730605036aaf854d9edd5b4",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "Heading 1 content",
"type": "NarrativeText"
},
{
"element_id": "387c4d334f8e9650a56b3b444b2ad5f6",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"image_url": "https://media4.giphy.com/media/26FPsOhR3tyQRTc2Y/giphy.gif?cid=7941fdc68sl3vdqajgosqug9hfhg3zq3t5yoflyy9p7y66q0&ep=v1_gifs_trending&rid=giphy.gif&ct=g",
"languages": [
"eng"
]
},
"text": "",
"type": "Image"
},
{
"element_id": "60d9f47b086264ea72277b741e3b2bdd",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "d3d87fc6-61cc-4bb5-89ed-e9dff0df1526",
"type": "UncategorizedText"
},
{
"element_id": "b39f61345657ccc5e201c20a6a90fad7",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "Stuff todo",
"type": "UncategorizedText"
},
{
"element_id": "b95452fe8c6616a1ce1311457526c302",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "more stuff todo",
"type": "UncategorizedText"
},
{
"element_id": "a7c3ee9360b2020e28aa31835ef5283c",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "More things to do",
"type": "NarrativeText"
},
{
"element_id": "349f058fcce7e32bb68b620841f40c9e",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "Something to do",
"type": "NarrativeText"
}
]