mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2026-01-06 04:11:08 +00:00
Part two of: https://github.com/Unstructured-IO/unstructured/pull/2842 Main changes compared to part one: * hash computation includes element's sequence number on page, page number, document filename and its text * there are more test for deterministic behavior of IDs returned by partitioning functions + their uniqueness (guaranteed at the document level, and high probability across multiple documents) This PR addresses the following issue: https://github.com/Unstructured-IO/unstructured/issues/2461
100 lines
4.3 KiB
JSON
100 lines
4.3 KiB
JSON
[
|
|
{
|
|
"element_id": "17e9a90f9616f2abed8cf32b5bd3810d",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_created": "2023-06-16T05:05:05+00:00",
|
|
"date_modified": "2023-06-16T05:05:05+00:00",
|
|
"record_locator": {
|
|
"server_path": "/Shared Documents/stanley-cups.xlsx",
|
|
"site_url": "https://unstructuredio.sharepoint.com"
|
|
},
|
|
"url": "https://unstructuredio.sharepoint.com/Shared Documents/stanley-cups.xlsx?d=wb9956a338079432191ea609def07394d",
|
|
"version": "1"
|
|
},
|
|
"filename": "stanley-cups.xlsx",
|
|
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"page_name": "Stanley Cups",
|
|
"page_number": 1
|
|
},
|
|
"text": "Stanley Cups",
|
|
"type": "Title"
|
|
},
|
|
{
|
|
"element_id": "8d70ea477d9db14ed01ff1d39a118a42",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_created": "2023-06-16T05:05:05+00:00",
|
|
"date_modified": "2023-06-16T05:05:05+00:00",
|
|
"record_locator": {
|
|
"server_path": "/Shared Documents/stanley-cups.xlsx",
|
|
"site_url": "https://unstructuredio.sharepoint.com"
|
|
},
|
|
"url": "https://unstructuredio.sharepoint.com/Shared Documents/stanley-cups.xlsx?d=wb9956a338079432191ea609def07394d",
|
|
"version": "1"
|
|
},
|
|
"filename": "stanley-cups.xlsx",
|
|
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"page_name": "Stanley Cups",
|
|
"page_number": 1,
|
|
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
|
|
},
|
|
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n",
|
|
"type": "Table"
|
|
},
|
|
{
|
|
"element_id": "ee34bd8c186b57e3530d5443ffa58122",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_created": "2023-06-16T05:05:05+00:00",
|
|
"date_modified": "2023-06-16T05:05:05+00:00",
|
|
"record_locator": {
|
|
"server_path": "/Shared Documents/stanley-cups.xlsx",
|
|
"site_url": "https://unstructuredio.sharepoint.com"
|
|
},
|
|
"url": "https://unstructuredio.sharepoint.com/Shared Documents/stanley-cups.xlsx?d=wb9956a338079432191ea609def07394d",
|
|
"version": "1"
|
|
},
|
|
"filename": "stanley-cups.xlsx",
|
|
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"page_name": "Stanley Cups Since 67",
|
|
"page_number": 2
|
|
},
|
|
"text": "Stanley Cups Since 67",
|
|
"type": "Title"
|
|
},
|
|
{
|
|
"element_id": "310cd42767ffd563f6639210df793c5b",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_created": "2023-06-16T05:05:05+00:00",
|
|
"date_modified": "2023-06-16T05:05:05+00:00",
|
|
"record_locator": {
|
|
"server_path": "/Shared Documents/stanley-cups.xlsx",
|
|
"site_url": "https://unstructuredio.sharepoint.com"
|
|
},
|
|
"url": "https://unstructuredio.sharepoint.com/Shared Documents/stanley-cups.xlsx?d=wb9956a338079432191ea609def07394d",
|
|
"version": "1"
|
|
},
|
|
"filename": "stanley-cups.xlsx",
|
|
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"page_name": "Stanley Cups Since 67",
|
|
"page_number": 2,
|
|
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
|
|
},
|
|
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n",
|
|
"type": "Table"
|
|
}
|
|
] |