Michał Martyniak 2d1923ac7e
Better element IDs - deterministic and document-unique hashes (#2673)
Part two of: https://github.com/Unstructured-IO/unstructured/pull/2842

Main changes compared to part one:
* hash computation includes element's sequence number on page, page
number, document filename and its text
* there are more test for deterministic behavior of IDs returned by
partitioning functions + their uniqueness (guaranteed at the document
level, and high probability across multiple documents)

This PR addresses the following issue:
https://github.com/Unstructured-IO/unstructured/issues/2461
2024-04-24 00:05:20 -07:00

100 lines
4.3 KiB
JSON

[
{
"element_id": "17e9a90f9616f2abed8cf32b5bd3810d",
"metadata": {
"data_source": {
"date_created": "2023-06-16T05:05:05+00:00",
"date_modified": "2023-06-16T05:05:05+00:00",
"record_locator": {
"server_path": "/Shared Documents/stanley-cups.xlsx",
"site_url": "https://unstructuredio.sharepoint.com"
},
"url": "https://unstructuredio.sharepoint.com/Shared Documents/stanley-cups.xlsx?d=wb9956a338079432191ea609def07394d",
"version": "1"
},
"filename": "stanley-cups.xlsx",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"languages": [
"eng"
],
"page_name": "Stanley Cups",
"page_number": 1
},
"text": "Stanley Cups",
"type": "Title"
},
{
"element_id": "8d70ea477d9db14ed01ff1d39a118a42",
"metadata": {
"data_source": {
"date_created": "2023-06-16T05:05:05+00:00",
"date_modified": "2023-06-16T05:05:05+00:00",
"record_locator": {
"server_path": "/Shared Documents/stanley-cups.xlsx",
"site_url": "https://unstructuredio.sharepoint.com"
},
"url": "https://unstructuredio.sharepoint.com/Shared Documents/stanley-cups.xlsx?d=wb9956a338079432191ea609def07394d",
"version": "1"
},
"filename": "stanley-cups.xlsx",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"languages": [
"eng"
],
"page_name": "Stanley Cups",
"page_number": 1,
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n",
"type": "Table"
},
{
"element_id": "ee34bd8c186b57e3530d5443ffa58122",
"metadata": {
"data_source": {
"date_created": "2023-06-16T05:05:05+00:00",
"date_modified": "2023-06-16T05:05:05+00:00",
"record_locator": {
"server_path": "/Shared Documents/stanley-cups.xlsx",
"site_url": "https://unstructuredio.sharepoint.com"
},
"url": "https://unstructuredio.sharepoint.com/Shared Documents/stanley-cups.xlsx?d=wb9956a338079432191ea609def07394d",
"version": "1"
},
"filename": "stanley-cups.xlsx",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"languages": [
"eng"
],
"page_name": "Stanley Cups Since 67",
"page_number": 2
},
"text": "Stanley Cups Since 67",
"type": "Title"
},
{
"element_id": "310cd42767ffd563f6639210df793c5b",
"metadata": {
"data_source": {
"date_created": "2023-06-16T05:05:05+00:00",
"date_modified": "2023-06-16T05:05:05+00:00",
"record_locator": {
"server_path": "/Shared Documents/stanley-cups.xlsx",
"site_url": "https://unstructuredio.sharepoint.com"
},
"url": "https://unstructuredio.sharepoint.com/Shared Documents/stanley-cups.xlsx?d=wb9956a338079432191ea609def07394d",
"version": "1"
},
"filename": "stanley-cups.xlsx",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"languages": [
"eng"
],
"page_name": "Stanley Cups Since 67",
"page_number": 2,
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n",
"type": "Table"
}
]