Michał Martyniak 2d1923ac7e
Better element IDs - deterministic and document-unique hashes (#2673)
Part two of: https://github.com/Unstructured-IO/unstructured/pull/2842

Main changes compared to part one:
* hash computation includes element's sequence number on page, page
number, document filename and its text
* there are more test for deterministic behavior of IDs returned by
partitioning functions + their uniqueness (guaranteed at the document
level, and high probability across multiple documents)

This PR addresses the following issue:
https://github.com/Unstructured-IO/unstructured/issues/2461
2024-04-24 00:05:20 -07:00

307 lines
8.4 KiB
JSON

[
{
"element_id": "375de74537bbe1fd31021086d68769bf",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.226000",
"date_modified": "2023-07-09T12:54:45.226000",
"record_locator": {
"page_id": "1605942",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "Copy and paste this section for each week.",
"type": "Title"
},
{
"element_id": "cab64c08a463fe95a7fbd921fc640d21",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.226000",
"date_modified": "2023-07-09T12:54:45.226000",
"record_locator": {
"page_id": "1605942",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
"version": "1"
},
"emphasized_text_contents": [
"Win"
],
"emphasized_text_tags": [
"strong"
],
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "Win",
"type": "Title"
},
{
"element_id": "c2c9596ed6bee008aae3ce4db696ac24",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.226000",
"date_modified": "2023-07-09T12:54:45.226000",
"record_locator": {
"page_id": "1605942",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "",
"type": "ListItem"
},
{
"element_id": "ae07661a728e59fbfef629ac4f428156",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.226000",
"date_modified": "2023-07-09T12:54:45.226000",
"record_locator": {
"page_id": "1605942",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "",
"type": "ListItem"
},
{
"element_id": "d3c4f46e6a1e6e61eb2326e355a939fe",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.226000",
"date_modified": "2023-07-09T12:54:45.226000",
"record_locator": {
"page_id": "1605942",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "",
"type": "ListItem"
},
{
"element_id": "3af48952e1993daadd5ed991d577d0e8",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.226000",
"date_modified": "2023-07-09T12:54:45.226000",
"record_locator": {
"page_id": "1605942",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
"version": "1"
},
"emphasized_text_contents": [
"Needs input"
],
"emphasized_text_tags": [
"strong"
],
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "Needs input",
"type": "Title"
},
{
"element_id": "7df2e781177efd8f71499e8c625822a4",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.226000",
"date_modified": "2023-07-09T12:54:45.226000",
"record_locator": {
"page_id": "1605942",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "",
"type": "ListItem"
},
{
"element_id": "03bb9a55d87cb58486edb4d6489566ce",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.226000",
"date_modified": "2023-07-09T12:54:45.226000",
"record_locator": {
"page_id": "1605942",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "",
"type": "ListItem"
},
{
"element_id": "c89bb55847e2ffad844e85a9669ab060",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.226000",
"date_modified": "2023-07-09T12:54:45.226000",
"record_locator": {
"page_id": "1605942",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "",
"type": "ListItem"
},
{
"element_id": "5425cdbf43e1a3479d7040c9652a8def",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.226000",
"date_modified": "2023-07-09T12:54:45.226000",
"record_locator": {
"page_id": "1605942",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
"version": "1"
},
"emphasized_text_contents": [
"Focus"
],
"emphasized_text_tags": [
"strong"
],
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "Focus",
"type": "Title"
},
{
"element_id": "3efe995111b5e90b3c048508acefb393",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.226000",
"date_modified": "2023-07-09T12:54:45.226000",
"record_locator": {
"page_id": "1605942",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "",
"type": "ListItem"
},
{
"element_id": "44d35bf5e10cef76e3cb4541988c7ad3",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.226000",
"date_modified": "2023-07-09T12:54:45.226000",
"record_locator": {
"page_id": "1605942",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "",
"type": "ListItem"
},
{
"element_id": "53db392c4a0f76b2ade29aefe3ef9eca",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.226000",
"date_modified": "2023-07-09T12:54:45.226000",
"record_locator": {
"page_id": "1605942",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"text_as_html": "<table><tr><td>Notes</td><td></td></tr><tr><td>Important Links</td><td></td></tr></table>"
},
"text": "Notes Important Links",
"type": "Table"
}
]