Michał Martyniak 2d1923ac7e
Better element IDs - deterministic and document-unique hashes (#2673)
Part two of: https://github.com/Unstructured-IO/unstructured/pull/2842

Main changes compared to part one:
* hash computation includes element's sequence number on page, page
number, document filename and its text
* there are more test for deterministic behavior of IDs returned by
partitioning functions + their uniqueness (guaranteed at the document
level, and high probability across multiple documents)

This PR addresses the following issue:
https://github.com/Unstructured-IO/unstructured/issues/2461
2024-04-24 00:05:20 -07:00

222 lines
6.5 KiB
JSON

[
{
"element_id": "9b56cbe96f509d43ba706a4faaf671f0",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:19.072000",
"date_modified": "2023-07-11T17:02:21.081000",
"record_locator": {
"page_id": "1867777",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1867777",
"version": "2"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "Testdoc1 has only this text for 10 times: 1",
"type": "NarrativeText"
},
{
"element_id": "e89817ef5b10d7bd5cacc62083530114",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:19.072000",
"date_modified": "2023-07-11T17:02:21.081000",
"record_locator": {
"page_id": "1867777",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1867777",
"version": "2"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "Testdoc1 has only this text for 10 times: 2",
"type": "NarrativeText"
},
{
"element_id": "e38e0e00f7fb8d4b32271530cfde5c3a",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:19.072000",
"date_modified": "2023-07-11T17:02:21.081000",
"record_locator": {
"page_id": "1867777",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1867777",
"version": "2"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "Testdoc1 has only this text for 10 times: 3",
"type": "NarrativeText"
},
{
"element_id": "3885e42cbd9d0c4c9854475c75450b29",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:19.072000",
"date_modified": "2023-07-11T17:02:21.081000",
"record_locator": {
"page_id": "1867777",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1867777",
"version": "2"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "Testdoc1 has only this text for 10 times: 4",
"type": "NarrativeText"
},
{
"element_id": "69e8d160b997383de9ef07ede5fd0693",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:19.072000",
"date_modified": "2023-07-11T17:02:21.081000",
"record_locator": {
"page_id": "1867777",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1867777",
"version": "2"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "Testdoc1 has only this text for 10 times: 5",
"type": "NarrativeText"
},
{
"element_id": "32bb7799e53f295ef8a92d93634fb753",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:19.072000",
"date_modified": "2023-07-11T17:02:21.081000",
"record_locator": {
"page_id": "1867777",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1867777",
"version": "2"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "Testdoc1 has only this text for 10 times: 6",
"type": "NarrativeText"
},
{
"element_id": "9a2c033a0ae7cbe306a7032162911085",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:19.072000",
"date_modified": "2023-07-11T17:02:21.081000",
"record_locator": {
"page_id": "1867777",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1867777",
"version": "2"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "Testdoc1 has only this text for 10 times: 7",
"type": "NarrativeText"
},
{
"element_id": "529da18f70a29f756ffba75f10feefa0",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:19.072000",
"date_modified": "2023-07-11T17:02:21.081000",
"record_locator": {
"page_id": "1867777",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1867777",
"version": "2"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "Testdoc1 has only this text for 10 times: 8",
"type": "NarrativeText"
},
{
"element_id": "bc115408c67b2109fd0730d7fd013e12",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:19.072000",
"date_modified": "2023-07-11T17:02:21.081000",
"record_locator": {
"page_id": "1867777",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1867777",
"version": "2"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "Testdoc1 has only this text for 10 times: 9",
"type": "NarrativeText"
},
{
"element_id": "4714a0ee9493d579ec617bda1a900b45",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:19.072000",
"date_modified": "2023-07-11T17:02:21.081000",
"record_locator": {
"page_id": "1867777",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1867777",
"version": "2"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "Testdoc1 has only this text for 10 times: 10",
"type": "NarrativeText"
}
]