mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-29 16:17:00 +00:00
Part two of: https://github.com/Unstructured-IO/unstructured/pull/2842 Main changes compared to part one: * hash computation includes element's sequence number on page, page number, document filename and its text * there are more test for deterministic behavior of IDs returned by partitioning functions + their uniqueness (guaranteed at the document level, and high probability across multiple documents) This PR addresses the following issue: https://github.com/Unstructured-IO/unstructured/issues/2461
50 lines
1.4 KiB
JSON
50 lines
1.4 KiB
JSON
[
|
|
{
|
|
"element_id": "5df397cc48371b33609230377fff0698",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_created": "2023-04-07T19:57:55.231559",
|
|
"date_modified": "2023-04-07T22:19:56.520459"
|
|
},
|
|
"filename": "C052BGT7718.xml",
|
|
"filetype": "application/xml",
|
|
"languages": [
|
|
"eng"
|
|
]
|
|
},
|
|
"text": "testing <reply> testing <reply> threads are cool",
|
|
"type": "NarrativeText"
|
|
},
|
|
{
|
|
"element_id": "600e3fddd88eef2f72e62382247d0444",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_created": "2023-04-07T19:57:55.231559",
|
|
"date_modified": "2023-04-07T22:19:56.520459"
|
|
},
|
|
"filename": "C052BGT7718.xml",
|
|
"filetype": "application/xml",
|
|
"languages": [
|
|
"eng"
|
|
]
|
|
},
|
|
"text": "<@U051UBRR946> has joined the channel <reply> <@U051UBRR946> has joined the channel",
|
|
"type": "NarrativeText"
|
|
},
|
|
{
|
|
"element_id": "05ac2c54bb9f4ab5c69faf65d75fed0c",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_created": "2023-04-07T19:57:55.231559",
|
|
"date_modified": "2023-04-07T22:19:56.520459"
|
|
},
|
|
"filename": "C052BGT7718.xml",
|
|
"filetype": "application/xml",
|
|
"languages": [
|
|
"eng"
|
|
]
|
|
},
|
|
"text": "<@U04ST78RXU3> has joined the channel <reply> <@U04ST78RXU3> has joined the channel",
|
|
"type": "NarrativeText"
|
|
}
|
|
] |