mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2026-01-06 04:11:08 +00:00
Part two of: https://github.com/Unstructured-IO/unstructured/pull/2842 Main changes compared to part one: * hash computation includes element's sequence number on page, page number, document filename and its text * there are more test for deterministic behavior of IDs returned by partitioning functions + their uniqueness (guaranteed at the document level, and high probability across multiple documents) This PR addresses the following issue: https://github.com/Unstructured-IO/unstructured/issues/2461
116 lines
3.0 KiB
JSON
116 lines
3.0 KiB
JSON
[
|
|
{
|
|
"element_id": "35a1916e54745e2962b10e0e20ebf05a",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_created": "2023-10-17T00:59:03.435000+00:00",
|
|
"date_modified": "2023-10-17T02:57:25.829000+00:00",
|
|
"record_locator": {
|
|
"hubspot_id": "2002294392"
|
|
}
|
|
},
|
|
"filename": "2002294392.txt",
|
|
"filetype": "text/plain",
|
|
"languages": [
|
|
"eng"
|
|
]
|
|
},
|
|
"text": "Issue with tests",
|
|
"type": "Title"
|
|
},
|
|
{
|
|
"element_id": "21d773f14ac7182d556fe92eb1fa1101",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_created": "2023-10-17T00:59:03.435000+00:00",
|
|
"date_modified": "2023-10-17T02:57:25.829000+00:00",
|
|
"record_locator": {
|
|
"hubspot_id": "2002294392"
|
|
}
|
|
},
|
|
"filename": "2002294392.txt",
|
|
"filetype": "text/plain",
|
|
"languages": [
|
|
"eng"
|
|
]
|
|
},
|
|
"text": "There's no issue, only another sample ticket for testing purposes.",
|
|
"type": "NarrativeText"
|
|
},
|
|
{
|
|
"element_id": "806bff0a2334b447631bc536fa31881e",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_created": "2023-10-17T00:59:03.435000+00:00",
|
|
"date_modified": "2023-10-17T02:57:25.829000+00:00",
|
|
"record_locator": {
|
|
"hubspot_id": "2002294392"
|
|
}
|
|
},
|
|
"filename": "2002294392.txt",
|
|
"filetype": "text/plain",
|
|
"languages": [
|
|
"eng"
|
|
]
|
|
},
|
|
"text": "This is an empty custom property.",
|
|
"type": "NarrativeText"
|
|
},
|
|
{
|
|
"element_id": "48885c342b4049f0d376ae5381788df0",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_created": "2023-10-17T00:59:03.435000+00:00",
|
|
"date_modified": "2023-10-17T02:57:25.829000+00:00",
|
|
"record_locator": {
|
|
"hubspot_id": "2002294392"
|
|
}
|
|
},
|
|
"filename": "2002294392.txt",
|
|
"filetype": "text/plain",
|
|
"languages": [
|
|
"eng"
|
|
]
|
|
},
|
|
"text": "This is an empty custom property.",
|
|
"type": "NarrativeText"
|
|
},
|
|
{
|
|
"element_id": "6e5ec9d255a6eab24548091f4d330a88",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_created": "2023-10-17T00:59:03.435000+00:00",
|
|
"date_modified": "2023-10-17T02:57:25.829000+00:00",
|
|
"record_locator": {
|
|
"hubspot_id": "2002294392"
|
|
}
|
|
},
|
|
"filename": "2002294392.txt",
|
|
"filetype": "text/plain",
|
|
"languages": [
|
|
"eng"
|
|
]
|
|
},
|
|
"text": "This is an empty custom property.",
|
|
"type": "NarrativeText"
|
|
},
|
|
{
|
|
"element_id": "1d4569c9e39dc07803a085a4ca9d9de4",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_created": "2023-10-17T00:59:03.435000+00:00",
|
|
"date_modified": "2023-10-17T02:57:25.829000+00:00",
|
|
"record_locator": {
|
|
"hubspot_id": "2002294392"
|
|
}
|
|
},
|
|
"filename": "2002294392.txt",
|
|
"filetype": "text/plain",
|
|
"languages": [
|
|
"eng"
|
|
]
|
|
},
|
|
"text": "This is an empty custom property.",
|
|
"type": "NarrativeText"
|
|
}
|
|
] |