mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-28 23:58:13 +00:00
Part two of: https://github.com/Unstructured-IO/unstructured/pull/2842 Main changes compared to part one: * hash computation includes element's sequence number on page, page number, document filename and its text * there are more test for deterministic behavior of IDs returned by partitioning functions + their uniqueness (guaranteed at the document level, and high probability across multiple documents) This PR addresses the following issue: https://github.com/Unstructured-IO/unstructured/issues/2461
162 lines
4.7 KiB
JSON
162 lines
4.7 KiB
JSON
[
|
|
{
|
|
"element_id": "127585a1914bd1ae38acced4f4ad22bb",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_created": "2024-01-09T20:39:22+00:00",
|
|
"record_locator": {
|
|
"collection": "sample-mongodb-data",
|
|
"document_id": "659daefa21dd8c9054b084b9",
|
|
"host": null
|
|
}
|
|
},
|
|
"filename": "659daefa21dd8c9054b084b9.txt",
|
|
"filetype": "text/plain",
|
|
"languages": [
|
|
"eng"
|
|
]
|
|
},
|
|
"text": "1908",
|
|
"type": "UncategorizedText"
|
|
},
|
|
{
|
|
"element_id": "0b55941362343ce910c3802b550b0743",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_created": "2024-01-09T20:39:22+00:00",
|
|
"record_locator": {
|
|
"collection": "sample-mongodb-data",
|
|
"document_id": "659daefa21dd8c9054b084b9",
|
|
"host": null
|
|
}
|
|
},
|
|
"filename": "659daefa21dd8c9054b084b9.txt",
|
|
"filetype": "text/plain",
|
|
"languages": [
|
|
"eng"
|
|
]
|
|
},
|
|
"text": "The Fight for Freedom",
|
|
"type": "Title"
|
|
},
|
|
{
|
|
"element_id": "fde748a21a8cf5a745a226c690ee7fc6",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_created": "2024-01-09T20:39:22+00:00",
|
|
"record_locator": {
|
|
"collection": "sample-mongodb-data",
|
|
"document_id": "659daefa21dd8c9054b084b9",
|
|
"host": null
|
|
}
|
|
},
|
|
"filename": "659daefa21dd8c9054b084b9.txt",
|
|
"filetype": "text/plain",
|
|
"languages": [
|
|
"eng"
|
|
]
|
|
},
|
|
"text": "American",
|
|
"type": "Title"
|
|
},
|
|
{
|
|
"element_id": "df3501d28b75a13688004689bf309152",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_created": "2024-01-09T20:39:22+00:00",
|
|
"record_locator": {
|
|
"collection": "sample-mongodb-data",
|
|
"document_id": "659daefa21dd8c9054b084b9",
|
|
"host": null
|
|
}
|
|
},
|
|
"filename": "659daefa21dd8c9054b084b9.txt",
|
|
"filetype": "text/plain",
|
|
"languages": [
|
|
"eng"
|
|
]
|
|
},
|
|
"text": "D. W. Griffith",
|
|
"type": "Title"
|
|
},
|
|
{
|
|
"element_id": "8100c07d4ae2fee23005e5e16266c098",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_created": "2024-01-09T20:39:22+00:00",
|
|
"record_locator": {
|
|
"collection": "sample-mongodb-data",
|
|
"document_id": "659daefa21dd8c9054b084b9",
|
|
"host": null
|
|
}
|
|
},
|
|
"filename": "659daefa21dd8c9054b084b9.txt",
|
|
"filetype": "text/plain",
|
|
"languages": [
|
|
"eng"
|
|
]
|
|
},
|
|
"text": "Florence Auer, John G. Adolfi",
|
|
"type": "Title"
|
|
},
|
|
{
|
|
"element_id": "9de21915e66988e57ca4415cbccb3081",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_created": "2024-01-09T20:39:22+00:00",
|
|
"record_locator": {
|
|
"collection": "sample-mongodb-data",
|
|
"document_id": "659daefa21dd8c9054b084b9",
|
|
"host": null
|
|
}
|
|
},
|
|
"filename": "659daefa21dd8c9054b084b9.txt",
|
|
"filetype": "text/plain",
|
|
"languages": [
|
|
"eng"
|
|
]
|
|
},
|
|
"text": "western",
|
|
"type": "Title"
|
|
},
|
|
{
|
|
"element_id": "2c5ed5302ad8ff4e4c7504aafc7383fc",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_created": "2024-01-09T20:39:22+00:00",
|
|
"record_locator": {
|
|
"collection": "sample-mongodb-data",
|
|
"document_id": "659daefa21dd8c9054b084b9",
|
|
"host": null
|
|
}
|
|
},
|
|
"filename": "659daefa21dd8c9054b084b9.txt",
|
|
"filetype": "text/plain",
|
|
"languages": [
|
|
"eng"
|
|
]
|
|
},
|
|
"text": "https://en.wikipedia.org/wiki/The_Fight_for_Freedom",
|
|
"type": "Title"
|
|
},
|
|
{
|
|
"element_id": "a77097f8ccd050951925e64f138050e5",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_created": "2024-01-09T20:39:22+00:00",
|
|
"record_locator": {
|
|
"collection": "sample-mongodb-data",
|
|
"document_id": "659daefa21dd8c9054b084b9",
|
|
"host": null
|
|
}
|
|
},
|
|
"filename": "659daefa21dd8c9054b084b9.txt",
|
|
"filetype": "text/plain",
|
|
"languages": [
|
|
"eng"
|
|
]
|
|
},
|
|
"text": "The film opens in a town on the Mexican border. A poker game is going on in the local saloon. One of the players cheats and is shot dead by another of the players, a Mexican named Pedro. In the uproar that follows Pedro is wounded as he escapes from the saloon. The sheriff is called, who tracks Pedro to his home but Pedro kills the sherriff too. While Pedro hides, his wife Juanita, is arrested on suspicion of murdering the sheriff. Pedro rescues her from the town jail and the two head for the Mexican border. Caught by the posse before they reach the border, Juanita is killed and the film ends with Pedro being arrested and taken back to town.",
|
|
"type": "NarrativeText"
|
|
}
|
|
] |