Michał Martyniak 2d1923ac7e
Better element IDs - deterministic and document-unique hashes (#2673)
Part two of: https://github.com/Unstructured-IO/unstructured/pull/2842

Main changes compared to part one:
* hash computation includes element's sequence number on page, page
number, document filename and its text
* there are more test for deterministic behavior of IDs returned by
partitioning functions + their uniqueness (guaranteed at the document
level, and high probability across multiple documents)

This PR addresses the following issue:
https://github.com/Unstructured-IO/unstructured/issues/2461
2024-04-24 00:05:20 -07:00

222 lines
6.9 KiB
JSON

[
{
"element_id": "076745acc2a2dbc1c39d2d9b5b559c4e",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:35:25.467000+00:00",
"date_modified": "2023-08-22T11:35:30.285000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP2-7"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP2-7"
},
"filetype": "text/plain",
"languages": [
"eng"
]
},
"text": "IssueID_IssueKey:10009 JCTP2-7",
"type": "Title"
},
{
"element_id": "959cecf5d4d39631be82c8df2f95acf9",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:35:25.467000+00:00",
"date_modified": "2023-08-22T11:35:30.285000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP2-7"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP2-7"
},
"filetype": "text/plain",
"languages": [
"eng"
]
},
"text": "ProjectID_Key:JCTP2 Jira Connector Test Project 2",
"type": "Title"
},
{
"element_id": "4116333b97e6759ee68a7209d38da6f9",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:35:25.467000+00:00",
"date_modified": "2023-08-22T11:35:30.285000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP2-7"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP2-7"
},
"filetype": "text/plain",
"languages": [
"eng"
]
},
"text": "IssueType:Task",
"type": "Title"
},
{
"element_id": "d320ed9b7fa6aed26616dedbeb700e5a",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:35:25.467000+00:00",
"date_modified": "2023-08-22T11:35:30.285000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP2-7"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP2-7"
},
"filetype": "text/plain",
"languages": [
"eng"
]
},
"text": "Status:To Do",
"type": "Title"
},
{
"element_id": "01290ead31043f090f43afcdcd2ed05f",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:35:25.467000+00:00",
"date_modified": "2023-08-22T11:35:30.285000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP2-7"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP2-7"
},
"filetype": "text/plain",
"languages": [
"eng"
]
},
"text": "Priority:{'self': 'https://unstructured-jira-connector-test.atlassian.net/rest/api/2/priority/3', 'iconUrl': 'https://unstructured-jira-connector-test.atlassian.net/images/icons/priorities/medium.svg', 'name': 'Medium', 'id': '3'}",
"type": "Title"
},
{
"element_id": "2d0c6e97e61ed3f599d3e1cdf8e467b1",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:35:25.467000+00:00",
"date_modified": "2023-08-22T11:35:30.285000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP2-7"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP2-7"
},
"filetype": "text/plain",
"languages": [
"eng"
]
},
"text": "AssigneeID_Name:712020:7bc7fdcb-67e7-435d-b4a2-128aee12820c Unstructured Devops",
"type": "Title"
},
{
"element_id": "75e6120922bd79ac809fe32c560ea9ec",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:35:25.467000+00:00",
"date_modified": "2023-08-22T11:35:30.285000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP2-7"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP2-7"
},
"filetype": "text/plain",
"languages": [
"eng"
]
},
"text": "ReporterAdr_Name:devops+jira-connector@unstructured.io Unstructured Devops",
"type": "Title"
},
{
"element_id": "e11eb01b2079feae05687330d09f97d8",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:35:25.467000+00:00",
"date_modified": "2023-08-22T11:35:30.285000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP2-7"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP2-7"
},
"filetype": "text/plain",
"languages": [
"eng"
]
},
"text": "Labels:",
"type": "Title"
},
{
"element_id": "726ac8329aab3f9e8950356c6ee048b2",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:35:25.467000+00:00",
"date_modified": "2023-08-22T11:35:30.285000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP2-7"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP2-7"
},
"filetype": "text/plain",
"languages": [
"eng"
]
},
"text": "Components:",
"type": "Title"
},
{
"element_id": "1e1c346c36cb06543d2a90455e8028c8",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:35:25.467000+00:00",
"date_modified": "2023-08-22T11:35:30.285000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP2-7"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP2-7"
},
"filetype": "text/plain",
"languages": [
"eng"
]
},
"text": "Test Epic 1 Task 3",
"type": "Title"
},
{
"element_id": "79ca0ebb540d4190af175827424a9ba6",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:35:25.467000+00:00",
"date_modified": "2023-08-22T11:35:30.285000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP2-7"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP2-7"
},
"filetype": "text/plain",
"languages": [
"eng"
]
},
"text": "{}",
"type": "UncategorizedText"
}
]