Michał Martyniak 2d1923ac7e
Better element IDs - deterministic and document-unique hashes (#2673)
Part two of: https://github.com/Unstructured-IO/unstructured/pull/2842

Main changes compared to part one:
* hash computation includes element's sequence number on page, page
number, document filename and its text
* there are more test for deterministic behavior of IDs returned by
partitioning functions + their uniqueness (guaranteed at the document
level, and high probability across multiple documents)

This PR addresses the following issue:
https://github.com/Unstructured-IO/unstructured/issues/2461
2024-04-24 00:05:20 -07:00

443 lines
15 KiB
JSON

[
{
"element_id": "c52f774b6fc6331eececdcbc6ecefb9c",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:29:37.774000+00:00",
"date_modified": "2023-08-24T12:05:04.690000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP1-1"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1"
},
"filetype": "text/plain",
"languages": [
"cat",
"eng"
]
},
"text": "IssueID_IssueKey:10000 JCTP1-1",
"type": "Title"
},
{
"element_id": "d23954deeb8d0bb7df109d8fec184b3a",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:29:37.774000+00:00",
"date_modified": "2023-08-24T12:05:04.690000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP1-1"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1"
},
"filetype": "text/plain",
"languages": [
"cat",
"eng"
]
},
"text": "ProjectID_Key:JCTP1 Jira Connector Test Project 1",
"type": "Title"
},
{
"element_id": "52555c9a2c6077d81ac167a1a97adef4",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:29:37.774000+00:00",
"date_modified": "2023-08-24T12:05:04.690000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP1-1"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1"
},
"filetype": "text/plain",
"languages": [
"cat",
"eng"
]
},
"text": "IssueType:Task",
"type": "Title"
},
{
"element_id": "a1a11be1a987330ca2f6c979a0d40eec",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:29:37.774000+00:00",
"date_modified": "2023-08-24T12:05:04.690000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP1-1"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1"
},
"filetype": "text/plain",
"languages": [
"cat",
"eng"
]
},
"text": "Status:In Progress",
"type": "Title"
},
{
"element_id": "b26cc731855ff363c52165204d75c406",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:29:37.774000+00:00",
"date_modified": "2023-08-24T12:05:04.690000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP1-1"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1"
},
"filetype": "text/plain",
"languages": [
"cat",
"eng"
]
},
"text": "Priority:{'self': 'https://unstructured-jira-connector-test.atlassian.net/rest/api/2/priority/3', 'iconUrl': 'https://unstructured-jira-connector-test.atlassian.net/images/icons/priorities/medium.svg', 'name': 'Medium', 'id': '3'}",
"type": "Title"
},
{
"element_id": "2074380252e31d043b89884c5ce73b06",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:29:37.774000+00:00",
"date_modified": "2023-08-24T12:05:04.690000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP1-1"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1"
},
"filetype": "text/plain",
"languages": [
"cat",
"eng"
]
},
"text": "AssigneeID_Name:712020:7bc7fdcb-67e7-435d-b4a2-128aee12820c Unstructured Devops",
"type": "Title"
},
{
"element_id": "0dd444ee4fa13d624c1ddb7510293394",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:29:37.774000+00:00",
"date_modified": "2023-08-24T12:05:04.690000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP1-1"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1"
},
"filetype": "text/plain",
"languages": [
"cat",
"eng"
]
},
"text": "ReporterAdr_Name:devops+jira-connector@unstructured.io Unstructured Devops",
"type": "Title"
},
{
"element_id": "5a66904e052e488c34a0c36c27790d42",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:29:37.774000+00:00",
"date_modified": "2023-08-24T12:05:04.690000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP1-1"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1"
},
"filetype": "text/plain",
"languages": [
"cat",
"eng"
]
},
"text": "Labels:Label1 Label2",
"type": "Title"
},
{
"element_id": "fd4a3081978054b271b72f08d2562805",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:29:37.774000+00:00",
"date_modified": "2023-08-24T12:05:04.690000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP1-1"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1"
},
"filetype": "text/plain",
"languages": [
"cat",
"eng"
]
},
"text": "Components:",
"type": "Title"
},
{
"element_id": "8b0a08e354d247728fc6653b29012097",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:29:37.774000+00:00",
"date_modified": "2023-08-24T12:05:04.690000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP1-1"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1"
},
"filetype": "text/plain",
"languages": [
"cat",
"eng"
]
},
"text": "Unstructured Devops My comment 1 Unstructured Devops My attachment image lorem ipsum:",
"type": "UncategorizedText"
},
{
"element_id": "6545a906fa2f77f703eb6b1927c75f1e",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:29:37.774000+00:00",
"date_modified": "2023-08-24T12:05:04.690000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP1-1"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1"
},
"filetype": "text/plain",
"languages": [
"cat",
"eng"
]
},
"text": "!image",
"type": "Title"
},
{
"element_id": "ab5fe61f63375b93e729660db18202dc",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:29:37.774000+00:00",
"date_modified": "2023-08-24T12:05:04.690000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP1-1"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1"
},
"filetype": "text/plain",
"languages": [
"cat",
"eng"
]
},
"text": "20230823",
"type": "UncategorizedText"
},
{
"element_id": "8fa3dc3c6d3fdfe4593bfaced6b185d0",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:29:37.774000+00:00",
"date_modified": "2023-08-24T12:05:04.690000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP1-1"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1"
},
"filetype": "text/plain",
"languages": [
"cat",
"eng"
]
},
"text": "143650.png|width=83.33333333333333%!",
"type": "UncategorizedText"
},
{
"element_id": "1363bb7c51c2d1648c261b14ebef3a71",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:29:37.774000+00:00",
"date_modified": "2023-08-24T12:05:04.690000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP1-1"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1"
},
"filetype": "text/plain",
"languages": [
"cat",
"eng"
]
},
"text": "Test todo 1",
"type": "Title"
},
{
"element_id": "2c5711aa2233e731c9f9738150422556",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:29:37.774000+00:00",
"date_modified": "2023-08-24T12:05:04.690000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP1-1"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1"
},
"filetype": "text/plain",
"languages": [
"cat",
"eng"
]
},
"text": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. [Nam quid possumus facere melius?|http://loripsum.net/] Ita relinquet duas, de quibus etiam atque etiam consideret. Quo modo autem philosophus loquitur? Quid est enim aliud esse versutum? His enim rebus detractis negat se reperire in asotorum vita quod reprehendat. Non est ista, inquam, Piso, magna dissensio. Duo Reges: constructio interrete. In eo enim positum est id, quod dicimus esse expetendum. Traditur, inquit, ab Epicuro ratio neglegendi doloris. Negat enim summo bono afferre incrementum diem. Aberat omnis dolor, qui si adesset, nec molliter ferret et tamen medicis plus quam philosophis uteretur.",
"type": "NarrativeText"
},
{
"element_id": "8f1fa05896c5a3572a64b3d27e35c430",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:29:37.774000+00:00",
"date_modified": "2023-08-24T12:05:04.690000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP1-1"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1"
},
"filetype": "text/plain",
"languages": [
"cat",
"eng"
]
},
"text": "Sedulo, inquam, faciam. Ergo, si semel tristior effectus est, hilara vita amissa est? Quamquam tu hanc copiosiorem etiam soles dicere. An eum locum libenter invisit, ubi Demosthenes et Aeschines inter se decertare soliti sunt? _Quippe: habes enim a rhetoribus;_ Non minor, inquit, voluptas percipitur ex vilissimis rebus quam ex pretiosissimis. Ut in geometria, prima si dederis, danda sunt omnia. Negat enim summo bono afferre incrementum diem.",
"type": "NarrativeText"
},
{
"element_id": "7861822edc33ade3d98c486c80666616",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:29:37.774000+00:00",
"date_modified": "2023-08-24T12:05:04.690000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP1-1"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1"
},
"filetype": "text/plain",
"languages": [
"cat",
"eng"
]
},
"text": "[Et nemo nimium beatus est;|http://loripsum.net/] Nam et complectitur verbis, quod vult, et dicit plane, quod intellegam; Ab his oratores, ab his imperatores ac rerum publicarum principes extiterunt. Ergo adhuc, quantum equidem intellego, causa non videtur fuisse mutandi nominis. Quis enim redargueret? Ita fit cum gravior, tum etiam splendidior oratio. Sed ut iis bonis erigimur, quae expectamus, sic laetamur iis, quae recordamur. _Bork_ Tubulum fuisse, qua illum, cuius is condemnatus est rogatione, P. [Eiuro, inquit adridens, iniquum, hac quidem de re;|http://loripsum.net/] Si quae forte-possumus.",
"type": "NarrativeText"
},
{
"element_id": "0ce97695971b0a37dc3a95c2f42debab",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:29:37.774000+00:00",
"date_modified": "2023-08-24T12:05:04.690000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP1-1"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1"
},
"filetype": "text/plain",
"languages": [
"cat",
"eng"
]
},
"text": "https://unstructured",
"type": "NarrativeText"
},
{
"element_id": "f5c47725c8fdccd883bde6f48513191b",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:29:37.774000+00:00",
"date_modified": "2023-08-24T12:05:04.690000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP1-1"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1"
},
"filetype": "text/plain",
"languages": [
"cat",
"eng"
]
},
"text": "jira",
"type": "Title"
},
{
"element_id": "f0033dc63737a3cfb0df923acdd79a35",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:29:37.774000+00:00",
"date_modified": "2023-08-24T12:05:04.690000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP1-1"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1"
},
"filetype": "text/plain",
"languages": [
"cat",
"eng"
]
},
"text": "connector",
"type": "Title"
},
{
"element_id": "b6ad345669dcdd6b7523d5fd87fdb0d1",
"metadata": {
"data_source": {
"date_created": "2023-08-22T11:29:37.774000+00:00",
"date_modified": "2023-08-24T12:05:04.690000+00:00",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP1-1"
},
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1"
},
"filetype": "text/plain",
"languages": [
"cat",
"eng"
]
},
"text": "test.atlassian.net/rest/api/2/attachment/10000",
"type": "Title"
}
]