mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-12 03:25:54 +00:00

### Summary Rip off page_number metadata fields until we have page counting for all kinds of html files (not just limited to news articles with multiple `<article>` tag) ### Test Unit tests `test_add_chunking_strategy_on_partition_html_respects_multipage` and `test_add_chunking_strategy_title_on_partition_auto_respects_multipage` removed since they relay on the `page_number` fields from the SEC html file - now test moved to mock test for chunk_by_title -> revisit those tests when we find test file for this Also changed the element ids from partition outputs for html files - element id change due to page number change (in element id hashing) -> todo ticket: update other deterministic element id tests per crag's comment --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
341 lines
10 KiB
JSON
341 lines
10 KiB
JSON
[
|
||
{
|
||
"element_id": "87d54efb69679f52b8c22f98f5ee6008",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"text_as_html": "<table><tr><td>Driver</td><td></td></tr><tr><td>Approver</td><td></td></tr><tr><td>Contributors</td><td></td></tr><tr><td>Informed</td><td></td></tr><tr><td>Objective</td><td></td></tr><tr><td>Due date</td><td></td></tr><tr><td>Key outcomes</td><td></td></tr><tr><td>Status</td><td>NOT STARTED / IN PROGRESS / COMPLETE</td></tr></table>"
|
||
},
|
||
"text": "Driver Approver Contributors Informed Objective Due date Key outcomes Status NOT STARTED / IN PROGRESS / COMPLETE",
|
||
"type": "Table"
|
||
},
|
||
{
|
||
"element_id": "c5528cff9dc4266b252f172314e2221c",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
]
|
||
},
|
||
"text": "\\uD83E\\uDD14 Problem Statement",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "7745f8f335c0d63ac895f39209fa50bf",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
]
|
||
},
|
||
"text": "🎯 Scope",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "a67a84caf3e93f9d3c6ee9462f6ac7bb",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"text_as_html": "<table><tr><td>Must have:</td><td></td></tr><tr><td>Nice to have:</td><td></td></tr><tr><td>Not in scope:</td><td></td></tr></table>"
|
||
},
|
||
"text": "Must have: Nice to have: Not in scope:",
|
||
"type": "Table"
|
||
},
|
||
{
|
||
"element_id": "eb8b47b67216ffd84479631af63b656a",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
]
|
||
},
|
||
"text": "\\uD83D\\uDDD3 Timeline",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "7aa58f6123e145d68b491d3e735060f8",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
]
|
||
},
|
||
"text": "Lane 1",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "21354bac4c070eaa9722a971e6bdbfea",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
]
|
||
},
|
||
"text": "Lane 2",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "2fac077cc411e658746e76d86ea1ec37",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
]
|
||
},
|
||
"text": "Feature 1",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "ff8497516144be25a4c0922f14c6ee28",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
]
|
||
},
|
||
"text": "Feature 2",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "0f701ee7f075b9b83ca75e844ab8184a",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
]
|
||
},
|
||
"text": "Feature 3",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "edf428e92bdb9e94ac17f876cdf7c058",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
]
|
||
},
|
||
"text": "Feature 4",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "4d846dbdfa5783f976e41e1852ffb179",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
]
|
||
},
|
||
"text": "iOS app",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "4fb022f234174c8dc5df55dd0c677833",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
]
|
||
},
|
||
"text": "Android app",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "a70dd893a3b6bd5c641812c25763f01a",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
]
|
||
},
|
||
"text": "\\uD83D\\uDEA9 Milestones and deadlines",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "15b3d2fa95017389c5c47d1c5fc64b4d",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"text_as_html": "<table><tr><td>Milestone</td><td>Owner</td><td>Deadline</td><td>Status</td></tr><tr><td></td><td></td><td></td><td></td></tr><tr><td></td><td></td><td></td><td></td></tr><tr><td></td><td></td><td></td><td></td></tr></table>"
|
||
},
|
||
"text": "Milestone Owner Deadline Status",
|
||
"type": "Table"
|
||
},
|
||
{
|
||
"element_id": "edd10b45a03a9b9dac237354faa55251",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
]
|
||
},
|
||
"text": "\\uD83D\\uDD17 Reference materials",
|
||
"type": "Title"
|
||
}
|
||
] |