Yuming Long 542d442699
chore CORE-4775: remove html page number metadata field (#2942)
### Summary

Rip off page_number metadata fields until we have page counting for all
kinds of html files (not just limited to news articles with multiple
`<article>` tag)

### Test
Unit tests
`test_add_chunking_strategy_on_partition_html_respects_multipage` and
`test_add_chunking_strategy_title_on_partition_auto_respects_multipage`
removed since they relay on the `page_number` fields from the SEC html
file - now test moved to mock test for chunk_by_title -> revisit those
tests when we find test file for this

Also changed the element ids from partition outputs for html files -
element id change due to page number change (in element id hashing) ->
todo ticket: update other deterministic element id tests per crag's
comment

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
2024-04-30 15:20:26 +00:00

341 lines
10 KiB
JSON
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

[
{
"element_id": "87d54efb69679f52b8c22f98f5ee6008",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
],
"text_as_html": "<table><tr><td>Driver</td><td></td></tr><tr><td>Approver</td><td></td></tr><tr><td>Contributors</td><td></td></tr><tr><td>Informed</td><td></td></tr><tr><td>Objective</td><td></td></tr><tr><td>Due date</td><td></td></tr><tr><td>Key outcomes</td><td></td></tr><tr><td>Status</td><td>NOT STARTED / IN PROGRESS / COMPLETE</td></tr></table>"
},
"text": "Driver Approver Contributors Informed Objective Due date Key outcomes Status NOT STARTED / IN PROGRESS / COMPLETE",
"type": "Table"
},
{
"element_id": "c5528cff9dc4266b252f172314e2221c",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "\\uD83E\\uDD14 Problem Statement",
"type": "Title"
},
{
"element_id": "7745f8f335c0d63ac895f39209fa50bf",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "🎯 Scope",
"type": "Title"
},
{
"element_id": "a67a84caf3e93f9d3c6ee9462f6ac7bb",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
],
"text_as_html": "<table><tr><td>Must have:</td><td></td></tr><tr><td>Nice to have:</td><td></td></tr><tr><td>Not in scope:</td><td></td></tr></table>"
},
"text": "Must have: Nice to have: Not in scope:",
"type": "Table"
},
{
"element_id": "eb8b47b67216ffd84479631af63b656a",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "\\uD83D\\uDDD3 Timeline",
"type": "Title"
},
{
"element_id": "7aa58f6123e145d68b491d3e735060f8",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "Lane 1",
"type": "Title"
},
{
"element_id": "21354bac4c070eaa9722a971e6bdbfea",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "Lane 2",
"type": "Title"
},
{
"element_id": "2fac077cc411e658746e76d86ea1ec37",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "Feature 1",
"type": "Title"
},
{
"element_id": "ff8497516144be25a4c0922f14c6ee28",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "Feature 2",
"type": "Title"
},
{
"element_id": "0f701ee7f075b9b83ca75e844ab8184a",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "Feature 3",
"type": "Title"
},
{
"element_id": "edf428e92bdb9e94ac17f876cdf7c058",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "Feature 4",
"type": "Title"
},
{
"element_id": "4d846dbdfa5783f976e41e1852ffb179",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "iOS app",
"type": "Title"
},
{
"element_id": "4fb022f234174c8dc5df55dd0c677833",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "Android app",
"type": "Title"
},
{
"element_id": "a70dd893a3b6bd5c641812c25763f01a",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "\\uD83D\\uDEA9 Milestones and deadlines",
"type": "Title"
},
{
"element_id": "15b3d2fa95017389c5c47d1c5fc64b4d",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
],
"text_as_html": "<table><tr><td>Milestone</td><td>Owner</td><td>Deadline</td><td>Status</td></tr><tr><td></td><td></td><td></td><td></td></tr><tr><td></td><td></td><td></td><td></td></tr><tr><td></td><td></td><td></td><td></td></tr></table>"
},
"text": "Milestone Owner Deadline Status",
"type": "Table"
},
{
"element_id": "edd10b45a03a9b9dac237354faa55251",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "\\uD83D\\uDD17 Reference materials",
"type": "Title"
}
]