Steve Canny ee9be2a3b2
fix: assorted partition_html() bugs (#2113)
Addresses a cluster of HTML-related bugs:
- empty table is identified as bulleted-table
- `partition_html()` emits empty (no text) tables (#1928)
- `.text_as_html` contains inappropriate `<br>` elements in invalid
locations.
- cells enclosed in `<thead>` and `<tfoot>` elements are dropped (#1928)
- `.text_as_html` contains whitespace padding

Each of these is addressed in a separate commit below.

Fixes #1928.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: scanny <scanny@users.noreply.github.com>
Co-authored-by: Yuming Long <63475068+yuming-long@users.noreply.github.com>
2023-11-20 16:29:32 +00:00

357 lines
11 KiB
JSON
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

[
{
"element_id": "10b5ef18a3c7fb1d7436b2e1b256e5b9",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"text_as_html": "<table><tr><td>Driver</td></tr><tr><td>Approver</td></tr><tr><td>Contributors</td></tr><tr><td>Informed</td></tr><tr><td>Objective</td></tr><tr><td>Due date</td></tr><tr><td>Key outcomes</td></tr><tr><td>Status</td><td>NOT STARTED</td><td>/</td><td>IN PROGRESS</td><td>/</td><td>COMPLETE</td></tr></table>"
},
"text": "Driver Approver Contributors Informed Objective Due date Key outcomes Status NOT STARTED / IN PROGRESS / COMPLETE",
"type": "Table"
},
{
"element_id": "4e2022d4483a407d85060675f64fbe17",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "\\uD83E\\uDD14 Problem Statement",
"type": "Title"
},
{
"element_id": "81163675915a75217e4116686fdca412",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "🎯 Scope",
"type": "Title"
},
{
"element_id": "f1f364fbde77afa0e99e8ea7ab4f7c3f",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"text_as_html": "<table><tr><td>Must have:</td></tr><tr><td>Nice to have:</td></tr><tr><td>Not in scope:</td></tr></table>"
},
"text": "Must have: Nice to have: Not in scope:",
"type": "Table"
},
{
"element_id": "e8b61a28d07e977379b42df455a1cde4",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "\\uD83D\\uDDD3 Timeline",
"type": "Title"
},
{
"element_id": "5043f71fbc70e35c0be413d4135be99f",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "Lane 1",
"type": "Title"
},
{
"element_id": "d5a2e177c588bf0c4f914baa4fae85b6",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "Lane 2",
"type": "Title"
},
{
"element_id": "c98ba1acbd22a15ddddfc244cbd8a2db",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "Feature 1",
"type": "Title"
},
{
"element_id": "e04620c8b3b611b3fefecef89baa63a9",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "Feature 2",
"type": "Title"
},
{
"element_id": "82e522a86692cc50ee5c020c8e6ce6a0",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "Feature 3",
"type": "Title"
},
{
"element_id": "822f7c45ea725c535970aab819a8ff10",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "Feature 4",
"type": "Title"
},
{
"element_id": "6e0f6eca4ff17d3377c1c3e8e1f73457",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "iOS app",
"type": "Title"
},
{
"element_id": "0b60fe04b3c5c3c76371b6eca8b19c8e",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "Android app",
"type": "Title"
},
{
"element_id": "e1cc184f345d146586fb12527c4fa696",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "\\uD83D\\uDEA9 Milestones and deadlines",
"type": "Title"
},
{
"element_id": "3f4ea3840d79521680c89a91dcd883cf",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"text_as_html": "<table><tr><td>Milestone</td><td>Owner</td><td>Deadline</td><td>Status</td></tr></table>"
},
"text": "Milestone Owner Deadline Status",
"type": "Table"
},
{
"element_id": "890c9b6d8d69ca1de5fd7a8b83fe78ff",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000",
"record_locator": {
"page_id": "1540126",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "\\uD83D\\uDD17 Reference materials",
"type": "Title"
}
]