mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-23 09:00:40 +00:00

Fixes #2339 Fixes to HTML partitioning introduced with v0.11.0 removed the use of `tabulate` for forming the HTML placed in `HTMLTable.text_as_html`. This had several benefits, but part of `tabulate`'s behavior was to make row-length (cell-count) uniform across the rows of the table. Lacking this prior uniformity produced a downstream problem reported in On closer inspection, the method used to "harvest" cell-text was producing more text-nodes than there were cells and was sensitive to where whitespace was used to format the HTML. It also "moved" text to different columns in certain rows. Refine the cell-text gathering mechanism to get exactly one text string for each row cell, eliminating whitespace formatting nodes and producing strict correspondence between the number of cells in the original HTML table row and that placed in HTML.text_as_html. HTML tables that are uniform (every row has the same number of cells) will produce a uniform table in `.text_as_html`. Merged cells may still produce a non-uniform table in `.text_as_html` (because the source table is non-uniform).
357 lines
11 KiB
JSON
357 lines
11 KiB
JSON
[
|
||
{
|
||
"element_id": "597883fce258148ee227842378ce55c3",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1,
|
||
"text_as_html": "<table><tr><td>Driver</td><td></td></tr><tr><td>Approver</td><td></td></tr><tr><td>Contributors</td><td></td></tr><tr><td>Informed</td><td></td></tr><tr><td>Objective</td><td></td></tr><tr><td>Due date</td><td></td></tr><tr><td>Key outcomes</td><td></td></tr><tr><td>Status</td><td>NOT STARTED / IN PROGRESS / COMPLETE</td></tr></table>"
|
||
},
|
||
"text": "Driver Approver Contributors Informed Objective Due date Key outcomes Status NOT STARTED / IN PROGRESS / COMPLETE",
|
||
"type": "Table"
|
||
},
|
||
{
|
||
"element_id": "4e2022d4483a407d85060675f64fbe17",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": "\\uD83E\\uDD14 Problem Statement",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "81163675915a75217e4116686fdca412",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": "🎯 Scope",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "f1f364fbde77afa0e99e8ea7ab4f7c3f",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1,
|
||
"text_as_html": "<table><tr><td>Must have:</td><td></td></tr><tr><td>Nice to have:</td><td></td></tr><tr><td>Not in scope:</td><td></td></tr></table>"
|
||
},
|
||
"text": "Must have: Nice to have: Not in scope:",
|
||
"type": "Table"
|
||
},
|
||
{
|
||
"element_id": "e8b61a28d07e977379b42df455a1cde4",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": "\\uD83D\\uDDD3 Timeline",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "5043f71fbc70e35c0be413d4135be99f",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": "Lane 1",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "d5a2e177c588bf0c4f914baa4fae85b6",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": "Lane 2",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "c98ba1acbd22a15ddddfc244cbd8a2db",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": "Feature 1",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "e04620c8b3b611b3fefecef89baa63a9",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": "Feature 2",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "82e522a86692cc50ee5c020c8e6ce6a0",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": "Feature 3",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "822f7c45ea725c535970aab819a8ff10",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": "Feature 4",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "6e0f6eca4ff17d3377c1c3e8e1f73457",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": "iOS app",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "0b60fe04b3c5c3c76371b6eca8b19c8e",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": "Android app",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "e1cc184f345d146586fb12527c4fa696",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": "\\uD83D\\uDEA9 Milestones and deadlines",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "3f4ea3840d79521680c89a91dcd883cf",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1,
|
||
"text_as_html": "<table><tr><td>Milestone</td><td>Owner</td><td>Deadline</td><td>Status</td></tr><tr><td></td><td></td><td></td><td></td></tr><tr><td></td><td></td><td></td><td></td></tr><tr><td></td><td></td><td></td><td></td></tr></table>"
|
||
},
|
||
"text": "Milestone Owner Deadline Status",
|
||
"type": "Table"
|
||
},
|
||
{
|
||
"element_id": "890c9b6d8d69ca1de5fd7a8b83fe78ff",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:55:50.911000",
|
||
"date_modified": "2023-07-09T12:56:10.564000",
|
||
"record_locator": {
|
||
"page_id": "1540126",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": "\\uD83D\\uDD17 Reference materials",
|
||
"type": "Title"
|
||
}
|
||
] |