mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-18 21:57:58 +00:00

Addresses a cluster of HTML-related bugs: - empty table is identified as bulleted-table - `partition_html()` emits empty (no text) tables (#1928) - `.text_as_html` contains inappropriate `<br>` elements in invalid locations. - cells enclosed in `<thead>` and `<tfoot>` elements are dropped (#1928) - `.text_as_html` contains whitespace padding Each of these is addressed in a separate commit below. Fixes #1928. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: scanny <scanny@users.noreply.github.com> Co-authored-by: Yuming Long <63475068+yuming-long@users.noreply.github.com>
245 lines
6.9 KiB
JSON
245 lines
6.9 KiB
JSON
[
|
||
{
|
||
"element_id": "35054d4d1455c734e83a868656b4ad16",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:54:45.162000",
|
||
"date_modified": "2023-07-09T12:54:45.162000",
|
||
"record_locator": {
|
||
"page_id": "1605928",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605928",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": "\\uD83D\\uDDD3 Date",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "0126c1353ddd7c8dfdb29f252a64a344",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:54:45.162000",
|
||
"date_modified": "2023-07-09T12:54:45.162000",
|
||
"record_locator": {
|
||
"page_id": "1605928",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605928",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": "\\uD83D\\uDC65 Participants",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:54:45.162000",
|
||
"date_modified": "2023-07-09T12:54:45.162000",
|
||
"record_locator": {
|
||
"page_id": "1605928",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605928",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": "",
|
||
"type": "ListItem"
|
||
},
|
||
{
|
||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:54:45.162000",
|
||
"date_modified": "2023-07-09T12:54:45.162000",
|
||
"record_locator": {
|
||
"page_id": "1605928",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605928",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": "",
|
||
"type": "ListItem"
|
||
},
|
||
{
|
||
"element_id": "fa64ff027cbc0c6929bc75d3c78c94c3",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:54:45.162000",
|
||
"date_modified": "2023-07-09T12:54:45.162000",
|
||
"record_locator": {
|
||
"page_id": "1605928",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605928",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": "\\uD83E\\uDD45 Goals",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:54:45.162000",
|
||
"date_modified": "2023-07-09T12:54:45.162000",
|
||
"record_locator": {
|
||
"page_id": "1605928",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605928",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": "",
|
||
"type": "ListItem"
|
||
},
|
||
{
|
||
"element_id": "537ea1b14dcba1742bdbd4a5fbfb488c",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:54:45.162000",
|
||
"date_modified": "2023-07-09T12:54:45.162000",
|
||
"record_locator": {
|
||
"page_id": "1605928",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605928",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": "\\uD83D\\uDDE3 Discussion topics",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "37af06e8e75d96a448a00026754b7942",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:54:45.162000",
|
||
"date_modified": "2023-07-09T12:54:45.162000",
|
||
"record_locator": {
|
||
"page_id": "1605928",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605928",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1,
|
||
"text_as_html": "<table><tr><td>Time</td><td>Item</td><td>Presenter</td><td>Notes</td></tr></table>"
|
||
},
|
||
"text": "Time Item Presenter Notes",
|
||
"type": "Table"
|
||
},
|
||
{
|
||
"element_id": "f158a8eaf72c7e9511d5e8ee03692652",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:54:45.162000",
|
||
"date_modified": "2023-07-09T12:54:45.162000",
|
||
"record_locator": {
|
||
"page_id": "1605928",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605928",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": "✅ Action items",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:54:45.162000",
|
||
"date_modified": "2023-07-09T12:54:45.162000",
|
||
"record_locator": {
|
||
"page_id": "1605928",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605928",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": "",
|
||
"type": "ListItem"
|
||
},
|
||
{
|
||
"element_id": "addb0aa08f77b69fa754ba55c6600c8a",
|
||
"metadata": {
|
||
"data_source": {
|
||
"date_created": "2023-07-09T12:54:45.162000",
|
||
"date_modified": "2023-07-09T12:54:45.162000",
|
||
"record_locator": {
|
||
"page_id": "1605928",
|
||
"url": "https://unstructured-ingest-test.atlassian.net"
|
||
},
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605928",
|
||
"version": "1"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": "⤴ Decisions",
|
||
"type": "Title"
|
||
}
|
||
] |