mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-11 02:08:13 +00:00

**Summary** Use more sophisticated algorithm for splitting oversized `Table` elements into `TableChunk` elements during chunking to ensure element text and HTML are "synchronized" and HTML is always parseable. **Additional Context** Table splitting now has the following characteristics: - `TableChunk.metadata.text_as_html` is always a parseable HTML `<table>` subtree. - `TableChunk.text` is always the text in the HTML version of the table fragment in `.metadata.text_as_html`. Text and HTML are "synchronized". - The table is divided at a whole-row boundary whenever possible. - A row is broken at an even-cell boundary when a single row is larger than the chunking window. - A cell is broken at an even-word boundary when a single cell is larger than the chunking window. - `.text_as_html` is "minified", removing all extraneous whitespace and unneeded elements or attributes. This maximizes the semantic "density" of each chunk.
150 lines
4.4 KiB
JSON
150 lines
4.4 KiB
JSON
[
|
|
{
|
|
"element_id": "7eb1052d0c2b8213a59b09e75f7e9d48",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_created": "2023-07-09T12:54:45.162000",
|
|
"date_modified": "2023-07-09T12:54:45.162000",
|
|
"record_locator": {
|
|
"page_id": "1605928",
|
|
"url": "https://unstructured-ingest-test.atlassian.net"
|
|
},
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605928",
|
|
"version": "1"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
]
|
|
},
|
|
"text": "\\uD83D\\uDDD3 Date",
|
|
"type": "Title"
|
|
},
|
|
{
|
|
"element_id": "5a0edb19b78ef68c152fee5450438050",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_created": "2023-07-09T12:54:45.162000",
|
|
"date_modified": "2023-07-09T12:54:45.162000",
|
|
"record_locator": {
|
|
"page_id": "1605928",
|
|
"url": "https://unstructured-ingest-test.atlassian.net"
|
|
},
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605928",
|
|
"version": "1"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
]
|
|
},
|
|
"text": "\\uD83D\\uDC65 Participants",
|
|
"type": "Title"
|
|
},
|
|
{
|
|
"element_id": "257a7e51634feee63a0ca125f89242a4",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_created": "2023-07-09T12:54:45.162000",
|
|
"date_modified": "2023-07-09T12:54:45.162000",
|
|
"record_locator": {
|
|
"page_id": "1605928",
|
|
"url": "https://unstructured-ingest-test.atlassian.net"
|
|
},
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605928",
|
|
"version": "1"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
]
|
|
},
|
|
"text": "\\uD83E\\uDD45 Goals",
|
|
"type": "Title"
|
|
},
|
|
{
|
|
"element_id": "e33a78e2fe5182a34770b6fc5a8b68f4",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_created": "2023-07-09T12:54:45.162000",
|
|
"date_modified": "2023-07-09T12:54:45.162000",
|
|
"record_locator": {
|
|
"page_id": "1605928",
|
|
"url": "https://unstructured-ingest-test.atlassian.net"
|
|
},
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605928",
|
|
"version": "1"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
]
|
|
},
|
|
"text": "\\uD83D\\uDDE3 Discussion topics",
|
|
"type": "Title"
|
|
},
|
|
{
|
|
"element_id": "c1a65256cf16868d5197438b4a02f2cd",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_created": "2023-07-09T12:54:45.162000",
|
|
"date_modified": "2023-07-09T12:54:45.162000",
|
|
"record_locator": {
|
|
"page_id": "1605928",
|
|
"url": "https://unstructured-ingest-test.atlassian.net"
|
|
},
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605928",
|
|
"version": "1"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"text_as_html": "<table><tr><td>Time</td><td>Item</td><td>Presenter</td><td>Notes</td></tr><tr><td/><td/><td/><td/></tr><tr><td/><td/><td/><td/></tr></table>"
|
|
},
|
|
"text": "Time Item Presenter Notes",
|
|
"type": "Table"
|
|
},
|
|
{
|
|
"element_id": "600cb65b484fffe887e320cb3a1c7b9c",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_created": "2023-07-09T12:54:45.162000",
|
|
"date_modified": "2023-07-09T12:54:45.162000",
|
|
"record_locator": {
|
|
"page_id": "1605928",
|
|
"url": "https://unstructured-ingest-test.atlassian.net"
|
|
},
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605928",
|
|
"version": "1"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
]
|
|
},
|
|
"text": "✅ Action items",
|
|
"type": "Title"
|
|
},
|
|
{
|
|
"element_id": "cc53207305306eef194a7d957dd9f6d6",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_created": "2023-07-09T12:54:45.162000",
|
|
"date_modified": "2023-07-09T12:54:45.162000",
|
|
"record_locator": {
|
|
"page_id": "1605928",
|
|
"url": "https://unstructured-ingest-test.atlassian.net"
|
|
},
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605928",
|
|
"version": "1"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
]
|
|
},
|
|
"text": "⤴ Decisions",
|
|
"type": "Title"
|
|
}
|
|
] |