mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-30 19:56:41 +00:00

Adding table extraction to HTML partitioning. This PR utilizes 'table' HTML elements to extract and parse HTML tables and return them in partitioning. ``` # checkout this branch, go into ipython shell In [1]: from unstructured.partition.html import partition_html In [2]: path_to_html = "{html sample file with table}" In [3]: elements = partition_html(path_to_html) ``` you should see the table in the elements list!
113 lines
2.6 KiB
JSON
113 lines
2.6 KiB
JSON
[
|
||
{
|
||
"type": "Title",
|
||
"element_id": "35054d4d1455c734e83a868656b4ad16",
|
||
"metadata": {
|
||
"data_source": {},
|
||
"filetype": "text/html",
|
||
"page_number": 1
|
||
},
|
||
"text": "\\uD83D\\uDDD3 Date"
|
||
},
|
||
{
|
||
"type": "Title",
|
||
"element_id": "0126c1353ddd7c8dfdb29f252a64a344",
|
||
"metadata": {
|
||
"data_source": {},
|
||
"filetype": "text/html",
|
||
"page_number": 1
|
||
},
|
||
"text": "\\uD83D\\uDC65 Participants"
|
||
},
|
||
{
|
||
"type": "ListItem",
|
||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||
"metadata": {
|
||
"data_source": {},
|
||
"filetype": "text/html",
|
||
"page_number": 1
|
||
},
|
||
"text": ""
|
||
},
|
||
{
|
||
"type": "ListItem",
|
||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||
"metadata": {
|
||
"data_source": {},
|
||
"filetype": "text/html",
|
||
"page_number": 1
|
||
},
|
||
"text": ""
|
||
},
|
||
{
|
||
"type": "Title",
|
||
"element_id": "fa64ff027cbc0c6929bc75d3c78c94c3",
|
||
"metadata": {
|
||
"data_source": {},
|
||
"filetype": "text/html",
|
||
"page_number": 1
|
||
},
|
||
"text": "\\uD83E\\uDD45 Goals"
|
||
},
|
||
{
|
||
"type": "ListItem",
|
||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||
"metadata": {
|
||
"data_source": {},
|
||
"filetype": "text/html",
|
||
"page_number": 1
|
||
},
|
||
"text": ""
|
||
},
|
||
{
|
||
"type": "Title",
|
||
"element_id": "537ea1b14dcba1742bdbd4a5fbfb488c",
|
||
"metadata": {
|
||
"data_source": {},
|
||
"filetype": "text/html",
|
||
"page_number": 1
|
||
},
|
||
"text": "\\uD83D\\uDDE3 Discussion topics"
|
||
},
|
||
{
|
||
"type": "Table",
|
||
"element_id": "37af06e8e75d96a448a00026754b7942",
|
||
"metadata": {
|
||
"data_source": {},
|
||
"filetype": "text/html",
|
||
"page_number": 1,
|
||
"text_as_html": "<table><br><tbody><br><tr><td>Time</td><td>Item</td><td>Presenter</td><td>Notes</td></tr><br><tr><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td> </td><td> </td><td> </td><td> </td></tr><br></tbody><br></table>"
|
||
},
|
||
"text": "Time Item Presenter Notes"
|
||
},
|
||
{
|
||
"type": "Title",
|
||
"element_id": "f158a8eaf72c7e9511d5e8ee03692652",
|
||
"metadata": {
|
||
"data_source": {},
|
||
"filetype": "text/html",
|
||
"page_number": 1
|
||
},
|
||
"text": "✅ Action items"
|
||
},
|
||
{
|
||
"type": "ListItem",
|
||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||
"metadata": {
|
||
"data_source": {},
|
||
"filetype": "text/html",
|
||
"page_number": 1
|
||
},
|
||
"text": ""
|
||
},
|
||
{
|
||
"type": "Title",
|
||
"element_id": "addb0aa08f77b69fa754ba55c6600c8a",
|
||
"metadata": {
|
||
"data_source": {},
|
||
"filetype": "text/html",
|
||
"page_number": 1
|
||
},
|
||
"text": "⤴ Decisions"
|
||
}
|
||
] |