mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-05 03:23:03 +00:00
## Summary
This PR fixes an issue where header/footer content in html are not
partitioned as `unstructured` `Header` or `Footer` element types. Rather
they are either `UncategorizedText` or taking on the type of the nested
structure inside the header/footer. E.g., `<header class="Header"><h1
class="Title">Header Title</h1></header>` would be partitioned as a
`Title` instead of `Header`.
## Bug description
This behavior is because we treat header and footer as layout, i.e.,
containers, in the ontology definition. As a result, during parsing we
[unwrap](ec209c6b5f/unstructured/partition/html/transformations.py (L361-L378))
the container and parse the contents as if they are from the main text
even though they are still part of header/footer.
The fix is to treat header/footer as text instead of layout in ontology
so that all content inside of them are properly gathered under
`Header`/`Footer` element types.
146 lines
5.3 KiB
JSON
146 lines
5.3 KiB
JSON
[
|
|
{
|
|
"element_id": "eda37931eb954fcc8dec8804c7e8fa4c",
|
|
"metadata": {
|
|
"category_depth": 0,
|
|
"file_directory": "test_unstructured/documents/html_files",
|
|
"filename": "example.html",
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"last_modified": "2025-06-12T11:12:20",
|
|
"page_number": 1,
|
|
"parent_id": "037b418b76eb4ac1bd40326ff67e67b0",
|
|
"text_as_html": "<div class=\"Page\" data-page-number=\"1\" />"
|
|
},
|
|
"text": "",
|
|
"type": "UncategorizedText"
|
|
},
|
|
{
|
|
"element_id": "97eb491421584ad892074d039779fbfa",
|
|
"metadata": {
|
|
"category_depth": 1,
|
|
"file_directory": "test_unstructured/documents/html_files",
|
|
"filename": "example.html",
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"last_modified": "2025-06-12T11:12:20",
|
|
"page_number": 1,
|
|
"parent_id": "eda37931eb954fcc8dec8804c7e8fa4c",
|
|
"text_as_html": "<header class=\"Header\"><h1 class=\"Title\">Header</h1><time class=\"CalendarDate\">Date: October 30, 2023</time></header>"
|
|
},
|
|
"text": "Header Date: October 30, 2023",
|
|
"type": "Header"
|
|
},
|
|
{
|
|
"element_id": "4afb6e4a90e14835b958dadb77cd8331",
|
|
"metadata": {
|
|
"category_depth": 1,
|
|
"file_directory": "test_unstructured/documents/html_files",
|
|
"filename": "example.html",
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"last_modified": "2025-06-12T11:12:20",
|
|
"page_number": 1,
|
|
"parent_id": "eda37931eb954fcc8dec8804c7e8fa4c",
|
|
"text_as_html": "<form class=\"Form\"><label class=\"FormField\" for=\"company-name\">From field name</label><input class=\"FormFieldValue\" value=\"Example value\" /></form>"
|
|
},
|
|
"text": "From field name Example value",
|
|
"type": "UncategorizedText"
|
|
},
|
|
{
|
|
"element_id": "d8f996f2bc9a49f4979aac58a2a9ee93",
|
|
"metadata": {
|
|
"category_depth": 1,
|
|
"file_directory": "test_unstructured/documents/html_files",
|
|
"filename": "example.html",
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"last_modified": "2025-06-12T11:12:20",
|
|
"page_number": 1,
|
|
"parent_id": "eda37931eb954fcc8dec8804c7e8fa4c",
|
|
"text_as_html": "<section class=\"Section\" />"
|
|
},
|
|
"text": "",
|
|
"type": "UncategorizedText"
|
|
},
|
|
{
|
|
"element_id": "d2c12f995ab248808900f66aec479e9d",
|
|
"metadata": {
|
|
"category_depth": 2,
|
|
"file_directory": "test_unstructured/documents/html_files",
|
|
"filename": "example.html",
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"last_modified": "2025-06-12T11:12:20",
|
|
"page_number": 1,
|
|
"parent_id": "d8f996f2bc9a49f4979aac58a2a9ee93",
|
|
"text_as_html": "<table class=\"Table\"><thead><tr><th>Description</th><th>Row header</th></tr></thead><tbody><tr><td>Value description</td><td><span>50 $</span><span>(1.32 %)</span></td></tr></tbody></table>"
|
|
},
|
|
"text": "Description Row header Value description 50 $ (1.32 %)",
|
|
"type": "Table"
|
|
},
|
|
{
|
|
"element_id": "8e3f0d85329343008593f43afcad3327",
|
|
"metadata": {
|
|
"category_depth": 1,
|
|
"file_directory": "test_unstructured/documents/html_files",
|
|
"filename": "example.html",
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"last_modified": "2025-06-12T11:12:20",
|
|
"page_number": 1,
|
|
"parent_id": "eda37931eb954fcc8dec8804c7e8fa4c",
|
|
"text_as_html": "<section class=\"Section\" />"
|
|
},
|
|
"text": "",
|
|
"type": "UncategorizedText"
|
|
},
|
|
{
|
|
"element_id": "5deaad75854741ccb69767881ef399db",
|
|
"metadata": {
|
|
"category_depth": 2,
|
|
"file_directory": "test_unstructured/documents/html_files",
|
|
"filename": "example.html",
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"last_modified": "2025-06-12T11:12:20",
|
|
"page_number": 1,
|
|
"parent_id": "8e3f0d85329343008593f43afcad3327",
|
|
"text_as_html": "<h2 class=\"Subtitle\">2. Subtitle</h2>"
|
|
},
|
|
"text": "2. Subtitle",
|
|
"type": "Title"
|
|
},
|
|
{
|
|
"element_id": "9e61f29755bc4b6dbb41ea575d41edb6",
|
|
"metadata": {
|
|
"category_depth": 2,
|
|
"file_directory": "test_unstructured/documents/html_files",
|
|
"filename": "example.html",
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"last_modified": "2025-06-12T11:12:20",
|
|
"page_number": 1,
|
|
"parent_id": "8e3f0d85329343008593f43afcad3327",
|
|
"text_as_html": "<p class=\"NarrativeText\">Paragraph text</p>"
|
|
},
|
|
"text": "Paragraph text",
|
|
"type": "NarrativeText"
|
|
}
|
|
] |