mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-25 06:04:53 +00:00
fix: fix header and footer not parsed as Header/Footer types (#4041)
## Summary
This PR fixes an issue where header/footer content in html are not
partitioned as `unstructured` `Header` or `Footer` element types. Rather
they are either `UncategorizedText` or taking on the type of the nested
structure inside the header/footer. E.g., `<header class="Header"><h1
class="Title">Header Title</h1></header>` would be partitioned as a
`Title` instead of `Header`.
## Bug description
This behavior is because we treat header and footer as layout, i.e.,
containers, in the ontology definition. As a result, during parsing we
[unwrap](ec209c6b5f/unstructured/partition/html/transformations.py (L361-L378))
the container and parse the contents as if they are from the main text
even though they are still part of header/footer.
The fix is to treat header/footer as text instead of layout in ontology
so that all content inside of them are properly gathered under
`Header`/`Footer` element types.
This commit is contained in:
parent
45c3b63dcc
commit
aa332101ab
@ -1,4 +1,4 @@
|
||||
## 0.18.2-dev3
|
||||
## 0.18.2-dev4
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -9,6 +9,7 @@
|
||||
- **Failproof docx malformed or merged tables** This fix prevents docx file with complex or vertical merges or malformed tables from failing at `tc_at_grid_offset` and raised `ValueError: no tc element at grid_offset=X`.
|
||||
- **partition_md can read special characters on non- utf-8 files** `partition_md` reads the file as utf-8 previously. Now it uses `read_txt_file` that reads file with detected encoding.
|
||||
- xml code not getting escaped in a code block in a markdown file when in partition
|
||||
- **Fixes parsing HTML header and footer** Previously header and footer texts are partitioned as `UncategorizedText` or as the nested structure like `Title`. Now they are properly partitioned as `Header` and `Footer` element types.
|
||||
|
||||
## 0.18.1
|
||||
|
||||
|
||||
@ -1,159 +1,143 @@
|
||||
[
|
||||
{
|
||||
"element_id": "3a6b156a81764e17be128264241f8136",
|
||||
"element_id": "eda37931eb954fcc8dec8804c7e8fa4c",
|
||||
"metadata": {
|
||||
"category_depth": 0,
|
||||
"filename": "example.pdf",
|
||||
"file_directory": "test_unstructured/documents/html_files",
|
||||
"filename": "example.html",
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"last_modified": "2025-06-12T11:12:20",
|
||||
"page_number": 1,
|
||||
"parent_id": "897a8a47377c4ad6aab839a929879537",
|
||||
"parent_id": "037b418b76eb4ac1bd40326ff67e67b0",
|
||||
"text_as_html": "<div class=\"Page\" data-page-number=\"1\" />"
|
||||
},
|
||||
"text": "",
|
||||
"type": "UncategorizedText"
|
||||
},
|
||||
{
|
||||
"element_id": "45b3d0053468484ba1c7b53998115412",
|
||||
"element_id": "97eb491421584ad892074d039779fbfa",
|
||||
"metadata": {
|
||||
"category_depth": 1,
|
||||
"filename": "example.pdf",
|
||||
"file_directory": "test_unstructured/documents/html_files",
|
||||
"filename": "example.html",
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"last_modified": "2025-06-12T11:12:20",
|
||||
"page_number": 1,
|
||||
"parent_id": "3a6b156a81764e17be128264241f8136",
|
||||
"text_as_html": "<header class=\"Header\" />"
|
||||
"parent_id": "eda37931eb954fcc8dec8804c7e8fa4c",
|
||||
"text_as_html": "<header class=\"Header\"><h1 class=\"Title\">Header</h1><time class=\"CalendarDate\">Date: October 30, 2023</time></header>"
|
||||
},
|
||||
"text": "",
|
||||
"type": "UncategorizedText"
|
||||
"text": "Header Date: October 30, 2023",
|
||||
"type": "Header"
|
||||
},
|
||||
{
|
||||
"element_id": "c95473e8a3704fc2b418697f9fddb27b",
|
||||
"metadata": {
|
||||
"category_depth": 2,
|
||||
"filename": "example.pdf",
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"parent_id": "45b3d0053468484ba1c7b53998115412",
|
||||
"text_as_html": "<h1 class=\"Title\">Header</h1>"
|
||||
},
|
||||
"text": "Header",
|
||||
"type": "Title"
|
||||
},
|
||||
{
|
||||
"element_id": "379cbfdc16d44bd6a59e6cfabe6438d5",
|
||||
"metadata": {
|
||||
"category_depth": 2,
|
||||
"filename": "example.pdf",
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"parent_id": "45b3d0053468484ba1c7b53998115412",
|
||||
"text_as_html": "<time class=\"CalendarDate\">Date: October 30, 2023</time>"
|
||||
},
|
||||
"text": "Date: October 30, 2023",
|
||||
"type": "UncategorizedText"
|
||||
},
|
||||
{
|
||||
"element_id": "637c2f6935fb4353a5f73025ce04619d",
|
||||
"element_id": "4afb6e4a90e14835b958dadb77cd8331",
|
||||
"metadata": {
|
||||
"category_depth": 1,
|
||||
"filename": "example.pdf",
|
||||
"file_directory": "test_unstructured/documents/html_files",
|
||||
"filename": "example.html",
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"last_modified": "2025-06-12T11:12:20",
|
||||
"page_number": 1,
|
||||
"parent_id": "3a6b156a81764e17be128264241f8136",
|
||||
"parent_id": "eda37931eb954fcc8dec8804c7e8fa4c",
|
||||
"text_as_html": "<form class=\"Form\"><label class=\"FormField\" for=\"company-name\">From field name</label><input class=\"FormFieldValue\" value=\"Example value\" /></form>"
|
||||
},
|
||||
"text": "From field name Example value",
|
||||
"type": "UncategorizedText"
|
||||
},
|
||||
{
|
||||
"element_id": "592422373ed741b68a077e2003f8ed81",
|
||||
"element_id": "d8f996f2bc9a49f4979aac58a2a9ee93",
|
||||
"metadata": {
|
||||
"category_depth": 1,
|
||||
"filename": "example.pdf",
|
||||
"file_directory": "test_unstructured/documents/html_files",
|
||||
"filename": "example.html",
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"last_modified": "2025-06-12T11:12:20",
|
||||
"page_number": 1,
|
||||
"parent_id": "3a6b156a81764e17be128264241f8136",
|
||||
"parent_id": "eda37931eb954fcc8dec8804c7e8fa4c",
|
||||
"text_as_html": "<section class=\"Section\" />"
|
||||
},
|
||||
"text": "",
|
||||
"type": "UncategorizedText"
|
||||
},
|
||||
{
|
||||
"element_id": "dc3792d4422e444f90876b56d0cfb20d",
|
||||
"element_id": "d2c12f995ab248808900f66aec479e9d",
|
||||
"metadata": {
|
||||
"category_depth": 2,
|
||||
"filename": "example.pdf",
|
||||
"file_directory": "test_unstructured/documents/html_files",
|
||||
"filename": "example.html",
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"last_modified": "2025-06-12T11:12:20",
|
||||
"page_number": 1,
|
||||
"parent_id": "592422373ed741b68a077e2003f8ed81",
|
||||
"parent_id": "d8f996f2bc9a49f4979aac58a2a9ee93",
|
||||
"text_as_html": "<table class=\"Table\"><thead><tr><th>Description</th><th>Row header</th></tr></thead><tbody><tr><td>Value description</td><td><span>50 $</span><span>(1.32 %)</span></td></tr></tbody></table>"
|
||||
},
|
||||
"text": "Description Row header Value description 50 $ (1.32 %)",
|
||||
"type": "Table"
|
||||
},
|
||||
{
|
||||
"element_id": "1032242af75c4b37984ea7fea9aac74c",
|
||||
"element_id": "8e3f0d85329343008593f43afcad3327",
|
||||
"metadata": {
|
||||
"category_depth": 1,
|
||||
"filename": "example.pdf",
|
||||
"file_directory": "test_unstructured/documents/html_files",
|
||||
"filename": "example.html",
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"last_modified": "2025-06-12T11:12:20",
|
||||
"page_number": 1,
|
||||
"parent_id": "3a6b156a81764e17be128264241f8136",
|
||||
"parent_id": "eda37931eb954fcc8dec8804c7e8fa4c",
|
||||
"text_as_html": "<section class=\"Section\" />"
|
||||
},
|
||||
"text": "",
|
||||
"type": "UncategorizedText"
|
||||
},
|
||||
{
|
||||
"element_id": "2a4e2c4a689f4f9a8c180b6b521e45c3",
|
||||
"element_id": "5deaad75854741ccb69767881ef399db",
|
||||
"metadata": {
|
||||
"category_depth": 2,
|
||||
"filename": "example.pdf",
|
||||
"file_directory": "test_unstructured/documents/html_files",
|
||||
"filename": "example.html",
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"last_modified": "2025-06-12T11:12:20",
|
||||
"page_number": 1,
|
||||
"parent_id": "1032242af75c4b37984ea7fea9aac74c",
|
||||
"parent_id": "8e3f0d85329343008593f43afcad3327",
|
||||
"text_as_html": "<h2 class=\"Subtitle\">2. Subtitle</h2>"
|
||||
},
|
||||
"text": "2. Subtitle",
|
||||
"type": "Title"
|
||||
},
|
||||
{
|
||||
"element_id": "5591f7a4df01447e82515ce45f686fbe",
|
||||
"element_id": "9e61f29755bc4b6dbb41ea575d41edb6",
|
||||
"metadata": {
|
||||
"category_depth": 2,
|
||||
"filename": "example.pdf",
|
||||
"file_directory": "test_unstructured/documents/html_files",
|
||||
"filename": "example.html",
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"last_modified": "2025-06-12T11:12:20",
|
||||
"page_number": 1,
|
||||
"parent_id": "1032242af75c4b37984ea7fea9aac74c",
|
||||
"parent_id": "8e3f0d85329343008593f43afcad3327",
|
||||
"text_as_html": "<p class=\"NarrativeText\">Paragraph text</p>"
|
||||
},
|
||||
"text": "Paragraph text",
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,62 +1,38 @@
|
||||
[
|
||||
{
|
||||
"element_id": "3a6b156a81764e17be128264241f8136",
|
||||
"element_id": "8157292897eb45e68229b5816da6e46c",
|
||||
"metadata": {
|
||||
"category_depth": 0,
|
||||
"file_directory": "test_unstructured/documents/html_files",
|
||||
"filename": "example_with_alternative_text.html",
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"last_modified": "2025-06-12T11:12:20",
|
||||
"page_number": 1,
|
||||
"parent_id": "897a8a47377c4ad6aab839a929879537",
|
||||
"parent_id": "6f5f66a6a23642a7958aeff927e343cd",
|
||||
"text_as_html": "<div class=\"Page\" data-page-number=\"1\" />"
|
||||
},
|
||||
"text": "",
|
||||
"type": "UncategorizedText"
|
||||
},
|
||||
{
|
||||
"element_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
|
||||
"element_id": "70c89a734fe1497293a2e01b2f35b887",
|
||||
"metadata": {
|
||||
"category_depth": 1,
|
||||
"file_directory": "test_unstructured/documents/html_files",
|
||||
"filename": "example_with_alternative_text.html",
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"last_modified": "2025-06-12T11:12:20",
|
||||
"page_number": 1,
|
||||
"parent_id": "3a6b156a81764e17be128264241f8136",
|
||||
"text_as_html": "<header class=\"Header\" />"
|
||||
"parent_id": "8157292897eb45e68229b5816da6e46c",
|
||||
"text_as_html": "<header class=\"Header\"><img class=\"Logo\" alt=\"New York logo\" /><img class=\"Image\" alt=\"A line graph showing the comparison of 5 year cumulative total return for stocks\" /></header>"
|
||||
},
|
||||
"text": "",
|
||||
"type": "UncategorizedText"
|
||||
},
|
||||
{
|
||||
"element_id": "33d66969-b274-4f88-abaa-e7f258b1595f",
|
||||
"metadata": {
|
||||
"category_depth": 2,
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
|
||||
"text_as_html": "<img class=\"Logo\" alt=\"New York logo\" />"
|
||||
},
|
||||
"text": "New York logo",
|
||||
"type": "Image"
|
||||
},
|
||||
{
|
||||
"element_id": "40c32fd8-9a02-42b8-a587-884293881090",
|
||||
"metadata": {
|
||||
"category_depth": 2,
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
|
||||
"text_as_html": "<img class=\"Image\" alt=\"A line graph showing the comparison of 5 year cumulative total return for stocks\" />"
|
||||
},
|
||||
"text": "A line graph showing the comparison of 5 year cumulative total return for stocks",
|
||||
"type": "Image"
|
||||
"text": "New York logo A line graph showing the comparison of 5 year cumulative total return for stocks",
|
||||
"type": "Header"
|
||||
}
|
||||
]
|
||||
@ -1,62 +1,38 @@
|
||||
[
|
||||
{
|
||||
"element_id": "3a6b156a81764e17be128264241f8136",
|
||||
"element_id": "7aeac07bad7f44359c3b2403d2ca3d2a",
|
||||
"metadata": {
|
||||
"category_depth": 0,
|
||||
"file_directory": "test_unstructured/documents/html_files",
|
||||
"filename": "example_with_inline_fields.html",
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"last_modified": "2025-06-12T11:12:20",
|
||||
"page_number": 1,
|
||||
"parent_id": "897a8a47377c4ad6aab839a929879537",
|
||||
"parent_id": "8bf6d9316275493499a373771a6e46d0",
|
||||
"text_as_html": "<div class=\"Page\" data-page-number=\"1\" />"
|
||||
},
|
||||
"text": "",
|
||||
"type": "UncategorizedText"
|
||||
},
|
||||
{
|
||||
"element_id": "45b3d0053468484ba1c7b53998115412",
|
||||
"element_id": "f8f71f7ce10748f08bee0f9922a04406",
|
||||
"metadata": {
|
||||
"category_depth": 1,
|
||||
"file_directory": "test_unstructured/documents/html_files",
|
||||
"filename": "example_with_inline_fields.html",
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"last_modified": "2025-06-12T11:12:20",
|
||||
"page_number": 1,
|
||||
"parent_id": "3a6b156a81764e17be128264241f8136",
|
||||
"text_as_html": "<header class=\"Header\" />"
|
||||
"parent_id": "7aeac07bad7f44359c3b2403d2ca3d2a",
|
||||
"text_as_html": "<header class=\"Header\"><p class=\"NarrativeText\">Table of Contents</p><address class=\"Address\">68 Prince Street Palmdale, CA 93550</address><a class=\"Hyperlink\">www.google.com</a><span class=\"UncategorizedText\">More text</span></header>"
|
||||
},
|
||||
"text": "",
|
||||
"type": "UncategorizedText"
|
||||
},
|
||||
{
|
||||
"element_id": "6cd3c1ba79654abb9c86162b6d1dae46",
|
||||
"metadata": {
|
||||
"category_depth": 2,
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"parent_id": "45b3d0053468484ba1c7b53998115412",
|
||||
"text_as_html": "<p class=\"NarrativeText\">Table of Contents</p><address class=\"Address\">68 Prince Street Palmdale, CA 93550</address><a class=\"Hyperlink\">www.google.com</a>"
|
||||
},
|
||||
"text": "Table of Contents 68 Prince Street Palmdale, CA 93550 www.google.com",
|
||||
"type": "NarrativeText"
|
||||
},
|
||||
{
|
||||
"element_id": "cb0d6675109241428778c7b996e0b21c",
|
||||
"metadata": {
|
||||
"category_depth": 2,
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"parent_id": "45b3d0053468484ba1c7b53998115412",
|
||||
"text_as_html": "<span class=\"UncategorizedText\">More text</span>"
|
||||
},
|
||||
"text": "More text",
|
||||
"type": "UncategorizedText"
|
||||
"text": "Table of Contents 68 Prince Street Palmdale, CA 93550 www.google.com More text",
|
||||
"type": "Header"
|
||||
}
|
||||
]
|
||||
@ -17,6 +17,26 @@ def remove_all_ids(html_str):
|
||||
return str(soup)
|
||||
|
||||
|
||||
def test_parsing_header_and_footer_into_correct_ontologyelement():
|
||||
input_html = """
|
||||
<div class="Page">
|
||||
<header class="Header">
|
||||
this is a header
|
||||
</header>
|
||||
<footer class="Footer">
|
||||
this is a footer
|
||||
</footer>
|
||||
</div>
|
||||
"""
|
||||
page = parse_html_to_ontology(input_html)
|
||||
assert len(page.children) == 2
|
||||
header, footer = page.children
|
||||
assert header.text == "this is a header"
|
||||
assert header.html_tag_name == "header"
|
||||
assert footer.text == "this is a footer"
|
||||
assert footer.html_tag_name == "footer"
|
||||
|
||||
|
||||
def test_wrong_html_parser_causes_paragraph_to_be_nested_in_div():
|
||||
# This test would fail if html5lib parser would be applied on the input HTML.
|
||||
# It would result in Page: <p></p> <address></address>
|
||||
|
||||
@ -3,10 +3,10 @@
|
||||
from unstructured.documents.elements import (
|
||||
Element,
|
||||
ElementMetadata,
|
||||
Header,
|
||||
NarrativeText,
|
||||
Table,
|
||||
Text,
|
||||
Title,
|
||||
)
|
||||
from unstructured.documents.ontology import Address, Paragraph
|
||||
from unstructured.partition.html.html_utils import indent_html
|
||||
@ -303,6 +303,9 @@ def test_very_nested_structure_is_preserved():
|
||||
</div>
|
||||
</section>
|
||||
<div class='Column'>
|
||||
<header class='Header'>
|
||||
Page 1
|
||||
</header>
|
||||
<blockquote class="Quote">
|
||||
<p class="Paragraph">
|
||||
Clever Quote
|
||||
@ -333,18 +336,20 @@ def test_very_nested_structure_is_preserved():
|
||||
text="",
|
||||
metadata=ElementMetadata(text_as_html='<div class="Column" />'),
|
||||
),
|
||||
Text(
|
||||
text="",
|
||||
metadata=ElementMetadata(text_as_html='<header class="Header" />'),
|
||||
),
|
||||
Title(
|
||||
Header(
|
||||
text="Title",
|
||||
metadata=ElementMetadata(text_as_html='<h1 class="Title">Title</h1>'),
|
||||
metadata=ElementMetadata(
|
||||
text_as_html='<header class="Header"><h1 class="Title">Title</h1></header>'
|
||||
),
|
||||
),
|
||||
Text(
|
||||
text="",
|
||||
metadata=ElementMetadata(text_as_html='<div class="Column" />'),
|
||||
),
|
||||
Header(
|
||||
text="Page 1",
|
||||
metadata=ElementMetadata(text_as_html='<header class="Header">Page 1</header>'),
|
||||
),
|
||||
NarrativeText(
|
||||
text="Clever Quote",
|
||||
metadata=ElementMetadata(
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.18.2-dev3" # pragma: no cover
|
||||
__version__ = "0.18.2-dev4" # pragma: no cover
|
||||
|
||||
@ -189,13 +189,13 @@ class Paragraph(OntologyElement):
|
||||
|
||||
class Header(OntologyElement):
|
||||
description: str = Field("The top section of a page", frozen=True)
|
||||
elementType: ElementTypeEnum = Field(ElementTypeEnum.layout, frozen=True)
|
||||
elementType: ElementTypeEnum = Field(ElementTypeEnum.text, frozen=True)
|
||||
allowed_tags: List[str] = Field(["header"], frozen=True)
|
||||
|
||||
|
||||
class Footer(OntologyElement):
|
||||
description: str = Field("The bottom section of a page", frozen=True)
|
||||
elementType: ElementTypeEnum = Field(ElementTypeEnum.layout, frozen=True)
|
||||
elementType: ElementTypeEnum = Field(ElementTypeEnum.text, frozen=True)
|
||||
allowed_tags: List[str] = Field(["footer"], frozen=True)
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user