mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
Fix layout parsing (#3754)
This commit is contained in:
parent
2417f8ed84
commit
5a91f0cda9
@ -1,4 +1,4 @@
|
|||||||
## 0.16.3-dev1
|
## 0.16.3-dev2
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
@ -7,6 +7,7 @@
|
|||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
* **V2 elements without first parent ID can be parsed**
|
* **V2 elements without first parent ID can be parsed**
|
||||||
|
* **Fix missing elements when layout element parsed in V2 ontology**
|
||||||
|
|
||||||
|
|
||||||
## 0.16.2
|
## 0.16.2
|
||||||
|
@ -59,7 +59,7 @@ def test_when_class_is_missing_it_can_be_inferred_from_type():
|
|||||||
expected_html = _wrap_with_body(
|
expected_html = _wrap_with_body(
|
||||||
"""
|
"""
|
||||||
<div class="Page">
|
<div class="Page">
|
||||||
<aside class='Sidebar'>Some text</aside>
|
<aside class='Sidebar'><p class='Paragraph'>Some text</p></aside>
|
||||||
</div>
|
</div>
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
@ -87,7 +87,7 @@ def test_when_class_is_wrong_tag_name_is_overwritten():
|
|||||||
expected_html = _wrap_with_body(
|
expected_html = _wrap_with_body(
|
||||||
"""
|
"""
|
||||||
<div class="Page">
|
<div class="Page">
|
||||||
<aside class='Sidebar'>Some text</aside>
|
<aside class='Sidebar'><p class='Paragraph'>Some text</p></aside>
|
||||||
</div>
|
</div>
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
@ -535,6 +535,8 @@ def test_malformed_html():
|
|||||||
# language=HTML
|
# language=HTML
|
||||||
expected_html = """
|
expected_html = """
|
||||||
<body class="Document">
|
<body class="Document">
|
||||||
|
|
||||||
|
<p class="Paragraph">
|
||||||
Unclosed comment
|
Unclosed comment
|
||||||
<div class="">
|
<div class="">
|
||||||
<p>
|
<p>
|
||||||
@ -554,6 +556,7 @@ def test_malformed_html():
|
|||||||
<p>
|
<p>
|
||||||
Paragraph with invalid characters: <EFBFBD> <EFBFBD> <EFBFBD>
|
Paragraph with invalid characters: <EFBFBD> <EFBFBD> <EFBFBD>
|
||||||
</p>
|
</p>
|
||||||
|
</p>
|
||||||
</body>
|
</body>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -563,3 +566,31 @@ def test_malformed_html():
|
|||||||
parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))
|
parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))
|
||||||
|
|
||||||
assert parsed_ontology == expected_html
|
assert parsed_ontology == expected_html
|
||||||
|
|
||||||
|
|
||||||
|
def test_text_is_wrapped_inside_layout_element():
|
||||||
|
# language=HTML
|
||||||
|
base_html = _wrap_with_body(
|
||||||
|
"""
|
||||||
|
<div class="Page">
|
||||||
|
Text
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
base_html = indent_html(base_html)
|
||||||
|
|
||||||
|
# language=HTML
|
||||||
|
expected_html = _wrap_with_body(
|
||||||
|
"""
|
||||||
|
<div class="Page">
|
||||||
|
<p class='Paragraph'>Text</p>
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
expected_html = indent_html(expected_html)
|
||||||
|
|
||||||
|
ontology: OntologyElement = parse_html_to_ontology(base_html)
|
||||||
|
parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))
|
||||||
|
|
||||||
|
assert parsed_ontology == expected_html
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.16.3-dev1" # pragma: no cover
|
__version__ = "0.16.3-dev2" # pragma: no cover
|
||||||
|
@ -79,15 +79,17 @@ def ontology_to_unstructured_elements(
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
childreen = []
|
||||||
for child in ontology_element.children:
|
for child in ontology_element.children:
|
||||||
elements_to_return += ontology_to_unstructured_elements(
|
childreen += ontology_to_unstructured_elements(
|
||||||
child,
|
child,
|
||||||
parent_id=ontology_element.id,
|
parent_id=ontology_element.id,
|
||||||
page_number=page_number,
|
page_number=page_number,
|
||||||
depth=0 if isinstance(ontology_element, Document) else depth + 1,
|
depth=0 if isinstance(ontology_element, Document) else depth + 1,
|
||||||
filename=filename,
|
filename=filename,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
elements_to_return += childreen
|
||||||
else:
|
else:
|
||||||
unstructured_element_class_name = ONTOLOGY_CLASS_NAME_TO_UNSTRUCTURED_ELEMENT_TYPE_NAME[
|
unstructured_element_class_name = ONTOLOGY_CLASS_NAME_TO_UNSTRUCTURED_ELEMENT_TYPE_NAME[
|
||||||
ontology_element.__class__.__name__
|
ontology_element.__class__.__name__
|
||||||
@ -98,7 +100,6 @@ def ontology_to_unstructured_elements(
|
|||||||
BeautifulSoup(html_code_of_ontology_element, "html.parser").get_text().strip()
|
BeautifulSoup(html_code_of_ontology_element, "html.parser").get_text().strip()
|
||||||
)
|
)
|
||||||
# TODO value attribute from form input should be added to the text
|
# TODO value attribute from form input should be added to the text
|
||||||
|
|
||||||
unstructured_element = element_class(
|
unstructured_element = element_class(
|
||||||
text=element_text,
|
text=element_text,
|
||||||
element_id=ontology_element.id,
|
element_id=ontology_element.id,
|
||||||
@ -255,8 +256,10 @@ def parse_html_to_ontology_element(soup: Tag) -> OntologyElement | None:
|
|||||||
additional_attributes=escaped_attrs,
|
additional_attributes=escaped_attrs,
|
||||||
)
|
)
|
||||||
|
|
||||||
has_children = (ontology_class != UncategorizedText) and any(
|
has_children = (
|
||||||
isinstance(content, Tag) for content in soup.contents
|
(ontology_class != UncategorizedText)
|
||||||
|
and any(isinstance(content, Tag) for content in soup.contents)
|
||||||
|
or ontology_class().elementType == ElementTypeEnum.layout
|
||||||
)
|
)
|
||||||
|
|
||||||
if has_children:
|
if has_children:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user