Add parsing HTML to unstructured elements (#3732)
> This is POC change; not everything is working correctly and code
quality could be improved significantly
This ticket add parsing HTML to unstructured element and back. How is it
working?
HTML has a tree structure, Unstructured Elements is a list.
HTML structure is traversed in DFS order, creating Elements and adding
them to list. So the reading order from HTML is preserved. To be able to
compose tree again all elements has IDs, and metadata.parent_id is
leveraged
How html is preserved if there are 'layout' without text, or there are
deeply nested HTMLs that are just text from the point of view of
Unstructured Element?
Each element is parsed back to HTML using metadata.text_as_html field.
For layout elements only html_tag are there, for long text elements
there is everything required to recreate HTML - you can see examples in
unit tests or .json file I attached.
Pros of solution:
- Nothing had to be changed in element types
Cons:
- There are elements without Text which may be confusing (they could be
replaced by some special type)
Core transformation logic can be found in 2 functions in
`unstructured/documents/transformations.py`
Knowns bugs (they are minor):
- sometimes html tag is changed incorrectly
- metadata.category_depth and metadata.page_number are not set
- page break is not added between pages
How to test. Generate HTML:
```python3
from pathlib import Path
from vlm_partitioner.src.partition import partition
if __name__ == "__main__":
doc_dir = Path("out_dir")
file_path = Path("example_doc.pdf")
partition(str(file_path), provider="anthropic", output_dir=str(doc_dir))
```
Then parse to unstructured elements and back to html
```python3
from pathlib import Path
from unstructured.documents.html_utils import indent_html
from unstructured.documents.transformations import parse_html_to_ontology, ontology_to_unstructured_elements, \
unstructured_elements_to_ontology
from unstructured.staging.base import elements_to_json
if __name__ == "__main__":
output_dir = Path("out_dir/")
output_dir.mkdir(exist_ok=True, parents=True)
doc_path = Path("out_dir/example_doc.html")
html_content = doc_path.read_text()
ontology = parse_html_to_ontology(html_content)
unstructured_elements = ontology_to_unstructured_elements(ontology)
elements_to_json(unstructured_elements, str(output_dir / f"{doc_path.stem}_unstr.json"))
parsed_ontology = unstructured_elements_to_ontology(unstructured_elements)
html_to_save = indent_html(parsed_ontology.to_html())
Path(output_dir / f"{doc_path.stem}_parsed_unstr.html").write_text(html_to_save)
```
I attached example doc before and after running these scripts
[outputs.zip](https://github.com/user-attachments/files/17438673/outputs.zip)
2024-10-23 14:28:07 +02:00
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
|
|
|
from unstructured.chunking.basic import chunk_elements
|
|
|
|
from unstructured.chunking.title import chunk_by_title
|
2024-10-31 13:17:25 +01:00
|
|
|
from unstructured.documents.ontology import (
|
|
|
|
Column,
|
|
|
|
Document,
|
|
|
|
Hyperlink,
|
|
|
|
Image,
|
|
|
|
Page,
|
|
|
|
Paragraph,
|
|
|
|
Section,
|
|
|
|
Table,
|
|
|
|
)
|
Add parsing HTML to unstructured elements (#3732)
> This is POC change; not everything is working correctly and code
quality could be improved significantly
This ticket add parsing HTML to unstructured element and back. How is it
working?
HTML has a tree structure, Unstructured Elements is a list.
HTML structure is traversed in DFS order, creating Elements and adding
them to list. So the reading order from HTML is preserved. To be able to
compose tree again all elements has IDs, and metadata.parent_id is
leveraged
How html is preserved if there are 'layout' without text, or there are
deeply nested HTMLs that are just text from the point of view of
Unstructured Element?
Each element is parsed back to HTML using metadata.text_as_html field.
For layout elements only html_tag are there, for long text elements
there is everything required to recreate HTML - you can see examples in
unit tests or .json file I attached.
Pros of solution:
- Nothing had to be changed in element types
Cons:
- There are elements without Text which may be confusing (they could be
replaced by some special type)
Core transformation logic can be found in 2 functions in
`unstructured/documents/transformations.py`
Knowns bugs (they are minor):
- sometimes html tag is changed incorrectly
- metadata.category_depth and metadata.page_number are not set
- page break is not added between pages
How to test. Generate HTML:
```python3
from pathlib import Path
from vlm_partitioner.src.partition import partition
if __name__ == "__main__":
doc_dir = Path("out_dir")
file_path = Path("example_doc.pdf")
partition(str(file_path), provider="anthropic", output_dir=str(doc_dir))
```
Then parse to unstructured elements and back to html
```python3
from pathlib import Path
from unstructured.documents.html_utils import indent_html
from unstructured.documents.transformations import parse_html_to_ontology, ontology_to_unstructured_elements, \
unstructured_elements_to_ontology
from unstructured.staging.base import elements_to_json
if __name__ == "__main__":
output_dir = Path("out_dir/")
output_dir.mkdir(exist_ok=True, parents=True)
doc_path = Path("out_dir/example_doc.html")
html_content = doc_path.read_text()
ontology = parse_html_to_ontology(html_content)
unstructured_elements = ontology_to_unstructured_elements(ontology)
elements_to_json(unstructured_elements, str(output_dir / f"{doc_path.stem}_unstr.json"))
parsed_ontology = unstructured_elements_to_ontology(unstructured_elements)
html_to_save = indent_html(parsed_ontology.to_html())
Path(output_dir / f"{doc_path.stem}_parsed_unstr.html").write_text(html_to_save)
```
I attached example doc before and after running these scripts
[outputs.zip](https://github.com/user-attachments/files/17438673/outputs.zip)
2024-10-23 14:28:07 +02:00
|
|
|
from unstructured.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
|
|
|
|
from unstructured.partition.html import partition_html
|
|
|
|
from unstructured.partition.html.transformations import (
|
|
|
|
ontology_to_unstructured_elements,
|
|
|
|
parse_html_to_ontology,
|
|
|
|
)
|
|
|
|
from unstructured.partition.json import partition_json
|
|
|
|
from unstructured.staging.base import elements_from_json
|
|
|
|
|
|
|
|
|
|
|
|
def test_page_number_is_passed_correctly():
|
|
|
|
ontology = Document(
|
|
|
|
children=[
|
|
|
|
Page(
|
|
|
|
children=[Paragraph(text="Paragraph1")],
|
|
|
|
additional_attributes={"data-page-number": "1"},
|
|
|
|
),
|
|
|
|
Page(
|
|
|
|
children=[Paragraph(text="Paragraph2")],
|
|
|
|
additional_attributes={"data-page-number": "2"},
|
|
|
|
),
|
|
|
|
]
|
|
|
|
)
|
|
|
|
unstructured_elements = ontology_to_unstructured_elements(ontology)
|
|
|
|
page1, p1, page2, p2 = unstructured_elements
|
|
|
|
assert p1.metadata.page_number == 1
|
|
|
|
assert p2.metadata.page_number == 2
|
|
|
|
|
|
|
|
|
|
|
|
def test_invalid_page_number_is_not_passed():
|
|
|
|
ontology = Document(
|
|
|
|
children=[
|
|
|
|
Page(
|
|
|
|
children=[Paragraph(text="Paragraph1")],
|
|
|
|
additional_attributes={"data-page-number": "invalid"},
|
|
|
|
)
|
|
|
|
]
|
|
|
|
)
|
|
|
|
unstructured_elements = ontology_to_unstructured_elements(ontology)
|
|
|
|
page1, p1 = unstructured_elements
|
|
|
|
assert not p1.metadata.page_number
|
|
|
|
|
|
|
|
|
|
|
|
def test_depth_is_passed_correctly():
|
|
|
|
ontology = Document(
|
|
|
|
children=[
|
|
|
|
Page(children=[Paragraph(text="Paragraph1")]),
|
|
|
|
Page(
|
|
|
|
children=[
|
|
|
|
Column(children=[Paragraph(text="Paragraph2")]),
|
|
|
|
Column(children=[Paragraph(text="Paragraph3")]),
|
|
|
|
]
|
|
|
|
),
|
|
|
|
]
|
|
|
|
)
|
|
|
|
|
|
|
|
unstructured_elements = ontology_to_unstructured_elements(ontology)
|
|
|
|
page1, p1, page2, c1, p2, c2, p3 = unstructured_elements
|
|
|
|
|
|
|
|
assert page1.metadata.category_depth == 0
|
|
|
|
assert page2.metadata.category_depth == 0
|
|
|
|
|
|
|
|
assert p1.metadata.category_depth == 1
|
|
|
|
|
|
|
|
assert c2.metadata.category_depth == 1
|
|
|
|
assert c1.metadata.category_depth == 1
|
|
|
|
|
|
|
|
assert p2.metadata.category_depth == 2
|
|
|
|
assert p3.metadata.category_depth == 2
|
|
|
|
|
|
|
|
|
|
|
|
def test_chunking_is_applied_on_elements():
|
|
|
|
ontology = Document(
|
|
|
|
children=[
|
|
|
|
Page(children=[Paragraph(text="Paragraph1")]),
|
|
|
|
Page(
|
|
|
|
children=[
|
|
|
|
Column(children=[Paragraph(text="Paragraph2")]),
|
|
|
|
Column(children=[Paragraph(text="Paragraph3")]),
|
|
|
|
]
|
|
|
|
),
|
|
|
|
]
|
|
|
|
)
|
|
|
|
|
|
|
|
unstructured_elements = ontology_to_unstructured_elements(ontology)
|
|
|
|
|
|
|
|
chunked_basic = chunk_elements(unstructured_elements)
|
|
|
|
assert str(chunked_basic[0]) == "Paragraph1\n\nParagraph2\n\nParagraph3"
|
|
|
|
chunked_by_title = chunk_by_title(unstructured_elements)
|
|
|
|
assert str(chunked_by_title[0]) == "Paragraph1\n\nParagraph2\n\nParagraph3"
|
|
|
|
|
|
|
|
|
|
|
|
def test_embeddings_are_applied_on_elements(mocker):
|
|
|
|
ontology = Document(
|
|
|
|
children=[
|
|
|
|
Page(children=[Paragraph(text="Paragraph1")]),
|
|
|
|
Page(
|
|
|
|
children=[
|
|
|
|
Column(children=[Paragraph(text="Paragraph2")]),
|
|
|
|
Column(children=[Paragraph(text="Paragraph3")]),
|
|
|
|
]
|
|
|
|
),
|
|
|
|
]
|
|
|
|
)
|
|
|
|
|
|
|
|
unstructured_elements = ontology_to_unstructured_elements(ontology)
|
|
|
|
# Mocked client with the desired behavior for embed_documents
|
|
|
|
mock_client = mocker.MagicMock()
|
|
|
|
mock_client.embed_documents.return_value = [1, 2, 3, 4, 5, 6, 7]
|
|
|
|
|
|
|
|
# Mock get_client to return our mock_client
|
|
|
|
mocker.patch.object(OpenAIEmbeddingConfig, "get_client", return_value=mock_client)
|
|
|
|
|
|
|
|
encoder = OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(api_key="api_key"))
|
|
|
|
elements = encoder.embed_documents(
|
|
|
|
elements=unstructured_elements,
|
|
|
|
)
|
|
|
|
|
|
|
|
assert len(elements) == 7
|
|
|
|
|
|
|
|
page1, p1, page2, c1, p2, c2, p3 = elements
|
|
|
|
|
|
|
|
assert p1.embeddings == 2
|
|
|
|
assert p2.embeddings == 5
|
|
|
|
assert p3.embeddings == 7
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("html_file_path", "json_file_path"),
|
|
|
|
[
|
|
|
|
("html_files/example.html", "unstructured_json_output/example.json"),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_ingest(html_file_path, json_file_path):
|
|
|
|
html_file_path = Path(__file__).parent / html_file_path
|
|
|
|
json_file_path = Path(__file__).parent / json_file_path
|
|
|
|
|
|
|
|
html_code = html_file_path.read_text()
|
|
|
|
expected_json_elements = elements_from_json(str(json_file_path))
|
|
|
|
|
|
|
|
ontology = parse_html_to_ontology(html_code)
|
|
|
|
unstructured_elements = ontology_to_unstructured_elements(ontology)
|
|
|
|
assert unstructured_elements == expected_json_elements
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("json_file_path", ["unstructured_json_output/example.json"])
|
|
|
|
def test_parsed_ontology_can_be_serialized_from_json(json_file_path):
|
|
|
|
json_file_path = Path(__file__).parent / json_file_path
|
|
|
|
|
|
|
|
expected_json_elements = elements_from_json(str(json_file_path))
|
|
|
|
|
|
|
|
json_elements_text = json_file_path.read_text()
|
|
|
|
elements = partition_json(text=json_elements_text)
|
|
|
|
|
|
|
|
assert len(elements) == len(expected_json_elements)
|
|
|
|
for i in range(len(elements)):
|
|
|
|
assert elements[i] == expected_json_elements[i]
|
|
|
|
# The partitioning output comes from PDF file, so only stem is compared
|
|
|
|
# as the suffix is different .pdf != .json
|
|
|
|
assert Path(elements[i].metadata.filename).stem == json_file_path.stem
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("html_file_path", "json_file_path"),
|
|
|
|
[
|
|
|
|
("html_files/example.html", "unstructured_json_output/example.json"),
|
2024-10-31 13:17:25 +01:00
|
|
|
("html_files/example_full_doc.html", "unstructured_json_output/example_full_doc.json"),
|
2024-11-26 17:20:23 +01:00
|
|
|
(
|
|
|
|
"html_files/example_with_alternative_text.html",
|
|
|
|
"unstructured_json_output/example_with_alternative_text.json",
|
|
|
|
),
|
2024-11-20 14:01:28 +01:00
|
|
|
("html_files/three_tables.html", "unstructured_json_output/three_tables.json"),
|
2024-10-31 13:17:25 +01:00
|
|
|
(
|
|
|
|
"html_files/example_with_inline_fields.html",
|
|
|
|
"unstructured_json_output/example_with_inline_fields.json",
|
|
|
|
),
|
Add parsing HTML to unstructured elements (#3732)
> This is POC change; not everything is working correctly and code
quality could be improved significantly
This ticket add parsing HTML to unstructured element and back. How is it
working?
HTML has a tree structure, Unstructured Elements is a list.
HTML structure is traversed in DFS order, creating Elements and adding
them to list. So the reading order from HTML is preserved. To be able to
compose tree again all elements has IDs, and metadata.parent_id is
leveraged
How html is preserved if there are 'layout' without text, or there are
deeply nested HTMLs that are just text from the point of view of
Unstructured Element?
Each element is parsed back to HTML using metadata.text_as_html field.
For layout elements only html_tag are there, for long text elements
there is everything required to recreate HTML - you can see examples in
unit tests or .json file I attached.
Pros of solution:
- Nothing had to be changed in element types
Cons:
- There are elements without Text which may be confusing (they could be
replaced by some special type)
Core transformation logic can be found in 2 functions in
`unstructured/documents/transformations.py`
Knowns bugs (they are minor):
- sometimes html tag is changed incorrectly
- metadata.category_depth and metadata.page_number are not set
- page break is not added between pages
How to test. Generate HTML:
```python3
from pathlib import Path
from vlm_partitioner.src.partition import partition
if __name__ == "__main__":
doc_dir = Path("out_dir")
file_path = Path("example_doc.pdf")
partition(str(file_path), provider="anthropic", output_dir=str(doc_dir))
```
Then parse to unstructured elements and back to html
```python3
from pathlib import Path
from unstructured.documents.html_utils import indent_html
from unstructured.documents.transformations import parse_html_to_ontology, ontology_to_unstructured_elements, \
unstructured_elements_to_ontology
from unstructured.staging.base import elements_to_json
if __name__ == "__main__":
output_dir = Path("out_dir/")
output_dir.mkdir(exist_ok=True, parents=True)
doc_path = Path("out_dir/example_doc.html")
html_content = doc_path.read_text()
ontology = parse_html_to_ontology(html_content)
unstructured_elements = ontology_to_unstructured_elements(ontology)
elements_to_json(unstructured_elements, str(output_dir / f"{doc_path.stem}_unstr.json"))
parsed_ontology = unstructured_elements_to_ontology(unstructured_elements)
html_to_save = indent_html(parsed_ontology.to_html())
Path(output_dir / f"{doc_path.stem}_parsed_unstr.html").write_text(html_to_save)
```
I attached example doc before and after running these scripts
[outputs.zip](https://github.com/user-attachments/files/17438673/outputs.zip)
2024-10-23 14:28:07 +02:00
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_path):
|
|
|
|
html_file_path = Path(__file__).parent / html_file_path
|
|
|
|
json_file_path = Path(__file__).parent / json_file_path
|
|
|
|
expected_json_elements = elements_from_json(str(json_file_path))
|
|
|
|
html_code = html_file_path.read_text()
|
|
|
|
|
2024-10-31 13:17:25 +01:00
|
|
|
predicted_elements = partition_html(
|
|
|
|
text=html_code, html_parser_version="v2", unique_element_ids=True
|
|
|
|
)
|
2024-11-26 17:20:23 +01:00
|
|
|
|
Add parsing HTML to unstructured elements (#3732)
> This is POC change; not everything is working correctly and code
quality could be improved significantly
This ticket add parsing HTML to unstructured element and back. How is it
working?
HTML has a tree structure, Unstructured Elements is a list.
HTML structure is traversed in DFS order, creating Elements and adding
them to list. So the reading order from HTML is preserved. To be able to
compose tree again all elements has IDs, and metadata.parent_id is
leveraged
How html is preserved if there are 'layout' without text, or there are
deeply nested HTMLs that are just text from the point of view of
Unstructured Element?
Each element is parsed back to HTML using metadata.text_as_html field.
For layout elements only html_tag are there, for long text elements
there is everything required to recreate HTML - you can see examples in
unit tests or .json file I attached.
Pros of solution:
- Nothing had to be changed in element types
Cons:
- There are elements without Text which may be confusing (they could be
replaced by some special type)
Core transformation logic can be found in 2 functions in
`unstructured/documents/transformations.py`
Knowns bugs (they are minor):
- sometimes html tag is changed incorrectly
- metadata.category_depth and metadata.page_number are not set
- page break is not added between pages
How to test. Generate HTML:
```python3
from pathlib import Path
from vlm_partitioner.src.partition import partition
if __name__ == "__main__":
doc_dir = Path("out_dir")
file_path = Path("example_doc.pdf")
partition(str(file_path), provider="anthropic", output_dir=str(doc_dir))
```
Then parse to unstructured elements and back to html
```python3
from pathlib import Path
from unstructured.documents.html_utils import indent_html
from unstructured.documents.transformations import parse_html_to_ontology, ontology_to_unstructured_elements, \
unstructured_elements_to_ontology
from unstructured.staging.base import elements_to_json
if __name__ == "__main__":
output_dir = Path("out_dir/")
output_dir.mkdir(exist_ok=True, parents=True)
doc_path = Path("out_dir/example_doc.html")
html_content = doc_path.read_text()
ontology = parse_html_to_ontology(html_content)
unstructured_elements = ontology_to_unstructured_elements(ontology)
elements_to_json(unstructured_elements, str(output_dir / f"{doc_path.stem}_unstr.json"))
parsed_ontology = unstructured_elements_to_ontology(unstructured_elements)
html_to_save = indent_html(parsed_ontology.to_html())
Path(output_dir / f"{doc_path.stem}_parsed_unstr.html").write_text(html_to_save)
```
I attached example doc before and after running these scripts
[outputs.zip](https://github.com/user-attachments/files/17438673/outputs.zip)
2024-10-23 14:28:07 +02:00
|
|
|
assert len(expected_json_elements) == len(predicted_elements)
|
|
|
|
|
|
|
|
for i in range(len(expected_json_elements)):
|
2024-10-31 13:17:25 +01:00
|
|
|
assert expected_json_elements[i] == predicted_elements[i]
|
2024-11-07 19:21:39 +01:00
|
|
|
assert (
|
|
|
|
expected_json_elements[i].metadata.text_as_html
|
|
|
|
== predicted_elements[i].metadata.text_as_html
|
|
|
|
)
|
2024-10-31 13:17:25 +01:00
|
|
|
|
|
|
|
|
|
|
|
def test_inline_elements_are_squeezed():
|
|
|
|
ontology = Document(
|
|
|
|
children=[
|
|
|
|
Page(
|
|
|
|
children=[
|
|
|
|
Hyperlink(text="Hyperlink1"),
|
|
|
|
Hyperlink(text="Hyperlink2"),
|
|
|
|
Hyperlink(text="Hyperlink3"),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
]
|
|
|
|
)
|
|
|
|
unstructured_elements = ontology_to_unstructured_elements(ontology)
|
|
|
|
assert len(unstructured_elements) == 2
|
|
|
|
|
|
|
|
page, text1 = unstructured_elements
|
|
|
|
assert text1.text == "Hyperlink1 Hyperlink2 Hyperlink3"
|
|
|
|
|
|
|
|
|
|
|
|
def test_text_elements_are_squeezed():
|
|
|
|
ontology = Document(
|
|
|
|
children=[
|
|
|
|
Page(
|
|
|
|
children=[
|
|
|
|
Paragraph(text="Paragraph1"),
|
|
|
|
Paragraph(text="Paragraph2"),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
]
|
|
|
|
)
|
|
|
|
unstructured_elements = ontology_to_unstructured_elements(ontology)
|
|
|
|
assert len(unstructured_elements) == 2
|
|
|
|
|
|
|
|
page, text1 = unstructured_elements
|
|
|
|
assert text1.text == "Paragraph1 Paragraph2"
|
|
|
|
|
|
|
|
|
|
|
|
def test_inline_elements_are_squeezed_when_image():
|
|
|
|
ontology = Document(
|
|
|
|
children=[
|
|
|
|
Page(
|
|
|
|
children=[
|
|
|
|
Paragraph(text="Paragraph1"),
|
|
|
|
Hyperlink(text="Hyperlink1"),
|
|
|
|
Image(text="Image1"),
|
|
|
|
Hyperlink(text="Hyperlink2"),
|
|
|
|
Hyperlink(text="Hyperlink3"),
|
|
|
|
Paragraph(text="Paragraph2"),
|
|
|
|
Paragraph(text="Paragraph3"),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
]
|
|
|
|
)
|
|
|
|
unstructured_elements = ontology_to_unstructured_elements(ontology)
|
|
|
|
assert len(unstructured_elements) == 4
|
|
|
|
|
|
|
|
page, text1, image, text2 = unstructured_elements
|
|
|
|
assert text1.text == "Paragraph1 Hyperlink1"
|
|
|
|
assert text2.text == "Hyperlink2 Hyperlink3 Paragraph2 Paragraph3"
|
|
|
|
|
|
|
|
assert '<a class="Hyperlink"' in text1.metadata.text_as_html
|
|
|
|
assert '<p class="Paragraph"' in text1.metadata.text_as_html
|
|
|
|
|
|
|
|
assert '<a class="Hyperlink"' in text2.metadata.text_as_html
|
|
|
|
assert '<p class="Paragraph"' in text2.metadata.text_as_html
|
|
|
|
|
|
|
|
|
|
|
|
def test_inline_elements_are_squeezed_when_table():
|
|
|
|
ontology = Document(
|
|
|
|
children=[
|
|
|
|
Page(
|
|
|
|
children=[
|
|
|
|
Hyperlink(text="Hyperlink1"),
|
|
|
|
Paragraph(text="Paragraph1"),
|
|
|
|
Paragraph(text="Paragraph2"),
|
|
|
|
Table(text="Table1"),
|
|
|
|
Paragraph(text="Paragraph2"),
|
|
|
|
Hyperlink(text="Hyperlink2"),
|
|
|
|
Hyperlink(text="Hyperlink3"),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
]
|
|
|
|
)
|
|
|
|
unstructured_elements = ontology_to_unstructured_elements(ontology)
|
|
|
|
assert len(unstructured_elements) == 4
|
|
|
|
|
|
|
|
page, text1, table1, text3 = unstructured_elements
|
|
|
|
assert text1.text == "Hyperlink1 Paragraph1 Paragraph2"
|
|
|
|
assert table1.text == "Table1"
|
|
|
|
assert text3.text == "Paragraph2 Hyperlink2 Hyperlink3"
|
|
|
|
|
|
|
|
|
|
|
|
def test_inline_elements_are_on_many_depths():
|
|
|
|
ontology = Document(
|
|
|
|
children=[
|
|
|
|
Page(
|
|
|
|
children=[
|
|
|
|
Hyperlink(text="Hyperlink1"),
|
|
|
|
Paragraph(text="Paragraph1"),
|
|
|
|
Section(
|
|
|
|
children=[
|
|
|
|
Section(
|
|
|
|
children=[
|
|
|
|
Hyperlink(text="Hyperlink2"),
|
|
|
|
Hyperlink(text="Hyperlink3"),
|
|
|
|
]
|
|
|
|
),
|
|
|
|
Paragraph(text="Paragraph2"),
|
|
|
|
Hyperlink(text="Hyperlink4"),
|
|
|
|
]
|
|
|
|
),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
]
|
|
|
|
)
|
|
|
|
unstructured_elements = ontology_to_unstructured_elements(ontology)
|
|
|
|
|
|
|
|
assert len(unstructured_elements) == 6
|
|
|
|
|
|
|
|
page, text1, section1, section2, text2, text3 = unstructured_elements
|
|
|
|
|
|
|
|
assert text1.text == "Hyperlink1 Paragraph1"
|
|
|
|
assert text2.text == "Hyperlink2 Hyperlink3"
|
|
|
|
assert text3.text == "Paragraph2 Hyperlink4"
|