unstructured/test_unstructured/documents/test_ontology_to_unstructured_parsing.py
2024-11-26 16:20:23 +00:00

339 lines
11 KiB
Python

from pathlib import Path
import pytest
from unstructured.chunking.basic import chunk_elements
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.ontology import (
Column,
Document,
Hyperlink,
Image,
Page,
Paragraph,
Section,
Table,
)
from unstructured.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
from unstructured.partition.html import partition_html
from unstructured.partition.html.transformations import (
ontology_to_unstructured_elements,
parse_html_to_ontology,
)
from unstructured.partition.json import partition_json
from unstructured.staging.base import elements_from_json
def test_page_number_is_passed_correctly():
ontology = Document(
children=[
Page(
children=[Paragraph(text="Paragraph1")],
additional_attributes={"data-page-number": "1"},
),
Page(
children=[Paragraph(text="Paragraph2")],
additional_attributes={"data-page-number": "2"},
),
]
)
unstructured_elements = ontology_to_unstructured_elements(ontology)
page1, p1, page2, p2 = unstructured_elements
assert p1.metadata.page_number == 1
assert p2.metadata.page_number == 2
def test_invalid_page_number_is_not_passed():
ontology = Document(
children=[
Page(
children=[Paragraph(text="Paragraph1")],
additional_attributes={"data-page-number": "invalid"},
)
]
)
unstructured_elements = ontology_to_unstructured_elements(ontology)
page1, p1 = unstructured_elements
assert not p1.metadata.page_number
def test_depth_is_passed_correctly():
ontology = Document(
children=[
Page(children=[Paragraph(text="Paragraph1")]),
Page(
children=[
Column(children=[Paragraph(text="Paragraph2")]),
Column(children=[Paragraph(text="Paragraph3")]),
]
),
]
)
unstructured_elements = ontology_to_unstructured_elements(ontology)
page1, p1, page2, c1, p2, c2, p3 = unstructured_elements
assert page1.metadata.category_depth == 0
assert page2.metadata.category_depth == 0
assert p1.metadata.category_depth == 1
assert c2.metadata.category_depth == 1
assert c1.metadata.category_depth == 1
assert p2.metadata.category_depth == 2
assert p3.metadata.category_depth == 2
def test_chunking_is_applied_on_elements():
ontology = Document(
children=[
Page(children=[Paragraph(text="Paragraph1")]),
Page(
children=[
Column(children=[Paragraph(text="Paragraph2")]),
Column(children=[Paragraph(text="Paragraph3")]),
]
),
]
)
unstructured_elements = ontology_to_unstructured_elements(ontology)
chunked_basic = chunk_elements(unstructured_elements)
assert str(chunked_basic[0]) == "Paragraph1\n\nParagraph2\n\nParagraph3"
chunked_by_title = chunk_by_title(unstructured_elements)
assert str(chunked_by_title[0]) == "Paragraph1\n\nParagraph2\n\nParagraph3"
def test_embeddings_are_applied_on_elements(mocker):
ontology = Document(
children=[
Page(children=[Paragraph(text="Paragraph1")]),
Page(
children=[
Column(children=[Paragraph(text="Paragraph2")]),
Column(children=[Paragraph(text="Paragraph3")]),
]
),
]
)
unstructured_elements = ontology_to_unstructured_elements(ontology)
# Mocked client with the desired behavior for embed_documents
mock_client = mocker.MagicMock()
mock_client.embed_documents.return_value = [1, 2, 3, 4, 5, 6, 7]
# Mock get_client to return our mock_client
mocker.patch.object(OpenAIEmbeddingConfig, "get_client", return_value=mock_client)
encoder = OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(api_key="api_key"))
elements = encoder.embed_documents(
elements=unstructured_elements,
)
assert len(elements) == 7
page1, p1, page2, c1, p2, c2, p3 = elements
assert p1.embeddings == 2
assert p2.embeddings == 5
assert p3.embeddings == 7
@pytest.mark.parametrize(
("html_file_path", "json_file_path"),
[
("html_files/example.html", "unstructured_json_output/example.json"),
],
)
def test_ingest(html_file_path, json_file_path):
html_file_path = Path(__file__).parent / html_file_path
json_file_path = Path(__file__).parent / json_file_path
html_code = html_file_path.read_text()
expected_json_elements = elements_from_json(str(json_file_path))
ontology = parse_html_to_ontology(html_code)
unstructured_elements = ontology_to_unstructured_elements(ontology)
assert unstructured_elements == expected_json_elements
@pytest.mark.parametrize("json_file_path", ["unstructured_json_output/example.json"])
def test_parsed_ontology_can_be_serialized_from_json(json_file_path):
json_file_path = Path(__file__).parent / json_file_path
expected_json_elements = elements_from_json(str(json_file_path))
json_elements_text = json_file_path.read_text()
elements = partition_json(text=json_elements_text)
assert len(elements) == len(expected_json_elements)
for i in range(len(elements)):
assert elements[i] == expected_json_elements[i]
# The partitioning output comes from PDF file, so only stem is compared
# as the suffix is different .pdf != .json
assert Path(elements[i].metadata.filename).stem == json_file_path.stem
@pytest.mark.parametrize(
("html_file_path", "json_file_path"),
[
("html_files/example.html", "unstructured_json_output/example.json"),
("html_files/example_full_doc.html", "unstructured_json_output/example_full_doc.json"),
(
"html_files/example_with_alternative_text.html",
"unstructured_json_output/example_with_alternative_text.json",
),
("html_files/three_tables.html", "unstructured_json_output/three_tables.json"),
(
"html_files/example_with_inline_fields.html",
"unstructured_json_output/example_with_inline_fields.json",
),
],
)
def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_path):
html_file_path = Path(__file__).parent / html_file_path
json_file_path = Path(__file__).parent / json_file_path
expected_json_elements = elements_from_json(str(json_file_path))
html_code = html_file_path.read_text()
predicted_elements = partition_html(
text=html_code, html_parser_version="v2", unique_element_ids=True
)
assert len(expected_json_elements) == len(predicted_elements)
for i in range(len(expected_json_elements)):
assert expected_json_elements[i] == predicted_elements[i]
assert (
expected_json_elements[i].metadata.text_as_html
== predicted_elements[i].metadata.text_as_html
)
def test_inline_elements_are_squeezed():
ontology = Document(
children=[
Page(
children=[
Hyperlink(text="Hyperlink1"),
Hyperlink(text="Hyperlink2"),
Hyperlink(text="Hyperlink3"),
],
)
]
)
unstructured_elements = ontology_to_unstructured_elements(ontology)
assert len(unstructured_elements) == 2
page, text1 = unstructured_elements
assert text1.text == "Hyperlink1 Hyperlink2 Hyperlink3"
def test_text_elements_are_squeezed():
ontology = Document(
children=[
Page(
children=[
Paragraph(text="Paragraph1"),
Paragraph(text="Paragraph2"),
],
)
]
)
unstructured_elements = ontology_to_unstructured_elements(ontology)
assert len(unstructured_elements) == 2
page, text1 = unstructured_elements
assert text1.text == "Paragraph1 Paragraph2"
def test_inline_elements_are_squeezed_when_image():
ontology = Document(
children=[
Page(
children=[
Paragraph(text="Paragraph1"),
Hyperlink(text="Hyperlink1"),
Image(text="Image1"),
Hyperlink(text="Hyperlink2"),
Hyperlink(text="Hyperlink3"),
Paragraph(text="Paragraph2"),
Paragraph(text="Paragraph3"),
],
)
]
)
unstructured_elements = ontology_to_unstructured_elements(ontology)
assert len(unstructured_elements) == 4
page, text1, image, text2 = unstructured_elements
assert text1.text == "Paragraph1 Hyperlink1"
assert text2.text == "Hyperlink2 Hyperlink3 Paragraph2 Paragraph3"
assert '<a class="Hyperlink"' in text1.metadata.text_as_html
assert '<p class="Paragraph"' in text1.metadata.text_as_html
assert '<a class="Hyperlink"' in text2.metadata.text_as_html
assert '<p class="Paragraph"' in text2.metadata.text_as_html
def test_inline_elements_are_squeezed_when_table():
ontology = Document(
children=[
Page(
children=[
Hyperlink(text="Hyperlink1"),
Paragraph(text="Paragraph1"),
Paragraph(text="Paragraph2"),
Table(text="Table1"),
Paragraph(text="Paragraph2"),
Hyperlink(text="Hyperlink2"),
Hyperlink(text="Hyperlink3"),
],
)
]
)
unstructured_elements = ontology_to_unstructured_elements(ontology)
assert len(unstructured_elements) == 4
page, text1, table1, text3 = unstructured_elements
assert text1.text == "Hyperlink1 Paragraph1 Paragraph2"
assert table1.text == "Table1"
assert text3.text == "Paragraph2 Hyperlink2 Hyperlink3"
def test_inline_elements_are_on_many_depths():
ontology = Document(
children=[
Page(
children=[
Hyperlink(text="Hyperlink1"),
Paragraph(text="Paragraph1"),
Section(
children=[
Section(
children=[
Hyperlink(text="Hyperlink2"),
Hyperlink(text="Hyperlink3"),
]
),
Paragraph(text="Paragraph2"),
Hyperlink(text="Hyperlink4"),
]
),
],
)
]
)
unstructured_elements = ontology_to_unstructured_elements(ontology)
assert len(unstructured_elements) == 6
page, text1, section1, section2, text2, text3 = unstructured_elements
assert text1.text == "Hyperlink1 Paragraph1"
assert text2.text == "Hyperlink2 Hyperlink3"
assert text3.text == "Paragraph2 Hyperlink4"