unstructured/test_unstructured/documents/test_ontology_to_unstructured_parsing.py

from pathlib import Path

import pytest

from unstructured.chunking.basic import chunk_elements
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.ontology import Column, Document, Page, Paragraph
from unstructured.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
from unstructured.partition.html import partition_html
from unstructured.partition.html.transformations import (
    ontology_to_unstructured_elements,
    parse_html_to_ontology,
)
from unstructured.partition.json import partition_json
from unstructured.staging.base import elements_from_json


def test_page_number_is_passed_correctly():
    ontology = Document(
        children=[
            Page(
                children=[Paragraph(text="Paragraph1")],
                additional_attributes={"data-page-number": "1"},
            ),
            Page(
                children=[Paragraph(text="Paragraph2")],
                additional_attributes={"data-page-number": "2"},
            ),
        ]
    )
    unstructured_elements = ontology_to_unstructured_elements(ontology)
    page1, p1, page2, p2 = unstructured_elements
    assert p1.metadata.page_number == 1
    assert p2.metadata.page_number == 2


def test_invalid_page_number_is_not_passed():
    ontology = Document(
        children=[
            Page(
                children=[Paragraph(text="Paragraph1")],
                additional_attributes={"data-page-number": "invalid"},
            )
        ]
    )
    unstructured_elements = ontology_to_unstructured_elements(ontology)
    page1, p1 = unstructured_elements
    assert not p1.metadata.page_number


def test_depth_is_passed_correctly():
    ontology = Document(
        children=[
            Page(children=[Paragraph(text="Paragraph1")]),
            Page(
                children=[
                    Column(children=[Paragraph(text="Paragraph2")]),
                    Column(children=[Paragraph(text="Paragraph3")]),
                ]
            ),
        ]
    )

    unstructured_elements = ontology_to_unstructured_elements(ontology)
    page1, p1, page2, c1, p2, c2, p3 = unstructured_elements

    assert page1.metadata.category_depth == 0
    assert page2.metadata.category_depth == 0

    assert p1.metadata.category_depth == 1

    assert c2.metadata.category_depth == 1
    assert c1.metadata.category_depth == 1

    assert p2.metadata.category_depth == 2
    assert p3.metadata.category_depth == 2


def test_chunking_is_applied_on_elements():
    ontology = Document(
        children=[
            Page(children=[Paragraph(text="Paragraph1")]),
            Page(
                children=[
                    Column(children=[Paragraph(text="Paragraph2")]),
                    Column(children=[Paragraph(text="Paragraph3")]),
                ]
            ),
        ]
    )

    unstructured_elements = ontology_to_unstructured_elements(ontology)

    chunked_basic = chunk_elements(unstructured_elements)
    assert str(chunked_basic[0]) == "Paragraph1\n\nParagraph2\n\nParagraph3"
    chunked_by_title = chunk_by_title(unstructured_elements)
    assert str(chunked_by_title[0]) == "Paragraph1\n\nParagraph2\n\nParagraph3"


def test_embeddings_are_applied_on_elements(mocker):
    ontology = Document(
        children=[
            Page(children=[Paragraph(text="Paragraph1")]),
            Page(
                children=[
                    Column(children=[Paragraph(text="Paragraph2")]),
                    Column(children=[Paragraph(text="Paragraph3")]),
                ]
            ),
        ]
    )

    unstructured_elements = ontology_to_unstructured_elements(ontology)
    # Mocked client with the desired behavior for embed_documents
    mock_client = mocker.MagicMock()
    mock_client.embed_documents.return_value = [1, 2, 3, 4, 5, 6, 7]

    # Mock get_client to return our mock_client
    mocker.patch.object(OpenAIEmbeddingConfig, "get_client", return_value=mock_client)

    encoder = OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(api_key="api_key"))
    elements = encoder.embed_documents(
        elements=unstructured_elements,
    )

    assert len(elements) == 7

    page1, p1, page2, c1, p2, c2, p3 = elements

    assert p1.embeddings == 2
    assert p2.embeddings == 5
    assert p3.embeddings == 7


@pytest.mark.parametrize(
    ("html_file_path", "json_file_path"),
    [
        ("html_files/example.html", "unstructured_json_output/example.json"),
    ],
)
def test_ingest(html_file_path, json_file_path):
    html_file_path = Path(__file__).parent / html_file_path
    json_file_path = Path(__file__).parent / json_file_path

    html_code = html_file_path.read_text()
    expected_json_elements = elements_from_json(str(json_file_path))

    ontology = parse_html_to_ontology(html_code)
    unstructured_elements = ontology_to_unstructured_elements(ontology)
    assert unstructured_elements == expected_json_elements


@pytest.mark.parametrize("json_file_path", ["unstructured_json_output/example.json"])
def test_parsed_ontology_can_be_serialized_from_json(json_file_path):
    json_file_path = Path(__file__).parent / json_file_path

    expected_json_elements = elements_from_json(str(json_file_path))

    json_elements_text = json_file_path.read_text()
    elements = partition_json(text=json_elements_text)

    assert len(elements) == len(expected_json_elements)
    for i in range(len(elements)):
        assert elements[i] == expected_json_elements[i]
        # The partitioning output comes from PDF file, so only stem is compared
        # as the suffix is different .pdf != .json
        assert Path(elements[i].metadata.filename).stem == json_file_path.stem


@pytest.mark.parametrize(
    ("html_file_path", "json_file_path"),
    [
        ("html_files/example.html", "unstructured_json_output/example.json"),
    ],
)
def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_path):
    html_file_path = Path(__file__).parent / html_file_path
    json_file_path = Path(__file__).parent / json_file_path

    expected_json_elements = elements_from_json(str(json_file_path))
    html_code = html_file_path.read_text()

    predicted_elements = partition_html(text=html_code, html_parser_version="v2")
    assert len(expected_json_elements) == len(predicted_elements)

    for i in range(len(expected_json_elements)):
        assert expected_json_elements[i] == expected_json_elements[i]