from pathlib import Path import pytest from unstructured.chunking.basic import chunk_elements from unstructured.chunking.title import chunk_by_title from unstructured.documents.ontology import ( Column, Document, Hyperlink, Image, Page, Paragraph, Section, Table, ) from unstructured.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder from unstructured.partition.html import partition_html from unstructured.partition.html.transformations import ( ontology_to_unstructured_elements, parse_html_to_ontology, ) from unstructured.partition.json import partition_json from unstructured.staging.base import elements_from_json def test_page_number_is_passed_correctly(): ontology = Document( children=[ Page( children=[Paragraph(text="Paragraph1")], additional_attributes={"data-page-number": "1"}, ), Page( children=[Paragraph(text="Paragraph2")], additional_attributes={"data-page-number": "2"}, ), ] ) unstructured_elements = ontology_to_unstructured_elements(ontology) page1, p1, page2, p2 = unstructured_elements assert p1.metadata.page_number == 1 assert p2.metadata.page_number == 2 def test_invalid_page_number_is_not_passed(): ontology = Document( children=[ Page( children=[Paragraph(text="Paragraph1")], additional_attributes={"data-page-number": "invalid"}, ) ] ) unstructured_elements = ontology_to_unstructured_elements(ontology) page1, p1 = unstructured_elements assert not p1.metadata.page_number def test_depth_is_passed_correctly(): ontology = Document( children=[ Page(children=[Paragraph(text="Paragraph1")]), Page( children=[ Column(children=[Paragraph(text="Paragraph2")]), Column(children=[Paragraph(text="Paragraph3")]), ] ), ] ) unstructured_elements = ontology_to_unstructured_elements(ontology) page1, p1, page2, c1, p2, c2, p3 = unstructured_elements assert page1.metadata.category_depth == 0 assert page2.metadata.category_depth == 0 assert p1.metadata.category_depth == 1 assert c2.metadata.category_depth == 1 assert c1.metadata.category_depth == 1 assert p2.metadata.category_depth == 2 assert p3.metadata.category_depth == 2 def test_chunking_is_applied_on_elements(): ontology = Document( children=[ Page(children=[Paragraph(text="Paragraph1")]), Page( children=[ Column(children=[Paragraph(text="Paragraph2")]), Column(children=[Paragraph(text="Paragraph3")]), ] ), ] ) unstructured_elements = ontology_to_unstructured_elements(ontology) chunked_basic = chunk_elements(unstructured_elements) assert str(chunked_basic[0]) == "Paragraph1\n\nParagraph2\n\nParagraph3" chunked_by_title = chunk_by_title(unstructured_elements) assert str(chunked_by_title[0]) == "Paragraph1\n\nParagraph2\n\nParagraph3" def test_embeddings_are_applied_on_elements(mocker): ontology = Document( children=[ Page(children=[Paragraph(text="Paragraph1")]), Page( children=[ Column(children=[Paragraph(text="Paragraph2")]), Column(children=[Paragraph(text="Paragraph3")]), ] ), ] ) unstructured_elements = ontology_to_unstructured_elements(ontology) # Mocked client with the desired behavior for embed_documents mock_client = mocker.MagicMock() mock_client.embed_documents.return_value = [1, 2, 3, 4, 5, 6, 7] # Mock get_client to return our mock_client mocker.patch.object(OpenAIEmbeddingConfig, "get_client", return_value=mock_client) encoder = OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(api_key="api_key")) elements = encoder.embed_documents( elements=unstructured_elements, ) assert len(elements) == 7 page1, p1, page2, c1, p2, c2, p3 = elements assert p1.embeddings == 2 assert p2.embeddings == 5 assert p3.embeddings == 7 @pytest.mark.parametrize( ("html_file_path", "json_file_path"), [ ("html_files/example.html", "unstructured_json_output/example.json"), ], ) def test_ingest(html_file_path, json_file_path): html_file_path = Path(__file__).parent / html_file_path json_file_path = Path(__file__).parent / json_file_path html_code = html_file_path.read_text() expected_json_elements = elements_from_json(str(json_file_path)) ontology = parse_html_to_ontology(html_code) unstructured_elements = ontology_to_unstructured_elements(ontology) assert unstructured_elements == expected_json_elements @pytest.mark.parametrize("json_file_path", ["unstructured_json_output/example.json"]) def test_parsed_ontology_can_be_serialized_from_json(json_file_path): json_file_path = Path(__file__).parent / json_file_path expected_json_elements = elements_from_json(str(json_file_path)) json_elements_text = json_file_path.read_text() elements = partition_json(text=json_elements_text) assert len(elements) == len(expected_json_elements) for i in range(len(elements)): assert elements[i] == expected_json_elements[i] # The partitioning output comes from PDF file, so only stem is compared # as the suffix is different .pdf != .json assert Path(elements[i].metadata.filename).stem == json_file_path.stem @pytest.mark.parametrize( ("html_file_path", "json_file_path"), [ ("html_files/example.html", "unstructured_json_output/example.json"), ("html_files/example_full_doc.html", "unstructured_json_output/example_full_doc.json"), ( "html_files/example_with_alternative_text.html", "unstructured_json_output/example_with_alternative_text.json", ), ("html_files/three_tables.html", "unstructured_json_output/three_tables.json"), ( "html_files/example_with_inline_fields.html", "unstructured_json_output/example_with_inline_fields.json", ), ], ) def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_path): html_file_path = Path(__file__).parent / html_file_path json_file_path = Path(__file__).parent / json_file_path expected_json_elements = elements_from_json(str(json_file_path)) html_code = html_file_path.read_text() predicted_elements = partition_html( text=html_code, html_parser_version="v2", unique_element_ids=True ) assert len(expected_json_elements) == len(predicted_elements) for i in range(len(expected_json_elements)): assert expected_json_elements[i] == predicted_elements[i] assert ( expected_json_elements[i].metadata.text_as_html == predicted_elements[i].metadata.text_as_html ) def test_inline_elements_are_squeezed(): ontology = Document( children=[ Page( children=[ Hyperlink(text="Hyperlink1"), Hyperlink(text="Hyperlink2"), Hyperlink(text="Hyperlink3"), ], ) ] ) unstructured_elements = ontology_to_unstructured_elements(ontology) assert len(unstructured_elements) == 2 page, text1 = unstructured_elements assert text1.text == "Hyperlink1 Hyperlink2 Hyperlink3" def test_text_elements_are_squeezed(): ontology = Document( children=[ Page( children=[ Paragraph(text="Paragraph1"), Paragraph(text="Paragraph2"), ], ) ] ) unstructured_elements = ontology_to_unstructured_elements(ontology) assert len(unstructured_elements) == 2 page, text1 = unstructured_elements assert text1.text == "Paragraph1 Paragraph2" def test_inline_elements_are_squeezed_when_image(): ontology = Document( children=[ Page( children=[ Paragraph(text="Paragraph1"), Hyperlink(text="Hyperlink1"), Image(text="Image1"), Hyperlink(text="Hyperlink2"), Hyperlink(text="Hyperlink3"), Paragraph(text="Paragraph2"), Paragraph(text="Paragraph3"), ], ) ] ) unstructured_elements = ontology_to_unstructured_elements(ontology) assert len(unstructured_elements) == 4 page, text1, image, text2 = unstructured_elements assert text1.text == "Paragraph1 Hyperlink1" assert text2.text == "Hyperlink2 Hyperlink3 Paragraph2 Paragraph3" assert '