unstructured/test_unstructured/partition/test_xml.py

"""Test-suite for `unstructured.partition.xml` module."""

from __future__ import annotations

import pytest
from pytest_mock import MockerFixture

from test_unstructured.unit_utils import example_doc_path
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import NarrativeText, Title
from unstructured.partition.json import partition_json
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
from unstructured.partition.xml import partition_xml
from unstructured.staging.base import elements_to_json


@pytest.mark.parametrize("filename", ["factbook.xml", "factbook-utf-16.xml"])
def test_partition_xml_from_filename(filename: str):
    file_path = example_doc_path(filename)
    elements = partition_xml(filename=file_path, xml_keep_tags=False)

    assert elements[0].text == "United States"
    assert elements[0].metadata.filename == filename
    if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
        assert {element.metadata.detection_origin for element in elements} == {"xml"}


def test_partition_xml_from_filename_with_metadata_filename():
    elements = partition_xml(
        example_doc_path("factbook.xml"), xml_keep_tags=False, metadata_filename="test"
    )

    assert elements[0].text == "United States"
    assert elements[0].metadata.filename == "test"


@pytest.mark.parametrize("filename", ["factbook.xml", "factbook-utf-16.xml"])
def test_partition_xml_from_file(filename: str):
    file_path = example_doc_path(filename)
    with open(file_path, "rb") as f:
        elements = partition_xml(file=f, xml_keep_tags=False, metadata_filename=file_path)

    assert elements[0].text == "United States"
    assert elements[0].metadata.filename == filename


def test_partition_xml_from_file_with_metadata_filename():
    with open(example_doc_path("factbook.xml"), "rb") as f:
        elements = partition_xml(file=f, xml_keep_tags=False, metadata_filename="test")

    assert elements[0].text == "United States"
    assert elements[0].metadata.filename == "test"


@pytest.mark.parametrize("filename", ["factbook.xml", "factbook-utf-16.xml"])
def test_partition_xml_from_file_rb(filename: str):
    file_path = example_doc_path(filename)
    with open(file_path, "rb") as f:
        elements = partition_xml(file=f, xml_keep_tags=False, metadata_filename=file_path)

    assert elements[0].text == "United States"
    assert elements[0].metadata.filename == filename


@pytest.mark.parametrize("filename", ["factbook.xml", "factbook-utf-16.xml"])
def test_partition_xml_from_filename_with_tags_default_encoding(filename: str):
    file_path = example_doc_path(filename)
    elements = partition_xml(filename=file_path, xml_keep_tags=True)

    assert "<leader>Joe Biden</leader>" in elements[0].text
    assert elements[0].metadata.filename == filename


def test_partition_xml_from_text_with_tags():
    with open(example_doc_path("factbook.xml")) as f:
        text = f.read()
    elements = partition_xml(text=text, xml_keep_tags=True)

    assert "<leader>Joe Biden</leader>" in elements[0].text


def test_partition_xml_from_filename_with_tags_raises_encoding_error():
    with pytest.raises(UnicodeDecodeError):
        partition_xml(example_doc_path("factbook-utf-16.xml"), xml_keep_tags=True, encoding="utf-8")


@pytest.mark.parametrize("filename", ["factbook.xml", "factbook-utf-16.xml"])
def test_partition_xml_from_file_with_tags_default_encoding(filename: str):
    file_path = example_doc_path(filename)
    with open(file_path, "rb") as f:
        elements = partition_xml(file=f, xml_keep_tags=True, metadata_filename=file_path)

    assert "<leader>Joe Biden</leader>" in elements[0].text
    assert elements[0].metadata.filename == filename


@pytest.mark.parametrize("filename", ["factbook.xml", "factbook-utf-16.xml"])
def test_partition_xml_from_file_rb_with_tags_default_encoding(filename: str):
    file_path = example_doc_path(filename)
    with open(file_path, "rb") as f:
        elements = partition_xml(file=f, xml_keep_tags=True, metadata_filename=file_path)

    assert "<leader>Joe Biden</leader>" in elements[0].text
    assert elements[0].metadata.filename == filename


def test_partition_xml_from_file_rb_with_tags_raises_encoding_error():
    with pytest.raises(UnicodeDecodeError):
        with open(example_doc_path("factbook-utf-16.xml"), "rb") as f:
            partition_xml(
                file=f,
                xml_keep_tags=True,
                encoding="utf-8",
            )


# -- .metadata.filetype --------------------------------------------------------------------------


def test_partition_xml_gets_the_XML_mime_type_in_metadata_filetype():
    XML_MIME_TYPE = "application/xml"
    elements = partition_xml(example_doc_path("factbook.xml"))
    assert all(e.metadata.filetype == XML_MIME_TYPE for e in elements), (
        f"Expected all elements to have '{XML_MIME_TYPE}' as their filetype, but got:"
        f" {repr(elements[0].metadata.filetype)}"
    )


# -- .metadata.last_modified ---------------------------------------------------------------------


def test_partition_xml_from_file_path_gets_last_modified_from_filesystem(mocker: MockerFixture):
    mocked_last_modification_date = "2029-07-05T09:24:28"

    mocker.patch(
        "unstructured.partition.xml.get_last_modified_date",
        return_value=mocked_last_modification_date,
    )

    elements = partition_xml(filename="example-docs/factbook.xml")

    assert elements[0].metadata.last_modified == mocked_last_modification_date


def test_partition_xml_from_file_gets_last_modified_None():
    with open("example-docs/factbook.xml", "rb") as f:
        elements = partition_xml(file=f)

    assert elements[0].metadata.last_modified is None


def test_partition_xml_from_file_path_prefers_metadata_last_modified(mocker: MockerFixture):
    filesystem_last_modified = "2029-07-05T09:24:28"
    metadata_last_modified = "2020-07-05T09:24:28"

    mocker.patch(
        "unstructured.partition.xml.get_last_modified_date", return_value=filesystem_last_modified
    )

    elements = partition_xml(
        filename="example-docs/factbook.xml",
        metadata_last_modified=metadata_last_modified,
    )

    assert elements[0].metadata.last_modified == metadata_last_modified


def test_partition_xml_from_file_prefers_metadata_last_modified():
    with open("example-docs/factbook.xml", "rb") as f:
        elements = partition_xml(file=f, metadata_last_modified="2029-07-05T09:24:28")

    assert elements[0].metadata.last_modified == "2029-07-05T09:24:28"


# ------------------------------------------------------------------------------------------------


@pytest.mark.parametrize("filename", ["factbook.xml", "factbook-utf-16.xml"])
def test_partition_xml_with_json(filename: str):
    file_path = example_doc_path(filename)
    elements = partition_xml(filename=file_path, xml_keep_tags=False)
    test_elements = partition_json(text=elements_to_json(elements))

    assert len(elements) == len(test_elements)
    assert elements[0].metadata.page_number == test_elements[0].metadata.page_number
    assert elements[0].metadata.filename == test_elements[0].metadata.filename

    for i in range(len(elements)):
        assert elements[i] == test_elements[i]


def test_partition_xml_with_narrative_line_breaks():
    xml_text = """<xml>
        <parrot>
            <name>Conure</name>
            <description>A conure is a very friendly bird.
            Conures are feathery and like to dance.
            </description>
        </parrot>
    </xml>"""

    elements = partition_xml(text=xml_text)
    assert elements[0] == Title("Conure")
    assert isinstance(elements[1], NarrativeText)
    assert str(elements[1]).startswith("A conure is a very friendly bird.")
    assert str(elements[1]).strip().endswith("Conures are feathery and like to dance.")


def test_add_chunking_strategy_on_partition_xml():
    file_path = example_doc_path("factbook.xml")
    elements = partition_xml(file_path)
    chunk_elements = partition_xml(file_path, chunking_strategy="by_title")
    chunks = chunk_by_title(elements)
    assert chunk_elements != elements
    assert chunk_elements == chunks


def test_partition_xml_element_metadata_has_languages():
    file_path = example_doc_path("factbook.xml")
    elements = partition_xml(file_path)
    assert elements[0].metadata.languages == ["eng"]


def test_partition_xml_respects_detect_language_per_element():
    elements = partition_xml(
        example_doc_path("language-docs/eng_spa_mult.xml"), detect_language_per_element=True
    )
    langs = [element.metadata.languages for element in elements]
    assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
rfctr(part): prepare for pluggable auto-partitioners 1 (#3655) Summary In preparation for pluggable auto-partitioners simplify metadata as discussed. Additional Context - Pluggable auto-partitioners requires partitioners to have a consistent call signature. An arbitrary partitioner provided at runtime needs to have a call signature that is known and consistent. Basically `partition_x(filename, , file, *kwargs)`. - The current `auto.partition()` is highly coupled to each distinct file-type partitioner, deciding which arguments to forward to each. - This is driven by the existence of "delegating" partitioners, those that convert their file-type and then call a second partitioner to do the actual partitioning. Both the delegating and proxy partitioners are decorated with metadata-post-processing decorators and those decorators are not idempotent. We call the situation where those decorators would run twice "double-decorating". For example, EPUB converts to HTML and calls `partition_html()` and both `partition_epub()` and `partition_html()` are decorated. - The way double-decorating has been avoided in the past is to avoid sending the arguments the metadata decorators are sensitive to to the proxy partitioner. This is very obscure, complex to reason about, error-prone, and just overall not a viable strategy. The better solution is to not decorate delegating partitioners and let the proxy partitioner handle all the metadata. - This first step in preparation for that is part of simplifying the metadata processing by removing unused or unwanted legacy parameters. - `date_from_file_object` is a misnomer because a file-object never contains last-modified data. - It can never produce useful results in the API where last-modified information must be provided by `metadata_last_modified`. - It is an undocumented parameter so not in use. - Using it can produce incorrect metadata. 2024-09-23 15:23:10 -07:00			"""Test-suite for `unstructured.partition.xml` module."""

			`from __future__ import annotations`

			`import pytest`
			`from pytest_mock import MockerFixture`

			`from test_unstructured.unit_utils import example_doc_path`
			`from unstructured.chunking.title import chunk_by_title`
			`from unstructured.documents.elements import NarrativeText, Title`
			`from unstructured.partition.json import partition_json`
			`from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA`
			`from unstructured.partition.xml import partition_xml`
			`from unstructured.staging.base import elements_to_json`


			`@pytest.mark.parametrize("filename", ["factbook.xml", "factbook-utf-16.xml"])`
			`def test_partition_xml_from_filename(filename: str):`
			`file_path = example_doc_path(filename)`
			`elements = partition_xml(filename=file_path, xml_keep_tags=False)`

			`assert elements[0].text == "United States"`
			`assert elements[0].metadata.filename == filename`
			`if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:`
			`assert {element.metadata.detection_origin for element in elements} == {"xml"}`


			`def test_partition_xml_from_filename_with_metadata_filename():`
			`elements = partition_xml(`
			`example_doc_path("factbook.xml"), xml_keep_tags=False, metadata_filename="test"`
			`)`

			`assert elements[0].text == "United States"`
			`assert elements[0].metadata.filename == "test"`


			`@pytest.mark.parametrize("filename", ["factbook.xml", "factbook-utf-16.xml"])`
			`def test_partition_xml_from_file(filename: str):`
			`file_path = example_doc_path(filename)`
			`with open(file_path, "rb") as f:`
			`elements = partition_xml(file=f, xml_keep_tags=False, metadata_filename=file_path)`

			`assert elements[0].text == "United States"`
			`assert elements[0].metadata.filename == filename`


			`def test_partition_xml_from_file_with_metadata_filename():`
			`with open(example_doc_path("factbook.xml"), "rb") as f:`
			`elements = partition_xml(file=f, xml_keep_tags=False, metadata_filename="test")`

			`assert elements[0].text == "United States"`
			`assert elements[0].metadata.filename == "test"`


			`@pytest.mark.parametrize("filename", ["factbook.xml", "factbook-utf-16.xml"])`
			`def test_partition_xml_from_file_rb(filename: str):`
			`file_path = example_doc_path(filename)`
			`with open(file_path, "rb") as f:`
			`elements = partition_xml(file=f, xml_keep_tags=False, metadata_filename=file_path)`

			`assert elements[0].text == "United States"`
			`assert elements[0].metadata.filename == filename`


			`@pytest.mark.parametrize("filename", ["factbook.xml", "factbook-utf-16.xml"])`
			`def test_partition_xml_from_filename_with_tags_default_encoding(filename: str):`
			`file_path = example_doc_path(filename)`
			`elements = partition_xml(filename=file_path, xml_keep_tags=True)`

			`assert "<leader>Joe Biden</leader>" in elements[0].text`
			`assert elements[0].metadata.filename == filename`


			`def test_partition_xml_from_text_with_tags():`
			`with open(example_doc_path("factbook.xml")) as f:`
			`text = f.read()`
			`elements = partition_xml(text=text, xml_keep_tags=True)`

			`assert "<leader>Joe Biden</leader>" in elements[0].text`


			`def test_partition_xml_from_filename_with_tags_raises_encoding_error():`
			`with pytest.raises(UnicodeDecodeError):`
			`partition_xml(example_doc_path("factbook-utf-16.xml"), xml_keep_tags=True, encoding="utf-8")`


			`@pytest.mark.parametrize("filename", ["factbook.xml", "factbook-utf-16.xml"])`
			`def test_partition_xml_from_file_with_tags_default_encoding(filename: str):`
			`file_path = example_doc_path(filename)`
			`with open(file_path, "rb") as f:`
			`elements = partition_xml(file=f, xml_keep_tags=True, metadata_filename=file_path)`

			`assert "<leader>Joe Biden</leader>" in elements[0].text`
			`assert elements[0].metadata.filename == filename`


			`@pytest.mark.parametrize("filename", ["factbook.xml", "factbook-utf-16.xml"])`
			`def test_partition_xml_from_file_rb_with_tags_default_encoding(filename: str):`
			`file_path = example_doc_path(filename)`
			`with open(file_path, "rb") as f:`
			`elements = partition_xml(file=f, xml_keep_tags=True, metadata_filename=file_path)`

			`assert "<leader>Joe Biden</leader>" in elements[0].text`
			`assert elements[0].metadata.filename == filename`


			`def test_partition_xml_from_file_rb_with_tags_raises_encoding_error():`
			`with pytest.raises(UnicodeDecodeError):`
			`with open(example_doc_path("factbook-utf-16.xml"), "rb") as f:`
			`partition_xml(`
			`file=f,`
			`xml_keep_tags=True,`
			`encoding="utf-8",`
			`)`


rfctr(part): remove double-decoration 2 (#3686) Summary Install new `@apply_metadata()` on PPTX, TSV, XLSX, and XML and remove decoration from PPT. Additional Context - Alphabetical order turns out to be hard, so this is the remaining "easy" delegating partitioner and the remaining principal partitioners. - Replace use of `@process_metadata()` and `@add_metadata_with_filetype()` decorators with `@apply_metadata()` on principal partitioners (those that do not delegate to other partitioners. - Remove all decorators from delegating partitioners (PPT in this case); this removes the "double-decorating". 2024-10-02 11:52:59 -07:00			`# -- .metadata.filetype --------------------------------------------------------------------------`


			`def test_partition_xml_gets_the_XML_mime_type_in_metadata_filetype():`
			`XML_MIME_TYPE = "application/xml"`
			`elements = partition_xml(example_doc_path("factbook.xml"))`
			`assert all(e.metadata.filetype == XML_MIME_TYPE for e in elements), (`
			`f"Expected all elements to have '{XML_MIME_TYPE}' as their filetype, but got:"`
			`f" {repr(elements[0].metadata.filetype)}"`
			`)`


rfctr(part): prepare for pluggable auto-partitioners 1 (#3655) Summary In preparation for pluggable auto-partitioners simplify metadata as discussed. Additional Context - Pluggable auto-partitioners requires partitioners to have a consistent call signature. An arbitrary partitioner provided at runtime needs to have a call signature that is known and consistent. Basically `partition_x(filename, , file, *kwargs)`. - The current `auto.partition()` is highly coupled to each distinct file-type partitioner, deciding which arguments to forward to each. - This is driven by the existence of "delegating" partitioners, those that convert their file-type and then call a second partitioner to do the actual partitioning. Both the delegating and proxy partitioners are decorated with metadata-post-processing decorators and those decorators are not idempotent. We call the situation where those decorators would run twice "double-decorating". For example, EPUB converts to HTML and calls `partition_html()` and both `partition_epub()` and `partition_html()` are decorated. - The way double-decorating has been avoided in the past is to avoid sending the arguments the metadata decorators are sensitive to to the proxy partitioner. This is very obscure, complex to reason about, error-prone, and just overall not a viable strategy. The better solution is to not decorate delegating partitioners and let the proxy partitioner handle all the metadata. - This first step in preparation for that is part of simplifying the metadata processing by removing unused or unwanted legacy parameters. - `date_from_file_object` is a misnomer because a file-object never contains last-modified data. - It can never produce useful results in the API where last-modified information must be provided by `metadata_last_modified`. - It is an undocumented parameter so not in use. - Using it can produce incorrect metadata. 2024-09-23 15:23:10 -07:00			`# -- .metadata.last_modified ---------------------------------------------------------------------`


			`def test_partition_xml_from_file_path_gets_last_modified_from_filesystem(mocker: MockerFixture):`
			`mocked_last_modification_date = "2029-07-05T09:24:28"`

			`mocker.patch(`
			`"unstructured.partition.xml.get_last_modified_date",`
			`return_value=mocked_last_modification_date,`
			`)`

			`elements = partition_xml(filename="example-docs/factbook.xml")`

			`assert elements[0].metadata.last_modified == mocked_last_modification_date`


			`def test_partition_xml_from_file_gets_last_modified_None():`
			`with open("example-docs/factbook.xml", "rb") as f:`
			`elements = partition_xml(file=f)`

			`assert elements[0].metadata.last_modified is None`


			`def test_partition_xml_from_file_path_prefers_metadata_last_modified(mocker: MockerFixture):`
			`filesystem_last_modified = "2029-07-05T09:24:28"`
			`metadata_last_modified = "2020-07-05T09:24:28"`

			`mocker.patch(`
			`"unstructured.partition.xml.get_last_modified_date", return_value=filesystem_last_modified`
			`)`

			`elements = partition_xml(`
			`filename="example-docs/factbook.xml",`
			`metadata_last_modified=metadata_last_modified,`
			`)`

			`assert elements[0].metadata.last_modified == metadata_last_modified`


			`def test_partition_xml_from_file_prefers_metadata_last_modified():`
			`with open("example-docs/factbook.xml", "rb") as f:`
			`elements = partition_xml(file=f, metadata_last_modified="2029-07-05T09:24:28")`

			`assert elements[0].metadata.last_modified == "2029-07-05T09:24:28"`


			`# ------------------------------------------------------------------------------------------------`


			`@pytest.mark.parametrize("filename", ["factbook.xml", "factbook-utf-16.xml"])`
			`def test_partition_xml_with_json(filename: str):`
			`file_path = example_doc_path(filename)`
			`elements = partition_xml(filename=file_path, xml_keep_tags=False)`
			`test_elements = partition_json(text=elements_to_json(elements))`

			`assert len(elements) == len(test_elements)`
			`assert elements[0].metadata.page_number == test_elements[0].metadata.page_number`
			`assert elements[0].metadata.filename == test_elements[0].metadata.filename`

			`for i in range(len(elements)):`
			`assert elements[i] == test_elements[i]`


			`def test_partition_xml_with_narrative_line_breaks():`
			`xml_text = """<xml>`
			`<parrot>`
			`<name>Conure</name>`
			`<description>A conure is a very friendly bird.`
			`Conures are feathery and like to dance.`
			`</description>`
			`</parrot>`
			`</xml>"""`

			`elements = partition_xml(text=xml_text)`
			`assert elements[0] == Title("Conure")`
			`assert isinstance(elements[1], NarrativeText)`
			`assert str(elements[1]).startswith("A conure is a very friendly bird.")`
			`assert str(elements[1]).strip().endswith("Conures are feathery and like to dance.")`


			`def test_add_chunking_strategy_on_partition_xml():`
			`file_path = example_doc_path("factbook.xml")`
			`elements = partition_xml(file_path)`
			`chunk_elements = partition_xml(file_path, chunking_strategy="by_title")`
			`chunks = chunk_by_title(elements)`
			`assert chunk_elements != elements`
			`assert chunk_elements == chunks`


			`def test_partition_xml_element_metadata_has_languages():`
			`file_path = example_doc_path("factbook.xml")`
			`elements = partition_xml(file_path)`
			`assert elements[0].metadata.languages == ["eng"]`


			`def test_partition_xml_respects_detect_language_per_element():`
			`elements = partition_xml(`
			`example_doc_path("language-docs/eng_spa_mult.xml"), detect_language_per_element=True`
			`)`
			`langs = [element.metadata.languages for element in elements]`
			`assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]`