unstructured/test_unstructured/partition/test_xml_partition.py
Matt Robinson c49df62967
feat: partition_xml infers element type on each leaf node (#1249)
### Summary

Closes #1229. Updates `partition_xml` so that the element type is
inferred on each leaf node when `xml_keep_tags=False` instead of
delegating splitting and partitioning to `partition_xml`. If
`xml_keep_tags=True`, the file is treated like a text file still and
partitioning is still delegated to `partition_text`.

Also adds the option to pass `text` as an input to `partition_xml`.

### Testing

Create a `parrots.xml` file that looks like:

```xml
<xml><parrot><name>Conure</name><description>A conure is a very friendly bird.

Conures are feathery and like to dance.</description></parrot></xml>
```

Run:

```python
from unstructured.partition.xml import partition_xml
from unstructured.staging.base import convert_to_dict

elements = partition_xml(filename="parrots.xml")
convert_to_dict(elements)
```

One `main`, the output is the following. Notice how the `<name>` tag
incorrectly gets merged into `<description>` in the first element.

```python
[{'element_id': '7ae4074435df8dfcefcf24a4e6c52026',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'Conure A conure is a very friendly bird.',
  'type': 'NarrativeText'},
 {'element_id': '859ecb332da6961acd2fb6a0185d1549',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'Conures are feathery and like to dance.',
  'type': 'NarrativeText'}]

```

One the feature branch, the output is the following, and the tags are
correctly separated.

```python
[{'element_id': '5512218914e4eeacf71a9cd42c373710',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'Conure',
  'type': 'Title'},
 {'element_id': '113bf8d250c2b1a77c9c2caa4b812f85',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'A conure is a very friendly bird.\n'
          '\n'
          'Conures are feathery and like to dance.',
  'type': 'NarrativeText'}]

```
2023-08-30 17:07:10 -04:00

280 lines
9.1 KiB
Python

import os
import pathlib
import pytest
from unstructured.documents.elements import NarrativeText, Title
from unstructured.partition.json import partition_json
from unstructured.partition.xml import partition_xml
from unstructured.staging.base import elements_to_json
DIRECTORY = pathlib.Path(__file__).parent.resolve()
@pytest.mark.parametrize(
"filename",
["factbook.xml", "factbook-utf-16.xml"],
)
def test_partition_xml_from_filename(filename):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
elements = partition_xml(filename=file_path, xml_keep_tags=False)
assert elements[0].text == "United States"
assert elements[0].metadata.filename == filename
def test_partition_xml_from_filename_with_metadata_filename():
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", "factbook.xml")
elements = partition_xml(filename=file_path, xml_keep_tags=False, metadata_filename="test")
assert elements[0].text == "United States"
assert elements[0].metadata.filename == "test"
@pytest.mark.parametrize(
"filename",
["factbook.xml", "factbook-utf-16.xml"],
)
def test_partition_xml_from_file(filename):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
with open(file_path) as f:
elements = partition_xml(file=f, xml_keep_tags=False, metadata_filename=file_path)
assert elements[0].text == "United States"
assert elements[0].metadata.filename == filename
def test_partition_xml_from_file_with_metadata_filename():
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", "factbook.xml")
with open(file_path) as f:
elements = partition_xml(file=f, xml_keep_tags=False, metadata_filename="test")
assert elements[0].text == "United States"
assert elements[0].metadata.filename == "test"
@pytest.mark.parametrize(
"filename",
["factbook.xml", "factbook-utf-16.xml"],
)
def test_partition_xml_from_file_rb(filename):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
with open(file_path, "rb") as f:
elements = partition_xml(file=f, xml_keep_tags=False, metadata_filename=file_path)
assert elements[0].text == "United States"
assert elements[0].metadata.filename == filename
@pytest.mark.parametrize(
"filename",
["factbook.xml", "factbook-utf-16.xml"],
)
def test_partition_xml_from_filename_with_tags_default_encoding(filename):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
elements = partition_xml(filename=file_path, xml_keep_tags=True)
assert "<leader>Joe Biden</leader>" in elements[0].text
assert elements[0].metadata.filename == filename
def test_partition_xml_from_text_with_tags(filename="example-docs/factbook.xml"):
with open(filename) as f:
text = f.read()
elements = partition_xml(text=text, xml_keep_tags=True, metadata_filename=filename)
assert "<leader>Joe Biden</leader>" in elements[0].text
assert elements[0].metadata.filename == "factbook.xml"
@pytest.mark.parametrize(
("filename", "encoding", "error"),
[("factbook-utf-16.xml", "utf-8", UnicodeDecodeError)],
)
def test_partition_xml_from_filename_with_tags_raises_encoding_error(filename, encoding, error):
with pytest.raises(error):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
partition_xml(filename=file_path, xml_keep_tags=True, encoding=encoding)
@pytest.mark.parametrize(
"filename",
["factbook.xml", "factbook-utf-16.xml"],
)
def test_partition_xml_from_file_with_tags_default_encoding(filename):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
with open(file_path) as f:
elements = partition_xml(file=f, xml_keep_tags=True, metadata_filename=file_path)
assert "<leader>Joe Biden</leader>" in elements[0].text
assert elements[0].metadata.filename == filename
@pytest.mark.parametrize(
"filename",
["factbook.xml", "factbook-utf-16.xml"],
)
def test_partition_xml_from_file_rb_with_tags_default_encoding(filename):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
with open(file_path, "rb") as f:
elements = partition_xml(file=f, xml_keep_tags=True, metadata_filename=file_path)
assert "<leader>Joe Biden</leader>" in elements[0].text
assert elements[0].metadata.filename == filename
@pytest.mark.parametrize(
("filename", "encoding", "error"),
[("factbook-utf-16.xml", "utf-8", UnicodeDecodeError)],
)
def test_partition_xml_from_file_rb_with_tags_raises_encoding_error(filename, encoding, error):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
with pytest.raises(error), open(file_path, "rb") as f:
partition_xml(
file=f,
xml_keep_tags=True,
metadata_filename=file_path,
encoding=encoding,
)
@pytest.mark.parametrize(
"filename",
["factbook.xml", "factbook-utf-16.xml"],
)
def test_partition_xml_from_filename_exclude_metadata(filename):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
elements = partition_xml(filename=file_path, xml_keep_tags=False, include_metadata=False)
assert elements[0].text == "United States"
for i in range(len(elements)):
assert elements[i].metadata.to_dict() == {}
@pytest.mark.parametrize(
"filename",
["factbook.xml", "factbook-utf-16.xml"],
)
def test_partition_xml_from_file_exclude_metadata(filename):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
with open(file_path) as f:
elements = partition_xml(
file=f,
xml_keep_tags=False,
metadata_filename=file_path,
include_metadata=False,
)
assert elements[0].text == "United States"
for i in range(len(elements)):
assert elements[i].metadata.to_dict() == {}
def test_partition_xml_metadata_date(
mocker,
filename="example-docs/factbook.xml",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.xml.get_last_modified_date",
return_value=mocked_last_modification_date,
)
elements = partition_xml(
filename=filename,
)
assert elements[0].metadata.last_modified == mocked_last_modification_date
def test_partition_xml_with_custom_metadata_date(
mocker,
filename="example-docs/factbook.xml",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
expected_last_modification_date = "2020-07-05T09:24:28"
mocker.patch(
"unstructured.partition.xml.get_last_modified_date",
return_value=mocked_last_modification_date,
)
elements = partition_xml(
filename=filename,
metadata_last_modified=expected_last_modification_date,
)
assert elements[0].metadata.last_modified == expected_last_modification_date
def test_partition_xml_from_file_metadata_date(
mocker,
filename="example-docs/factbook.xml",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.xml.get_last_modified_date_from_file",
return_value=mocked_last_modification_date,
)
with open(filename, "rb") as f:
elements = partition_xml(
file=f,
)
assert elements[0].metadata.last_modified == mocked_last_modification_date
def test_partition_xml_from_file_with_custom_metadata_date(
mocker,
filename="example-docs/factbook.xml",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
expected_last_modification_date = "2020-07-05T09:24:28"
mocker.patch(
"unstructured.partition.xml.get_last_modified_date_from_file",
return_value=mocked_last_modification_date,
)
with open(filename, "rb") as f:
elements = partition_xml(file=f, metadata_last_modified=expected_last_modification_date)
assert elements[0].metadata.last_modified == expected_last_modification_date
@pytest.mark.parametrize(
"filename",
["factbook.xml", "factbook-utf-16.xml"],
)
def test_partition_xml_with_json(filename):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
elements = partition_xml(filename=file_path, xml_keep_tags=False)
test_elements = partition_json(text=elements_to_json(elements))
assert len(elements) == len(test_elements)
assert elements[0].metadata.page_number == test_elements[0].metadata.page_number
assert elements[0].metadata.filename == test_elements[0].metadata.filename
for i in range(len(elements)):
assert elements[i] == test_elements[i]
def test_partition_xml_with_narrative_line_breaks():
xml_text = """<xml>
<parrot>
<name>Conure</name>
<description>A conure is a very friendly bird.
Conures are feathery and like to dance.
</description>
</parrot>
</xml>"""
elements = partition_xml(text=xml_text)
assert elements[0] == Title("Conure")
assert isinstance(elements[1], NarrativeText)
assert str(elements[1]).startswith("A conure is a very friendly bird.")
assert str(elements[1]).strip().endswith("Conures are feathery and like to dance.")