feat: partition_xml infers element type on each leaf node (#1249)

### Summary

Closes #1229. Updates `partition_xml` so that the element type is
inferred on each leaf node when `xml_keep_tags=False` instead of
delegating splitting and partitioning to `partition_xml`. If
`xml_keep_tags=True`, the file is treated like a text file still and
partitioning is still delegated to `partition_text`.

Also adds the option to pass `text` as an input to `partition_xml`.

### Testing

Create a `parrots.xml` file that looks like:

```xml
<xml><parrot><name>Conure</name><description>A conure is a very friendly bird.

Conures are feathery and like to dance.</description></parrot></xml>
```

Run:

```python
from unstructured.partition.xml import partition_xml
from unstructured.staging.base import convert_to_dict

elements = partition_xml(filename="parrots.xml")
convert_to_dict(elements)
```

One `main`, the output is the following. Notice how the `<name>` tag
incorrectly gets merged into `<description>` in the first element.

```python
[{'element_id': '7ae4074435df8dfcefcf24a4e6c52026',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'Conure A conure is a very friendly bird.',
  'type': 'NarrativeText'},
 {'element_id': '859ecb332da6961acd2fb6a0185d1549',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'Conures are feathery and like to dance.',
  'type': 'NarrativeText'}]

```

One the feature branch, the output is the following, and the tags are
correctly separated.

```python
[{'element_id': '5512218914e4eeacf71a9cd42c373710',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'Conure',
  'type': 'Title'},
 {'element_id': '113bf8d250c2b1a77c9c2caa4b812f85',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'A conure is a very friendly bird.\n'
          '\n'
          'Conures are feathery and like to dance.',
  'type': 'NarrativeText'}]

```
This commit is contained in:
Matt Robinson 2023-08-30 17:07:10 -04:00 committed by GitHub
parent de855bb4ed
commit c49df62967
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 96 additions and 49 deletions

View File

@ -1,6 +1,11 @@
## 0.10.10-dev0 ## 0.10.10-dev2
### Enhancements ### Enhancements
* Adds `text` as an input parameter to `partition_xml`.
* `partition_xml` no longer runs through `partition_text`, avoiding incorrect splitting
on carriage returns in the XML. Since `partition_xml` no longer calls `partition_text`,
`min_partition` and `max_partition` are no longer supported in `partition_xml`.
* Bump `unstructured-inference==0.5.18`, change non-default detectron2 classification threshold * Bump `unstructured-inference==0.5.18`, change non-default detectron2 classification threshold
### Features ### Features

View File

@ -877,9 +877,4 @@ If ``xml_keep_tags=True``, the function returns tag information in addition to t
elements = partition_xml(filename="example-docs/factbook.xml", xml_keep_tags=False) elements = partition_xml(filename="example-docs/factbook.xml", xml_keep_tags=False)
``partition_xml`` includes a ``max_partition`` parameter that indicates the maximum character length for a document element.
The default value is ``1500``, which roughly corresponds to
the average character length for a paragraph.
You can disable ``max_partition`` by setting it to ``None``.
For more information about the ``partition_xml`` brick, you can check the `source code here <https://github.com/Unstructured-IO/unstructured/blob/a583d47b841bdd426b9058b7c34f6aa3ed8de152/unstructured/partition/xml.py>`_. For more information about the ``partition_xml`` brick, you can check the `source code here <https://github.com/Unstructured-IO/unstructured/blob/a583d47b841bdd426b9058b7c34f6aa3ed8de152/unstructured/partition/xml.py>`_.

View File

@ -644,7 +644,7 @@ def test_file_specific_produces_correct_filetype(filetype: FileType):
def test_auto_partition_xml_from_filename(filename="example-docs/factbook.xml"): def test_auto_partition_xml_from_filename(filename="example-docs/factbook.xml"):
elements = partition(filename=filename, xml_keep_tags=False) elements = partition(filename=filename, xml_keep_tags=False, metadata_filename=filename)
assert elements[0].text == "United States" assert elements[0].text == "United States"
assert elements[0].metadata.filename == "factbook.xml" assert elements[0].metadata.filename == "factbook.xml"
@ -660,15 +660,15 @@ def test_auto_partition_xml_from_file(filename="example-docs/factbook.xml"):
def test_auto_partition_xml_from_filename_with_tags(filename="example-docs/factbook.xml"): def test_auto_partition_xml_from_filename_with_tags(filename="example-docs/factbook.xml"):
elements = partition(filename=filename, xml_keep_tags=True) elements = partition(filename=filename, xml_keep_tags=True)
assert elements[5].text == "<leader>Joe Biden</leader>" assert "<leader>Joe Biden</leader>" in elements[0].text
assert elements[5].metadata.filename == "factbook.xml" assert elements[0].metadata.filename == "factbook.xml"
def test_auto_partition_xml_from_file_with_tags(filename="example-docs/factbook.xml"): def test_auto_partition_xml_from_file_with_tags(filename="example-docs/factbook.xml"):
with open(filename, "rb") as f: with open(filename, "rb") as f:
elements = partition(file=f, xml_keep_tags=True) elements = partition(file=f, xml_keep_tags=True)
assert elements[5].text == "<leader>Joe Biden</leader>" assert "<leader>Joe Biden</leader>" in elements[0].text
EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"

View File

@ -3,6 +3,7 @@ import pathlib
import pytest import pytest
from unstructured.documents.elements import NarrativeText, Title
from unstructured.partition.json import partition_json from unstructured.partition.json import partition_json
from unstructured.partition.xml import partition_xml from unstructured.partition.xml import partition_xml
from unstructured.staging.base import elements_to_json from unstructured.staging.base import elements_to_json
@ -73,8 +74,17 @@ def test_partition_xml_from_filename_with_tags_default_encoding(filename):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
elements = partition_xml(filename=file_path, xml_keep_tags=True) elements = partition_xml(filename=file_path, xml_keep_tags=True)
assert elements[5].text == "<leader>Joe Biden</leader>" assert "<leader>Joe Biden</leader>" in elements[0].text
assert elements[5].metadata.filename == filename assert elements[0].metadata.filename == filename
def test_partition_xml_from_text_with_tags(filename="example-docs/factbook.xml"):
with open(filename) as f:
text = f.read()
elements = partition_xml(text=text, xml_keep_tags=True, metadata_filename=filename)
assert "<leader>Joe Biden</leader>" in elements[0].text
assert elements[0].metadata.filename == "factbook.xml"
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -96,8 +106,8 @@ def test_partition_xml_from_file_with_tags_default_encoding(filename):
with open(file_path) as f: with open(file_path) as f:
elements = partition_xml(file=f, xml_keep_tags=True, metadata_filename=file_path) elements = partition_xml(file=f, xml_keep_tags=True, metadata_filename=file_path)
assert elements[5].text == "<leader>Joe Biden</leader>" assert "<leader>Joe Biden</leader>" in elements[0].text
assert elements[5].metadata.filename == filename assert elements[0].metadata.filename == filename
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -109,8 +119,8 @@ def test_partition_xml_from_file_rb_with_tags_default_encoding(filename):
with open(file_path, "rb") as f: with open(file_path, "rb") as f:
elements = partition_xml(file=f, xml_keep_tags=True, metadata_filename=file_path) elements = partition_xml(file=f, xml_keep_tags=True, metadata_filename=file_path)
assert elements[5].text == "<leader>Joe Biden</leader>" assert "<leader>Joe Biden</leader>" in elements[0].text
assert elements[5].metadata.filename == filename assert elements[0].metadata.filename == filename
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -250,3 +260,20 @@ def test_partition_xml_with_json(filename):
for i in range(len(elements)): for i in range(len(elements)):
assert elements[i] == test_elements[i] assert elements[i] == test_elements[i]
def test_partition_xml_with_narrative_line_breaks():
xml_text = """<xml>
<parrot>
<name>Conure</name>
<description>A conure is a very friendly bird.
Conures are feathery and like to dance.
</description>
</parrot>
</xml>"""
elements = partition_xml(text=xml_text)
assert elements[0] == Title("Conure")
assert isinstance(elements[1], NarrativeText)
assert str(elements[1]).startswith("A conure is a very friendly bird.")
assert str(elements[1]).strip().endswith("Conures are feathery and like to dance.")

View File

@ -1 +1 @@
__version__ = "0.10.10-dev0" # pragma: no cover __version__ = "0.10.10-dev2" # pragma: no cover

View File

@ -2,7 +2,12 @@ import xml.etree.ElementTree as ET
from tempfile import SpooledTemporaryFile from tempfile import SpooledTemporaryFile
from typing import IO, BinaryIO, List, Optional, Union, cast from typing import IO, BinaryIO, List, Optional, Union, cast
from unstructured.documents.elements import Element, process_metadata from unstructured.documents.elements import (
Element,
ElementMetadata,
Text,
process_metadata,
)
from unstructured.file_utils.encoding import read_txt_file from unstructured.file_utils.encoding import read_txt_file
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.common import ( from unstructured.partition.common import (
@ -11,7 +16,7 @@ from unstructured.partition.common import (
get_last_modified_date_from_file, get_last_modified_date_from_file,
spooled_to_bytes_io_if_needed, spooled_to_bytes_io_if_needed,
) )
from unstructured.partition.text import partition_text from unstructured.partition.text import element_from_text
def is_leaf(elem): def is_leaf(elem):
@ -25,8 +30,11 @@ def is_string(elem):
def get_leaf_elements( def get_leaf_elements(
filename: Optional[str] = None, filename: Optional[str] = None,
file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None, file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None,
text: Optional[str] = None,
xml_path: str = ".", xml_path: str = ".",
): xml_keep_tags: bool = False,
) -> List[Optional[str]]:
exactly_one(filename=filename, file=file, text=text)
if filename: if filename:
_, raw_text = read_txt_file(filename=filename) _, raw_text = read_txt_file(filename=filename)
elif file: elif file:
@ -34,8 +42,8 @@ def get_leaf_elements(
cast(Union[BinaryIO, SpooledTemporaryFile], file), cast(Union[BinaryIO, SpooledTemporaryFile], file),
) )
_, raw_text = read_txt_file(file=f) _, raw_text = read_txt_file(file=f)
else: elif text:
raise ValueError("Either 'filename' or 'file' must be provided.") raw_text = text
root = ET.fromstring(raw_text) root = ET.fromstring(raw_text)
leaf_elements = [] leaf_elements = []
@ -45,7 +53,7 @@ def get_leaf_elements(
if is_leaf(subelem) and is_string(subelem.text): if is_leaf(subelem) and is_string(subelem.text):
leaf_elements.append(subelem.text) leaf_elements.append(subelem.text)
return "\n".join(leaf_elements) # type: ignore return leaf_elements
@process_metadata() @process_metadata()
@ -53,13 +61,12 @@ def get_leaf_elements(
def partition_xml( def partition_xml(
filename: Optional[str] = None, filename: Optional[str] = None,
file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None, file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None,
text: Optional[str] = None,
xml_keep_tags: bool = False, xml_keep_tags: bool = False,
xml_path: str = ".", xml_path: str = ".",
metadata_filename: Optional[str] = None, metadata_filename: Optional[str] = None,
include_metadata: bool = True, include_metadata: bool = True,
encoding: Optional[str] = None, encoding: Optional[str] = None,
max_partition: Optional[int] = 1500,
min_partition: Optional[int] = 0,
metadata_last_modified: Optional[str] = None, metadata_last_modified: Optional[str] = None,
**kwargs, **kwargs,
) -> List[Element]: ) -> List[Element]:
@ -71,6 +78,8 @@ def partition_xml(
A string defining the target filename path. A string defining the target filename path.
file file
A file-like object using "rb" mode --> open(filename, "rb"). A file-like object using "rb" mode --> open(filename, "rb").
text
The text of the XML file
xml_keep_tags xml_keep_tags
If True, will retain the XML tags in the output. Otherwise it will simply extract If True, will retain the XML tags in the output. Otherwise it will simply extract
the text from within the tags. the text from within the tags.
@ -81,15 +90,26 @@ def partition_xml(
include_metadata include_metadata
Determines whether or not metadata is included in the metadata attribute on the Determines whether or not metadata is included in the metadata attribute on the
elements in the output. elements in the output.
max_partition
The maximum number of characters to include in a partition. If None is passed,
no maximum is applied.
min_partition
The minimum number of characters to include in a partition.
metadata_last_modified metadata_last_modified
The day of the last modification The day of the last modification
""" """
exactly_one(filename=filename, file=file) exactly_one(filename=filename, file=file, text=text)
elements: List[Element] = []
last_modification_date = None
if filename:
last_modification_date = get_last_modified_date(filename)
elif file:
last_modification_date = get_last_modified_date_from_file(file)
metadata = (
ElementMetadata(
filename=metadata_filename or filename,
last_modified=metadata_last_modified or last_modification_date,
)
if include_metadata
else ElementMetadata()
)
if xml_keep_tags: if xml_keep_tags:
if filename: if filename:
@ -99,24 +119,24 @@ def partition_xml(
cast(Union[BinaryIO, SpooledTemporaryFile], file), cast(Union[BinaryIO, SpooledTemporaryFile], file),
) )
_, raw_text = read_txt_file(file=f, encoding=encoding) _, raw_text = read_txt_file(file=f, encoding=encoding)
else: elif text:
raise ValueError("Either 'filename' or 'file' must be provided.") raw_text = text
else:
raw_text = get_leaf_elements(filename=filename, file=file, xml_path=xml_path)
last_modification_date = None elements = [
if filename: Text(text=raw_text, metadata=metadata),
last_modification_date = get_last_modified_date(filename) ]
elif file:
last_modification_date = get_last_modified_date_from_file(file)
elements = partition_text( else:
text=raw_text, leaf_elements = get_leaf_elements(
metadata_filename=metadata_filename, filename=filename,
include_metadata=include_metadata, file=file,
max_partition=max_partition, text=text,
min_partition=min_partition, xml_path=xml_path,
metadata_last_modified=metadata_last_modified or last_modification_date,
) )
for leaf_element in leaf_elements:
if leaf_element:
element = element_from_text(leaf_element)
element.metadata = metadata
elements.append(element)
return elements return elements