From c49df62967036e8a7922b0cf0104df6768a27ced Mon Sep 17 00:00:00 2001
From: Matt Robinson <mrobinson@unstructured.io>
Date: Wed, 30 Aug 2023 17:07:10 -0400
Subject: [PATCH] feat: `partition_xml` infers element type on each leaf node
 (#1249)

### Summary

Closes #1229. Updates `partition_xml` so that the element type is
inferred on each leaf node when `xml_keep_tags=False` instead of
delegating splitting and partitioning to `partition_xml`. If
`xml_keep_tags=True`, the file is treated like a text file still and
partitioning is still delegated to `partition_text`.

Also adds the option to pass `text` as an input to `partition_xml`.

### Testing

Create a `parrots.xml` file that looks like:

```xml
<xml><parrot><name>Conure</name><description>A conure is a very friendly bird.

Conures are feathery and like to dance.</description></parrot></xml>
```

Run:

```python
from unstructured.partition.xml import partition_xml
from unstructured.staging.base import convert_to_dict

elements = partition_xml(filename="parrots.xml")
convert_to_dict(elements)
```

One `main`, the output is the following. Notice how the `<name>` tag
incorrectly gets merged into `<description>` in the first element.

```python
[{'element_id': '7ae4074435df8dfcefcf24a4e6c52026',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'Conure A conure is a very friendly bird.',
  'type': 'NarrativeText'},
 {'element_id': '859ecb332da6961acd2fb6a0185d1549',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'Conures are feathery and like to dance.',
  'type': 'NarrativeText'}]

```

One the feature branch, the output is the following, and the tags are
correctly separated.

```python
[{'element_id': '5512218914e4eeacf71a9cd42c373710',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'Conure',
  'type': 'Title'},
 {'element_id': '113bf8d250c2b1a77c9c2caa4b812f85',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'A conure is a very friendly bird.\n'
          '\n'
          'Conures are feathery and like to dance.',
  'type': 'NarrativeText'}]

```
---
 CHANGELOG.md                                  |  7 +-
 docs/source/bricks/partition.rst              |  5 --
 test_unstructured/partition/test_auto.py      |  8 +-
 .../partition/test_xml_partition.py           | 39 +++++++--
 unstructured/__version__.py                   |  2 +-
 unstructured/partition/xml.py                 | 84 ++++++++++++-------
 6 files changed, 96 insertions(+), 49 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4ca4e34a2..d353a463f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,11 @@
-## 0.10.10-dev0
+## 0.10.10-dev2
 
 ### Enhancements
+
+* Adds `text` as an input parameter to `partition_xml`.
+* `partition_xml` no longer runs through `partition_text`, avoiding incorrect splitting
+  on carriage returns in the XML. Since `partition_xml` no longer calls `partition_text`,
+  `min_partition` and `max_partition` are no longer supported in `partition_xml`.
 * Bump `unstructured-inference==0.5.18`, change non-default detectron2 classification threshold
 
 ### Features
diff --git a/docs/source/bricks/partition.rst b/docs/source/bricks/partition.rst
index fa991bb4c..22245f202 100644
--- a/docs/source/bricks/partition.rst
+++ b/docs/source/bricks/partition.rst
@@ -877,9 +877,4 @@ If ``xml_keep_tags=True``, the function returns tag information in addition to t
 
   elements = partition_xml(filename="example-docs/factbook.xml", xml_keep_tags=False)
 
-``partition_xml`` includes a ``max_partition`` parameter that indicates the maximum character length for a document element.
-The default value is ``1500``, which roughly corresponds to
-the average character length for a paragraph.
-You can disable ``max_partition`` by setting it to ``None``.
-
 For more information about the ``partition_xml`` brick, you can check the `source code here <https://github.com/Unstructured-IO/unstructured/blob/a583d47b841bdd426b9058b7c34f6aa3ed8de152/unstructured/partition/xml.py>`_.
diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
index 031a94968..a5dbc94ea 100644
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@@ -644,7 +644,7 @@ def test_file_specific_produces_correct_filetype(filetype: FileType):
 
 
 def test_auto_partition_xml_from_filename(filename="example-docs/factbook.xml"):
-    elements = partition(filename=filename, xml_keep_tags=False)
+    elements = partition(filename=filename, xml_keep_tags=False, metadata_filename=filename)
 
     assert elements[0].text == "United States"
     assert elements[0].metadata.filename == "factbook.xml"
@@ -660,15 +660,15 @@ def test_auto_partition_xml_from_file(filename="example-docs/factbook.xml"):
 def test_auto_partition_xml_from_filename_with_tags(filename="example-docs/factbook.xml"):
     elements = partition(filename=filename, xml_keep_tags=True)
 
-    assert elements[5].text == "<leader>Joe Biden</leader>"
-    assert elements[5].metadata.filename == "factbook.xml"
+    assert "<leader>Joe Biden</leader>" in elements[0].text
+    assert elements[0].metadata.filename == "factbook.xml"
 
 
 def test_auto_partition_xml_from_file_with_tags(filename="example-docs/factbook.xml"):
     with open(filename, "rb") as f:
         elements = partition(file=f, xml_keep_tags=True)
 
-    assert elements[5].text == "<leader>Joe Biden</leader>"
+    assert "<leader>Joe Biden</leader>" in elements[0].text
 
 
 EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
diff --git a/test_unstructured/partition/test_xml_partition.py b/test_unstructured/partition/test_xml_partition.py
index 366c5a1fc..8cf71f707 100644
--- a/test_unstructured/partition/test_xml_partition.py
+++ b/test_unstructured/partition/test_xml_partition.py
@@ -3,6 +3,7 @@ import pathlib
 
 import pytest
 
+from unstructured.documents.elements import NarrativeText, Title
 from unstructured.partition.json import partition_json
 from unstructured.partition.xml import partition_xml
 from unstructured.staging.base import elements_to_json
@@ -73,8 +74,17 @@ def test_partition_xml_from_filename_with_tags_default_encoding(filename):
     file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
     elements = partition_xml(filename=file_path, xml_keep_tags=True)
 
-    assert elements[5].text == "<leader>Joe Biden</leader>"
-    assert elements[5].metadata.filename == filename
+    assert "<leader>Joe Biden</leader>" in elements[0].text
+    assert elements[0].metadata.filename == filename
+
+
+def test_partition_xml_from_text_with_tags(filename="example-docs/factbook.xml"):
+    with open(filename) as f:
+        text = f.read()
+    elements = partition_xml(text=text, xml_keep_tags=True, metadata_filename=filename)
+
+    assert "<leader>Joe Biden</leader>" in elements[0].text
+    assert elements[0].metadata.filename == "factbook.xml"
 
 
 @pytest.mark.parametrize(
@@ -96,8 +106,8 @@ def test_partition_xml_from_file_with_tags_default_encoding(filename):
     with open(file_path) as f:
         elements = partition_xml(file=f, xml_keep_tags=True, metadata_filename=file_path)
 
-    assert elements[5].text == "<leader>Joe Biden</leader>"
-    assert elements[5].metadata.filename == filename
+    assert "<leader>Joe Biden</leader>" in elements[0].text
+    assert elements[0].metadata.filename == filename
 
 
 @pytest.mark.parametrize(
@@ -109,8 +119,8 @@ def test_partition_xml_from_file_rb_with_tags_default_encoding(filename):
     with open(file_path, "rb") as f:
         elements = partition_xml(file=f, xml_keep_tags=True, metadata_filename=file_path)
 
-    assert elements[5].text == "<leader>Joe Biden</leader>"
-    assert elements[5].metadata.filename == filename
+    assert "<leader>Joe Biden</leader>" in elements[0].text
+    assert elements[0].metadata.filename == filename
 
 
 @pytest.mark.parametrize(
@@ -250,3 +260,20 @@ def test_partition_xml_with_json(filename):
 
     for i in range(len(elements)):
         assert elements[i] == test_elements[i]
+
+
+def test_partition_xml_with_narrative_line_breaks():
+    xml_text = """<xml>
+        <parrot>
+            <name>Conure</name>
+            <description>A conure is a very friendly bird.
+            Conures are feathery and like to dance.
+            </description>
+        </parrot>
+    </xml>"""
+
+    elements = partition_xml(text=xml_text)
+    assert elements[0] == Title("Conure")
+    assert isinstance(elements[1], NarrativeText)
+    assert str(elements[1]).startswith("A conure is a very friendly bird.")
+    assert str(elements[1]).strip().endswith("Conures are feathery and like to dance.")
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 9b66a2e0e..14b39f43e 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.10.10-dev0"  # pragma: no cover
+__version__ = "0.10.10-dev2"  # pragma: no cover
diff --git a/unstructured/partition/xml.py b/unstructured/partition/xml.py
index dc4a1536e..dd09d02fa 100644
--- a/unstructured/partition/xml.py
+++ b/unstructured/partition/xml.py
@@ -2,7 +2,12 @@ import xml.etree.ElementTree as ET
 from tempfile import SpooledTemporaryFile
 from typing import IO, BinaryIO, List, Optional, Union, cast
 
-from unstructured.documents.elements import Element, process_metadata
+from unstructured.documents.elements import (
+    Element,
+    ElementMetadata,
+    Text,
+    process_metadata,
+)
 from unstructured.file_utils.encoding import read_txt_file
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.common import (
@@ -11,7 +16,7 @@ from unstructured.partition.common import (
     get_last_modified_date_from_file,
     spooled_to_bytes_io_if_needed,
 )
-from unstructured.partition.text import partition_text
+from unstructured.partition.text import element_from_text
 
 
 def is_leaf(elem):
@@ -25,8 +30,11 @@ def is_string(elem):
 def get_leaf_elements(
     filename: Optional[str] = None,
     file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None,
+    text: Optional[str] = None,
     xml_path: str = ".",
-):
+    xml_keep_tags: bool = False,
+) -> List[Optional[str]]:
+    exactly_one(filename=filename, file=file, text=text)
     if filename:
         _, raw_text = read_txt_file(filename=filename)
     elif file:
@@ -34,8 +42,8 @@ def get_leaf_elements(
             cast(Union[BinaryIO, SpooledTemporaryFile], file),
         )
         _, raw_text = read_txt_file(file=f)
-    else:
-        raise ValueError("Either 'filename' or 'file' must be provided.")
+    elif text:
+        raw_text = text
 
     root = ET.fromstring(raw_text)
     leaf_elements = []
@@ -45,7 +53,7 @@ def get_leaf_elements(
             if is_leaf(subelem) and is_string(subelem.text):
                 leaf_elements.append(subelem.text)
 
-    return "\n".join(leaf_elements)  # type: ignore
+    return leaf_elements
 
 
 @process_metadata()
@@ -53,13 +61,12 @@ def get_leaf_elements(
 def partition_xml(
     filename: Optional[str] = None,
     file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None,
+    text: Optional[str] = None,
     xml_keep_tags: bool = False,
     xml_path: str = ".",
     metadata_filename: Optional[str] = None,
     include_metadata: bool = True,
     encoding: Optional[str] = None,
-    max_partition: Optional[int] = 1500,
-    min_partition: Optional[int] = 0,
     metadata_last_modified: Optional[str] = None,
     **kwargs,
 ) -> List[Element]:
@@ -71,6 +78,8 @@ def partition_xml(
         A string defining the target filename path.
     file
         A file-like object using "rb" mode --> open(filename, "rb").
+    text
+        The text of the XML file
     xml_keep_tags
         If True, will retain the XML tags in the output. Otherwise it will simply extract
         the text from within the tags.
@@ -81,15 +90,26 @@ def partition_xml(
     include_metadata
         Determines whether or not metadata is included in the metadata attribute on the
         elements in the output.
-    max_partition
-        The maximum number of characters to include in a partition. If None is passed,
-        no maximum is applied.
-    min_partition
-        The minimum number of characters to include in a partition.
     metadata_last_modified
         The day of the last modification
     """
-    exactly_one(filename=filename, file=file)
+    exactly_one(filename=filename, file=file, text=text)
+    elements: List[Element] = []
+
+    last_modification_date = None
+    if filename:
+        last_modification_date = get_last_modified_date(filename)
+    elif file:
+        last_modification_date = get_last_modified_date_from_file(file)
+
+    metadata = (
+        ElementMetadata(
+            filename=metadata_filename or filename,
+            last_modified=metadata_last_modified or last_modification_date,
+        )
+        if include_metadata
+        else ElementMetadata()
+    )
 
     if xml_keep_tags:
         if filename:
@@ -99,24 +119,24 @@ def partition_xml(
                 cast(Union[BinaryIO, SpooledTemporaryFile], file),
             )
             _, raw_text = read_txt_file(file=f, encoding=encoding)
-        else:
-            raise ValueError("Either 'filename' or 'file' must be provided.")
+        elif text:
+            raw_text = text
+
+        elements = [
+            Text(text=raw_text, metadata=metadata),
+        ]
+
     else:
-        raw_text = get_leaf_elements(filename=filename, file=file, xml_path=xml_path)
-
-    last_modification_date = None
-    if filename:
-        last_modification_date = get_last_modified_date(filename)
-    elif file:
-        last_modification_date = get_last_modified_date_from_file(file)
-
-    elements = partition_text(
-        text=raw_text,
-        metadata_filename=metadata_filename,
-        include_metadata=include_metadata,
-        max_partition=max_partition,
-        min_partition=min_partition,
-        metadata_last_modified=metadata_last_modified or last_modification_date,
-    )
+        leaf_elements = get_leaf_elements(
+            filename=filename,
+            file=file,
+            text=text,
+            xml_path=xml_path,
+        )
+        for leaf_element in leaf_elements:
+            if leaf_element:
+                element = element_from_text(leaf_element)
+                element.metadata = metadata
+                elements.append(element)
 
     return elements