From c49df62967036e8a7922b0cf0104df6768a27ced Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Wed, 30 Aug 2023 17:07:10 -0400 Subject: [PATCH] feat: `partition_xml` infers element type on each leaf node (#1249) ### Summary Closes #1229. Updates `partition_xml` so that the element type is inferred on each leaf node when `xml_keep_tags=False` instead of delegating splitting and partitioning to `partition_xml`. If `xml_keep_tags=True`, the file is treated like a text file still and partitioning is still delegated to `partition_text`. Also adds the option to pass `text` as an input to `partition_xml`. ### Testing Create a `parrots.xml` file that looks like: ```xml ConureA conure is a very friendly bird. Conures are feathery and like to dance. ``` Run: ```python from unstructured.partition.xml import partition_xml from unstructured.staging.base import convert_to_dict elements = partition_xml(filename="parrots.xml") convert_to_dict(elements) ``` One `main`, the output is the following. Notice how the `` tag incorrectly gets merged into `` in the first element. ```python [{'element_id': '7ae4074435df8dfcefcf24a4e6c52026', 'metadata': {'file_directory': '/home/matt/tmp', 'filename': 'parrots.xml', 'filetype': 'application/xml', 'last_modified': '2023-08-30T14:21:38'}, 'text': 'Conure A conure is a very friendly bird.', 'type': 'NarrativeText'}, {'element_id': '859ecb332da6961acd2fb6a0185d1549', 'metadata': {'file_directory': '/home/matt/tmp', 'filename': 'parrots.xml', 'filetype': 'application/xml', 'last_modified': '2023-08-30T14:21:38'}, 'text': 'Conures are feathery and like to dance.', 'type': 'NarrativeText'}] ``` One the feature branch, the output is the following, and the tags are correctly separated. ```python [{'element_id': '5512218914e4eeacf71a9cd42c373710', 'metadata': {'file_directory': '/home/matt/tmp', 'filename': 'parrots.xml', 'filetype': 'application/xml', 'last_modified': '2023-08-30T14:21:38'}, 'text': 'Conure', 'type': 'Title'}, {'element_id': '113bf8d250c2b1a77c9c2caa4b812f85', 'metadata': {'file_directory': '/home/matt/tmp', 'filename': 'parrots.xml', 'filetype': 'application/xml', 'last_modified': '2023-08-30T14:21:38'}, 'text': 'A conure is a very friendly bird.\n' '\n' 'Conures are feathery and like to dance.', 'type': 'NarrativeText'}] ``` --- CHANGELOG.md | 7 +- docs/source/bricks/partition.rst | 5 -- test_unstructured/partition/test_auto.py | 8 +- .../partition/test_xml_partition.py | 39 +++++++-- unstructured/__version__.py | 2 +- unstructured/partition/xml.py | 84 ++++++++++++------- 6 files changed, 96 insertions(+), 49 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4ca4e34a2..d353a463f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,11 @@ -## 0.10.10-dev0 +## 0.10.10-dev2 ### Enhancements + +* Adds `text` as an input parameter to `partition_xml`. +* `partition_xml` no longer runs through `partition_text`, avoiding incorrect splitting + on carriage returns in the XML. Since `partition_xml` no longer calls `partition_text`, + `min_partition` and `max_partition` are no longer supported in `partition_xml`. * Bump `unstructured-inference==0.5.18`, change non-default detectron2 classification threshold ### Features diff --git a/docs/source/bricks/partition.rst b/docs/source/bricks/partition.rst index fa991bb4c..22245f202 100644 --- a/docs/source/bricks/partition.rst +++ b/docs/source/bricks/partition.rst @@ -877,9 +877,4 @@ If ``xml_keep_tags=True``, the function returns tag information in addition to t elements = partition_xml(filename="example-docs/factbook.xml", xml_keep_tags=False) -``partition_xml`` includes a ``max_partition`` parameter that indicates the maximum character length for a document element. -The default value is ``1500``, which roughly corresponds to -the average character length for a paragraph. -You can disable ``max_partition`` by setting it to ``None``. - For more information about the ``partition_xml`` brick, you can check the `source code here `_. diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index 031a94968..a5dbc94ea 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -644,7 +644,7 @@ def test_file_specific_produces_correct_filetype(filetype: FileType): def test_auto_partition_xml_from_filename(filename="example-docs/factbook.xml"): - elements = partition(filename=filename, xml_keep_tags=False) + elements = partition(filename=filename, xml_keep_tags=False, metadata_filename=filename) assert elements[0].text == "United States" assert elements[0].metadata.filename == "factbook.xml" @@ -660,15 +660,15 @@ def test_auto_partition_xml_from_file(filename="example-docs/factbook.xml"): def test_auto_partition_xml_from_filename_with_tags(filename="example-docs/factbook.xml"): elements = partition(filename=filename, xml_keep_tags=True) - assert elements[5].text == "Joe Biden" - assert elements[5].metadata.filename == "factbook.xml" + assert "Joe Biden" in elements[0].text + assert elements[0].metadata.filename == "factbook.xml" def test_auto_partition_xml_from_file_with_tags(filename="example-docs/factbook.xml"): with open(filename, "rb") as f: elements = partition(file=f, xml_keep_tags=True) - assert elements[5].text == "Joe Biden" + assert "Joe Biden" in elements[0].text EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" diff --git a/test_unstructured/partition/test_xml_partition.py b/test_unstructured/partition/test_xml_partition.py index 366c5a1fc..8cf71f707 100644 --- a/test_unstructured/partition/test_xml_partition.py +++ b/test_unstructured/partition/test_xml_partition.py @@ -3,6 +3,7 @@ import pathlib import pytest +from unstructured.documents.elements import NarrativeText, Title from unstructured.partition.json import partition_json from unstructured.partition.xml import partition_xml from unstructured.staging.base import elements_to_json @@ -73,8 +74,17 @@ def test_partition_xml_from_filename_with_tags_default_encoding(filename): file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) elements = partition_xml(filename=file_path, xml_keep_tags=True) - assert elements[5].text == "Joe Biden" - assert elements[5].metadata.filename == filename + assert "Joe Biden" in elements[0].text + assert elements[0].metadata.filename == filename + + +def test_partition_xml_from_text_with_tags(filename="example-docs/factbook.xml"): + with open(filename) as f: + text = f.read() + elements = partition_xml(text=text, xml_keep_tags=True, metadata_filename=filename) + + assert "Joe Biden" in elements[0].text + assert elements[0].metadata.filename == "factbook.xml" @pytest.mark.parametrize( @@ -96,8 +106,8 @@ def test_partition_xml_from_file_with_tags_default_encoding(filename): with open(file_path) as f: elements = partition_xml(file=f, xml_keep_tags=True, metadata_filename=file_path) - assert elements[5].text == "Joe Biden" - assert elements[5].metadata.filename == filename + assert "Joe Biden" in elements[0].text + assert elements[0].metadata.filename == filename @pytest.mark.parametrize( @@ -109,8 +119,8 @@ def test_partition_xml_from_file_rb_with_tags_default_encoding(filename): with open(file_path, "rb") as f: elements = partition_xml(file=f, xml_keep_tags=True, metadata_filename=file_path) - assert elements[5].text == "Joe Biden" - assert elements[5].metadata.filename == filename + assert "Joe Biden" in elements[0].text + assert elements[0].metadata.filename == filename @pytest.mark.parametrize( @@ -250,3 +260,20 @@ def test_partition_xml_with_json(filename): for i in range(len(elements)): assert elements[i] == test_elements[i] + + +def test_partition_xml_with_narrative_line_breaks(): + xml_text = """ + + Conure + A conure is a very friendly bird. + Conures are feathery and like to dance. + + + """ + + elements = partition_xml(text=xml_text) + assert elements[0] == Title("Conure") + assert isinstance(elements[1], NarrativeText) + assert str(elements[1]).startswith("A conure is a very friendly bird.") + assert str(elements[1]).strip().endswith("Conures are feathery and like to dance.") diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 9b66a2e0e..14b39f43e 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.10-dev0" # pragma: no cover +__version__ = "0.10.10-dev2" # pragma: no cover diff --git a/unstructured/partition/xml.py b/unstructured/partition/xml.py index dc4a1536e..dd09d02fa 100644 --- a/unstructured/partition/xml.py +++ b/unstructured/partition/xml.py @@ -2,7 +2,12 @@ import xml.etree.ElementTree as ET from tempfile import SpooledTemporaryFile from typing import IO, BinaryIO, List, Optional, Union, cast -from unstructured.documents.elements import Element, process_metadata +from unstructured.documents.elements import ( + Element, + ElementMetadata, + Text, + process_metadata, +) from unstructured.file_utils.encoding import read_txt_file from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.partition.common import ( @@ -11,7 +16,7 @@ from unstructured.partition.common import ( get_last_modified_date_from_file, spooled_to_bytes_io_if_needed, ) -from unstructured.partition.text import partition_text +from unstructured.partition.text import element_from_text def is_leaf(elem): @@ -25,8 +30,11 @@ def is_string(elem): def get_leaf_elements( filename: Optional[str] = None, file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None, + text: Optional[str] = None, xml_path: str = ".", -): + xml_keep_tags: bool = False, +) -> List[Optional[str]]: + exactly_one(filename=filename, file=file, text=text) if filename: _, raw_text = read_txt_file(filename=filename) elif file: @@ -34,8 +42,8 @@ def get_leaf_elements( cast(Union[BinaryIO, SpooledTemporaryFile], file), ) _, raw_text = read_txt_file(file=f) - else: - raise ValueError("Either 'filename' or 'file' must be provided.") + elif text: + raw_text = text root = ET.fromstring(raw_text) leaf_elements = [] @@ -45,7 +53,7 @@ def get_leaf_elements( if is_leaf(subelem) and is_string(subelem.text): leaf_elements.append(subelem.text) - return "\n".join(leaf_elements) # type: ignore + return leaf_elements @process_metadata() @@ -53,13 +61,12 @@ def get_leaf_elements( def partition_xml( filename: Optional[str] = None, file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None, + text: Optional[str] = None, xml_keep_tags: bool = False, xml_path: str = ".", metadata_filename: Optional[str] = None, include_metadata: bool = True, encoding: Optional[str] = None, - max_partition: Optional[int] = 1500, - min_partition: Optional[int] = 0, metadata_last_modified: Optional[str] = None, **kwargs, ) -> List[Element]: @@ -71,6 +78,8 @@ def partition_xml( A string defining the target filename path. file A file-like object using "rb" mode --> open(filename, "rb"). + text + The text of the XML file xml_keep_tags If True, will retain the XML tags in the output. Otherwise it will simply extract the text from within the tags. @@ -81,15 +90,26 @@ def partition_xml( include_metadata Determines whether or not metadata is included in the metadata attribute on the elements in the output. - max_partition - The maximum number of characters to include in a partition. If None is passed, - no maximum is applied. - min_partition - The minimum number of characters to include in a partition. metadata_last_modified The day of the last modification """ - exactly_one(filename=filename, file=file) + exactly_one(filename=filename, file=file, text=text) + elements: List[Element] = [] + + last_modification_date = None + if filename: + last_modification_date = get_last_modified_date(filename) + elif file: + last_modification_date = get_last_modified_date_from_file(file) + + metadata = ( + ElementMetadata( + filename=metadata_filename or filename, + last_modified=metadata_last_modified or last_modification_date, + ) + if include_metadata + else ElementMetadata() + ) if xml_keep_tags: if filename: @@ -99,24 +119,24 @@ def partition_xml( cast(Union[BinaryIO, SpooledTemporaryFile], file), ) _, raw_text = read_txt_file(file=f, encoding=encoding) - else: - raise ValueError("Either 'filename' or 'file' must be provided.") + elif text: + raw_text = text + + elements = [ + Text(text=raw_text, metadata=metadata), + ] + else: - raw_text = get_leaf_elements(filename=filename, file=file, xml_path=xml_path) - - last_modification_date = None - if filename: - last_modification_date = get_last_modified_date(filename) - elif file: - last_modification_date = get_last_modified_date_from_file(file) - - elements = partition_text( - text=raw_text, - metadata_filename=metadata_filename, - include_metadata=include_metadata, - max_partition=max_partition, - min_partition=min_partition, - metadata_last_modified=metadata_last_modified or last_modification_date, - ) + leaf_elements = get_leaf_elements( + filename=filename, + file=file, + text=text, + xml_path=xml_path, + ) + for leaf_element in leaf_elements: + if leaf_element: + element = element_from_text(leaf_element) + element.metadata = metadata + elements.append(element) return elements