mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-16 12:49:12 +00:00
feat: partition_xml
infers element type on each leaf node (#1249)
### Summary Closes #1229. Updates `partition_xml` so that the element type is inferred on each leaf node when `xml_keep_tags=False` instead of delegating splitting and partitioning to `partition_xml`. If `xml_keep_tags=True`, the file is treated like a text file still and partitioning is still delegated to `partition_text`. Also adds the option to pass `text` as an input to `partition_xml`. ### Testing Create a `parrots.xml` file that looks like: ```xml <xml><parrot><name>Conure</name><description>A conure is a very friendly bird. Conures are feathery and like to dance.</description></parrot></xml> ``` Run: ```python from unstructured.partition.xml import partition_xml from unstructured.staging.base import convert_to_dict elements = partition_xml(filename="parrots.xml") convert_to_dict(elements) ``` One `main`, the output is the following. Notice how the `<name>` tag incorrectly gets merged into `<description>` in the first element. ```python [{'element_id': '7ae4074435df8dfcefcf24a4e6c52026', 'metadata': {'file_directory': '/home/matt/tmp', 'filename': 'parrots.xml', 'filetype': 'application/xml', 'last_modified': '2023-08-30T14:21:38'}, 'text': 'Conure A conure is a very friendly bird.', 'type': 'NarrativeText'}, {'element_id': '859ecb332da6961acd2fb6a0185d1549', 'metadata': {'file_directory': '/home/matt/tmp', 'filename': 'parrots.xml', 'filetype': 'application/xml', 'last_modified': '2023-08-30T14:21:38'}, 'text': 'Conures are feathery and like to dance.', 'type': 'NarrativeText'}] ``` One the feature branch, the output is the following, and the tags are correctly separated. ```python [{'element_id': '5512218914e4eeacf71a9cd42c373710', 'metadata': {'file_directory': '/home/matt/tmp', 'filename': 'parrots.xml', 'filetype': 'application/xml', 'last_modified': '2023-08-30T14:21:38'}, 'text': 'Conure', 'type': 'Title'}, {'element_id': '113bf8d250c2b1a77c9c2caa4b812f85', 'metadata': {'file_directory': '/home/matt/tmp', 'filename': 'parrots.xml', 'filetype': 'application/xml', 'last_modified': '2023-08-30T14:21:38'}, 'text': 'A conure is a very friendly bird.\n' '\n' 'Conures are feathery and like to dance.', 'type': 'NarrativeText'}] ```
This commit is contained in:
parent
de855bb4ed
commit
c49df62967
@ -1,6 +1,11 @@
|
|||||||
## 0.10.10-dev0
|
## 0.10.10-dev2
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
|
* Adds `text` as an input parameter to `partition_xml`.
|
||||||
|
* `partition_xml` no longer runs through `partition_text`, avoiding incorrect splitting
|
||||||
|
on carriage returns in the XML. Since `partition_xml` no longer calls `partition_text`,
|
||||||
|
`min_partition` and `max_partition` are no longer supported in `partition_xml`.
|
||||||
* Bump `unstructured-inference==0.5.18`, change non-default detectron2 classification threshold
|
* Bump `unstructured-inference==0.5.18`, change non-default detectron2 classification threshold
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
@ -877,9 +877,4 @@ If ``xml_keep_tags=True``, the function returns tag information in addition to t
|
|||||||
|
|
||||||
elements = partition_xml(filename="example-docs/factbook.xml", xml_keep_tags=False)
|
elements = partition_xml(filename="example-docs/factbook.xml", xml_keep_tags=False)
|
||||||
|
|
||||||
``partition_xml`` includes a ``max_partition`` parameter that indicates the maximum character length for a document element.
|
|
||||||
The default value is ``1500``, which roughly corresponds to
|
|
||||||
the average character length for a paragraph.
|
|
||||||
You can disable ``max_partition`` by setting it to ``None``.
|
|
||||||
|
|
||||||
For more information about the ``partition_xml`` brick, you can check the `source code here <https://github.com/Unstructured-IO/unstructured/blob/a583d47b841bdd426b9058b7c34f6aa3ed8de152/unstructured/partition/xml.py>`_.
|
For more information about the ``partition_xml`` brick, you can check the `source code here <https://github.com/Unstructured-IO/unstructured/blob/a583d47b841bdd426b9058b7c34f6aa3ed8de152/unstructured/partition/xml.py>`_.
|
||||||
|
@ -644,7 +644,7 @@ def test_file_specific_produces_correct_filetype(filetype: FileType):
|
|||||||
|
|
||||||
|
|
||||||
def test_auto_partition_xml_from_filename(filename="example-docs/factbook.xml"):
|
def test_auto_partition_xml_from_filename(filename="example-docs/factbook.xml"):
|
||||||
elements = partition(filename=filename, xml_keep_tags=False)
|
elements = partition(filename=filename, xml_keep_tags=False, metadata_filename=filename)
|
||||||
|
|
||||||
assert elements[0].text == "United States"
|
assert elements[0].text == "United States"
|
||||||
assert elements[0].metadata.filename == "factbook.xml"
|
assert elements[0].metadata.filename == "factbook.xml"
|
||||||
@ -660,15 +660,15 @@ def test_auto_partition_xml_from_file(filename="example-docs/factbook.xml"):
|
|||||||
def test_auto_partition_xml_from_filename_with_tags(filename="example-docs/factbook.xml"):
|
def test_auto_partition_xml_from_filename_with_tags(filename="example-docs/factbook.xml"):
|
||||||
elements = partition(filename=filename, xml_keep_tags=True)
|
elements = partition(filename=filename, xml_keep_tags=True)
|
||||||
|
|
||||||
assert elements[5].text == "<leader>Joe Biden</leader>"
|
assert "<leader>Joe Biden</leader>" in elements[0].text
|
||||||
assert elements[5].metadata.filename == "factbook.xml"
|
assert elements[0].metadata.filename == "factbook.xml"
|
||||||
|
|
||||||
|
|
||||||
def test_auto_partition_xml_from_file_with_tags(filename="example-docs/factbook.xml"):
|
def test_auto_partition_xml_from_file_with_tags(filename="example-docs/factbook.xml"):
|
||||||
with open(filename, "rb") as f:
|
with open(filename, "rb") as f:
|
||||||
elements = partition(file=f, xml_keep_tags=True)
|
elements = partition(file=f, xml_keep_tags=True)
|
||||||
|
|
||||||
assert elements[5].text == "<leader>Joe Biden</leader>"
|
assert "<leader>Joe Biden</leader>" in elements[0].text
|
||||||
|
|
||||||
|
|
||||||
EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||||
|
@ -3,6 +3,7 @@ import pathlib
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from unstructured.documents.elements import NarrativeText, Title
|
||||||
from unstructured.partition.json import partition_json
|
from unstructured.partition.json import partition_json
|
||||||
from unstructured.partition.xml import partition_xml
|
from unstructured.partition.xml import partition_xml
|
||||||
from unstructured.staging.base import elements_to_json
|
from unstructured.staging.base import elements_to_json
|
||||||
@ -73,8 +74,17 @@ def test_partition_xml_from_filename_with_tags_default_encoding(filename):
|
|||||||
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
||||||
elements = partition_xml(filename=file_path, xml_keep_tags=True)
|
elements = partition_xml(filename=file_path, xml_keep_tags=True)
|
||||||
|
|
||||||
assert elements[5].text == "<leader>Joe Biden</leader>"
|
assert "<leader>Joe Biden</leader>" in elements[0].text
|
||||||
assert elements[5].metadata.filename == filename
|
assert elements[0].metadata.filename == filename
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_xml_from_text_with_tags(filename="example-docs/factbook.xml"):
|
||||||
|
with open(filename) as f:
|
||||||
|
text = f.read()
|
||||||
|
elements = partition_xml(text=text, xml_keep_tags=True, metadata_filename=filename)
|
||||||
|
|
||||||
|
assert "<leader>Joe Biden</leader>" in elements[0].text
|
||||||
|
assert elements[0].metadata.filename == "factbook.xml"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@ -96,8 +106,8 @@ def test_partition_xml_from_file_with_tags_default_encoding(filename):
|
|||||||
with open(file_path) as f:
|
with open(file_path) as f:
|
||||||
elements = partition_xml(file=f, xml_keep_tags=True, metadata_filename=file_path)
|
elements = partition_xml(file=f, xml_keep_tags=True, metadata_filename=file_path)
|
||||||
|
|
||||||
assert elements[5].text == "<leader>Joe Biden</leader>"
|
assert "<leader>Joe Biden</leader>" in elements[0].text
|
||||||
assert elements[5].metadata.filename == filename
|
assert elements[0].metadata.filename == filename
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@ -109,8 +119,8 @@ def test_partition_xml_from_file_rb_with_tags_default_encoding(filename):
|
|||||||
with open(file_path, "rb") as f:
|
with open(file_path, "rb") as f:
|
||||||
elements = partition_xml(file=f, xml_keep_tags=True, metadata_filename=file_path)
|
elements = partition_xml(file=f, xml_keep_tags=True, metadata_filename=file_path)
|
||||||
|
|
||||||
assert elements[5].text == "<leader>Joe Biden</leader>"
|
assert "<leader>Joe Biden</leader>" in elements[0].text
|
||||||
assert elements[5].metadata.filename == filename
|
assert elements[0].metadata.filename == filename
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@ -250,3 +260,20 @@ def test_partition_xml_with_json(filename):
|
|||||||
|
|
||||||
for i in range(len(elements)):
|
for i in range(len(elements)):
|
||||||
assert elements[i] == test_elements[i]
|
assert elements[i] == test_elements[i]
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_xml_with_narrative_line_breaks():
|
||||||
|
xml_text = """<xml>
|
||||||
|
<parrot>
|
||||||
|
<name>Conure</name>
|
||||||
|
<description>A conure is a very friendly bird.
|
||||||
|
Conures are feathery and like to dance.
|
||||||
|
</description>
|
||||||
|
</parrot>
|
||||||
|
</xml>"""
|
||||||
|
|
||||||
|
elements = partition_xml(text=xml_text)
|
||||||
|
assert elements[0] == Title("Conure")
|
||||||
|
assert isinstance(elements[1], NarrativeText)
|
||||||
|
assert str(elements[1]).startswith("A conure is a very friendly bird.")
|
||||||
|
assert str(elements[1]).strip().endswith("Conures are feathery and like to dance.")
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.10.10-dev0" # pragma: no cover
|
__version__ = "0.10.10-dev2" # pragma: no cover
|
||||||
|
@ -2,7 +2,12 @@ import xml.etree.ElementTree as ET
|
|||||||
from tempfile import SpooledTemporaryFile
|
from tempfile import SpooledTemporaryFile
|
||||||
from typing import IO, BinaryIO, List, Optional, Union, cast
|
from typing import IO, BinaryIO, List, Optional, Union, cast
|
||||||
|
|
||||||
from unstructured.documents.elements import Element, process_metadata
|
from unstructured.documents.elements import (
|
||||||
|
Element,
|
||||||
|
ElementMetadata,
|
||||||
|
Text,
|
||||||
|
process_metadata,
|
||||||
|
)
|
||||||
from unstructured.file_utils.encoding import read_txt_file
|
from unstructured.file_utils.encoding import read_txt_file
|
||||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||||
from unstructured.partition.common import (
|
from unstructured.partition.common import (
|
||||||
@ -11,7 +16,7 @@ from unstructured.partition.common import (
|
|||||||
get_last_modified_date_from_file,
|
get_last_modified_date_from_file,
|
||||||
spooled_to_bytes_io_if_needed,
|
spooled_to_bytes_io_if_needed,
|
||||||
)
|
)
|
||||||
from unstructured.partition.text import partition_text
|
from unstructured.partition.text import element_from_text
|
||||||
|
|
||||||
|
|
||||||
def is_leaf(elem):
|
def is_leaf(elem):
|
||||||
@ -25,8 +30,11 @@ def is_string(elem):
|
|||||||
def get_leaf_elements(
|
def get_leaf_elements(
|
||||||
filename: Optional[str] = None,
|
filename: Optional[str] = None,
|
||||||
file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None,
|
file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None,
|
||||||
|
text: Optional[str] = None,
|
||||||
xml_path: str = ".",
|
xml_path: str = ".",
|
||||||
):
|
xml_keep_tags: bool = False,
|
||||||
|
) -> List[Optional[str]]:
|
||||||
|
exactly_one(filename=filename, file=file, text=text)
|
||||||
if filename:
|
if filename:
|
||||||
_, raw_text = read_txt_file(filename=filename)
|
_, raw_text = read_txt_file(filename=filename)
|
||||||
elif file:
|
elif file:
|
||||||
@ -34,8 +42,8 @@ def get_leaf_elements(
|
|||||||
cast(Union[BinaryIO, SpooledTemporaryFile], file),
|
cast(Union[BinaryIO, SpooledTemporaryFile], file),
|
||||||
)
|
)
|
||||||
_, raw_text = read_txt_file(file=f)
|
_, raw_text = read_txt_file(file=f)
|
||||||
else:
|
elif text:
|
||||||
raise ValueError("Either 'filename' or 'file' must be provided.")
|
raw_text = text
|
||||||
|
|
||||||
root = ET.fromstring(raw_text)
|
root = ET.fromstring(raw_text)
|
||||||
leaf_elements = []
|
leaf_elements = []
|
||||||
@ -45,7 +53,7 @@ def get_leaf_elements(
|
|||||||
if is_leaf(subelem) and is_string(subelem.text):
|
if is_leaf(subelem) and is_string(subelem.text):
|
||||||
leaf_elements.append(subelem.text)
|
leaf_elements.append(subelem.text)
|
||||||
|
|
||||||
return "\n".join(leaf_elements) # type: ignore
|
return leaf_elements
|
||||||
|
|
||||||
|
|
||||||
@process_metadata()
|
@process_metadata()
|
||||||
@ -53,13 +61,12 @@ def get_leaf_elements(
|
|||||||
def partition_xml(
|
def partition_xml(
|
||||||
filename: Optional[str] = None,
|
filename: Optional[str] = None,
|
||||||
file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None,
|
file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None,
|
||||||
|
text: Optional[str] = None,
|
||||||
xml_keep_tags: bool = False,
|
xml_keep_tags: bool = False,
|
||||||
xml_path: str = ".",
|
xml_path: str = ".",
|
||||||
metadata_filename: Optional[str] = None,
|
metadata_filename: Optional[str] = None,
|
||||||
include_metadata: bool = True,
|
include_metadata: bool = True,
|
||||||
encoding: Optional[str] = None,
|
encoding: Optional[str] = None,
|
||||||
max_partition: Optional[int] = 1500,
|
|
||||||
min_partition: Optional[int] = 0,
|
|
||||||
metadata_last_modified: Optional[str] = None,
|
metadata_last_modified: Optional[str] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> List[Element]:
|
) -> List[Element]:
|
||||||
@ -71,6 +78,8 @@ def partition_xml(
|
|||||||
A string defining the target filename path.
|
A string defining the target filename path.
|
||||||
file
|
file
|
||||||
A file-like object using "rb" mode --> open(filename, "rb").
|
A file-like object using "rb" mode --> open(filename, "rb").
|
||||||
|
text
|
||||||
|
The text of the XML file
|
||||||
xml_keep_tags
|
xml_keep_tags
|
||||||
If True, will retain the XML tags in the output. Otherwise it will simply extract
|
If True, will retain the XML tags in the output. Otherwise it will simply extract
|
||||||
the text from within the tags.
|
the text from within the tags.
|
||||||
@ -81,15 +90,26 @@ def partition_xml(
|
|||||||
include_metadata
|
include_metadata
|
||||||
Determines whether or not metadata is included in the metadata attribute on the
|
Determines whether or not metadata is included in the metadata attribute on the
|
||||||
elements in the output.
|
elements in the output.
|
||||||
max_partition
|
|
||||||
The maximum number of characters to include in a partition. If None is passed,
|
|
||||||
no maximum is applied.
|
|
||||||
min_partition
|
|
||||||
The minimum number of characters to include in a partition.
|
|
||||||
metadata_last_modified
|
metadata_last_modified
|
||||||
The day of the last modification
|
The day of the last modification
|
||||||
"""
|
"""
|
||||||
exactly_one(filename=filename, file=file)
|
exactly_one(filename=filename, file=file, text=text)
|
||||||
|
elements: List[Element] = []
|
||||||
|
|
||||||
|
last_modification_date = None
|
||||||
|
if filename:
|
||||||
|
last_modification_date = get_last_modified_date(filename)
|
||||||
|
elif file:
|
||||||
|
last_modification_date = get_last_modified_date_from_file(file)
|
||||||
|
|
||||||
|
metadata = (
|
||||||
|
ElementMetadata(
|
||||||
|
filename=metadata_filename or filename,
|
||||||
|
last_modified=metadata_last_modified or last_modification_date,
|
||||||
|
)
|
||||||
|
if include_metadata
|
||||||
|
else ElementMetadata()
|
||||||
|
)
|
||||||
|
|
||||||
if xml_keep_tags:
|
if xml_keep_tags:
|
||||||
if filename:
|
if filename:
|
||||||
@ -99,24 +119,24 @@ def partition_xml(
|
|||||||
cast(Union[BinaryIO, SpooledTemporaryFile], file),
|
cast(Union[BinaryIO, SpooledTemporaryFile], file),
|
||||||
)
|
)
|
||||||
_, raw_text = read_txt_file(file=f, encoding=encoding)
|
_, raw_text = read_txt_file(file=f, encoding=encoding)
|
||||||
else:
|
elif text:
|
||||||
raise ValueError("Either 'filename' or 'file' must be provided.")
|
raw_text = text
|
||||||
else:
|
|
||||||
raw_text = get_leaf_elements(filename=filename, file=file, xml_path=xml_path)
|
|
||||||
|
|
||||||
last_modification_date = None
|
elements = [
|
||||||
if filename:
|
Text(text=raw_text, metadata=metadata),
|
||||||
last_modification_date = get_last_modified_date(filename)
|
]
|
||||||
elif file:
|
|
||||||
last_modification_date = get_last_modified_date_from_file(file)
|
|
||||||
|
|
||||||
elements = partition_text(
|
else:
|
||||||
text=raw_text,
|
leaf_elements = get_leaf_elements(
|
||||||
metadata_filename=metadata_filename,
|
filename=filename,
|
||||||
include_metadata=include_metadata,
|
file=file,
|
||||||
max_partition=max_partition,
|
text=text,
|
||||||
min_partition=min_partition,
|
xml_path=xml_path,
|
||||||
metadata_last_modified=metadata_last_modified or last_modification_date,
|
|
||||||
)
|
)
|
||||||
|
for leaf_element in leaf_elements:
|
||||||
|
if leaf_element:
|
||||||
|
element = element_from_text(leaf_element)
|
||||||
|
element.metadata = metadata
|
||||||
|
elements.append(element)
|
||||||
|
|
||||||
return elements
|
return elements
|
||||||
|
Loading…
x
Reference in New Issue
Block a user