unstructured/test_unstructured/partition/test_xml_partition.py
Matt Robinson 23ff32cc42
feat: add partition_xml for XML files (#596)
* first pass on partition_xml

* add option to keep xml tags

* added tests for xml

* fix filename

* update filenames

* remove outdated readme

* add xml to auto

* version and changelog

* update readme and docs

* pass through include_metadata

* update include_metadata description

* add README back in

* linting, linting, linting

* more linting

* spooled to bytes doesnt need to be a tuple

* Add tests for newly supported filetypes

* Correct metadata filetype

* doc typo

Co-authored-by: qued <64741807+qued@users.noreply.github.com>

* typo fix

Co-authored-by: qued <64741807+qued@users.noreply.github.com>

* typo fix

Co-authored-by: qued <64741807+qued@users.noreply.github.com>

* keep_xml_tags -> xml_keep_tags

---------

Co-authored-by: Alan Bertl <alan@unstructured.io>
Co-authored-by: qued <64741807+qued@users.noreply.github.com>
2023-05-18 15:40:12 +00:00

32 lines
1.2 KiB
Python

from unstructured.partition.xml import partition_xml
def test_partition_xml_from_filename(filename="example-docs/factbook.xml"):
elements = partition_xml(filename=filename, xml_keep_tags=False)
assert elements[0].text == "United States"
assert elements[0].metadata.filename == "factbook.xml"
def test_partition_xml_from_file(filename="example-docs/factbook.xml"):
with open(filename, "rb") as f:
elements = partition_xml(file=f, xml_keep_tags=False, metadata_filename=filename)
assert elements[0].text == "United States"
assert elements[0].metadata.filename == "factbook.xml"
def test_partition_xml_from_filename_with_tags(filename="example-docs/factbook.xml"):
elements = partition_xml(filename=filename, xml_keep_tags=True)
assert elements[5].text == "<name>United States</name>"
assert elements[5].metadata.filename == "factbook.xml"
def test_partition_xml_from_file_with_tags(filename="example-docs/factbook.xml"):
with open(filename, "rb") as f:
elements = partition_xml(file=f, xml_keep_tags=True, metadata_filename=filename)
assert elements[5].text == "<name>United States</name>"
assert elements[5].metadata.filename == "factbook.xml"