mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-24 17:41:15 +00:00

* first pass on partition_xml * add option to keep xml tags * added tests for xml * fix filename * update filenames * remove outdated readme * add xml to auto * version and changelog * update readme and docs * pass through include_metadata * update include_metadata description * add README back in * linting, linting, linting * more linting * spooled to bytes doesnt need to be a tuple * Add tests for newly supported filetypes * Correct metadata filetype * doc typo Co-authored-by: qued <64741807+qued@users.noreply.github.com> * typo fix Co-authored-by: qued <64741807+qued@users.noreply.github.com> * typo fix Co-authored-by: qued <64741807+qued@users.noreply.github.com> * keep_xml_tags -> xml_keep_tags --------- Co-authored-by: Alan Bertl <alan@unstructured.io> Co-authored-by: qued <64741807+qued@users.noreply.github.com>
32 lines
1.2 KiB
Python
32 lines
1.2 KiB
Python
from unstructured.partition.xml import partition_xml
|
|
|
|
|
|
def test_partition_xml_from_filename(filename="example-docs/factbook.xml"):
|
|
elements = partition_xml(filename=filename, xml_keep_tags=False)
|
|
|
|
assert elements[0].text == "United States"
|
|
assert elements[0].metadata.filename == "factbook.xml"
|
|
|
|
|
|
def test_partition_xml_from_file(filename="example-docs/factbook.xml"):
|
|
with open(filename, "rb") as f:
|
|
elements = partition_xml(file=f, xml_keep_tags=False, metadata_filename=filename)
|
|
|
|
assert elements[0].text == "United States"
|
|
assert elements[0].metadata.filename == "factbook.xml"
|
|
|
|
|
|
def test_partition_xml_from_filename_with_tags(filename="example-docs/factbook.xml"):
|
|
elements = partition_xml(filename=filename, xml_keep_tags=True)
|
|
|
|
assert elements[5].text == "<name>United States</name>"
|
|
assert elements[5].metadata.filename == "factbook.xml"
|
|
|
|
|
|
def test_partition_xml_from_file_with_tags(filename="example-docs/factbook.xml"):
|
|
with open(filename, "rb") as f:
|
|
elements = partition_xml(file=f, xml_keep_tags=True, metadata_filename=filename)
|
|
|
|
assert elements[5].text == "<name>United States</name>"
|
|
assert elements[5].metadata.filename == "factbook.xml"
|