mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-14 04:25:54 +00:00

Add functionality to try other common encodings for html, xml files if an error related to the encoding is raised and the user has not specified an encoding. Change auto.py to have a None default for encoding Remove the unused parameter encoding from partition_pdf Add functionality to the read_txt_file utility function to handle file-like object from URL
110 lines
3.7 KiB
Python
110 lines
3.7 KiB
Python
import os
|
|
import pathlib
|
|
|
|
import pytest
|
|
|
|
from unstructured.partition.xml import partition_xml
|
|
|
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"filename",
|
|
["factbook.xml", "factbook-utf-16.xml"],
|
|
)
|
|
def test_partition_xml_from_filename(filename):
|
|
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
|
elements = partition_xml(filename=file_path, xml_keep_tags=False)
|
|
|
|
assert elements[0].text == "United States"
|
|
assert elements[0].metadata.filename == filename
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"filename",
|
|
["factbook.xml", "factbook-utf-16.xml"],
|
|
)
|
|
def test_partition_xml_from_file(filename):
|
|
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
|
with open(file_path) as f:
|
|
elements = partition_xml(file=f, xml_keep_tags=False, metadata_filename=file_path)
|
|
|
|
assert elements[0].text == "United States"
|
|
assert elements[0].metadata.filename == filename
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"filename",
|
|
["factbook.xml", "factbook-utf-16.xml"],
|
|
)
|
|
def test_partition_xml_from_file_rb(filename):
|
|
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
|
with open(file_path, "rb") as f:
|
|
elements = partition_xml(file=f, xml_keep_tags=False, metadata_filename=file_path)
|
|
|
|
assert elements[0].text == "United States"
|
|
assert elements[0].metadata.filename == filename
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"filename",
|
|
["factbook.xml", "factbook-utf-16.xml"],
|
|
)
|
|
def test_partition_xml_from_filename_with_tags_default_encoding(filename):
|
|
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
|
elements = partition_xml(filename=file_path, xml_keep_tags=True)
|
|
|
|
assert elements[5].text == "<name>United States</name>"
|
|
assert elements[5].metadata.filename == filename
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("filename", "encoding", "error"),
|
|
[("factbook-utf-16.xml", "utf-8", UnicodeDecodeError)],
|
|
)
|
|
def test_partition_xml_from_filename_with_tags_raises_encoding_error(filename, encoding, error):
|
|
with pytest.raises(error):
|
|
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
|
partition_xml(filename=file_path, xml_keep_tags=True, encoding=encoding)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"filename",
|
|
["factbook.xml", "factbook-utf-16.xml"],
|
|
)
|
|
def test_partition_xml_from_file_with_tags_default_encoding(filename):
|
|
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
|
with open(file_path) as f:
|
|
elements = partition_xml(file=f, xml_keep_tags=True, metadata_filename=file_path)
|
|
|
|
assert elements[5].text == "<name>United States</name>"
|
|
assert elements[5].metadata.filename == filename
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"filename",
|
|
["factbook.xml", "factbook-utf-16.xml"],
|
|
)
|
|
def test_partition_xml_from_file_rb_with_tags_default_encoding(filename):
|
|
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
|
with open(file_path, "rb") as f:
|
|
elements = partition_xml(file=f, xml_keep_tags=True, metadata_filename=file_path)
|
|
|
|
assert elements[5].text == "<name>United States</name>"
|
|
assert elements[5].metadata.filename == filename
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("filename", "encoding", "error"),
|
|
[("factbook-utf-16.xml", "utf-8", UnicodeDecodeError)],
|
|
)
|
|
def test_partition_xml_from_file_rb_with_tags_raises_encoding_error(filename, encoding, error):
|
|
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
|
with pytest.raises(error), open(file_path, "rb") as f:
|
|
partition_xml(
|
|
file=f,
|
|
xml_keep_tags=True,
|
|
metadata_filename=file_path,
|
|
encoding=encoding,
|
|
)
|