unstructured/test_unstructured/partition/test_xml_partition.py
Christine Straub 547bb38d86
fix: encoding/decoding error with default utf-8 encoding for html, xml, and auto (#660)
Add functionality to try other common encodings for html, xml files if an error related to the encoding is raised and the user has not specified an encoding.

Change auto.py to have a None default for encoding

Remove the unused parameter encoding from partition_pdf

Add functionality to the read_txt_file utility function to handle file-like object from URL
2023-06-05 11:27:12 -07:00

110 lines
3.7 KiB
Python

import os
import pathlib
import pytest
from unstructured.partition.xml import partition_xml
DIRECTORY = pathlib.Path(__file__).parent.resolve()
@pytest.mark.parametrize(
"filename",
["factbook.xml", "factbook-utf-16.xml"],
)
def test_partition_xml_from_filename(filename):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
elements = partition_xml(filename=file_path, xml_keep_tags=False)
assert elements[0].text == "United States"
assert elements[0].metadata.filename == filename
@pytest.mark.parametrize(
"filename",
["factbook.xml", "factbook-utf-16.xml"],
)
def test_partition_xml_from_file(filename):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
with open(file_path) as f:
elements = partition_xml(file=f, xml_keep_tags=False, metadata_filename=file_path)
assert elements[0].text == "United States"
assert elements[0].metadata.filename == filename
@pytest.mark.parametrize(
"filename",
["factbook.xml", "factbook-utf-16.xml"],
)
def test_partition_xml_from_file_rb(filename):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
with open(file_path, "rb") as f:
elements = partition_xml(file=f, xml_keep_tags=False, metadata_filename=file_path)
assert elements[0].text == "United States"
assert elements[0].metadata.filename == filename
@pytest.mark.parametrize(
"filename",
["factbook.xml", "factbook-utf-16.xml"],
)
def test_partition_xml_from_filename_with_tags_default_encoding(filename):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
elements = partition_xml(filename=file_path, xml_keep_tags=True)
assert elements[5].text == "<name>United States</name>"
assert elements[5].metadata.filename == filename
@pytest.mark.parametrize(
("filename", "encoding", "error"),
[("factbook-utf-16.xml", "utf-8", UnicodeDecodeError)],
)
def test_partition_xml_from_filename_with_tags_raises_encoding_error(filename, encoding, error):
with pytest.raises(error):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
partition_xml(filename=file_path, xml_keep_tags=True, encoding=encoding)
@pytest.mark.parametrize(
"filename",
["factbook.xml", "factbook-utf-16.xml"],
)
def test_partition_xml_from_file_with_tags_default_encoding(filename):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
with open(file_path) as f:
elements = partition_xml(file=f, xml_keep_tags=True, metadata_filename=file_path)
assert elements[5].text == "<name>United States</name>"
assert elements[5].metadata.filename == filename
@pytest.mark.parametrize(
"filename",
["factbook.xml", "factbook-utf-16.xml"],
)
def test_partition_xml_from_file_rb_with_tags_default_encoding(filename):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
with open(file_path, "rb") as f:
elements = partition_xml(file=f, xml_keep_tags=True, metadata_filename=file_path)
assert elements[5].text == "<name>United States</name>"
assert elements[5].metadata.filename == filename
@pytest.mark.parametrize(
("filename", "encoding", "error"),
[("factbook-utf-16.xml", "utf-8", UnicodeDecodeError)],
)
def test_partition_xml_from_file_rb_with_tags_raises_encoding_error(filename, encoding, error):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
with pytest.raises(error), open(file_path, "rb") as f:
partition_xml(
file=f,
xml_keep_tags=True,
metadata_filename=file_path,
encoding=encoding,
)