# pyright: reportPrivateUsage=false
import os
from pathlib import Path
import pytest
from lxml import etree
from unstructured.documents.xml import XMLDocument
FILEPATH = Path(__file__).absolute().parent
@pytest.fixture()
def sample_document():
return """"
""" xml_document = XMLDocument.from_string(sample_document) type_tag = xml_document.document_tree.find(".//type") assert type_tag.text.strip() == "10-K" def test_read_with_stylesheet(): filename = os.path.join(FILEPATH, "..", "..", "example-docs", "factbook.xml") stylesheet = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xsl") xml_document = XMLDocument.from_file(filename=filename, stylesheet=stylesheet) doc_tree = xml_document.document_tree # NOTE(robinson) - The table heading row plus one row for each of the four data items assert int(doc_tree.xpath("count(//tr)")) == 5 # NOTE(robinson) - Four data elements x four attributes for each assert int(doc_tree.xpath("count(//td)")) == 16 def test_read_with_stylesheet_warns_with_html_parser(caplog): filename = os.path.join(FILEPATH, "..", "..", "example-docs", "factbook.xml") stylesheet = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xsl") XMLDocument.from_file(filename=filename, stylesheet=stylesheet, parser=etree.HTMLParser()) assert "WARNING" in caplog.text10-K Proctor & Gamble