unstructured/unstructured/documents/xml.py

from typing import List, Optional, Union

from lxml import etree

from unstructured.documents.base import Document, Page
from unstructured.file_utils.encoding import read_txt_file
from unstructured.logger import logger
from unstructured.partition.text import (
    element_from_text,
    split_by_paragraph,
)

VALID_PARSERS = Union[etree.HTMLParser, etree.XMLParser, None]


class XMLDocument(Document):
    """Class for handling .xml documents. This class uses rules based parsing to identify
    sections of interest within the document."""

    def __init__(
        self,
        stylesheet: Optional[str] = None,
        parser: VALID_PARSERS = None,
    ):
        """Class for parsing XML documents. XML documents are parsed using lxml.

        Parameters
        ----------
        filename:
            The name of the XML file to read
        stylesheet:
            An XLST stylesheet that can be applied to transform the XML file
        parser:
            The lxml parser to use with the file. The HTML parser is used by default
            because it is more tolerant of special characters and malformed XML. If you
            are using a stylesheet, you likely want the XMLParser.
        """
        if not parser:
            parser = (
                etree.XMLParser(remove_comments=True)
                if stylesheet
                else etree.HTMLParser(remove_comments=True)
            )

        self.stylesheet = stylesheet
        self.parser = parser
        self.document_tree = None
        super().__init__()

    def _read(self):
        raise NotImplementedError

    @property
    def pages(self) -> List[Page]:
        """Gets all elements from pages in sequential order."""
        if self._pages is None:
            self._pages = self._read()
        return super().pages

    def _read_xml(self, content):
        """Reads in an XML file and converts it to an lxml element tree object."""
        # NOTE(robinson) - without the carriage return at the beginning, you get
        # output that looks like the following when you run partition_pdf
        #   'h   3       a   l   i   g   n   =   "   c   e   n   t   e   r   "   >'
        # The correct output is returned once you add the initial return.
        is_html_parser = isinstance(self.parser, etree.HTMLParser)
        if content and not content.startswith("\n") and is_html_parser:
            content = "\n" + content
        if self.document_tree is None:
            try:
                document_tree = etree.fromstring(content, self.parser)
                if document_tree is None:
                    raise ValueError("document_tree is None")

            # NOTE(robinson) - The following ValueError occurs with unicode strings. In that
            # case, we call back to encoding the string and passing in bytes.
            #     ValueError: Unicode strings with encoding declaration are not supported.
            #     Please use  bytes input or XML fragments without declaration.
            except ValueError:
                document_tree = etree.fromstring(content.encode(), self.parser)

            if "<pre>" and "</pre>" in content:
                tree = etree.HTML(content)
                for element in tree.xpath("//pre"):
                    if not element.text:
                        continue
                    text_content = split_by_paragraph(element.text)
                    for text in text_content:
                        element = etree.Element("span")
                        element.text = str(element_from_text(text=text))
                        document_tree.append(element)

            if self.stylesheet:
                if isinstance(self.parser, etree.HTMLParser):
                    logger.warning(
                        "You are using the HTML parser with an XSLT stylesheet. "
                        "Stylesheets are more commonly parsed with the "
                        "XMLParser. If your HTML does not display properly, try "
                        "`import lxml.etree as etree` and setting "
                        "`parser=etree.XMLParser()` instead.",
                    )
                xslt = etree.parse(self.stylesheet)
                transform = etree.XSLT(xslt)
                document_tree = transform(document_tree)

            self.document_tree = document_tree

        return self.document_tree

    @classmethod
    def from_string(
        cls,
        text: str,
        parser: VALID_PARSERS = None,
        stylesheet: Optional[str] = None,
        **kwargs,
    ):
        """Supports reading in an XML file as a raw string rather than as a file."""
        logger.info("Reading document from string ...")
        doc = cls(parser=parser, stylesheet=stylesheet, **kwargs)
        doc._read_xml(text)
        return doc

    @classmethod
    def from_file(
        cls,
        filename,
        parser: VALID_PARSERS = None,
        stylesheet: Optional[str] = None,
        encoding: Optional[str] = None,
        **kwargs,
    ):
        _, content = read_txt_file(filename=filename, encoding=encoding)
        return cls.from_string(content, parser=parser, stylesheet=stylesheet, **kwargs)
Resolve various style issues to improve overall code quality (#282) * Apply import sorting ruff . --select I --fix * Remove unnecessary open mode parameter ruff . --select UP015 --fix * Use f-string formatting rather than .format * Remove extraneous parentheses Also use "" instead of str() * Resolve missing trailing commas ruff . --select COM --fix * Rewrite list() and dict() calls using literals ruff . --select C4 --fix * Add () to pytest.fixture, use tuples for parametrize, etc. ruff . --select PT --fix * Simplify code: merge conditionals, context managers ruff . --select SIM --fix * Import without unnecessary alias ruff . --select PLR0402 --fix * Apply formatting via black * Rewrite ValueError somewhat Slightly unrelated to the rest of the PR * Apply formatting to tests via black * Update expected exception message to match 0d81564 * Satisfy E501 line too long in test * Update changelog & version * Add ruff to make tidy and test deps * Run 'make tidy' * Update changelog & version * Update changelog & version * Add ruff to 'check' target Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR. 2023-02-27 17:30:54 +01:00			`from typing import List, Optional, Union`
Initial Release 2022-06-29 14:35:19 -04:00
Resolve various style issues to improve overall code quality (#282) * Apply import sorting ruff . --select I --fix * Remove unnecessary open mode parameter ruff . --select UP015 --fix * Use f-string formatting rather than .format * Remove extraneous parentheses Also use "" instead of str() * Resolve missing trailing commas ruff . --select COM --fix * Rewrite list() and dict() calls using literals ruff . --select C4 --fix * Add () to pytest.fixture, use tuples for parametrize, etc. ruff . --select PT --fix * Simplify code: merge conditionals, context managers ruff . --select SIM --fix * Import without unnecessary alias ruff . --select PLR0402 --fix * Apply formatting via black * Rewrite ValueError somewhat Slightly unrelated to the rest of the PR * Apply formatting to tests via black * Update expected exception message to match 0d81564 * Satisfy E501 line too long in test * Update changelog & version * Add ruff to make tidy and test deps * Run 'make tidy' * Update changelog & version * Update changelog & version * Add ruff to 'check' target Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR. 2023-02-27 17:30:54 +01:00			`from lxml import etree`
Initial Release 2022-06-29 14:35:19 -04:00
fix: move _read out of base Document class Changed where _read sits in the inheritance structure since PDFDocument doesn't really need lazy document processing 2022-11-14 13:34:42 -06:00			`from unstructured.documents.base import Document, Page`
fix: encoding/decoding error with default utf-8 encoding for html, xml, and auto (#660) Add functionality to try other common encodings for html, xml files if an error related to the encoding is raised and the user has not specified an encoding. Change auto.py to have a None default for encoding Remove the unused parameter encoding from partition_pdf Add functionality to the read_txt_file utility function to handle file-like object from URL 2023-06-05 11:27:12 -07:00			`from unstructured.file_utils.encoding import read_txt_file`
Resolve various style issues to improve overall code quality (#282) * Apply import sorting ruff . --select I --fix * Remove unnecessary open mode parameter ruff . --select UP015 --fix * Use f-string formatting rather than .format * Remove extraneous parentheses Also use "" instead of str() * Resolve missing trailing commas ruff . --select COM --fix * Rewrite list() and dict() calls using literals ruff . --select C4 --fix * Add () to pytest.fixture, use tuples for parametrize, etc. ruff . --select PT --fix * Simplify code: merge conditionals, context managers ruff . --select SIM --fix * Import without unnecessary alias ruff . --select PLR0402 --fix * Apply formatting via black * Rewrite ValueError somewhat Slightly unrelated to the rest of the PR * Apply formatting to tests via black * Update expected exception message to match 0d81564 * Satisfy E501 line too long in test * Update changelog & version * Add ruff to make tidy and test deps * Run 'make tidy' * Update changelog & version * Update changelog & version * Add ruff to 'check' target Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR. 2023-02-27 17:30:54 +01:00			`from unstructured.logger import logger`
feature(html partition): parse pre tag (#642) * feature(html partition): parse pre tag * chore: update CHANGELOG.md * style: black format xml.py * Added tests dor html with pre tag * remove skip test, update parse pre tag * fix style * chore: spell check * chore: update changelog & version * chore: update ingest test fixtures * chore: add exception handling if `element.text` is `None` in `_read_xml` * test: add more sanity testing on the `.text` content of the element(s) * refactor: move the conditional logic for <pre> outside of the `try/except` block --------- Co-authored-by: cragwolfe <crag@unstructured.io> Co-authored-by: christinestraub <christinemstraub@gmail.com> 2023-06-27 21:52:39 +03:00			`from unstructured.partition.text import (`
			`element_from_text,`
			`split_by_paragraph,`
			`)`
Initial Release 2022-06-29 14:35:19 -04:00
			`VALID_PARSERS = Union[etree.HTMLParser, etree.XMLParser, None]`


			`class XMLDocument(Document):`
			`"""Class for handling .xml documents. This class uses rules based parsing to identify`
			`sections of interest within the document."""`

			`def __init__(`
			`self,`
			`stylesheet: Optional[str] = None,`
			`parser: VALID_PARSERS = None,`
			`):`
			`"""Class for parsing XML documents. XML documents are parsed using lxml.`

			`Parameters`
			`----------`
			`filename:`
			`The name of the XML file to read`
			`stylesheet:`
			`An XLST stylesheet that can be applied to transform the XML file`
			`parser:`
			`The lxml parser to use with the file. The HTML parser is used by default`
			`because it is more tolerant of special characters and malformed XML. If you`
			`are using a stylesheet, you likely want the XMLParser.`
			`"""`
			`if not parser:`
fix: remove comments when parsing XML or HTML (#210) * Update xml.py remove comments while parsing * change logged in CHANGLOG and editted version * make tidy * editted version * new version 0.4.8-dev1 * editted version * Update CHANGELOG.md Co-authored-by: cragwolfe <crag@unstructuredai.io> --------- Co-authored-by: cragwolfe <crag@unstructuredai.io> 2023-02-11 02:52:13 +09:00			`parser = (`
			`etree.XMLParser(remove_comments=True)`
			`if stylesheet`
			`else etree.HTMLParser(remove_comments=True)`
			`)`
Initial Release 2022-06-29 14:35:19 -04:00
			`self.stylesheet = stylesheet`
			`self.parser = parser`
			`self.document_tree = None`
			`super().__init__()`

			`def _read(self):`
			`raise NotImplementedError`

fix: move _read out of base Document class Changed where _read sits in the inheritance structure since PDFDocument doesn't really need lazy document processing 2022-11-14 13:34:42 -06:00			`@property`
			`def pages(self) -> List[Page]:`
			`"""Gets all elements from pages in sequential order."""`
			`if self._pages is None:`
			`self._pages = self._read()`
			`return super().pages`

Initial Release 2022-06-29 14:35:19 -04:00			`def _read_xml(self, content):`
			`"""Reads in an XML file and converts it to an lxml element tree object."""`
fix: updates markdown code to process markdown with embedded html (#480) * add carriage return to html if missing * test on markdown with embedded html * changelog and version * check for html parser * linting, linting, linting 2023-04-13 12:47:45 -04:00			`# NOTE(robinson) - without the carriage return at the beginning, you get`
			`# output that looks like the following when you run partition_pdf`
			`# 'h 3 a l i g n = " c e n t e r " >'`
			`# The correct output is returned once you add the initial return.`
			`is_html_parser = isinstance(self.parser, etree.HTMLParser)`
			`if content and not content.startswith("\n") and is_html_parser:`
			`content = "\n" + content`
Initial Release 2022-06-29 14:35:19 -04:00			`if self.document_tree is None:`
fix: fix html encoding to support foreign characters (#452) * fix: fix html encoding to support foreign characters * version and changelog 2023-04-05 16:18:54 -04:00			`try:`
			`document_tree = etree.fromstring(content, self.parser)`
fix: encode xml string if document_tree is `None` in `_read_xml` (#477) * fix: encode xml string if document_tree is `None` in `_read_xml` * don't encode text in test 2023-04-13 14:09:58 +01:00			`if document_tree is None:`
			`raise ValueError("document_tree is None")`
feature(html partition): parse pre tag (#642) * feature(html partition): parse pre tag * chore: update CHANGELOG.md * style: black format xml.py * Added tests dor html with pre tag * remove skip test, update parse pre tag * fix style * chore: spell check * chore: update changelog & version * chore: update ingest test fixtures * chore: add exception handling if `element.text` is `None` in `_read_xml` * test: add more sanity testing on the `.text` content of the element(s) * refactor: move the conditional logic for <pre> outside of the `try/except` block --------- Co-authored-by: cragwolfe <crag@unstructured.io> Co-authored-by: christinestraub <christinemstraub@gmail.com> 2023-06-27 21:52:39 +03:00
fix: fix html encoding to support foreign characters (#452) * fix: fix html encoding to support foreign characters * version and changelog 2023-04-05 16:18:54 -04:00			`# NOTE(robinson) - The following ValueError occurs with unicode strings. In that`
			`# case, we call back to encoding the string and passing in bytes.`
			`# ValueError: Unicode strings with encoding declaration are not supported.`
			`# Please use bytes input or XML fragments without declaration.`
			`except ValueError:`
			`document_tree = etree.fromstring(content.encode(), self.parser)`
Initial Release 2022-06-29 14:35:19 -04:00
feature(html partition): parse pre tag (#642) * feature(html partition): parse pre tag * chore: update CHANGELOG.md * style: black format xml.py * Added tests dor html with pre tag * remove skip test, update parse pre tag * fix style * chore: spell check * chore: update changelog & version * chore: update ingest test fixtures * chore: add exception handling if `element.text` is `None` in `_read_xml` * test: add more sanity testing on the `.text` content of the element(s) * refactor: move the conditional logic for <pre> outside of the `try/except` block --------- Co-authored-by: cragwolfe <crag@unstructured.io> Co-authored-by: christinestraub <christinemstraub@gmail.com> 2023-06-27 21:52:39 +03:00			`if "<pre>" and "</pre>" in content:`
			`tree = etree.HTML(content)`
			`for element in tree.xpath("//pre"):`
			`if not element.text:`
			`continue`
			`text_content = split_by_paragraph(element.text)`
			`for text in text_content:`
			`element = etree.Element("span")`
			`element.text = str(element_from_text(text=text))`
			`document_tree.append(element)`

Initial Release 2022-06-29 14:35:19 -04:00			`if self.stylesheet:`
			`if isinstance(self.parser, etree.HTMLParser):`
			`logger.warning(`
			`"You are using the HTML parser with an XSLT stylesheet. "`
			`"Stylesheets are more commonly parsed with the "`
			`"XMLParser. If your HTML does not display properly, try "`
			"`import lxml.etree as etree` and setting "
Resolve various style issues to improve overall code quality (#282) * Apply import sorting ruff . --select I --fix * Remove unnecessary open mode parameter ruff . --select UP015 --fix * Use f-string formatting rather than .format * Remove extraneous parentheses Also use "" instead of str() * Resolve missing trailing commas ruff . --select COM --fix * Rewrite list() and dict() calls using literals ruff . --select C4 --fix * Add () to pytest.fixture, use tuples for parametrize, etc. ruff . --select PT --fix * Simplify code: merge conditionals, context managers ruff . --select SIM --fix * Import without unnecessary alias ruff . --select PLR0402 --fix * Apply formatting via black * Rewrite ValueError somewhat Slightly unrelated to the rest of the PR * Apply formatting to tests via black * Update expected exception message to match 0d81564 * Satisfy E501 line too long in test * Update changelog & version * Add ruff to make tidy and test deps * Run 'make tidy' * Update changelog & version * Update changelog & version * Add ruff to 'check' target Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR. 2023-02-27 17:30:54 +01:00			"`parser=etree.XMLParser()` instead.",
Initial Release 2022-06-29 14:35:19 -04:00			`)`
			`xslt = etree.parse(self.stylesheet)`
			`transform = etree.XSLT(xslt)`
			`document_tree = transform(document_tree)`

			`self.document_tree = document_tree`

			`return self.document_tree`

			`@classmethod`
fix: enable `partition_html` to grab content outside of `<article>` tags (#772) * optionally dont assemble articles * add test for content outside of articles * pass kwargs in partition * changelog and version * update default to False * bump version for release * back to dev version to get another fix in the release 2023-06-20 13:07:30 -04:00			`def from_string(`
			`cls,`
			`text: str,`
			`parser: VALID_PARSERS = None,`
			`stylesheet: Optional[str] = None,`
			`**kwargs,`
			`):`
Initial Release 2022-06-29 14:35:19 -04:00			`"""Supports reading in an XML file as a raw string rather than as a file."""`
			`logger.info("Reading document from string ...")`
fix: enable `partition_html` to grab content outside of `<article>` tags (#772) * optionally dont assemble articles * add test for content outside of articles * pass kwargs in partition * changelog and version * update default to False * bump version for release * back to dev version to get another fix in the release 2023-06-20 13:07:30 -04:00			`doc = cls(parser=parser, stylesheet=stylesheet, **kwargs)`
Initial Release 2022-06-29 14:35:19 -04:00			`doc._read_xml(text)`
			`return doc`

			`@classmethod`
Adding optional encoding arg, and text_partition tests (#339) 2023-03-06 15:07:33 -08:00			`def from_file(`
			`cls,`
			`filename,`
			`parser: VALID_PARSERS = None,`
			`stylesheet: Optional[str] = None,`
fix: encoding/decoding error with default utf-8 encoding for html, xml, and auto (#660) Add functionality to try other common encodings for html, xml files if an error related to the encoding is raised and the user has not specified an encoding. Change auto.py to have a None default for encoding Remove the unused parameter encoding from partition_pdf Add functionality to the read_txt_file utility function to handle file-like object from URL 2023-06-05 11:27:12 -07:00			`encoding: Optional[str] = None,`
fix: enable `partition_html` to grab content outside of `<article>` tags (#772) * optionally dont assemble articles * add test for content outside of articles * pass kwargs in partition * changelog and version * update default to False * bump version for release * back to dev version to get another fix in the release 2023-06-20 13:07:30 -04:00			`**kwargs,`
Adding optional encoding arg, and text_partition tests (#339) 2023-03-06 15:07:33 -08:00			`):`
fix: encoding/decoding error with default utf-8 encoding for html, xml, and auto (#660) Add functionality to try other common encodings for html, xml files if an error related to the encoding is raised and the user has not specified an encoding. Change auto.py to have a None default for encoding Remove the unused parameter encoding from partition_pdf Add functionality to the read_txt_file utility function to handle file-like object from URL 2023-06-05 11:27:12 -07:00			`_, content = read_txt_file(filename=filename, encoding=encoding)`
fix: enable `partition_html` to grab content outside of `<article>` tags (#772) * optionally dont assemble articles * add test for content outside of articles * pass kwargs in partition * changelog and version * update default to False * bump version for release * back to dev version to get another fix in the release 2023-06-20 13:07:30 -04:00			`return cls.from_string(content, parser=parser, stylesheet=stylesheet, **kwargs)`