135 lines
5.1 KiB
Python
Raw Normal View History

from typing import List, Optional, Union
2022-06-29 14:35:19 -04:00
from lxml import etree
2022-06-29 14:35:19 -04:00
from unstructured.documents.base import Document, Page
from unstructured.file_utils.encoding import read_txt_file
from unstructured.logger import logger
from unstructured.partition.text import (
element_from_text,
split_by_paragraph,
)
2022-06-29 14:35:19 -04:00
VALID_PARSERS = Union[etree.HTMLParser, etree.XMLParser, None]
class XMLDocument(Document):
"""Class for handling .xml documents. This class uses rules based parsing to identify
sections of interest within the document."""
def __init__(
self,
stylesheet: Optional[str] = None,
parser: VALID_PARSERS = None,
):
"""Class for parsing XML documents. XML documents are parsed using lxml.
Parameters
----------
filename:
The name of the XML file to read
stylesheet:
An XLST stylesheet that can be applied to transform the XML file
parser:
The lxml parser to use with the file. The HTML parser is used by default
because it is more tolerant of special characters and malformed XML. If you
are using a stylesheet, you likely want the XMLParser.
"""
if not parser:
parser = (
etree.XMLParser(remove_comments=True)
if stylesheet
else etree.HTMLParser(remove_comments=True)
)
2022-06-29 14:35:19 -04:00
self.stylesheet = stylesheet
self.parser = parser
self.document_tree = None
super().__init__()
def _read(self):
raise NotImplementedError
@property
def pages(self) -> List[Page]:
"""Gets all elements from pages in sequential order."""
if self._pages is None:
self._pages = self._read()
return super().pages
2022-06-29 14:35:19 -04:00
def _read_xml(self, content):
"""Reads in an XML file and converts it to an lxml element tree object."""
# NOTE(robinson) - without the carriage return at the beginning, you get
# output that looks like the following when you run partition_pdf
# 'h 3 a l i g n = " c e n t e r " >'
# The correct output is returned once you add the initial return.
is_html_parser = isinstance(self.parser, etree.HTMLParser)
if content and not content.startswith("\n") and is_html_parser:
content = "\n" + content
2022-06-29 14:35:19 -04:00
if self.document_tree is None:
try:
document_tree = etree.fromstring(content, self.parser)
if document_tree is None:
raise ValueError("document_tree is None")
# NOTE(robinson) - The following ValueError occurs with unicode strings. In that
# case, we call back to encoding the string and passing in bytes.
# ValueError: Unicode strings with encoding declaration are not supported.
# Please use bytes input or XML fragments without declaration.
except ValueError:
document_tree = etree.fromstring(content.encode(), self.parser)
2022-06-29 14:35:19 -04:00
if "<pre>" and "</pre>" in content:
tree = etree.HTML(content)
for element in tree.xpath("//pre"):
if not element.text:
continue
text_content = split_by_paragraph(element.text)
for text in text_content:
element = etree.Element("span")
element.text = str(element_from_text(text=text))
document_tree.append(element)
2022-06-29 14:35:19 -04:00
if self.stylesheet:
if isinstance(self.parser, etree.HTMLParser):
logger.warning(
"You are using the HTML parser with an XSLT stylesheet. "
"Stylesheets are more commonly parsed with the "
"XMLParser. If your HTML does not display properly, try "
"`import lxml.etree as etree` and setting "
"`parser=etree.XMLParser()` instead.",
2022-06-29 14:35:19 -04:00
)
xslt = etree.parse(self.stylesheet)
transform = etree.XSLT(xslt)
document_tree = transform(document_tree)
self.document_tree = document_tree
return self.document_tree
@classmethod
def from_string(
cls,
text: str,
parser: VALID_PARSERS = None,
stylesheet: Optional[str] = None,
**kwargs,
):
2022-06-29 14:35:19 -04:00
"""Supports reading in an XML file as a raw string rather than as a file."""
logger.info("Reading document from string ...")
doc = cls(parser=parser, stylesheet=stylesheet, **kwargs)
2022-06-29 14:35:19 -04:00
doc._read_xml(text)
return doc
@classmethod
def from_file(
cls,
filename,
parser: VALID_PARSERS = None,
stylesheet: Optional[str] = None,
encoding: Optional[str] = None,
**kwargs,
):
_, content = read_txt_file(filename=filename, encoding=encoding)
return cls.from_string(content, parser=parser, stylesheet=stylesheet, **kwargs)