mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00

* feature(html partition): parse pre tag * chore: update CHANGELOG.md * style: black format xml.py * Added tests dor html with pre tag * remove skip test, update parse pre tag * fix style * chore: spell check * chore: update changelog & version * chore: update ingest test fixtures * chore: add exception handling if `element.text` is `None` in `_read_xml` * test: add more sanity testing on the `.text` content of the element(s) * refactor: move the conditional logic for <pre> outside of the `try/except` block --------- Co-authored-by: cragwolfe <crag@unstructured.io> Co-authored-by: christinestraub <christinemstraub@gmail.com>
135 lines
5.1 KiB
Python
135 lines
5.1 KiB
Python
from typing import List, Optional, Union
|
|
|
|
from lxml import etree
|
|
|
|
from unstructured.documents.base import Document, Page
|
|
from unstructured.file_utils.encoding import read_txt_file
|
|
from unstructured.logger import logger
|
|
from unstructured.partition.text import (
|
|
element_from_text,
|
|
split_by_paragraph,
|
|
)
|
|
|
|
VALID_PARSERS = Union[etree.HTMLParser, etree.XMLParser, None]
|
|
|
|
|
|
class XMLDocument(Document):
|
|
"""Class for handling .xml documents. This class uses rules based parsing to identify
|
|
sections of interest within the document."""
|
|
|
|
def __init__(
|
|
self,
|
|
stylesheet: Optional[str] = None,
|
|
parser: VALID_PARSERS = None,
|
|
):
|
|
"""Class for parsing XML documents. XML documents are parsed using lxml.
|
|
|
|
Parameters
|
|
----------
|
|
filename:
|
|
The name of the XML file to read
|
|
stylesheet:
|
|
An XLST stylesheet that can be applied to transform the XML file
|
|
parser:
|
|
The lxml parser to use with the file. The HTML parser is used by default
|
|
because it is more tolerant of special characters and malformed XML. If you
|
|
are using a stylesheet, you likely want the XMLParser.
|
|
"""
|
|
if not parser:
|
|
parser = (
|
|
etree.XMLParser(remove_comments=True)
|
|
if stylesheet
|
|
else etree.HTMLParser(remove_comments=True)
|
|
)
|
|
|
|
self.stylesheet = stylesheet
|
|
self.parser = parser
|
|
self.document_tree = None
|
|
super().__init__()
|
|
|
|
def _read(self):
|
|
raise NotImplementedError
|
|
|
|
@property
|
|
def pages(self) -> List[Page]:
|
|
"""Gets all elements from pages in sequential order."""
|
|
if self._pages is None:
|
|
self._pages = self._read()
|
|
return super().pages
|
|
|
|
def _read_xml(self, content):
|
|
"""Reads in an XML file and converts it to an lxml element tree object."""
|
|
# NOTE(robinson) - without the carriage return at the beginning, you get
|
|
# output that looks like the following when you run partition_pdf
|
|
# 'h 3 a l i g n = " c e n t e r " >'
|
|
# The correct output is returned once you add the initial return.
|
|
is_html_parser = isinstance(self.parser, etree.HTMLParser)
|
|
if content and not content.startswith("\n") and is_html_parser:
|
|
content = "\n" + content
|
|
if self.document_tree is None:
|
|
try:
|
|
document_tree = etree.fromstring(content, self.parser)
|
|
if document_tree is None:
|
|
raise ValueError("document_tree is None")
|
|
|
|
# NOTE(robinson) - The following ValueError occurs with unicode strings. In that
|
|
# case, we call back to encoding the string and passing in bytes.
|
|
# ValueError: Unicode strings with encoding declaration are not supported.
|
|
# Please use bytes input or XML fragments without declaration.
|
|
except ValueError:
|
|
document_tree = etree.fromstring(content.encode(), self.parser)
|
|
|
|
if "<pre>" and "</pre>" in content:
|
|
tree = etree.HTML(content)
|
|
for element in tree.xpath("//pre"):
|
|
if not element.text:
|
|
continue
|
|
text_content = split_by_paragraph(element.text)
|
|
for text in text_content:
|
|
element = etree.Element("span")
|
|
element.text = str(element_from_text(text=text))
|
|
document_tree.append(element)
|
|
|
|
if self.stylesheet:
|
|
if isinstance(self.parser, etree.HTMLParser):
|
|
logger.warning(
|
|
"You are using the HTML parser with an XSLT stylesheet. "
|
|
"Stylesheets are more commonly parsed with the "
|
|
"XMLParser. If your HTML does not display properly, try "
|
|
"`import lxml.etree as etree` and setting "
|
|
"`parser=etree.XMLParser()` instead.",
|
|
)
|
|
xslt = etree.parse(self.stylesheet)
|
|
transform = etree.XSLT(xslt)
|
|
document_tree = transform(document_tree)
|
|
|
|
self.document_tree = document_tree
|
|
|
|
return self.document_tree
|
|
|
|
@classmethod
|
|
def from_string(
|
|
cls,
|
|
text: str,
|
|
parser: VALID_PARSERS = None,
|
|
stylesheet: Optional[str] = None,
|
|
**kwargs,
|
|
):
|
|
"""Supports reading in an XML file as a raw string rather than as a file."""
|
|
logger.info("Reading document from string ...")
|
|
doc = cls(parser=parser, stylesheet=stylesheet, **kwargs)
|
|
doc._read_xml(text)
|
|
return doc
|
|
|
|
@classmethod
|
|
def from_file(
|
|
cls,
|
|
filename,
|
|
parser: VALID_PARSERS = None,
|
|
stylesheet: Optional[str] = None,
|
|
encoding: Optional[str] = None,
|
|
**kwargs,
|
|
):
|
|
_, content = read_txt_file(filename=filename, encoding=encoding)
|
|
return cls.from_string(content, parser=parser, stylesheet=stylesheet, **kwargs)
|