kravetsmic 58e988e110
feature(html partition): parse pre tag (#642)
* feature(html partition): parse pre tag

* chore: update CHANGELOG.md

* style: black format xml.py

* Added tests dor html with pre tag

* remove skip test, update parse pre tag

* fix style

* chore: spell check

* chore: update changelog & version

* chore: update ingest test fixtures

* chore: add exception handling if `element.text` is `None` in `_read_xml`

* test: add more sanity testing on the `.text` content of the element(s)

* refactor: move the conditional logic for <pre> outside of the `try/except` block

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
2023-06-27 18:52:39 +00:00

135 lines
5.1 KiB
Python

from typing import List, Optional, Union
from lxml import etree
from unstructured.documents.base import Document, Page
from unstructured.file_utils.encoding import read_txt_file
from unstructured.logger import logger
from unstructured.partition.text import (
element_from_text,
split_by_paragraph,
)
VALID_PARSERS = Union[etree.HTMLParser, etree.XMLParser, None]
class XMLDocument(Document):
"""Class for handling .xml documents. This class uses rules based parsing to identify
sections of interest within the document."""
def __init__(
self,
stylesheet: Optional[str] = None,
parser: VALID_PARSERS = None,
):
"""Class for parsing XML documents. XML documents are parsed using lxml.
Parameters
----------
filename:
The name of the XML file to read
stylesheet:
An XLST stylesheet that can be applied to transform the XML file
parser:
The lxml parser to use with the file. The HTML parser is used by default
because it is more tolerant of special characters and malformed XML. If you
are using a stylesheet, you likely want the XMLParser.
"""
if not parser:
parser = (
etree.XMLParser(remove_comments=True)
if stylesheet
else etree.HTMLParser(remove_comments=True)
)
self.stylesheet = stylesheet
self.parser = parser
self.document_tree = None
super().__init__()
def _read(self):
raise NotImplementedError
@property
def pages(self) -> List[Page]:
"""Gets all elements from pages in sequential order."""
if self._pages is None:
self._pages = self._read()
return super().pages
def _read_xml(self, content):
"""Reads in an XML file and converts it to an lxml element tree object."""
# NOTE(robinson) - without the carriage return at the beginning, you get
# output that looks like the following when you run partition_pdf
# 'h 3 a l i g n = " c e n t e r " >'
# The correct output is returned once you add the initial return.
is_html_parser = isinstance(self.parser, etree.HTMLParser)
if content and not content.startswith("\n") and is_html_parser:
content = "\n" + content
if self.document_tree is None:
try:
document_tree = etree.fromstring(content, self.parser)
if document_tree is None:
raise ValueError("document_tree is None")
# NOTE(robinson) - The following ValueError occurs with unicode strings. In that
# case, we call back to encoding the string and passing in bytes.
# ValueError: Unicode strings with encoding declaration are not supported.
# Please use bytes input or XML fragments without declaration.
except ValueError:
document_tree = etree.fromstring(content.encode(), self.parser)
if "<pre>" and "</pre>" in content:
tree = etree.HTML(content)
for element in tree.xpath("//pre"):
if not element.text:
continue
text_content = split_by_paragraph(element.text)
for text in text_content:
element = etree.Element("span")
element.text = str(element_from_text(text=text))
document_tree.append(element)
if self.stylesheet:
if isinstance(self.parser, etree.HTMLParser):
logger.warning(
"You are using the HTML parser with an XSLT stylesheet. "
"Stylesheets are more commonly parsed with the "
"XMLParser. If your HTML does not display properly, try "
"`import lxml.etree as etree` and setting "
"`parser=etree.XMLParser()` instead.",
)
xslt = etree.parse(self.stylesheet)
transform = etree.XSLT(xslt)
document_tree = transform(document_tree)
self.document_tree = document_tree
return self.document_tree
@classmethod
def from_string(
cls,
text: str,
parser: VALID_PARSERS = None,
stylesheet: Optional[str] = None,
**kwargs,
):
"""Supports reading in an XML file as a raw string rather than as a file."""
logger.info("Reading document from string ...")
doc = cls(parser=parser, stylesheet=stylesheet, **kwargs)
doc._read_xml(text)
return doc
@classmethod
def from_file(
cls,
filename,
parser: VALID_PARSERS = None,
stylesheet: Optional[str] = None,
encoding: Optional[str] = None,
**kwargs,
):
_, content = read_txt_file(filename=filename, encoding=encoding)
return cls.from_string(content, parser=parser, stylesheet=stylesheet, **kwargs)