feature(html partition): parse pre tag (#642)

* feature(html partition): parse pre tag

* chore: update CHANGELOG.md

* style: black format xml.py

* Added tests dor html with pre tag

* remove skip test, update parse pre tag

* fix style

* chore: spell check

* chore: update changelog & version

* chore: update ingest test fixtures

* chore: add exception handling if `element.text` is `None` in `_read_xml`

* test: add more sanity testing on the `.text` content of the element(s)

* refactor: move the conditional logic for <pre> outside of the `try/except` block

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
This commit is contained in:
kravetsmic 2023-06-27 21:52:39 +03:00 committed by GitHub
parent 078e2aa116
commit 58e988e110
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 8579 additions and 3 deletions

View File

@ -1,4 +1,4 @@
## 0.7.10-dev0
## 0.7.10-dev1
### Enhancements
@ -8,6 +8,8 @@
### Fixes
* Fix pre tag parsing for `partition_html`
## 0.7.9
### Enhancements
@ -198,7 +200,7 @@
### Enhancements
* XLS support from auto partiton
* XLS support from auto partition
### Features

File diff suppressed because it is too large Load Diff

View File

@ -775,6 +775,17 @@ def test_auto_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
assert elements[0].metadata.filetype == "text/csv"
def test_auto_partition_html_pre_from_file(filename="example-docs/fake-html-pre.htm"):
elements = partition(filename=filename)
assert len(elements) > 0
assert PageBreak() not in elements
assert clean_extra_whitespace(elements[0].text) == "[107th Congress Public Law 56]"
assert isinstance(elements[0], Title)
assert elements[0].metadata.filetype == "text/html"
assert elements[0].metadata.filename == "fake-html-pre.htm"
def test_auto_partition_works_on_empty_filename(filename="example-docs/empty.txt"):
assert partition(filename=filename) == []

View File

@ -6,6 +6,7 @@ import pytest
import requests
from requests.models import Response
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import PageBreak, Title
from unstructured.partition.html import partition_html
@ -263,3 +264,15 @@ def test_partition_html_can_turn_off_assemble_articles():
"""
elements = partition_html(text=html_text, html_assemble_articles=False)
assert elements[-1] == Title("This is outside of the article.")
def test_partition_html_with_pre_tag():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-html-pre.htm")
elements = partition_html(filename=filename)
assert len(elements) > 0
assert PageBreak() not in elements
assert clean_extra_whitespace(elements[0].text) == "[107th Congress Public Law 56]"
assert isinstance(elements[0], Title)
assert elements[0].metadata.filetype == "text/html"
assert elements[0].metadata.filename == "fake-html-pre.htm"

View File

@ -58,5 +58,235 @@
"page_number": 1
},
"text": "Downloadify Invoke Script For This Page"
},
{
"type": "Title",
"element_id": "a6f18a30c8de3b1436133823a93f50db",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 2
},
"text": "Downloadify.create('downloadify',{"
},
{
"type": "Title",
"element_id": "edbfc1f9e429bd016ab71cd365adad8a",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 3
},
"text": "filename: function(){"
},
{
"type": "Title",
"element_id": "fff8b2c3a9b06101fe7f64eef5c4ab2d",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 4
},
"text": "return document.getElementById('filename').value;"
},
{
"type": "UncategorizedText",
"element_id": "e52a1813c46049513e6beb0b5c9e2aca",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 5
},
"text": "},"
},
{
"type": "Title",
"element_id": "877efc801047c9b9b245c2febc5c4cf4",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 6
},
"text": "data: function(){"
},
{
"type": "Title",
"element_id": "7a384fc4bff83eec4ba849c592f38de2",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 7
},
"text": "return document.getElementById('data').value;"
},
{
"type": "UncategorizedText",
"element_id": "e52a1813c46049513e6beb0b5c9e2aca",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 8
},
"text": "},"
},
{
"type": "Title",
"element_id": "91a68ed0c4ad18401f490396c820c497",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 9
},
"text": "onComplete: function(){"
},
{
"type": "Title",
"element_id": "a9b9ab32700c5a217e4f3f544e35fb43",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 10
},
"text": "alert('Your File Has Been Saved!');"
},
{
"type": "UncategorizedText",
"element_id": "e52a1813c46049513e6beb0b5c9e2aca",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 11
},
"text": "},"
},
{
"type": "Title",
"element_id": "d4dd03a884778f7100dc6d5bbbf8b5b4",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 12
},
"text": "onCancel: function(){"
},
{
"type": "NarrativeText",
"element_id": "96c9e5794fbc5652b2da41753be3401f",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 13
},
"text": "alert('You have cancelled the saving of this file.');"
},
{
"type": "UncategorizedText",
"element_id": "e52a1813c46049513e6beb0b5c9e2aca",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 14
},
"text": "},"
},
{
"type": "Title",
"element_id": "c7f068e18ced43a5a5b566bb3139be83",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 15
},
"text": "onError: function(){"
},
{
"type": "NarrativeText",
"element_id": "4ce0312c6b9cefd778c77dcda1daa357",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 16
},
"text": "alert('You must put something in the File Contents or there will be nothing to save!');"
},
{
"type": "UncategorizedText",
"element_id": "e52a1813c46049513e6beb0b5c9e2aca",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 17
},
"text": "},"
},
{
"type": "UncategorizedText",
"element_id": "4e6cb015a10ef85a94cbf38f0736c963",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 18
},
"text": "swf: 'media/downloadify.swf',"
},
{
"type": "UncategorizedText",
"element_id": "547c1eea609aae64271813c3cc061d03",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 19
},
"text": "downloadImage: 'images/download.png',"
},
{
"type": "UncategorizedText",
"element_id": "08f092710daeddd051b6c9ed12f8a77d",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 20
},
"text": "width: 100,"
},
{
"type": "UncategorizedText",
"element_id": "4664ad91130a49e27fc2f874b5d08a68",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 21
},
"text": "height: 30,"
},
{
"type": "UncategorizedText",
"element_id": "577ebe0897958e450d22132ba908c640",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 22
},
"text": "transparent: true,"
},
{
"type": "Title",
"element_id": "e68f9c269ffa29b81d6ec9a8ebe47817",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 23
},
"text": "append: false"
},
{
"type": "UncategorizedText",
"element_id": "29576b54e255e3c948eea5b5904fa38b",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 24
},
"text": "});"
}
]

View File

@ -1 +1 @@
__version__ = "0.7.10-dev0" # pragma: no cover
__version__ = "0.7.10-dev1" # pragma: no cover

View File

@ -5,6 +5,10 @@ from lxml import etree
from unstructured.documents.base import Document, Page
from unstructured.file_utils.encoding import read_txt_file
from unstructured.logger import logger
from unstructured.partition.text import (
element_from_text,
split_by_paragraph,
)
VALID_PARSERS = Union[etree.HTMLParser, etree.XMLParser, None]
@ -67,6 +71,7 @@ class XMLDocument(Document):
document_tree = etree.fromstring(content, self.parser)
if document_tree is None:
raise ValueError("document_tree is None")
# NOTE(robinson) - The following ValueError occurs with unicode strings. In that
# case, we call back to encoding the string and passing in bytes.
# ValueError: Unicode strings with encoding declaration are not supported.
@ -74,6 +79,17 @@ class XMLDocument(Document):
except ValueError:
document_tree = etree.fromstring(content.encode(), self.parser)
if "<pre>" and "</pre>" in content:
tree = etree.HTML(content)
for element in tree.xpath("//pre"):
if not element.text:
continue
text_content = split_by_paragraph(element.text)
for text in text_content:
element = etree.Element("span")
element.text = str(element_from_text(text=text))
document_tree.append(element)
if self.stylesheet:
if isinstance(self.parser, etree.HTMLParser):
logger.warning(