mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
feature(html partition): parse pre tag (#642)
* feature(html partition): parse pre tag * chore: update CHANGELOG.md * style: black format xml.py * Added tests dor html with pre tag * remove skip test, update parse pre tag * fix style * chore: spell check * chore: update changelog & version * chore: update ingest test fixtures * chore: add exception handling if `element.text` is `None` in `_read_xml` * test: add more sanity testing on the `.text` content of the element(s) * refactor: move the conditional logic for <pre> outside of the `try/except` block --------- Co-authored-by: cragwolfe <crag@unstructured.io> Co-authored-by: christinestraub <christinemstraub@gmail.com>
This commit is contained in:
parent
078e2aa116
commit
58e988e110
@ -1,4 +1,4 @@
|
||||
## 0.7.10-dev0
|
||||
## 0.7.10-dev1
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -8,6 +8,8 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* Fix pre tag parsing for `partition_html`
|
||||
|
||||
## 0.7.9
|
||||
|
||||
### Enhancements
|
||||
@ -198,7 +200,7 @@
|
||||
|
||||
### Enhancements
|
||||
|
||||
* XLS support from auto partiton
|
||||
* XLS support from auto partition
|
||||
|
||||
### Features
|
||||
|
||||
|
8304
example-docs/fake-html-pre.htm
Normal file
8304
example-docs/fake-html-pre.htm
Normal file
File diff suppressed because it is too large
Load Diff
@ -775,6 +775,17 @@ def test_auto_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
|
||||
assert elements[0].metadata.filetype == "text/csv"
|
||||
|
||||
|
||||
def test_auto_partition_html_pre_from_file(filename="example-docs/fake-html-pre.htm"):
|
||||
elements = partition(filename=filename)
|
||||
|
||||
assert len(elements) > 0
|
||||
assert PageBreak() not in elements
|
||||
assert clean_extra_whitespace(elements[0].text) == "[107th Congress Public Law 56]"
|
||||
assert isinstance(elements[0], Title)
|
||||
assert elements[0].metadata.filetype == "text/html"
|
||||
assert elements[0].metadata.filename == "fake-html-pre.htm"
|
||||
|
||||
|
||||
def test_auto_partition_works_on_empty_filename(filename="example-docs/empty.txt"):
|
||||
assert partition(filename=filename) == []
|
||||
|
||||
|
@ -6,6 +6,7 @@ import pytest
|
||||
import requests
|
||||
from requests.models import Response
|
||||
|
||||
from unstructured.cleaners.core import clean_extra_whitespace
|
||||
from unstructured.documents.elements import PageBreak, Title
|
||||
from unstructured.partition.html import partition_html
|
||||
|
||||
@ -263,3 +264,15 @@ def test_partition_html_can_turn_off_assemble_articles():
|
||||
"""
|
||||
elements = partition_html(text=html_text, html_assemble_articles=False)
|
||||
assert elements[-1] == Title("This is outside of the article.")
|
||||
|
||||
|
||||
def test_partition_html_with_pre_tag():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-html-pre.htm")
|
||||
elements = partition_html(filename=filename)
|
||||
|
||||
assert len(elements) > 0
|
||||
assert PageBreak() not in elements
|
||||
assert clean_extra_whitespace(elements[0].text) == "[107th Congress Public Law 56]"
|
||||
assert isinstance(elements[0], Title)
|
||||
assert elements[0].metadata.filetype == "text/html"
|
||||
assert elements[0].metadata.filename == "fake-html-pre.htm"
|
||||
|
@ -58,5 +58,235 @@
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Downloadify Invoke Script For This Page"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "a6f18a30c8de3b1436133823a93f50db",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "Downloadify.create('downloadify',{"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "edbfc1f9e429bd016ab71cd365adad8a",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 3
|
||||
},
|
||||
"text": "filename: function(){"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "fff8b2c3a9b06101fe7f64eef5c4ab2d",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 4
|
||||
},
|
||||
"text": "return document.getElementById('filename').value;"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "e52a1813c46049513e6beb0b5c9e2aca",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 5
|
||||
},
|
||||
"text": "},"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "877efc801047c9b9b245c2febc5c4cf4",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 6
|
||||
},
|
||||
"text": "data: function(){"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "7a384fc4bff83eec4ba849c592f38de2",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 7
|
||||
},
|
||||
"text": "return document.getElementById('data').value;"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "e52a1813c46049513e6beb0b5c9e2aca",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 8
|
||||
},
|
||||
"text": "},"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "91a68ed0c4ad18401f490396c820c497",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 9
|
||||
},
|
||||
"text": "onComplete: function(){"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "a9b9ab32700c5a217e4f3f544e35fb43",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 10
|
||||
},
|
||||
"text": "alert('Your File Has Been Saved!');"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "e52a1813c46049513e6beb0b5c9e2aca",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 11
|
||||
},
|
||||
"text": "},"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "d4dd03a884778f7100dc6d5bbbf8b5b4",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 12
|
||||
},
|
||||
"text": "onCancel: function(){"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "96c9e5794fbc5652b2da41753be3401f",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 13
|
||||
},
|
||||
"text": "alert('You have cancelled the saving of this file.');"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "e52a1813c46049513e6beb0b5c9e2aca",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 14
|
||||
},
|
||||
"text": "},"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "c7f068e18ced43a5a5b566bb3139be83",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 15
|
||||
},
|
||||
"text": "onError: function(){"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "4ce0312c6b9cefd778c77dcda1daa357",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 16
|
||||
},
|
||||
"text": "alert('You must put something in the File Contents or there will be nothing to save!');"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "e52a1813c46049513e6beb0b5c9e2aca",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 17
|
||||
},
|
||||
"text": "},"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "4e6cb015a10ef85a94cbf38f0736c963",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 18
|
||||
},
|
||||
"text": "swf: 'media/downloadify.swf',"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "547c1eea609aae64271813c3cc061d03",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 19
|
||||
},
|
||||
"text": "downloadImage: 'images/download.png',"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "08f092710daeddd051b6c9ed12f8a77d",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 20
|
||||
},
|
||||
"text": "width: 100,"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "4664ad91130a49e27fc2f874b5d08a68",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 21
|
||||
},
|
||||
"text": "height: 30,"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "577ebe0897958e450d22132ba908c640",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 22
|
||||
},
|
||||
"text": "transparent: true,"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "e68f9c269ffa29b81d6ec9a8ebe47817",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 23
|
||||
},
|
||||
"text": "append: false"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "29576b54e255e3c948eea5b5904fa38b",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 24
|
||||
},
|
||||
"text": "});"
|
||||
}
|
||||
]
|
@ -1 +1 @@
|
||||
__version__ = "0.7.10-dev0" # pragma: no cover
|
||||
__version__ = "0.7.10-dev1" # pragma: no cover
|
||||
|
@ -5,6 +5,10 @@ from lxml import etree
|
||||
from unstructured.documents.base import Document, Page
|
||||
from unstructured.file_utils.encoding import read_txt_file
|
||||
from unstructured.logger import logger
|
||||
from unstructured.partition.text import (
|
||||
element_from_text,
|
||||
split_by_paragraph,
|
||||
)
|
||||
|
||||
VALID_PARSERS = Union[etree.HTMLParser, etree.XMLParser, None]
|
||||
|
||||
@ -67,6 +71,7 @@ class XMLDocument(Document):
|
||||
document_tree = etree.fromstring(content, self.parser)
|
||||
if document_tree is None:
|
||||
raise ValueError("document_tree is None")
|
||||
|
||||
# NOTE(robinson) - The following ValueError occurs with unicode strings. In that
|
||||
# case, we call back to encoding the string and passing in bytes.
|
||||
# ValueError: Unicode strings with encoding declaration are not supported.
|
||||
@ -74,6 +79,17 @@ class XMLDocument(Document):
|
||||
except ValueError:
|
||||
document_tree = etree.fromstring(content.encode(), self.parser)
|
||||
|
||||
if "<pre>" and "</pre>" in content:
|
||||
tree = etree.HTML(content)
|
||||
for element in tree.xpath("//pre"):
|
||||
if not element.text:
|
||||
continue
|
||||
text_content = split_by_paragraph(element.text)
|
||||
for text in text_content:
|
||||
element = etree.Element("span")
|
||||
element.text = str(element_from_text(text=text))
|
||||
document_tree.append(element)
|
||||
|
||||
if self.stylesheet:
|
||||
if isinstance(self.parser, etree.HTMLParser):
|
||||
logger.warning(
|
||||
|
Loading…
x
Reference in New Issue
Block a user