feature(html partition): parse pre tag (#642)

* feature(html partition): parse pre tag * chore: update CHANGELOG.md * style: black format xml.py * Added tests dor html with pre tag * remove skip test, update parse pre tag * fix style * chore: spell check * chore: update changelog & version * chore: update ingest test fixtures * chore: add exception handling if `element.text` is `None` in `_read_xml` * test: add more sanity testing on the `.text` content of the element(s) * refactor: move the conditional logic for <pre> outside of the `try/except` block --------- Co-authored-by: cragwolfe <crag@unstructured.io> Co-authored-by: christinestraub <christinemstraub@gmail.com>
2025-06-27 02:30:08 +00:00 · 2023-06-27 21:52:39 +03:00 · 2023-06-27 21:52:39 +03:00 · 58e988e110
commit 58e988e110
parent 078e2aa116
7 changed files with 8579 additions and 3 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.7.10-dev0
+## 0.7.10-dev1

 ### Enhancements

@ -8,6 +8,8 @@

 ### Fixes

+* Fix pre tag parsing for `partition_html`
+
 ## 0.7.9

 ### Enhancements
@ -198,7 +200,7 @@

 ### Enhancements

-* XLS support from auto partiton
+* XLS support from auto partition

 ### Features

--- a/example-docs/fake-html-pre.htm
+++ b/example-docs/fake-html-pre.htm
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@ -775,6 +775,17 @@ def test_auto_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
    assert elements[0].metadata.filetype == "text/csv"


+def test_auto_partition_html_pre_from_file(filename="example-docs/fake-html-pre.htm"):
+    elements = partition(filename=filename)
+
+    assert len(elements) > 0
+    assert PageBreak() not in elements
+    assert clean_extra_whitespace(elements[0].text) == "[107th Congress Public Law 56]"
+    assert isinstance(elements[0], Title)
+    assert elements[0].metadata.filetype == "text/html"
+    assert elements[0].metadata.filename == "fake-html-pre.htm"
+
+
 def test_auto_partition_works_on_empty_filename(filename="example-docs/empty.txt"):
    assert partition(filename=filename) == []

--- a/test_unstructured/partition/test_html_partition.py
+++ b/test_unstructured/partition/test_html_partition.py
@ -6,6 +6,7 @@ import pytest
 import requests
 from requests.models import Response

+from unstructured.cleaners.core import clean_extra_whitespace
 from unstructured.documents.elements import PageBreak, Title
 from unstructured.partition.html import partition_html

@ -263,3 +264,15 @@ def test_partition_html_can_turn_off_assemble_articles():
 """
    elements = partition_html(text=html_text, html_assemble_articles=False)
    assert elements[-1] == Title("This is outside of the article.")
+
+
+def test_partition_html_with_pre_tag():
+    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-html-pre.htm")
+    elements = partition_html(filename=filename)
+
+    assert len(elements) > 0
+    assert PageBreak() not in elements
+    assert clean_extra_whitespace(elements[0].text) == "[107th Congress Public Law 56]"
+    assert isinstance(elements[0], Title)
+    assert elements[0].metadata.filetype == "text/html"
+    assert elements[0].metadata.filename == "fake-html-pre.htm"
--- a/test_unstructured_ingest/expected-structured-output/github-downloadify/test.html.json
+++ b/test_unstructured_ingest/expected-structured-output/github-downloadify/test.html.json
@ -58,5 +58,235 @@
      "page_number": 1
    },
    "text": "Downloadify Invoke Script For This Page"
+  },
+  {
+    "type": "Title",
+    "element_id": "a6f18a30c8de3b1436133823a93f50db",
+    "metadata": {
+      "data_source": {},
+      "filetype": "text/html",
+      "page_number": 2
+    },
+    "text": "Downloadify.create('downloadify',{"
+  },
+  {
+    "type": "Title",
+    "element_id": "edbfc1f9e429bd016ab71cd365adad8a",
+    "metadata": {
+      "data_source": {},
+      "filetype": "text/html",
+      "page_number": 3
+    },
+    "text": "filename: function(){"
+  },
+  {
+    "type": "Title",
+    "element_id": "fff8b2c3a9b06101fe7f64eef5c4ab2d",
+    "metadata": {
+      "data_source": {},
+      "filetype": "text/html",
+      "page_number": 4
+    },
+    "text": "return document.getElementById('filename').value;"
+  },
+  {
+    "type": "UncategorizedText",
+    "element_id": "e52a1813c46049513e6beb0b5c9e2aca",
+    "metadata": {
+      "data_source": {},
+      "filetype": "text/html",
+      "page_number": 5
+    },
+    "text": "},"
+  },
+  {
+    "type": "Title",
+    "element_id": "877efc801047c9b9b245c2febc5c4cf4",
+    "metadata": {
+      "data_source": {},
+      "filetype": "text/html",
+      "page_number": 6
+    },
+    "text": "data: function(){"
+  },
+  {
+    "type": "Title",
+    "element_id": "7a384fc4bff83eec4ba849c592f38de2",
+    "metadata": {
+      "data_source": {},
+      "filetype": "text/html",
+      "page_number": 7
+    },
+    "text": "return document.getElementById('data').value;"
+  },
+  {
+    "type": "UncategorizedText",
+    "element_id": "e52a1813c46049513e6beb0b5c9e2aca",
+    "metadata": {
+      "data_source": {},
+      "filetype": "text/html",
+      "page_number": 8
+    },
+    "text": "},"
+  },
+  {
+    "type": "Title",
+    "element_id": "91a68ed0c4ad18401f490396c820c497",
+    "metadata": {
+      "data_source": {},
+      "filetype": "text/html",
+      "page_number": 9
+    },
+    "text": "onComplete: function(){"
+  },
+  {
+    "type": "Title",
+    "element_id": "a9b9ab32700c5a217e4f3f544e35fb43",
+    "metadata": {
+      "data_source": {},
+      "filetype": "text/html",
+      "page_number": 10
+    },
+    "text": "alert('Your File Has Been Saved!');"
+  },
+  {
+    "type": "UncategorizedText",
+    "element_id": "e52a1813c46049513e6beb0b5c9e2aca",
+    "metadata": {
+      "data_source": {},
+      "filetype": "text/html",
+      "page_number": 11
+    },
+    "text": "},"
+  },
+  {
+    "type": "Title",
+    "element_id": "d4dd03a884778f7100dc6d5bbbf8b5b4",
+    "metadata": {
+      "data_source": {},
+      "filetype": "text/html",
+      "page_number": 12
+    },
+    "text": "onCancel: function(){"
+  },
+  {
+    "type": "NarrativeText",
+    "element_id": "96c9e5794fbc5652b2da41753be3401f",
+    "metadata": {
+      "data_source": {},
+      "filetype": "text/html",
+      "page_number": 13
+    },
+    "text": "alert('You have cancelled the saving of this file.');"
+  },
+  {
+    "type": "UncategorizedText",
+    "element_id": "e52a1813c46049513e6beb0b5c9e2aca",
+    "metadata": {
+      "data_source": {},
+      "filetype": "text/html",
+      "page_number": 14
+    },
+    "text": "},"
+  },
+  {
+    "type": "Title",
+    "element_id": "c7f068e18ced43a5a5b566bb3139be83",
+    "metadata": {
+      "data_source": {},
+      "filetype": "text/html",
+      "page_number": 15
+    },
+    "text": "onError: function(){"
+  },
+  {
+    "type": "NarrativeText",
+    "element_id": "4ce0312c6b9cefd778c77dcda1daa357",
+    "metadata": {
+      "data_source": {},
+      "filetype": "text/html",
+      "page_number": 16
+    },
+    "text": "alert('You must put something in the File Contents or there will be nothing to save!');"
+  },
+  {
+    "type": "UncategorizedText",
+    "element_id": "e52a1813c46049513e6beb0b5c9e2aca",
+    "metadata": {
+      "data_source": {},
+      "filetype": "text/html",
+      "page_number": 17
+    },
+    "text": "},"
+  },
+  {
+    "type": "UncategorizedText",
+    "element_id": "4e6cb015a10ef85a94cbf38f0736c963",
+    "metadata": {
+      "data_source": {},
+      "filetype": "text/html",
+      "page_number": 18
+    },
+    "text": "swf: 'media/downloadify.swf',"
+  },
+  {
+    "type": "UncategorizedText",
+    "element_id": "547c1eea609aae64271813c3cc061d03",
+    "metadata": {
+      "data_source": {},
+      "filetype": "text/html",
+      "page_number": 19
+    },
+    "text": "downloadImage: 'images/download.png',"
+  },
+  {
+    "type": "UncategorizedText",
+    "element_id": "08f092710daeddd051b6c9ed12f8a77d",
+    "metadata": {
+      "data_source": {},
+      "filetype": "text/html",
+      "page_number": 20
+    },
+    "text": "width: 100,"
+  },
+  {
+    "type": "UncategorizedText",
+    "element_id": "4664ad91130a49e27fc2f874b5d08a68",
+    "metadata": {
+      "data_source": {},
+      "filetype": "text/html",
+      "page_number": 21
+    },
+    "text": "height: 30,"
+  },
+  {
+    "type": "UncategorizedText",
+    "element_id": "577ebe0897958e450d22132ba908c640",
+    "metadata": {
+      "data_source": {},
+      "filetype": "text/html",
+      "page_number": 22
+    },
+    "text": "transparent: true,"
+  },
+  {
+    "type": "Title",
+    "element_id": "e68f9c269ffa29b81d6ec9a8ebe47817",
+    "metadata": {
+      "data_source": {},
+      "filetype": "text/html",
+      "page_number": 23
+    },
+    "text": "append: false"
+  },
+  {
+    "type": "UncategorizedText",
+    "element_id": "29576b54e255e3c948eea5b5904fa38b",
+    "metadata": {
+      "data_source": {},
+      "filetype": "text/html",
+      "page_number": 24
+    },
+    "text": "});"
  }
 ]
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.7.10-dev0"  # pragma: no cover
+__version__ = "0.7.10-dev1"  # pragma: no cover
--- a/unstructured/documents/xml.py
+++ b/unstructured/documents/xml.py
@ -5,6 +5,10 @@ from lxml import etree
 from unstructured.documents.base import Document, Page
 from unstructured.file_utils.encoding import read_txt_file
 from unstructured.logger import logger
+from unstructured.partition.text import (
+    element_from_text,
+    split_by_paragraph,
+)

 VALID_PARSERS = Union[etree.HTMLParser, etree.XMLParser, None]

@ -67,6 +71,7 @@ class XMLDocument(Document):
                document_tree = etree.fromstring(content, self.parser)
                if document_tree is None:
                    raise ValueError("document_tree is None")
+
            # NOTE(robinson) - The following ValueError occurs with unicode strings. In that
            # case, we call back to encoding the string and passing in bytes.
            #     ValueError: Unicode strings with encoding declaration are not supported.
@ -74,6 +79,17 @@ class XMLDocument(Document):
            except ValueError:
                document_tree = etree.fromstring(content.encode(), self.parser)

+            if "<pre>" and "</pre>" in content:
+                tree = etree.HTML(content)
+                for element in tree.xpath("//pre"):
+                    if not element.text:
+                        continue
+                    text_content = split_by_paragraph(element.text)
+                    for text in text_content:
+                        element = etree.Element("span")
+                        element.text = str(element_from_text(text=text))
+                        document_tree.append(element)
+
            if self.stylesheet:
                if isinstance(self.parser, etree.HTMLParser):
                    logger.warning(