fix(html): style (CSS) content appears in HTML text (#2132)

Fixes #1958. `<style>` is invalid where it appears in the HTML of thw WSJ page mentioned by that issue but invalid has little meaning in the HTML world if Chrome accepts it. In any case, we have no use for the contents of a `<style>` tag wherever it appears so safe enough for us to just strip all those tags. Note we do not want to also strip the *tail text* which can contain text we're interested in.
2025-12-08 04:55:36 +00:00 · 2023-11-20 20:22:13 -08:00 · 2023-11-20 20:22:13 -08:00 · 19f00b9fa4
commit 19f00b9fa4
parent ccda93b0d1
4 changed files with 38 additions and 9 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,13 @@
+## 0.11.1-dev0
+
+### Enhancements
+
+### Features
+
+### Fixes
+
+* **Do not extract text of `<style>` tags in HTML.** `<style>` tags containing CSS in invalid positions previously contributed to element text. Do not consider text node of a `<style>` element as textual content.
+
 ## 0.11.0

 ### Enhancements
--- a/test_unstructured/documents/test_html.py
+++ b/test_unstructured/documents/test_html.py
@ -2,7 +2,7 @@

 import os
 import pathlib
-from typing import Dict, List
+from typing import Dict, List, cast

 import pytest
 from lxml import etree
@ -212,6 +212,31 @@ def test_it_provides_parseable_HTML_in_text_as_html():
    )


+# -- element-suppression behaviors ---------------------------------------------------------------
+
+
+def test_it_does_not_extract_text_in_script_tags():
+    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-with-scripts.html")
+    doc = HTMLDocument.from_file(filename=filename)
+    assert all("function (" not in element.text for element in cast(List[Text], doc.elements))
+
+
+def test_it_does_not_extract_text_in_style_tags():
+    html_str = (
+        "<html>\n"
+        "<body>\n"
+        "  <p><style> p { margin:0; padding:0; } </style>Lorem ipsum dolor</p>\n"
+        "</body>\n"
+        "</html>"
+    )
+
+    html_document = HTMLDocument.from_string(html_str)
+
+    (element,) = html_document.elements
+    assert isinstance(element, Text)
+    assert element.text == "Lorem ipsum dolor"
+
+
 # ------------------------------------------------------------------------------------------------


@ -818,12 +843,6 @@ def test_joins_tag_text_correctly():
    assert el.text == "Hello again peet magical"


-def test_sample_doc_with_scripts():
-    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-with-scripts.html")
-    doc = HTMLDocument.from_file(filename=filename)
-    assert all("function (" not in element.text for element in doc.elements)
-
-
 def test_sample_doc_with_emoji():
    raw_html = """
    <html charset="unicode">
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.11.0"  # pragma: no cover
+__version__ = "0.11.1-dev0"  # pragma: no cover
--- a/unstructured/documents/html.py
+++ b/unstructured/documents/html.py
@ -150,7 +150,7 @@ class HTMLDocument(XMLDocument):
            return self._pages
        logger.info("Reading document ...")
        pages: List[Page] = []
-        etree.strip_elements(self.document_tree, ["script"])
+        etree.strip_elements(self.document_tree, ["script", "style"], with_tail=False)
        root = _find_main(self.document_tree)

        articles = _find_articles(root, assemble_articles=self.assembled_articles)