From 19f00b9fa4e402ca816999984e17395af36e18b1 Mon Sep 17 00:00:00 2001
From: Steve Canny <stcanny@gmail.com>
Date: Mon, 20 Nov 2023 20:22:13 -0800
Subject: [PATCH] fix(html): style (CSS) content appears in HTML text (#2132)

Fixes #1958.

`<style>` is invalid where it appears in the HTML of thw WSJ page
mentioned by that issue but invalid has little meaning in the HTML world
if Chrome accepts it.

In any case, we have no use for the contents of a `<style>` tag wherever
it appears so safe enough for us to just strip all those tags. Note we
do not want to also strip the *tail text* which can contain text we're
interested in.
---
 CHANGELOG.md                             | 10 +++++++
 test_unstructured/documents/test_html.py | 33 +++++++++++++++++++-----
 unstructured/__version__.py              |  2 +-
 unstructured/documents/html.py           |  2 +-
 4 files changed, 38 insertions(+), 9 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4ec8bf5fd..3f9994469 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,13 @@
+## 0.11.1-dev0
+
+### Enhancements
+
+### Features
+
+### Fixes
+
+* **Do not extract text of `<style>` tags in HTML.** `<style>` tags containing CSS in invalid positions previously contributed to element text. Do not consider text node of a `<style>` element as textual content.
+
 ## 0.11.0
 
 ### Enhancements
diff --git a/test_unstructured/documents/test_html.py b/test_unstructured/documents/test_html.py
index 636502ce2..346427527 100644
--- a/test_unstructured/documents/test_html.py
+++ b/test_unstructured/documents/test_html.py
@@ -2,7 +2,7 @@
 
 import os
 import pathlib
-from typing import Dict, List
+from typing import Dict, List, cast
 
 import pytest
 from lxml import etree
@@ -212,6 +212,31 @@ def test_it_provides_parseable_HTML_in_text_as_html():
     )
 
 
+# -- element-suppression behaviors ---------------------------------------------------------------
+
+
+def test_it_does_not_extract_text_in_script_tags():
+    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-with-scripts.html")
+    doc = HTMLDocument.from_file(filename=filename)
+    assert all("function (" not in element.text for element in cast(List[Text], doc.elements))
+
+
+def test_it_does_not_extract_text_in_style_tags():
+    html_str = (
+        "<html>\n"
+        "<body>\n"
+        "  <p><style> p { margin:0; padding:0; } </style>Lorem ipsum dolor</p>\n"
+        "</body>\n"
+        "</html>"
+    )
+
+    html_document = HTMLDocument.from_string(html_str)
+
+    (element,) = html_document.elements
+    assert isinstance(element, Text)
+    assert element.text == "Lorem ipsum dolor"
+
+
 # ------------------------------------------------------------------------------------------------
 
 
@@ -818,12 +843,6 @@ def test_joins_tag_text_correctly():
     assert el.text == "Hello again peet magical"
 
 
-def test_sample_doc_with_scripts():
-    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-with-scripts.html")
-    doc = HTMLDocument.from_file(filename=filename)
-    assert all("function (" not in element.text for element in doc.elements)
-
-
 def test_sample_doc_with_emoji():
     raw_html = """
     <html charset="unicode">
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 3ec084411..b4d190a05 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.11.0"  # pragma: no cover
+__version__ = "0.11.1-dev0"  # pragma: no cover
diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py
index eb055b4dc..9b715943e 100644
--- a/unstructured/documents/html.py
+++ b/unstructured/documents/html.py
@@ -150,7 +150,7 @@ class HTMLDocument(XMLDocument):
             return self._pages
         logger.info("Reading document ...")
         pages: List[Page] = []
-        etree.strip_elements(self.document_tree, ["script"])
+        etree.strip_elements(self.document_tree, ["script", "style"], with_tail=False)
         root = _find_main(self.document_tree)
 
         articles = _find_articles(root, assemble_articles=self.assembled_articles)