From 19f00b9fa4e402ca816999984e17395af36e18b1 Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Mon, 20 Nov 2023 20:22:13 -0800 Subject: [PATCH] fix(html): style (CSS) content appears in HTML text (#2132) Fixes #1958. `Lorem ipsum dolor

\n" + "\n" + "" + ) + + html_document = HTMLDocument.from_string(html_str) + + (element,) = html_document.elements + assert isinstance(element, Text) + assert element.text == "Lorem ipsum dolor" + + # ------------------------------------------------------------------------------------------------ @@ -818,12 +843,6 @@ def test_joins_tag_text_correctly(): assert el.text == "Hello again peet magical" -def test_sample_doc_with_scripts(): - filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-with-scripts.html") - doc = HTMLDocument.from_file(filename=filename) - assert all("function (" not in element.text for element in doc.elements) - - def test_sample_doc_with_emoji(): raw_html = """ diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 3ec084411..b4d190a05 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.11.0" # pragma: no cover +__version__ = "0.11.1-dev0" # pragma: no cover diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py index eb055b4dc..9b715943e 100644 --- a/unstructured/documents/html.py +++ b/unstructured/documents/html.py @@ -150,7 +150,7 @@ class HTMLDocument(XMLDocument): return self._pages logger.info("Reading document ...") pages: List[Page] = [] - etree.strip_elements(self.document_tree, ["script"]) + etree.strip_elements(self.document_tree, ["script", "style"], with_tail=False) root = _find_main(self.document_tree) articles = _find_articles(root, assemble_articles=self.assembled_articles)