fix: Remove JavaScript from HTML reader output (#313)

* Fixes an error causing JavaScript to appear in the output of `partition_html` sometimes.
2025-12-02 02:00:29 +00:00 · 2023-02-28 23:24:24 +01:00 · 2023-02-28 23:24:24 +01:00 · 350c4230ee
commit 350c4230ee
parent 1ccbc05b10
5 changed files with 1191 additions and 8 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.5.1-dev1
+## 0.5.1-dev2

 ### Enhancements

@ -6,6 +6,7 @@

 ### Fixes

+* Fixes an error causing JavaScript to appear in the output of `partition_html` sometimes.
 * Fix several issues with the `requires_dependencies` decorator, including the error message
  and how it was used.

--- a/example-docs/example-with-scripts.html
+++ b/example-docs/example-with-scripts.html
--- a/test_unstructured/documents/test_html.py
+++ b/test_unstructured/documents/test_html.py
@ -1,4 +1,5 @@
 import os
+import pathlib

 import pytest
 from lxml import etree
@ -23,6 +24,8 @@ from unstructured.documents.html import (
    TagsMixin,
 )

+DIRECTORY = pathlib.Path(__file__).parent.resolve()
+
 TAGS = (
    "<a><abbr><acronym><address><applet><area><article><aside><audio><b><base><basefont><bdi>"
    "<bdo><big><blockquote><body><br><button><canvas><caption><center><cite><code><col>"
@ -632,3 +635,9 @@ def test_joins_tag_text_correctly():
    doc = HTMLDocument.from_string(raw_html)
    el = doc.elements[0]
    assert el.text == "Hello again peet magical"
+
+
+def test_sample_doc_with_scripts():
+    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-with-scripts.html")
+    doc = HTMLDocument.from_file(filename=filename)
+    assert all(["function (" not in element.text for element in doc.elements])
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.5.1-dev1"  # pragma: no cover
+__version__ = "0.5.1-dev2"  # pragma: no cover
--- a/unstructured/documents/html.py
+++ b/unstructured/documents/html.py
@ -265,12 +265,11 @@ def is_narrative_tag(text: str, tag: str) -> bool:
 def _construct_text(tag_elem: etree.Element) -> str:
    """Extracts text from a text tag element."""
    text = ""
-    for item in tag_elem.itertext():
-        if item:
-            text += item
-
-    if tag_elem.tail:
-        text = text + tag_elem.tail
+    for item in tag_elem.iter():
+        if item.text and item.tag != "script":
+            text += item.text
+            if item.tail:
+                text += item.tail

    text = replace_unicode_quotes(text)
    return text.strip()