fix: Remove JavaScript from HTML reader output (#313)

* Fixes an error causing JavaScript to appear in the output of `partition_html` sometimes.
This commit is contained in:
Tom Aarsen 2023-02-28 23:24:24 +01:00 committed by GitHub
parent 1ccbc05b10
commit 350c4230ee
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 1191 additions and 8 deletions

View File

@ -1,4 +1,4 @@
## 0.5.1-dev1
## 0.5.1-dev2
### Enhancements
@ -6,6 +6,7 @@
### Fixes
* Fixes an error causing JavaScript to appear in the output of `partition_html` sometimes.
* Fix several issues with the `requires_dependencies` decorator, including the error message
and how it was used.

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,5 @@
import os
import pathlib
import pytest
from lxml import etree
@ -23,6 +24,8 @@ from unstructured.documents.html import (
TagsMixin,
)
DIRECTORY = pathlib.Path(__file__).parent.resolve()
TAGS = (
"<a><abbr><acronym><address><applet><area><article><aside><audio><b><base><basefont><bdi>"
"<bdo><big><blockquote><body><br><button><canvas><caption><center><cite><code><col>"
@ -632,3 +635,9 @@ def test_joins_tag_text_correctly():
doc = HTMLDocument.from_string(raw_html)
el = doc.elements[0]
assert el.text == "Hello again peet magical"
def test_sample_doc_with_scripts():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-with-scripts.html")
doc = HTMLDocument.from_file(filename=filename)
assert all(["function (" not in element.text for element in doc.elements])

View File

@ -1 +1 @@
__version__ = "0.5.1-dev1" # pragma: no cover
__version__ = "0.5.1-dev2" # pragma: no cover

View File

@ -265,12 +265,11 @@ def is_narrative_tag(text: str, tag: str) -> bool:
def _construct_text(tag_elem: etree.Element) -> str:
"""Extracts text from a text tag element."""
text = ""
for item in tag_elem.itertext():
if item:
text += item
if tag_elem.tail:
text = text + tag_elem.tail
for item in tag_elem.iter():
if item.text and item.tag != "script":
text += item.text
if item.tail:
text += item.tail
text = replace_unicode_quotes(text)
return text.strip()