mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-02 02:00:29 +00:00
fix: Remove JavaScript from HTML reader output (#313)
* Fixes an error causing JavaScript to appear in the output of `partition_html` sometimes.
This commit is contained in:
parent
1ccbc05b10
commit
350c4230ee
@ -1,4 +1,4 @@
|
||||
## 0.5.1-dev1
|
||||
## 0.5.1-dev2
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -6,6 +6,7 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* Fixes an error causing JavaScript to appear in the output of `partition_html` sometimes.
|
||||
* Fix several issues with the `requires_dependencies` decorator, including the error message
|
||||
and how it was used.
|
||||
|
||||
|
||||
1174
example-docs/example-with-scripts.html
Normal file
1174
example-docs/example-with-scripts.html
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,4 +1,5 @@
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
import pytest
|
||||
from lxml import etree
|
||||
@ -23,6 +24,8 @@ from unstructured.documents.html import (
|
||||
TagsMixin,
|
||||
)
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
|
||||
TAGS = (
|
||||
"<a><abbr><acronym><address><applet><area><article><aside><audio><b><base><basefont><bdi>"
|
||||
"<bdo><big><blockquote><body><br><button><canvas><caption><center><cite><code><col>"
|
||||
@ -632,3 +635,9 @@ def test_joins_tag_text_correctly():
|
||||
doc = HTMLDocument.from_string(raw_html)
|
||||
el = doc.elements[0]
|
||||
assert el.text == "Hello again peet magical"
|
||||
|
||||
|
||||
def test_sample_doc_with_scripts():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-with-scripts.html")
|
||||
doc = HTMLDocument.from_file(filename=filename)
|
||||
assert all(["function (" not in element.text for element in doc.elements])
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.5.1-dev1" # pragma: no cover
|
||||
__version__ = "0.5.1-dev2" # pragma: no cover
|
||||
|
||||
@ -265,12 +265,11 @@ def is_narrative_tag(text: str, tag: str) -> bool:
|
||||
def _construct_text(tag_elem: etree.Element) -> str:
|
||||
"""Extracts text from a text tag element."""
|
||||
text = ""
|
||||
for item in tag_elem.itertext():
|
||||
if item:
|
||||
text += item
|
||||
|
||||
if tag_elem.tail:
|
||||
text = text + tag_elem.tail
|
||||
for item in tag_elem.iter():
|
||||
if item.text and item.tag != "script":
|
||||
text += item.text
|
||||
if item.tail:
|
||||
text += item.tail
|
||||
|
||||
text = replace_unicode_quotes(text)
|
||||
return text.strip()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user