mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-10-18 11:33:09 +00:00
fix(html): style (CSS) content appears in HTML text (#2132)
Fixes #1958. `<style>` is invalid where it appears in the HTML of thw WSJ page mentioned by that issue but invalid has little meaning in the HTML world if Chrome accepts it. In any case, we have no use for the contents of a `<style>` tag wherever it appears so safe enough for us to just strip all those tags. Note we do not want to also strip the *tail text* which can contain text we're interested in.
This commit is contained in:
parent
ccda93b0d1
commit
19f00b9fa4
10
CHANGELOG.md
10
CHANGELOG.md
@ -1,3 +1,13 @@
|
||||
## 0.11.1-dev0
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
* **Do not extract text of `<style>` tags in HTML.** `<style>` tags containing CSS in invalid positions previously contributed to element text. Do not consider text node of a `<style>` element as textual content.
|
||||
|
||||
## 0.11.0
|
||||
|
||||
### Enhancements
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
import os
|
||||
import pathlib
|
||||
from typing import Dict, List
|
||||
from typing import Dict, List, cast
|
||||
|
||||
import pytest
|
||||
from lxml import etree
|
||||
@ -212,6 +212,31 @@ def test_it_provides_parseable_HTML_in_text_as_html():
|
||||
)
|
||||
|
||||
|
||||
# -- element-suppression behaviors ---------------------------------------------------------------
|
||||
|
||||
|
||||
def test_it_does_not_extract_text_in_script_tags():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-with-scripts.html")
|
||||
doc = HTMLDocument.from_file(filename=filename)
|
||||
assert all("function (" not in element.text for element in cast(List[Text], doc.elements))
|
||||
|
||||
|
||||
def test_it_does_not_extract_text_in_style_tags():
|
||||
html_str = (
|
||||
"<html>\n"
|
||||
"<body>\n"
|
||||
" <p><style> p { margin:0; padding:0; } </style>Lorem ipsum dolor</p>\n"
|
||||
"</body>\n"
|
||||
"</html>"
|
||||
)
|
||||
|
||||
html_document = HTMLDocument.from_string(html_str)
|
||||
|
||||
(element,) = html_document.elements
|
||||
assert isinstance(element, Text)
|
||||
assert element.text == "Lorem ipsum dolor"
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
@ -818,12 +843,6 @@ def test_joins_tag_text_correctly():
|
||||
assert el.text == "Hello again peet magical"
|
||||
|
||||
|
||||
def test_sample_doc_with_scripts():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-with-scripts.html")
|
||||
doc = HTMLDocument.from_file(filename=filename)
|
||||
assert all("function (" not in element.text for element in doc.elements)
|
||||
|
||||
|
||||
def test_sample_doc_with_emoji():
|
||||
raw_html = """
|
||||
<html charset="unicode">
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.11.0" # pragma: no cover
|
||||
__version__ = "0.11.1-dev0" # pragma: no cover
|
||||
|
@ -150,7 +150,7 @@ class HTMLDocument(XMLDocument):
|
||||
return self._pages
|
||||
logger.info("Reading document ...")
|
||||
pages: List[Page] = []
|
||||
etree.strip_elements(self.document_tree, ["script"])
|
||||
etree.strip_elements(self.document_tree, ["script", "style"], with_tail=False)
|
||||
root = _find_main(self.document_tree)
|
||||
|
||||
articles = _find_articles(root, assemble_articles=self.assembled_articles)
|
||||
|
Loading…
x
Reference in New Issue
Block a user