fix(html): style (CSS) content appears in HTML text (#2132)

Fixes #1958.

`<style>` is invalid where it appears in the HTML of thw WSJ page
mentioned by that issue but invalid has little meaning in the HTML world
if Chrome accepts it.

In any case, we have no use for the contents of a `<style>` tag wherever
it appears so safe enough for us to just strip all those tags. Note we
do not want to also strip the *tail text* which can contain text we're
interested in.
This commit is contained in:
Steve Canny 2023-11-20 20:22:13 -08:00 committed by GitHub
parent ccda93b0d1
commit 19f00b9fa4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 38 additions and 9 deletions

View File

@ -1,3 +1,13 @@
## 0.11.1-dev0
### Enhancements
### Features
### Fixes
* **Do not extract text of `<style>` tags in HTML.** `<style>` tags containing CSS in invalid positions previously contributed to element text. Do not consider text node of a `<style>` element as textual content.
## 0.11.0
### Enhancements

View File

@ -2,7 +2,7 @@
import os
import pathlib
from typing import Dict, List
from typing import Dict, List, cast
import pytest
from lxml import etree
@ -212,6 +212,31 @@ def test_it_provides_parseable_HTML_in_text_as_html():
)
# -- element-suppression behaviors ---------------------------------------------------------------
def test_it_does_not_extract_text_in_script_tags():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-with-scripts.html")
doc = HTMLDocument.from_file(filename=filename)
assert all("function (" not in element.text for element in cast(List[Text], doc.elements))
def test_it_does_not_extract_text_in_style_tags():
html_str = (
"<html>\n"
"<body>\n"
" <p><style> p { margin:0; padding:0; } </style>Lorem ipsum dolor</p>\n"
"</body>\n"
"</html>"
)
html_document = HTMLDocument.from_string(html_str)
(element,) = html_document.elements
assert isinstance(element, Text)
assert element.text == "Lorem ipsum dolor"
# ------------------------------------------------------------------------------------------------
@ -818,12 +843,6 @@ def test_joins_tag_text_correctly():
assert el.text == "Hello again peet magical"
def test_sample_doc_with_scripts():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-with-scripts.html")
doc = HTMLDocument.from_file(filename=filename)
assert all("function (" not in element.text for element in doc.elements)
def test_sample_doc_with_emoji():
raw_html = """
<html charset="unicode">

View File

@ -1 +1 @@
__version__ = "0.11.0" # pragma: no cover
__version__ = "0.11.1-dev0" # pragma: no cover

View File

@ -150,7 +150,7 @@ class HTMLDocument(XMLDocument):
return self._pages
logger.info("Reading document ...")
pages: List[Page] = []
etree.strip_elements(self.document_tree, ["script"])
etree.strip_elements(self.document_tree, ["script", "style"], with_tail=False)
root = _find_main(self.document_tree)
articles = _find_articles(root, assemble_articles=self.assembled_articles)