Steve Canny 19f00b9fa4
fix(html): style (CSS) content appears in HTML text (#2132)
Fixes #1958.

`<style>` is invalid where it appears in the HTML of thw WSJ page
mentioned by that issue but invalid has little meaning in the HTML world
if Chrome accepts it.

In any case, we have no use for the contents of a `<style>` tag wherever
it appears so safe enough for us to just strip all those tags. Note we
do not want to also strip the *tail text* which can contain text we're
interested in.
2023-11-21 04:22:13 +00:00

906 lines
27 KiB
Python

# pyright: reportPrivateUsage=false
import os
import pathlib
from typing import Dict, List, cast
import pytest
from lxml import etree
from unstructured.documents import html
from unstructured.documents.base import Page
from unstructured.documents.elements import (
Address,
ListItem,
NarrativeText,
Table,
Text,
Title,
)
from unstructured.documents.html import (
HEADING_TAGS,
LIST_ITEM_TAGS,
SECTION_TAGS,
TABLE_TAGS,
TEXT_TAGS,
HTMLDocument,
HTMLNarrativeText,
HTMLTable,
HTMLTitle,
TagsMixin,
)
DIRECTORY = pathlib.Path(__file__).parent.resolve()
TAGS = (
(
"<a><abbr><acronym><address><applet><area><article><aside><audio><b><base><basefont><bdi>"
"<bdo><big><blockquote><body><br><button><canvas><caption><center><cite><code><col>"
"<colgroup><data><datalist><dd><del><details><dfn><dialog><dir><div><dl><dt><em><embed>"
"<fieldset><figcaption><figure><font><footer><form><frame><frameset><h1><h2><h3><h4><h5>"
"<h6><head><header><hr><html><i><iframe><img><input><ins><kbd><label><legend><li><link>"
"<main><map><mark><meta><meter><nav><noframes><noscript><object><ol><optgroup><option>"
"<output><p><param><picture><pre><progress><q><rp><rt><ruby><s><samp><script><section>"
"<select><small><source><span><strike><strong><style><sub><summary><sup><table><tbody><td>"
"<template><textarea><tfoot><th><thead><time><title><tr><track><tt><u><ul><var><video><wbr>"
)
.replace(">", "")
.split("<")[1:]
)
VOID_TAGS = (
("<area><base><br><col><embed><hr><img><input><link><meta><param><source><track><wbr>")
.replace(">", "")
.split("<")[1:]
)
INCLUDED_TAGS = TEXT_TAGS + HEADING_TAGS + LIST_ITEM_TAGS + SECTION_TAGS
EXCLUDED_TAGS = [
tag
for tag in TAGS
if tag not in (INCLUDED_TAGS + TABLE_TAGS + VOID_TAGS + ["html", "head", "body"])
]
# -- table-extraction behaviors ------------------------------------------------------------------
def test_it_can_parse_a_bare_bones_table_to_an_HTMLTable_element():
"""Bare-bones means no `<thead>`, `<tbody>`, or `<tfoot>` elements."""
html_str = (
"<html>\n"
"<body>\n"
" <table>\n"
" <tr><td>Lorem</td><td>Ipsum</td></tr>\n"
" <tr><td>Ut enim non</td><td>ad minim\nveniam quis</td></tr>\n"
" </table>\n"
"</body>\n"
"</html>"
)
html_document = HTMLDocument.from_string(html_str)
# -- there is exactly one element and it's an HTMLTable instance --
(element,) = html_document.elements
assert isinstance(element, HTMLTable)
# -- table text is joined into a single string; no row or cell boundaries are represented --
assert element.text == "Lorem Ipsum Ut enim non ad minim\nveniam quis"
# -- An HTML representation is also available that is longer but represents table structure.
# -- Note this is padded with undesired spaces for human-readability that doesn't matter to us.
assert element.text_as_html == (
"<table>"
"<tr><td>Lorem</td><td>Ipsum</td></tr>"
"<tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr>"
"</table>"
)
def test_it_accommodates_column_heading_cells_enclosed_in_thead_tbody_and_tfoot_elements():
"""Cells within a `table/thead` element are included in the text and html.
The presence of a `<thead>` element in the original also determines whether a `<thead>` element
appears in `.text_as_html` or whether the first row of cells is simply in the body.
"""
html_str = (
"<html>\n"
"<body>\n"
" <table>\n"
" <thead>\n"
" <tr><th>Lorem</th><th>Ipsum</th></tr>\n"
" </thead>\n"
" <tbody>\n"
" <tr><th>Lorem ipsum</th><td>dolor sit amet nulla</td></tr>\n"
" <tr><th>Ut enim non</th><td>ad minim\nveniam quis</td></tr>\n"
" </tbody>\n"
" <tfoot>\n"
" <tr><th>Dolor</th><td>Equis</td></tr>\n"
" </tfoot>\n"
" </table>\n"
"</body>\n"
"</html>"
)
html_document = HTMLDocument.from_string(html_str)
(element,) = html_document.elements
assert isinstance(element, HTMLTable)
assert element.text_as_html == (
"<table>"
"<tr><td>Lorem</td><td>Ipsum</td></tr>"
"<tr><td>Lorem ipsum</td><td>dolor sit amet nulla</td></tr>"
"<tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr>"
"<tr><td>Dolor</td><td>Equis</td></tr>"
"</table>"
)
def test_it_does_not_emit_an_HTMLTable_element_for_a_table_with_no_text():
html_str = (
"<html>\n"
"<body>\n"
" <table>\n"
" <tr><td> </td><td> </td></tr>\n"
" <tr><td> </td><td> </td></tr>\n"
" </table>\n"
"</body>\n"
"</html>"
)
html_document = HTMLDocument.from_string(html_str)
assert html_document.elements == []
def test_it_does_not_consider_an_empty_table_a_bulleted_text_table():
html_str = (
"<html>\n"
"<body>\n"
" <table>\n"
" <tr><td> </td><td> </td></tr>\n"
" <tr><td> </td><td> </td></tr>\n"
" </table>\n"
"</body>\n"
"</html>"
)
html_document = HTMLDocument.from_string(html_str)
html_elem = html_document.document_tree
assert html_elem is not None
table = html_elem.find(".//table")
assert table is not None
assert html._is_bulleted_table(table) is False
def test_it_provides_parseable_HTML_in_text_as_html():
html_str = (
"<html>\n"
"<body>\n"
" <table>\n"
" <thead>\n"
" <tr><th>Lorem</th><th>Ipsum</th></tr>\n"
" </thead>\n"
" <tbody>\n"
" <tr><th>Lorem ipsum</th><td>dolor sit amet nulla</td></tr>\n"
" <tr><th>Ut enim non</th><td>ad minim\nveniam quis</td></tr>\n"
" </tbody>\n"
" <tfoot>\n"
" <tr><th>Dolor</th><td>Equis</td></tr>\n"
" </tfoot>\n"
" </table>\n"
"</body>\n"
"</html>"
)
html_document = HTMLDocument.from_string(html_str)
(element,) = html_document.elements
assert isinstance(element, HTMLTable)
text_as_html = element.text_as_html
assert text_as_html is not None
html = etree.fromstring(text_as_html, etree.HTMLParser())
assert html is not None
# -- lxml adds the <html><body> container, that's not present in `.text_as_html` --
assert etree.tostring(html, encoding=str) == (
"<html><body>"
"<table>"
"<tr><td>Lorem</td><td>Ipsum</td></tr>"
"<tr><td>Lorem ipsum</td><td>dolor sit amet nulla</td></tr>"
"<tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr>"
"<tr><td>Dolor</td><td>Equis</td></tr>"
"</table>"
"</body></html>"
)
# -- element-suppression behaviors ---------------------------------------------------------------
def test_it_does_not_extract_text_in_script_tags():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-with-scripts.html")
doc = HTMLDocument.from_file(filename=filename)
assert all("function (" not in element.text for element in cast(List[Text], doc.elements))
def test_it_does_not_extract_text_in_style_tags():
html_str = (
"<html>\n"
"<body>\n"
" <p><style> p { margin:0; padding:0; } </style>Lorem ipsum dolor</p>\n"
"</body>\n"
"</html>"
)
html_document = HTMLDocument.from_string(html_str)
(element,) = html_document.elements
assert isinstance(element, Text)
assert element.text == "Lorem ipsum dolor"
# ------------------------------------------------------------------------------------------------
def test_parses_tags_correctly():
raw_html = """<html>
<body>
<table>
<tbody>
<tr>
<td><p>Hi there!</p></td>
</tr>
</tbody>
</table>
</body>
</html>"""
doc = HTMLDocument.from_string(raw_html)
el = doc.elements[0]
assert el.ancestortags + (el.tag,) == ("html", "body", "table")
def test_has_table_ancestor():
title = HTMLTitle(
"I am a Title",
tag="td",
ancestortags=["html", "body", "table", "tr"],
)
assert html.has_table_ancestor(title)
def test_has_no_table_ancestor():
title = HTMLTitle(
"I am a Title",
tag="p",
ancestortags=["html", "body"],
)
assert not html.has_table_ancestor(title)
def test_read_without_skipping_table(monkeypatch):
monkeypatch.setattr(html, "is_possible_narrative_text", lambda *args: True)
doc = """<html>
<body>
<table>
<tbody>
<tr>
<td><p>Hi there! I am Matt!</p></td>
</tr>
</tbody>
</table>
</body>
</html>"""
document = HTMLDocument.from_string(doc).doc_after_cleaners(skip_table=False)
assert document.pages[0].elements[0] == Table(text="Hi there! I am Matt!")
@pytest.mark.parametrize(
("doc", "expected"),
[
(
"<p>Hi there <span>my name is</span> <b><i>Matt</i></i></p>",
"Hi there my name is Matt",
),
("<p>I have a</p> tail", "I have a tail"),
],
)
def test_construct_text(doc, expected):
document_tree = etree.fromstring(doc, etree.HTMLParser())
para = document_tree.find(".//p")
text = html._construct_text(para)
assert text == expected
@pytest.mark.parametrize(
("doc", "root", "expected"),
[
(
"<p>Hello <strong>there</strong> I <em>am</em> a <b>very</b> <i>important</i> text</p>",
"p",
[
{"text": "there", "tag": "strong"},
{"text": "am", "tag": "em"},
{"text": "very", "tag": "b"},
{"text": "important", "tag": "i"},
],
),
(
"<p>Here is a <span>list</span> of <b>my <i>favorite</i> things</b></p>",
"p",
[
{"text": "list", "tag": "span"},
{"text": "my favorite things", "tag": "b"},
{"text": "favorite", "tag": "i"},
],
),
(
"<strong>A lone strong text!</strong>",
"strong",
[{"text": "A lone strong text!", "tag": "strong"}],
),
("<span>I have a</span> tail", "span", [{"text": "I have a", "tag": "span"}]),
("<p>Text with no emphasized runs</p> ", "p", []),
],
)
def test_get_emphasized_texts_from_tag(doc: str, root: str, expected: List[Dict[str, str]]):
document_tree = etree.fromstring(doc, etree.HTMLParser())
el = document_tree.find(f".//{root}")
assert el is not None
emphasized_texts = html._get_emphasized_texts_from_tag(el)
assert emphasized_texts == expected
def test_parse_nothing():
doc = """<p></p>"""
document_tree = etree.fromstring(doc, etree.HTMLParser())
el = document_tree.find(".//p")
parsed_el = html._parse_tag(el)
assert parsed_el is None
def test_read_with_existing_pages():
page = Page(number=0)
html_document = HTMLDocument.from_pages([page])
assert html_document.pages == [page]
def test_parse_not_anything(monkeypatch):
monkeypatch.setattr(html, "is_narrative_tag", lambda *args: False)
monkeypatch.setattr(html, "is_possible_title", lambda *args: False)
doc = """<p>This is nothing</p>"""
document_tree = etree.fromstring(doc, etree.HTMLParser())
el = document_tree.find(".//p")
parsed_el = html._parse_tag(el)
assert parsed_el == Text(text="This is nothing")
def test_parse_bullets(monkeypatch):
doc = """<p>● An excellent point!</p>"""
document_tree = etree.fromstring(doc, etree.HTMLParser())
el = document_tree.find(".//p")
parsed_el = html._parse_tag(el)
assert parsed_el == ListItem("An excellent point!")
def test_parse_tag_ignores_lonely_bullets():
doc = """<p>●</p>"""
document_tree = etree.fromstring(doc, etree.HTMLParser())
el = document_tree.find(".//p")
parsed_el = html._parse_tag(el)
assert parsed_el is None
def test_parse_tag_ignores_stubs():
doc = """<p>$</p>"""
document_tree = etree.fromstring(doc, etree.HTMLParser())
el = document_tree.find(".//p")
parsed_el = html._parse_tag(el)
assert parsed_el is None
def test_adjacent_spans_are_text_tags():
doc = """<div><span>&#8226;</span><span>A bullet!</span></div>"""
document_tree = etree.fromstring(doc, etree.HTMLParser())
el = document_tree.find(".//div")
assert html._is_text_tag(el) is True
def test_process_list_item_gets_next_section():
doc = """
<div>
<p>●</p>
<p>●</p>
</div>
<div>
<p>An excellent point!</p>
</div>
"""
document_tree = etree.fromstring(doc, etree.HTMLParser())
el = document_tree.find(".//div")
parsed_el, _ = html._process_list_item(el, max_predecessor_len=10)
assert parsed_el == ListItem(text="An excellent point!")
def test_get_bullet_descendants():
div_1 = "<div><p>●</p><p>●</p></div>"
document_tree_1 = etree.fromstring(div_1, etree.HTMLParser())
element = document_tree_1.find(".//div")
div_2 = "<div><p>An excellent point!</p></div>"
document_tree_2 = etree.fromstring(div_2, etree.HTMLParser())
next_element = document_tree_2.find(".//div")
descendants = html._get_bullet_descendants(element, next_element)
assert len(descendants) == 1
def test_process_list_item_returns_none_if_next_blank():
doc = """
<div>
<p>●</p>
<p>●</p>
</div>
"""
document_tree = etree.fromstring(doc, etree.HTMLParser())
el = document_tree.find(".//div")
parsed_el, _ = html._process_list_item(el)
assert parsed_el is None
def test_process_list_item_returns_none_if_next_has_no_text():
doc = """
<div>
<p>●</p>
<p>●</p>
</div>
<div>
</div>
"""
document_tree = etree.fromstring(doc, etree.HTMLParser())
el = document_tree.find(".//div")
assert html.is_list_item_tag(el) is True
parsed_el, _ = html._process_list_item(el)
assert parsed_el is None
def test_process_list_item_ignores_deep_divs():
doc = """
<div>
<p>●</p>
<p>●</p>
<p>●</p>
<p>●</p>
<p>●</p>
</div>
<div>
<p>An excellent point!</p>
</div>
"""
document_tree = etree.fromstring(doc, etree.HTMLParser())
el = document_tree.find(".//div")
parsed_el, _ = html._process_list_item(el, max_predecessor_len=2)
assert parsed_el is None
def test_read_html_doc(tmpdir, monkeypatch):
TITLE1 = "A Great and Glorious Section"
SECTION1 = "Dear Leader is the best. He is such a wonderful engineer!"
TITLE2 = "Another Magnificent Title"
SECTION2 = "The last section is a title because of its capitalization patterns!"
TABLE_SECTION = "Skip me because I'm in a table"
TITLE3 = "A New Beginning"
SECTION3 = "Here is the start of a new page."
doc = f"""<html>
<body>
<header>
<p>Here is a header. We want to ignore anything that is in this section.</p>
</header>
<h1>{TITLE1}</h1>
<p>{SECTION1}</p>
<p></p>
<p>{TITLE2}</p>
<p><b>{SECTION2}</b></p>
<table>
<tbody>
<tr>
<td><p>{TABLE_SECTION}</p></td>
</tr>
</tbody>
</table>
<hr>
<h2>{TITLE3}</h2>
<div>{SECTION3}</div>
<footer>
<p>Here is a footer. We want to ignore anything that is in this section</p>
</footer>
<div>
<p>Let's ignore anything after the footer too since it's probably garbage.</p>
</div>
</body>
</html>"""
filename = os.path.join(tmpdir.dirname, "sample-doc.html")
with open(filename, "w") as f:
f.write(doc)
html_document = HTMLDocument.from_file(filename=filename).doc_after_cleaners(
skip_headers_and_footers=True,
skip_table=True,
)
print("original pages: ", HTMLDocument.from_file(filename=filename).pages)
print("filtered pages: ", html_document.pages)
print([el.text for el in html_document.pages[0].elements])
assert len(html_document.pages) == 2
page_one = html_document.pages[0]
assert len(page_one.elements) == 4
assert page_one.elements == [
Title(text=TITLE1),
NarrativeText(text=SECTION1),
Title(text=TITLE2),
NarrativeText(text=SECTION2),
]
page_two = html_document.pages[1]
assert len(page_two.elements) == 2
assert page_two.elements == [
Title(text=TITLE3),
NarrativeText(text=SECTION3),
]
pages = html_document.pages
assert all(isinstance(page, Page) for page in pages)
def test_find_main():
html_str = """<header></header>
<body>
<p>Lots preamble stuff yada yada yada</p>
<main>
<article>
<section>
<h2>A Wonderful Section!</h2>
<p>Look at this amazing section!</p>
</section>
<section>
<h2>Another Wonderful Section!</h2>
<p>Look at this other amazing section!</p>
</section>
</article>
</main>
</body>"""
html_document = HTMLDocument.from_string(html_str)
document_tree = html_document.document_tree
main_tag = html._find_main(document_tree)
assert main_tag.tag == "main"
def test_find_main_returns_doc_when_main_not_present():
html_str = """<header></header>
<body>
<p>Lots preamble stuff yada yada yada</p>
<article>
<section>
<h2>A Wonderful Section!</h2>
<p>Look at this amazing section!</p>
</section>
<section>
<h2>Another Wonderful Section!</h2>
<p>Look at this other amazing section!</p>
</section>
</article>
</body>"""
html_document = HTMLDocument.from_string(html_str)
document_tree = html_document.document_tree
root = html._find_main(document_tree)
assert root.tag == "html"
def test_find_articles():
html_str = """<header></header>
<body>
<p>Lots preamble stuff yada yada yada</p>
<article>
<h2>A Wonderful Section!</h2>
<p>Look at this amazing section!</p>
</article>
<article>
<h2>Another Wonderful Section!</h2>
<p>Look at this other amazing section!</p>
</article>
</body>"""
html_document = HTMLDocument.from_string(html_str)
document_tree = html_document.document_tree
articles = html._find_articles(document_tree)
assert len(articles) == 2
def test_find_articles_returns_doc_when_none_present():
html_str = """<header></header>
<body>
<p>Lots preamble stuff yada yada yada</p>
<section>
<h2>A Wonderful Section!</h2>
<p>Look at this amazing section!</p>
</section>
<section>
<h2>Another Wonderful Section!</h2>
<p>Look at this other amazing section!</p>
</section>
</body>"""
html_document = HTMLDocument.from_string(html_str)
document_tree = html_document.document_tree
articles = html._find_articles(document_tree)
assert len(articles) == 1
def test_include_headers_and_footers(sample_doc):
html_document = sample_doc.doc_after_cleaners(skip_headers_and_footers=False)
assert len(html_document.pages[1].elements) == 3
def test_include_table_text(sample_doc):
html_document = sample_doc.doc_after_cleaners(skip_table=False)
assert len(html_document.pages[0].elements) == 2
@pytest.mark.parametrize("tag", [tag for tag in TEXT_TAGS if tag not in TABLE_TAGS])
def test_tag_types(tag):
html_str = f"""
<body>
<{tag}>
There is some text here.
</{tag}>
</body>
"""
html_document = HTMLDocument.from_string(html_str)
assert len(html_document.pages[0].elements) == 1
@pytest.mark.parametrize("tag", EXCLUDED_TAGS)
def test_exclude_tag_types(tag):
html_str = f"""
<body>
<{tag}>
There is some text here.
</{tag}>
</body>
"""
html_document = HTMLDocument.from_string(html_str)
assert len(html_document.pages) == 0
def test_tag_types_table(sample_doc):
html_document = sample_doc.doc_after_cleaners(skip_table=True)
assert len(html_document.pages[0].elements) == 2
def test_nested_text_tags():
tag1, tag2 = [tag for tag in TEXT_TAGS if tag not in TABLE_TAGS][:2]
html_str = f"""
<body>
<{tag1}>
<{tag2}>
There is some text here.
</{tag2}>
</{tag1}>
</body>
"""
html_document = HTMLDocument.from_string(html_str).doc_after_cleaners(skip_table=False)
assert len(html_document.pages[0].elements) == 1
def test_containers_with_text_are_processed():
html_str = """<div dir=3D"ltr">Hi All,<div><br></div>
<div>Get excited for our first annual family day!</div>
<div>Best.<br clear=3D"all">
<div><br></div>
-- <br>
<div dir=3D"ltr">
<div dir=3D"ltr">Dino the Datasaur<div>Unstructured Technologies<br><div>Data Scientist
</div>
<div>Doylestown, PA 18901</div>
<div><br></div>
</div>
</div>
</div>
</div>
</div>"""
html_document = HTMLDocument.from_string(html_str)
assert html_document.elements == [
Text(text="Hi All,"),
NarrativeText(text="Get excited for our first annual family day!"),
Title(text="Best."),
Title(text="Dino the Datasaur"),
Title(text="Unstructured Technologies"),
Title(text="Data Scientist"),
Address(text="Doylestown, PA 18901"),
]
def test_html_grabs_bulleted_text_in_tags():
html_str = """<html>
<body>
<ol>
<li>Happy Groundhog's day!</li>
<li>Looks like six more weeks of winter ...</li>
</ol>
</body>
</html>"""
html_document = HTMLDocument.from_string(html_str)
assert html_document.elements == [
ListItem(text="Happy Groundhog's day!"),
ListItem(text="Looks like six more weeks of winter ..."),
]
def test_html_grabs_bulleted_text_in_paras():
html_str = """<html>
<body>
<p>
<span>&#8226; Happy Groundhog's day!</span>
</p>
<p>
<span>&#8226; Looks like six more weeks of winter ...</span>
</p>
</body>
</html>"""
html_document = HTMLDocument.from_string(html_str)
assert html_document.elements == [
ListItem(text="Happy Groundhog's day!"),
ListItem(text="Looks like six more weeks of winter ..."),
]
def test_bulletized_bulleted_text_from_table():
doc = """<html>
<body>
<table>
<tbody>
<tr>
<td>•</td>
<td><p>Happy Groundhog's day!</p></td>
</tr>
<tr>
<td>•</td>
<td><p>Looks like six more weeks of winter ...</p></td>
</tr>
</tbody>
</table>
</body>
</html>"""
document_tree = etree.fromstring(doc, etree.HTMLParser())
table = document_tree.find(".//table")
bulleted_text = html._bulleted_text_from_table(table)
assert bulleted_text == [
ListItem(text="Happy Groundhog's day!"),
ListItem(text="Looks like six more weeks of winter ..."),
]
def test_html_grabs_bulleted_text_in_tables():
html_str = """<html>
<body>
<table>
<tbody>
<tr>
<td>&#8226;</td>
<td><p>Happy Groundhog's day!</p></td>
</tr>
<tr>
<td>&#8226;</td>
<td><p>Looks like six more weeks of winter ...</p></td>
</tr>
</tbody>
</table>
</body>
</html>"""
html_document = HTMLDocument.from_string(html_str)
assert html_document.elements == [
ListItem(text="Happy Groundhog's day!"),
ListItem(text="Looks like six more weeks of winter ..."),
]
def test_raises_error_no_tag():
with pytest.raises(TypeError):
TagsMixin(tag=None)
with pytest.raises(TypeError):
TagsMixin()
def test_raises_error_wrong_elements(monkeypatch, sample_doc):
page = Page(0)
page.elements = ["this should def not be a string"]
monkeypatch.setattr(sample_doc, "_pages", [page])
with pytest.raises(ValueError):
sample_doc.doc_after_cleaners()
def test_filter_in_place():
html_doc = """
<table><tbody><tr><td>A table thing.</td></tr></tbody></table>
<p>A non-table thing</p>
"""
doc = HTMLDocument.from_string(html_doc)
assert len(doc.elements) == 2
doc.doc_after_cleaners(skip_table=True, inplace=True)
assert len(doc.elements) == 1
def test_joins_tag_text_correctly():
raw_html = "<p>Hello again peet mag<i>ic</i>al</p>"
doc = HTMLDocument.from_string(raw_html)
el = doc.elements[0]
assert el.text == "Hello again peet magical"
def test_sample_doc_with_emoji():
raw_html = """
<html charset="unicode">
<p>Hello again 😀</p>
</html>
"""
doc = HTMLDocument.from_string(raw_html)
# NOTE(robinson) - unclear why right now, but the output is the emoji on the test runners
# and the byte string representation when running locally on mac
assert doc.elements[0].text in ["Hello again ð\x9f\x98\x80", "Hello again 😀"]
def test_only_plain_text_in_body():
raw_html = "<body>Hello</body>"
doc = HTMLDocument.from_string(raw_html)
assert doc.elements[0].text == "Hello"
def test_plain_text_before_anything_in_body():
raw_html = "<body>Hello<p>World</p></body>"
doc = HTMLDocument.from_string(raw_html)
assert doc.elements[0].text == "Hello"
assert doc.elements[1].text == "World"
def test_line_break_in_container():
raw_html = "<div>Hello<br/>World</div>"
doc = HTMLDocument.from_string(raw_html)
assert doc.elements[0].text == "Hello"
assert doc.elements[1].text == "World"
@pytest.mark.parametrize("tag", TEXT_TAGS)
def test_line_break_in_text_tag(tag):
raw_html = f"<{tag}>Hello<br/>World</{tag}>"
doc = HTMLDocument.from_string(raw_html)
assert doc.elements[0].text == "Hello"
assert doc.elements[1].text == "World"
# -- module-level fixtures -----------------------------------------------------------------------
@pytest.fixture()
def sample_doc():
table_element = HTMLTitle(
"I'm a title in a table.",
tag="p",
ancestortags=("table", "tbody", "tr", "td"),
)
narrative = HTMLNarrativeText("I'm some narrative text", tag="p", ancestortags=())
page1 = Page(0)
page1.elements = [table_element, narrative]
header = HTMLTitle("I'm a header", tag="header", ancestortags=())
body = HTMLNarrativeText("Body text", tag="p", ancestortags=())
footer = HTMLTitle("I'm a footer", tag="footer", ancestortags=())
page2 = Page(1)
page2.elements = [header, body, footer]
doc = HTMLDocument.from_pages([page1, page2])
return doc