1478 lines
52 KiB
Python
Raw Normal View History

# pyright: reportPrivateUsage=false
"""Test suite for `unstructured.partition.html.partition` module."""
from __future__ import annotations
import io
import pathlib
from tempfile import SpooledTemporaryFile
from typing import Any
import pytest
from lxml import etree
from test_unstructured.unit_utils import (
FixtureRequest,
Mock,
assert_round_trips_through_JSON,
example_doc_path,
example_doc_text,
function_mock,
)
from unstructured.chunking.title import chunk_by_title
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import (
Address,
CompositeElement,
EmailAddress,
ListItem,
NarrativeText,
Table,
TableChunk,
Text,
Title,
)
from unstructured.documents.html import HTMLDocument
from unstructured.file_utils.encoding import read_txt_file
from unstructured.partition.html import partition_html
from unstructured.partition.html.partition import HtmlPartitionerOptions, _HtmlPartitioner
# ================================================================================================
# SOURCE HTML LOADING BEHAVIORS
# ================================================================================================
# -- document-source (filename, file, text, url) -------------------------------------------------
def test_partition_html_accepts_a_file_path():
elements = partition_html(example_doc_path("example-10k-1p.html"))
assert len(elements) > 0
assert all(e.metadata.filename == "example-10k-1p.html" for e in elements)
assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
def test_user_without_file_write_permission_can_partition_html(tmp_path: pathlib.Path):
read_only_file_path = tmp_path / "example-10k-readonly.html"
read_only_file_path.write_text(example_doc_text("example-10k-1p.html"))
read_only_file_path.chmod(0o444)
elements = partition_html(filename=str(read_only_file_path.resolve()))
assert len(elements) > 0
def test_partition_html_accepts_a_file_like_object():
with open(example_doc_path("example-10k-1p.html"), "rb") as f:
elements = partition_html(file=f)
assert len(elements) > 0
assert all(e.metadata.filename is None for e in elements)
def test_partition_html_accepts_an_html_str():
elements = partition_html(text=example_doc_text("example-10k-1p.html"))
assert len(elements) > 0
def test_partition_html_accepts_a_url_to_an_HTML_document(requests_get_: Mock):
requests_get_.return_value = FakeResponse(
text=example_doc_text("example-10k-1p.html"),
status_code=200,
headers={"Content-Type": "text/html"},
)
elements = partition_html(url="https://fake.url")
requests_get_.assert_called_once_with("https://fake.url", headers={}, verify=True)
assert len(elements) > 0
def test_partition_html_raises_when_no_path_or_file_or_text_or_url_is_specified():
with pytest.raises(ValueError, match="Exactly one of filename, file, text, or url must be sp"):
partition_html()
# -- encoding for filename, file, and text -------------------------------------------------------
@pytest.mark.parametrize(
"filename", ["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html"]
)
def test_partition_html_from_filename_raises_when_explicit_encoding_is_wrong(filename: str):
with pytest.raises(UnicodeDecodeError):
with open(example_doc_path(filename), "rb") as f:
partition_html(file=f, encoding="utf-8")
@pytest.mark.parametrize(
"filename",
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html", "fake-html-lang-de.html"],
)
def test_partition_html_from_filename_default_encoding(filename: str):
elements = partition_html(example_doc_path(filename))
assert len(elements) > 0
assert all(e.metadata.filename == filename for e in elements)
if filename == "fake-html-lang-de.html":
assert elements == EXPECTED_OUTPUT_LANGUAGE_DE
@pytest.mark.parametrize(
"filename", ["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html"]
)
def test_partition_html_from_file_raises_encoding_error(filename: str):
with open(example_doc_path(filename), "rb") as f:
file = io.BytesIO(f.read())
with pytest.raises(UnicodeDecodeError, match="'utf-8' codec can't decode byte 0xff in posi"):
partition_html(file=file, encoding="utf-8")
@pytest.mark.parametrize(
"filename",
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html", "fake-html-lang-de.html"],
)
def test_partition_html_from_file_default_encoding(filename: str):
with open(example_doc_path(filename), "rb") as f:
elements = partition_html(file=f)
assert len(elements) > 0
if filename == "fake-html-lang-de.html":
assert elements == EXPECTED_OUTPUT_LANGUAGE_DE
@pytest.mark.parametrize(
"filename", ["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html"]
)
def test_partition_html_from_file_rb_raises_encoding_error(filename: str):
with pytest.raises(UnicodeDecodeError, match="'utf-8' codec can't decode byte 0xff in posi"):
with open(example_doc_path(filename), "rb") as f:
partition_html(file=f, encoding="utf-8")
@pytest.mark.parametrize(
"filename",
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html", "fake-html-lang-de.html"],
)
def test_partition_html_from_file_rb_default_encoding(filename: str):
with open(example_doc_path(filename), "rb") as f:
elements = partition_html(file=f)
assert len(elements) > 0
if filename == "fake-html-lang-de.html":
assert elements == EXPECTED_OUTPUT_LANGUAGE_DE
def test_partition_html_processes_chinese_chracters():
html_text = "<html><div><p>每日新闻</p></div></html>"
elements = partition_html(text=html_text)
assert elements[0].text == "每日新闻"
def test_emoji_appears_with_emoji_utf8_code():
assert partition_html(text='<html charset="utf-8"><p>Hello &#128512;</p></html>') == [
Title("Hello 😀")
]
# -- partition_html() from URL -------------------------------------------------------------------
def test_partition_html_from_url_raises_on_failure_response_status_code(requests_get_: Mock):
requests_get_.return_value = FakeResponse(
text=example_doc_text("example-10k-1p.html"),
status_code=500,
headers={"Content-Type": "text/html"},
)
with pytest.raises(ValueError, match="Error status code on GET of provided URL: 500"):
partition_html(url="https://fake.url")
def test_partition_html_from_url_raises_on_response_of_wrong_content_type(requests_get_: Mock):
requests_get_.return_value = FakeResponse(
text=example_doc_text("example-10k-1p.html"),
status_code=200,
headers={"Content-Type": "application/json"},
)
with pytest.raises(ValueError, match="Expected content type text/html. Got application/json."):
partition_html(url="https://fake.url")
def test_partition_from_url_includes_provided_headers_in_request(requests_get_: Mock):
requests_get_.return_value = FakeResponse(
text="<html><head></head><body><p>What do I know? Who needs to know it?</p></body></html>",
status_code=200,
headers={"Content-Type": "text/html"},
)
partition_html(url="https://example.com", headers={"User-Agent": "test"})
requests_get_.assert_called_once_with(
"https://example.com", headers={"User-Agent": "test"}, verify=True
)
# ================================================================================================
# PARSING TESTS
# ================================================================================================
def test_partition_html_on_ideas_page():
elements = partition_html(example_doc_path("ideas-page.html"))
assert len(elements) == 1
e = elements[0]
assert e == Table(
"January 2023 ( Someone fed my essays into GPT to make something that could answer"
"\nquestions based on them, then asked it where good ideas come from. The"
"\nanswer was ok, but not what I would have said. This is what I would have said.)"
" The way to get new ideas is to notice anomalies: what seems strange,"
"\nor missing, or broken? You can see anomalies in everyday life (much"
"\nof standup comedy is based on this), but the best place to look for"
"\nthem is at the frontiers of knowledge. Knowledge grows fractally."
"\nFrom a distance its edges look smooth, but when you learn enough"
"\nto get close to one, you'll notice it's full of gaps. These gaps"
"\nwill seem obvious; it will seem inexplicable that no one has tried"
"\nx or wondered about y. In the best case, exploring such gaps yields"
"\nwhole new fractal buds.",
)
assert e.metadata.emphasized_text_contents is None
assert e.metadata.link_urls is None
assert e.metadata.text_as_html is not None
# -- element-suppression behaviors ---------------------------------------------------------------
def test_it_does_not_extract_text_in_script_tags(opts_args: dict[str, Any]):
opts_args["file_path"] = example_doc_path("example-with-scripts.html")
opts = HtmlPartitionerOptions(**opts_args)
doc = HTMLDocument.load(opts)
assert all("function (" not in element.text for element in doc.elements)
def test_it_does_not_extract_text_in_style_tags(opts_args: dict[str, Any]):
opts_args["text"] = (
"<html>\n"
"<body>\n"
" <p><style> p { margin:0; padding:0; } </style>Lorem ipsum dolor</p>\n"
"</body>\n"
"</html>"
)
opts = HtmlPartitionerOptions(**opts_args)
html_document = HTMLDocument.load(opts)
(element,) = html_document.elements
assert isinstance(element, Text)
assert element.text == "Lorem ipsum dolor"
# -- table parsing behaviors ---------------------------------------------------------------------
def test_it_can_parse_a_bare_bones_table_to_a_Table_element(opts_args: dict[str, Any]):
"""Bare-bones means no `<thead>`, `<tbody>`, or `<tfoot>` elements."""
opts_args["text"] = (
"<html>\n"
"<body>\n"
" <table>\n"
" <tr><td>Lorem</td><td>Ipsum</td></tr>\n"
" <tr><td>Ut enim non</td><td>ad minim\nveniam quis</td></tr>\n"
" </table>\n"
"</body>\n"
"</html>"
)
opts = HtmlPartitionerOptions(**opts_args)
html_document = HTMLDocument.load(opts)
# -- there is exactly one element and it's a Table instance --
(element,) = html_document.elements
assert isinstance(element, Table)
# -- table text is joined into a single string; no row or cell boundaries are represented --
assert element.text == "Lorem Ipsum Ut enim non ad minim\nveniam quis"
# -- An HTML representation is also available that is longer but represents table structure.
assert element.metadata.text_as_html == (
"<table>"
"<tr><td>Lorem</td><td>Ipsum</td></tr>"
"<tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr>"
"</table>"
)
def test_it_accommodates_column_heading_cells_enclosed_in_thead_tbody_and_tfoot_elements(
opts_args: dict[str, Any]
):
"""Cells within a `table/thead` element are included in the text and html.
The presence of a `<thead>` element in the original also determines whether a `<thead>` element
appears in `.text_as_html` or whether the first row of cells is simply in the body.
"""
opts_args["text"] = (
"<html>\n"
"<body>\n"
" <table>\n"
" <thead>\n"
" <tr><th>Lorem</th><th>Ipsum</th></tr>\n"
" </thead>\n"
" <tbody>\n"
" <tr><th>Lorem ipsum</th><td>dolor sit amet nulla</td></tr>\n"
" <tr><th>Ut enim non</th><td>ad minim\nveniam quis</td></tr>\n"
" </tbody>\n"
" <tfoot>\n"
" <tr><th>Dolor</th><td>Equis</td></tr>\n"
" </tfoot>\n"
" </table>\n"
"</body>\n"
"</html>"
)
opts = HtmlPartitionerOptions(**opts_args)
html_document = HTMLDocument.load(opts)
(element,) = html_document.elements
assert isinstance(element, Table)
assert element.metadata.text_as_html == (
"<table>"
"<tr><td>Lorem</td><td>Ipsum</td></tr>"
"<tr><td>Lorem ipsum</td><td>dolor sit amet nulla</td></tr>"
"<tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr>"
"<tr><td>Dolor</td><td>Equis</td></tr>"
"</table>"
)
def test_it_does_not_emit_a_Table_element_for_a_table_with_no_text(opts_args: dict[str, Any]):
opts_args["text"] = (
"<html>\n"
"<body>\n"
" <table>\n"
" <tr><td> </td><td> </td></tr>\n"
" <tr><td> </td><td> </td></tr>\n"
" </table>\n"
"</body>\n"
"</html>"
)
opts = HtmlPartitionerOptions(**opts_args)
html_document = HTMLDocument.load(opts)
assert html_document.elements == []
def test_it_provides_parseable_HTML_in_text_as_html(opts_args: dict[str, Any]):
opts_args["text"] = (
"<html>\n"
"<body>\n"
" <table>\n"
" <thead>\n"
" <tr><th>Lorem</th><th>Ipsum</th></tr>\n"
" </thead>\n"
" <tbody>\n"
" <tr><th>Lorem ipsum</th><td>dolor sit amet nulla</td></tr>\n"
" <tr><th>Ut enim non</th><td>ad minim\nveniam quis</td></tr>\n"
" </tbody>\n"
" <tfoot>\n"
" <tr><th>Dolor</th><td>Equis</td></tr>\n"
" </tfoot>\n"
" </table>\n"
"</body>\n"
"</html>"
)
html_document = HTMLDocument.load(HtmlPartitionerOptions(**opts_args))
(element,) = html_document.elements
assert isinstance(element, Table)
text_as_html = element.metadata.text_as_html
assert text_as_html is not None
html = etree.fromstring(text_as_html, etree.HTMLParser())
assert html is not None
# -- lxml adds the <html><body> container, that's not present in `.text_as_html` --
assert etree.tostring(html, encoding=str) == (
"<html><body>"
"<table>"
"<tr><td>Lorem</td><td>Ipsum</td></tr>"
"<tr><td>Lorem ipsum</td><td>dolor sit amet nulla</td></tr>"
"<tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr>"
"<tr><td>Dolor</td><td>Equis</td></tr>"
"</table>"
"</body></html>"
)
@pytest.mark.parametrize(
("tag", "expected_text_as_html"),
[
("thead", "<table><tr><td>Header 1</td><td>Header 2</td></tr></table>"),
("tfoot", "<table><tr><td>Header 1</td><td>Header 2</td></tr></table>"),
],
)
def test_partition_html_parses_table_without_tbody(tag: str, expected_text_as_html: str):
elements = partition_html(
text=(
f"<table>\n"
f" <{tag}>\n"
f" <tr><th>Header 1</th><th>Header 2</th></tr>\n"
f" </{tag}>\n"
f"</table>"
)
)
assert elements[0].metadata.text_as_html == expected_text_as_html
def test_partition_html_reduces_a_nested_table_to_its_text_placed_in_the_cell_that_contains_it(
opts_args: dict[str, Any]
):
"""Recursively ..."""
opts = HtmlPartitionerOptions(**opts_args)
# -- note <table> elements nested in <td> elements --
html_str = (
"<table>\n"
" <tr>\n"
" <td>\n"
" <table>\n"
" <tr><td>foo</td><td>bar</td></tr>\n"
" <tr><td>baz</td><td>bng</td></tr>\n"
" </table>\n"
" </td>\n"
" <td>\n"
" <table>\n"
" <tr><td>fizz</td><td>bang</td></tr>\n"
" </table>\n"
" </td>\n"
" </tr>\n"
"</table>"
)
html_document = HTMLDocument(html_str, opts)
table_elem = html_document._main.find(".//table")
assert table_elem is not None
html_table = html_document._parse_Table_from_table_elem(table_elem)
assert isinstance(html_table, Table)
assert html_table.text == "foo bar baz bng fizz bang"
assert html_table.metadata.text_as_html == (
"<table><tr><td>foo bar baz bng</td><td>fizz bang</td></tr></table>"
)
def test_partition_html_accommodates_tds_with_child_elements(opts_args: dict[str, Any]):
"""Like this example from an SEC 10k filing."""
opts = HtmlPartitionerOptions(**opts_args)
html_str = (
"<table>\n"
" <tr>\n"
" <td></td>\n"
" <td></td>\n"
" </tr>\n"
" <tr>\n"
" <td>\n"
" <p>\n"
" <span>\n"
' <ix:nonNumeric id="F_be4cc145-372a-4689-be60-d8a70b0c8b9a"'
' contextRef="C_1de69f73-df01-4830-8af0-0f11b469bc4a" name="dei:DocumentAnnualReport"'
' format="ixt-sec:boolballotbox">\n'
" <span>&#9746;</span>\n"
" </ix:nonNumeric>\n"
" </span>\n"
" </p>\n"
" </td>\n"
" <td>\n"
" <p>\n"
" <span>ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE"
" ACT OF 1934</span>\n"
" </p>\n"
" </td>\n"
" </tr>\n"
"</table>\n"
)
html_document = HTMLDocument(html_str, opts)
table_elem = html_document._main.find(".//table")
assert table_elem is not None
html_table = html_document._parse_Table_from_table_elem(table_elem)
assert isinstance(html_table, Table)
assert html_table.text == (
"☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934"
)
assert html_table.metadata.text_as_html == (
"<table>"
"<tr><td></td><td></td></tr>"
"<tr><td>☒</td><td>ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES"
" EXCHANGE ACT OF 1934</td></tr>"
"</table>"
)
# -- other element-specific behaviors ------------------------------------------------------------
def test_partition_html_recognizes_h1_to_h3_as_Title_except_in_edge_cases():
assert partition_html(
text=(
"<p>This is a section of narrative text, it's long, flows and has meaning</p>\n"
"<h1>This heading is a title, even though it's long, flows and has meaning</h1>\n"
"<h2>A heading that is at the second level</h2>\n"
"<h3>Finally, the third heading</h3>\n"
"<h2>December 1-17, 2017</h2>\n"
"<h3>email@example.com</h3>\n"
"<h3><li>- bulleted item</li></h3>\n"
)
) == [
NarrativeText("This is a section of narrative text, it's long, flows and has meaning"),
Title("This heading is a title, even though it's long, flows and has meaning"),
Title("A heading that is at the second level"),
Title("Finally, the third heading"),
Title("December 1-17, 2017"),
EmailAddress("email@example.com"),
ListItem("- bulleted item"),
]
def test_partition_html_with_pre_tag():
elements = partition_html(example_doc_path("fake-html-pre.htm"))
assert len(elements) > 0
assert all(e.category != "PageBreak" for e in elements)
assert clean_extra_whitespace(elements[0].text).startswith("[107th Congress Public Law 56]")
assert isinstance(elements[0], NarrativeText)
assert elements[0].metadata.filetype == "text/html"
assert elements[0].metadata.filename == "fake-html-pre.htm"
def test_pre_tag_parsing_respects_order():
assert partition_html(
text=(
"<pre>The Big Brown Bear</pre>\n"
"<div>The big brown bear is growling.</div>\n"
"<pre>The big brown bear is sleeping.</pre>\n"
"<div>The Big Blue Bear</div>\n"
)
) == [
Title("The Big Brown Bear"),
NarrativeText("The big brown bear is growling."),
NarrativeText("The big brown bear is sleeping."),
Title("The Big Blue Bear"),
]
def test_partition_html_b_tag_parsing():
elements = partition_html(
text=(
"<!DOCTYPE html>\n"
"<html>\n"
"<body>\n"
"<div>\n"
" <h1>Header 1</h1>\n"
" <p>Text</p>\n"
" <h2>Header 2</h2>\n"
" <pre>\n"
" <b>Param1</b> = Y<br><b>Param2</b> = 1<br><b>Param3</b> = 2<br><b>Param4</b> = A\n"
" <br><b>Param5</b> = A,B,C,D,E<br><b>Param6</b> = 7<br><b>Param7</b> = Five<br>\n"
" </pre>\n"
"</div>\n"
"</body>\n"
"</html>\n"
)
)
assert "|".join(e.text for e in elements) == (
"Header 1|Text|Header 2|Param1 = Y|Param2 = 1|Param3 = 2|Param4 = A|"
"Param5 = A,B,C,D,E|Param6 = 7|Param7 = Five"
)
def test_partition_html_tag_tail_parsing():
elements = partition_html(
text=(
"<html>\n"
"<body>\n"
"<div>\n"
" Head\n"
" <div><span>Nested</span></div>\n"
" Tail\n"
"</div>\n"
"</body>\n"
"</html>\n"
)
)
assert "|".join([str(e).strip() for e in elements]) == "Head|Nested|Tail"
# -- parsing edge cases --------------------------------------------------------------------------
def test_partition_html_from_text_works_with_empty_string():
assert partition_html(text="") == []
def test_nested_text_tags(opts_args: dict[str, Any]):
opts_args["text"] = (
"<body>\n"
" <p>\n"
" <a>\n"
" There is some text here.\n"
" </a>\n"
" </p>\n"
"</body>\n"
)
opts = HtmlPartitionerOptions(**opts_args)
elements = HTMLDocument.load(opts).elements
assert len(elements) == 1
def test_containers_with_text_are_processed(opts_args: dict[str, Any]):
opts_args["text"] = (
'<div dir=3D"ltr">Hi All,<div><br></div>\n'
" <div>Get excited for our first annual family day!</div>\n"
' <div>Best.<br clear=3D"all">\n'
" <div><br></div>\n"
" -- <br>\n"
' <div dir=3D"ltr">\n'
' <div dir=3D"ltr">Dino the Datasaur<div>\n'
" Unstructured Technologies<br>\n"
" <div>Data Scientist</div>\n"
" <div>Doylestown, PA 18901</div>\n"
" <div><br></div>\n"
" </div>\n"
" </div>\n"
" </div>\n"
" </div>\n"
"</div>\n"
)
opts = HtmlPartitionerOptions(**opts_args)
html_document = HTMLDocument.load(opts)
assert html_document.elements == [
Text(text="Hi All,"),
NarrativeText(text="Get excited for our first annual family day!"),
Title(text="Best."),
Text(text="\n -- "),
Title(text="Dino the Datasaur"),
Title(text="\n Unstructured Technologies"),
Title(text="Data Scientist"),
Address(text="Doylestown, PA 18901"),
]
def test_html_grabs_bulleted_text_in_tags(opts_args: dict[str, Any]):
opts_args["text"] = (
"<html>\n"
" <body>\n"
" <ol>\n"
" <li>Happy Groundhog's day!</li>\n"
" <li>Looks like six more weeks of winter ...</li>\n"
" </ol>\n"
" </body>\n"
"</html>\n"
)
opts = HtmlPartitionerOptions(**opts_args)
assert HTMLDocument.load(opts).elements == [
ListItem(text="Happy Groundhog's day!"),
ListItem(text="Looks like six more weeks of winter ..."),
]
def test_html_grabs_bulleted_text_in_paras(opts_args: dict[str, Any]):
opts_args["text"] = (
"<html>\n"
" <body>\n"
" <p>\n"
" <span>&#8226; Happy Groundhog's day!</span>\n"
" </p>\n"
" <p>\n"
" <span>&#8226; Looks like six more weeks of winter ...</span>\n"
" </p>\n"
" </body>\n"
"</html>\n"
)
opts = HtmlPartitionerOptions(**opts_args)
assert HTMLDocument.load(opts).elements == [
ListItem(text="Happy Groundhog's day!"),
ListItem(text="Looks like six more weeks of winter ..."),
]
def test_joins_tag_text_correctly(opts_args: dict[str, Any]):
opts_args["text"] = "<p>Hello again peet mag<i>ic</i>al</p>"
opts = HtmlPartitionerOptions(**opts_args)
doc = HTMLDocument.load(opts)
assert doc.elements[0].text == "Hello again peet magical"
def test_sample_doc_with_emoji(opts_args: dict[str, Any]):
opts_args["text"] = '<html charset="unicode">\n<p>Hello again 😀</p>\n</html>'
opts = HtmlPartitionerOptions(**opts_args)
doc = HTMLDocument.load(opts)
# NOTE(robinson) - unclear why right now, but the output is the emoji on the test runners
# and the byte string representation when running locally on mac
assert doc.elements[0].text in ["Hello again ð\x9f\x98\x80", "Hello again 😀"]
def test_only_plain_text_in_body(opts_args: dict[str, Any]):
opts_args["text"] = "<body>Hello</body>"
opts = HtmlPartitionerOptions(**opts_args)
assert HTMLDocument.load(opts).elements[0].text == "Hello"
def test_plain_text_before_anything_in_body(opts_args: dict[str, Any]):
opts_args["text"] = "<body>Hello<p>World</p></body>"
opts = HtmlPartitionerOptions(**opts_args)
doc = HTMLDocument.load(opts)
assert doc.elements[0].text == "Hello"
assert doc.elements[1].text == "World"
def test_line_break_in_container(opts_args: dict[str, Any]):
opts_args["text"] = "<div>Hello<br/>World</div>"
opts = HtmlPartitionerOptions(**opts_args)
doc = HTMLDocument.load(opts)
assert doc.elements[0].text == "Hello"
assert doc.elements[1].text == "World"
@pytest.mark.parametrize("tag", ["del", "form", "noscript"])
def test_exclude_tag_types(tag: str, opts_args: dict[str, Any]):
opts_args["text"] = f"<body>\n <{tag}>\n There is some text here.\n </{tag}>\n</body>\n"
opts = HtmlPartitionerOptions(**opts_args)
elements = HTMLDocument.load(opts).elements
assert len(elements) == 0
# ================================================================================================
# OTHER ARGS
# ================================================================================================
# -- `chunking_strategy` arg ---------------------------------------------------------------------
def test_partition_html_can_chunk_while_partitioning():
file_path = example_doc_path("example-10k-1p.html")
chunks = partition_html(file_path, chunking_strategy="by_title")
chunks_2 = chunk_by_title(partition_html(file_path))
assert all(isinstance(c, (CompositeElement, Table, TableChunk)) for c in chunks)
assert chunks == chunks_2
# -- `include_metadata` arg ----------------------------------------------------------------------
def test_partition_html_from_filename_can_suppress_metadata():
elements = partition_html(example_doc_path("example-10k-1p.html"), include_metadata=False)
assert all(e.metadata.to_dict() == {} for e in elements)
# -- `skip_headers_and_footers` arg --------------------------------------------------------------
def test_partition_html_can_skip_headers_and_footers():
assert partition_html(
text=(
"<html>\n"
" <header>\n"
" <p>Header</p>\n"
" </header>\n"
" <body>\n"
" <h1>My First Heading</h1>\n"
" <p>It was a dark and stormy night. No one was around.</p>\n"
" </body>\n"
" <footer>\n"
" <p>Footer</p>\n"
" </footer>\n"
"</html>\n"
),
skip_headers_and_footers=True,
) == [
Title("My First Heading"),
NarrativeText("It was a dark and stormy night. No one was around."),
]
# -- `unique_element_ids` arg --------------------------------------------------------------------
def test_all_element_ids_are_unique():
ids = [e.id for e in partition_html(example_doc_path("fake-html-with-duplicate-elements.html"))]
assert len(ids) == len(set(ids))
def test_element_ids_are_deterministic():
ids = [e.id for e in partition_html("example-docs/fake-html-with-duplicate-elements.html")]
ids_2 = [e.id for e in partition_html("example-docs/fake-html-with-duplicate-elements.html")]
assert ids == ids_2
# ================================================================================================
# METADATA BEHAVIORS
# ================================================================================================
# -- .metadata.category_depth + parent_id --------------------------------------------------------
def test_partition_html_records_hierarchy_metadata():
elements = partition_html(
text=(
"<html>\n"
" <p>Preamble gets no category_depth or parent_id</p>\n"
" <h1>Heading gets category_depth but no parent_id</h1>\n"
" <p>Body paragraph gets parent_id but no category_depth</p>\n"
" <ul>\n"
" <li>List item gets category_depth and parent_id</li>\n"
" <li>Second list item gets category_depth and parent_id</li>\n"
" </ul>\n"
" <p>Body paragraph after list gets parent_id but no category_depth</p>\n"
"</html>\n"
)
)
assert len(elements) == 6
e = elements[0]
assert isinstance(e, NarrativeText)
assert e.text == "Preamble gets no category_depth or parent_id"
assert e.metadata.category_depth is None
assert e.metadata.parent_id is None
e = elements[1]
assert isinstance(e, Title)
assert e.text == "Heading gets category_depth but no parent_id"
assert e.metadata.category_depth == 0
assert e.metadata.parent_id is None
e = elements[2]
assert isinstance(e, NarrativeText)
assert e.text == "Body paragraph gets parent_id but no category_depth"
assert e.metadata.category_depth is None
assert e.metadata.parent_id == elements[1].id
e = elements[3]
assert isinstance(e, ListItem)
assert e.text == "List item gets category_depth and parent_id"
assert e.metadata.category_depth == 1
assert e.metadata.parent_id == elements[1].id
e = elements[4]
assert isinstance(e, ListItem)
assert e.text == "Second list item gets category_depth and parent_id"
assert e.metadata.category_depth == 1
assert e.metadata.parent_id == elements[1].id
e = elements[5]
assert isinstance(e, NarrativeText)
assert e.text == "Body paragraph after list gets parent_id but no category_depth"
assert e.metadata.category_depth is None
assert e.metadata.parent_id == elements[1].id
# -- .metadata.emphasis --------------------------------------------------------------------------
def test_partition_html_grabs_emphasized_texts():
elements = partition_html(
text=(
"<html>\n"
" <p>Hello there I am a very <strong>important</strong> text!</p>\n"
" <p>Here is a <span>list</span> of <b>my <i>favorite</i> things</b></p>\n"
" <ul>\n"
" <li><em>Parrots</em></li>\n"
" <li>Dogs</li>\n"
" </ul>\n"
" <span>A lone span text!</span>\n"
"</html>\n"
)
)
e = elements[0]
assert e == NarrativeText("Hello there I am a very important text!")
assert e.metadata.emphasized_text_contents == ["important"]
assert e.metadata.emphasized_text_tags == ["strong"]
e = elements[1]
assert e == NarrativeText("Here is a list of my favorite things")
assert e.metadata.emphasized_text_contents == ["list", "my favorite things", "favorite"]
assert e.metadata.emphasized_text_tags == ["span", "b", "i"]
e = elements[2]
assert e == ListItem("Parrots")
assert e.metadata.emphasized_text_contents == ["Parrots"]
assert e.metadata.emphasized_text_tags == ["em"]
e = elements[3]
assert e == ListItem("Dogs")
assert e.metadata.emphasized_text_contents is None
assert e.metadata.emphasized_text_tags is None
e = elements[4]
assert e == Title("A lone span text!")
assert e.metadata.emphasized_text_contents == ["A lone span text!"]
assert e.metadata.emphasized_text_tags == ["span"]
# -- .metadata.filename --------------------------------------------------------------------------
def test_partition_html_from_filename_uses_source_filename_for_metadata_by_default():
elements = partition_html(example_doc_path("example-10k-1p.html"))
assert len(elements) > 0
assert all(e.metadata.filename == "example-10k-1p.html" for e in elements)
assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
def test_partition_html_from_filename_prefers_metadata_filename():
elements = partition_html(example_doc_path("example-10k-1p.html"), metadata_filename="test")
assert len(elements) > 0
assert all(element.metadata.filename == "test" for element in elements)
def test_partition_html_from_file_prefers_metadata_filename():
with open(example_doc_path("example-10k-1p.html"), "rb") as f:
elements = partition_html(file=f, metadata_filename="test")
assert len(elements) > 0
assert all(e.metadata.filename == "test" for e in elements)
# -- .metadata.languages -------------------------------------------------------------------------
def test_partition_html_element_metadata_has_languages():
elements = partition_html(example_doc_path("example-10k-1p.html"))
assert elements[0].metadata.languages == ["eng"]
def test_partition_html_respects_detect_language_per_element():
elements = partition_html(
example_doc_path("language-docs/eng_spa_mult.html"), detect_language_per_element=True
)
assert [e.metadata.languages for e in elements] == [
["eng"],
["spa", "eng"],
["eng"],
["eng"],
["spa"],
]
# -- .metadata.last_modified ---------------------------------------------------------------------
def test_partition_html_from_filename_pulls_last_modified_from_filesystem(
get_last_modified_date_: Mock,
):
last_modified_on_filesystem = "2023-07-05T09:24:28"
get_last_modified_date_.return_value = last_modified_on_filesystem
elements = partition_html(example_doc_path("fake-html.html"))
assert isinstance(elements[0], Title)
assert elements[0].metadata.last_modified == last_modified_on_filesystem
def test_partition_html_from_filename_prefers_metadata_last_modified(
get_last_modified_date_: Mock,
):
metadata_last_modified = "2023-07-05T09:24:28"
get_last_modified_date_.return_value = "2024-06-04T09:24:28"
elements = partition_html(
example_doc_path("fake-html.html"), metadata_last_modified=metadata_last_modified
)
assert isinstance(elements[0], Title)
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
def test_partition_html_from_file_does_not_assign_last_modified_metadata_by_default(
get_last_modified_date_from_file_: Mock,
):
get_last_modified_date_from_file_.return_value = "2029-07-05T09:24:28"
with open(example_doc_path("fake-html.html"), "rb") as f:
elements = partition_html(file=f)
assert isinstance(elements[0], Title)
assert elements[0].metadata.last_modified is None
def test_partition_html_from_file_pulls_last_modified_from_file_like_object_when_so_instructed(
get_last_modified_date_from_file_: Mock,
):
get_last_modified_date_from_file_.return_value = "2024-06-04T09:24:28"
with open(example_doc_path("fake-html.html"), "rb") as f:
elements = partition_html(file=f, date_from_file_object=True)
assert isinstance(elements[0], Title)
assert all(e.metadata.last_modified == "2024-06-04T09:24:28" for e in elements)
def test_partition_html_from_file_assigns_no_last_modified_metadata_when_file_has_none():
"""Test partition_html() with file that are not possible to get last modified date"""
with open(example_doc_path("fake-html.html"), "rb") as f:
sf = SpooledTemporaryFile()
sf.write(f.read())
sf.seek(0)
elements = partition_html(file=sf, date_from_file_object=True)
assert all(e.metadata.last_modified is None for e in elements)
def test_partition_html_from_file_prefers_metadata_last_modified(
get_last_modified_date_from_file_: Mock,
):
metadata_last_modified = "2023-07-05T09:24:28"
get_last_modified_date_from_file_.return_value = "2024-06-04T09:24:28"
with open(example_doc_path("fake-html.html"), "rb") as f:
elements = partition_html(file=f, metadata_last_modified=metadata_last_modified)
assert isinstance(elements[0], Title)
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
def test_partition_html_from_text_assigns_no_last_modified_metadata():
elements = partition_html(text="<html><div><p>TEST</p></div></html>")
assert isinstance(elements[0], Title)
assert elements[0].metadata.last_modified is None
def test_partition_html_from_text_prefers_metadata_last_modified():
metadata_last_modified = "2023-07-05T09:24:28"
elements = partition_html(
text="<html><div><p>TEST</p></div></html>", metadata_last_modified=metadata_last_modified
)
assert isinstance(elements[0], Title)
assert elements[0].metadata.last_modified == metadata_last_modified
# -- .metadata.link* -----------------------------------------------------------------------------
def test_partition_html_grabs_links():
elements = partition_html(
text=(
"<html>\n"
' <p>Hello there I am a <a href="/link">very important link!</a></p>\n'
" <p>Here is a list of my favorite things</p>\n"
" <ul>\n"
' <li><a href="https://en.wikipedia.org/wiki/Parrot">Parrots</a></li>\n'
" <li>Dogs</li>\n"
" </ul>\n"
' <a href="/loner">A lone link!</a>\n'
"</html>\n"
)
)
e = elements[0]
assert e == NarrativeText("Hello there I am a very important link!")
assert e.metadata.link_urls == ["/link"]
assert e.metadata.link_texts == ["very important link!"]
e = elements[1]
assert e == NarrativeText("Here is a list of my favorite things")
assert e.metadata.link_urls is None
assert e.metadata.link_texts is None
e = elements[2]
assert e == ListItem("Parrots")
assert e.metadata.link_urls == ["https://en.wikipedia.org/wiki/Parrot"]
assert e.metadata.link_texts == ["Parrots"]
e = elements[3]
assert e == ListItem("Dogs")
assert e.metadata.link_urls is None
assert e.metadata.link_texts is None
e = elements[4]
assert e == Title("A lone link!")
assert e.metadata.link_urls == ["/loner"]
assert e.metadata.link_texts == ["A lone link!"]
def test_partition_html_links():
elements = partition_html(
text=(
"<html>\n"
' <a href="/loner">A lone link!</a>\n'
' <p>Hello <a href="/link">link!</a></p>\n'
' <p>\n Hello <a href="/link">link!</a></p>\n'
' <p><a href="/wiki/parrots">Parrots</a> and <a href="/wiki/dogs">Dogs</a></p>\n'
"</html>\n"
)
)
e = elements[0]
assert e.metadata.link_texts == ["A lone link!"]
assert e.metadata.link_urls == ["/loner"]
assert e.metadata.link_start_indexes == [-1]
e = elements[1]
assert e.metadata.link_texts == ["link!"]
assert e.metadata.link_urls == ["/link"]
assert e.metadata.link_start_indexes == [6]
e = elements[2]
assert e.metadata.link_texts == ["link!"]
assert e.metadata.link_urls == ["/link"]
assert e.metadata.link_start_indexes == [6]
e = elements[3]
assert e.metadata.link_texts == ["Parrots", "Dogs"]
assert e.metadata.link_urls == ["/wiki/parrots", "/wiki/dogs"]
assert e.metadata.link_start_indexes == [0, 12]
# -- .metadata.text_as_html ----------------------------------------------------------------------
@pytest.mark.parametrize(
("html_str", "expected_value"),
[
(
"<table><tr><th>Header 1</th><th>Header 2</th></tr></table>",
"<table><tr><td>Header 1</td><td>Header 2</td></tr></table>",
),
(
"<table>"
"<tr><td>Dimensions</td><td>Weight</td></tr>"
"<tr><td>4'-6\" x 1'</td><td>18 kg</td></tr>"
"</table>",
# ----------
"<table>"
"<tr><td>Dimensions</td><td>Weight</td></tr>"
"<tr><td>4&#x27;-6&quot; x 1&#x27;</td><td>18 kg</td></tr>"
"</table>",
),
],
)
def test_partition_html_applies_text_as_html_metadata_for_tables(
html_str: str, expected_value: str
):
elements = partition_html(text=html_str)
assert len(elements) == 1
assert elements[0].metadata.text_as_html == expected_value
# -- .metadata.url -------------------------------------------------------------------------------
def test_partition_html_from_url_adds_url_to_metadata(requests_get_: Mock):
requests_get_.return_value = FakeResponse(
text=example_doc_text("example-10k-1p.html"),
status_code=200,
headers={"Content-Type": "text/html"},
)
elements = partition_html(url="https://trusttheforceluke.com")
requests_get_.assert_called_once_with("https://trusttheforceluke.com", headers={}, verify=True)
assert len(elements) > 0
assert all(e.metadata.url == "https://trusttheforceluke.com" for e in elements)
# ================================================================================================
# SERIALIZATION BEHAVIORS
# ================================================================================================
def test_partition_html_round_trips_through_json():
elements = partition_html(example_doc_path("example-10k-1p.html"))
assert_round_trips_through_JSON(elements)
# ================================================================================================
# MODULE-LEVEL FIXTURES
# ================================================================================================
EXPECTED_OUTPUT_LANGUAGE_DE = [
Title(text="Jahresabschluss zum Geschäftsjahr vom 01.01.2020 bis zum 31.12.2020"),
]
@pytest.fixture
def get_last_modified_date_(request: pytest.FixtureRequest):
return function_mock(request, "unstructured.partition.html.partition.get_last_modified_date")
@pytest.fixture
def get_last_modified_date_from_file_(request: pytest.FixtureRequest):
return function_mock(
request, "unstructured.partition.html.partition.get_last_modified_date_from_file"
)
class FakeResponse:
def __init__(self, text: str, status_code: int, headers: dict[str, str] = {}):
self.text = text
self.status_code = status_code
self.ok = status_code < 300
self.headers = headers
@pytest.fixture
def opts_args() -> dict[str, Any]:
"""All default arguments for `HtmlPartitionerOptions`.
Individual argument values can be changed to suit each test. Makes construction of opts more
compact for testing purposes.
"""
return {
"file": None,
"file_path": None,
"text": None,
"encoding": None,
"url": None,
"headers": {},
"ssl_verify": True,
"date_from_file_object": False,
"metadata_last_modified": None,
"skip_headers_and_footers": False,
"detection_origin": None,
}
@pytest.fixture
def requests_get_(request: pytest.FixtureRequest):
return function_mock(request, "unstructured.partition.html.partition.requests.get")
# ================================================================================================
# ISOLATED UNIT TESTS
# ================================================================================================
# These test components used by `partition_html()` in isolation such that all edge cases can be
# exercised.
# ================================================================================================
class DescribeHtmlPartitionerOptions:
"""Unit-test suite for `unstructured.partition.html.partition.HtmlPartitionerOptions`."""
# -- .detection_origin -----------------------
@pytest.mark.parametrize("detection_origin", ["html", None])
def it_knows_the_caller_provided_detection_origin(
self, detection_origin: str | None, opts_args: dict[str, Any]
):
opts_args["detection_origin"] = detection_origin
opts = HtmlPartitionerOptions(**opts_args)
assert opts.detection_origin == detection_origin
# -- .encoding -------------------------------
@pytest.mark.parametrize("encoding", ["utf-8", None])
def it_knows_the_caller_provided_encoding(
self, encoding: str | None, opts_args: dict[str, Any]
):
opts_args["encoding"] = encoding
opts = HtmlPartitionerOptions(**opts_args)
assert opts.encoding == encoding
# -- .html_text ------------------------------
def it_gets_the_HTML_from_the_file_path_when_one_is_provided(self, opts_args: dict[str, Any]):
file_path = example_doc_path("example-10k-1p.html")
opts_args["file_path"] = file_path
opts = HtmlPartitionerOptions(**opts_args)
html_text = opts.html_text
assert isinstance(html_text, str)
assert html_text == read_txt_file(file_path)[1]
def and_it_gets_the_HTML_from_the_file_like_object_when_one_is_provided(
self, opts_args: dict[str, Any]
):
file_path = example_doc_path("example-10k-1p.html")
with open(file_path, "rb") as f:
file = io.BytesIO(f.read())
opts_args["file"] = file
opts = HtmlPartitionerOptions(**opts_args)
html_text = opts.html_text
assert isinstance(html_text, str)
assert html_text == read_txt_file(file_path)[1]
def and_it_uses_the_HTML_in_the_text_argument_when_that_is_provided(
self, opts_args: dict[str, Any]
):
opts_args["text"] = "<html><body><p>Hello World!</p></body></html>"
opts = HtmlPartitionerOptions(**opts_args)
assert opts.html_text == "<html><body><p>Hello World!</p></body></html>"
def and_it_gets_the_HTML_from_the_url_when_one_is_provided(
self, requests_get_: Mock, opts_args: dict[str, Any]
):
requests_get_.return_value = FakeResponse(
text="<html><body><p>I just flew over the internet!</p></body></html>",
status_code=200,
headers={"Content-Type": "text/html"},
)
opts_args["url"] = "https://insta.tweet.face.org"
opts = HtmlPartitionerOptions(**opts_args)
assert opts.html_text == "<html><body><p>I just flew over the internet!</p></body></html>"
def but_it_raises_when_no_path_or_file_or_text_or_url_was_provided(
self, opts_args: dict[str, Any]
):
opts = HtmlPartitionerOptions(**opts_args)
with pytest.raises(ValueError, match="Exactly one of filename, file, text, or url must be"):
opts.html_text
# -- .last_modified --------------------------
def it_gets_the_last_modified_date_of_the_document_from_the_caller_when_provided(
self, opts_args: dict[str, Any]
):
opts_args["metadata_last_modified"] = "2024-03-05T17:02:53"
opts = HtmlPartitionerOptions(**opts_args)
assert opts.last_modified == "2024-03-05T17:02:53"
def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_path_is_provided(
self, opts_args: dict[str, Any], get_last_modified_date_: Mock
):
opts_args["file_path"] = "a/b/document.html"
get_last_modified_date_.return_value = "2024-04-02T20:32:35"
opts = HtmlPartitionerOptions(**opts_args)
last_modified = opts.last_modified
get_last_modified_date_.assert_called_once_with("a/b/document.html")
assert last_modified == "2024-04-02T20:32:35"
def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_file_like_object_is_provided(
self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock
):
file = io.BytesIO(b"abcdefg")
opts_args["file"] = file
opts_args["date_from_file_object"] = True
get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07"
opts = HtmlPartitionerOptions(**opts_args)
last_modified = opts.last_modified
get_last_modified_date_from_file_.assert_called_once_with(file)
assert last_modified == "2024-04-02T20:42:07"
def but_it_falls_back_to_None_for_the_last_modified_date_when_date_from_file_object_is_False(
self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock
):
file = io.BytesIO(b"abcdefg")
opts_args["file"] = file
opts_args["date_from_file_object"] = False
get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07"
opts = HtmlPartitionerOptions(**opts_args)
last_modified = opts.last_modified
get_last_modified_date_from_file_.assert_not_called()
assert last_modified is None
# -- .skip_headers_and_footers ---------------
@pytest.mark.parametrize("skip_headers_and_footers", [True, False])
def it_knows_the_caller_provided_skip_headers_and_footers_setting(
self, skip_headers_and_footers: bool, opts_args: dict[str, Any]
):
opts_args["skip_headers_and_footers"] = skip_headers_and_footers
opts = HtmlPartitionerOptions(**opts_args)
assert opts.skip_headers_and_footers is skip_headers_and_footers
# -- fixtures --------------------------------------------------------------------------------
@pytest.fixture()
def get_last_modified_date_(self, request: FixtureRequest) -> Mock:
return function_mock(
request, "unstructured.partition.html.partition.get_last_modified_date"
)
@pytest.fixture()
def get_last_modified_date_from_file_(self, request: FixtureRequest):
return function_mock(
request, "unstructured.partition.html.partition.get_last_modified_date_from_file"
)
class Describe_HtmlPartitioner:
"""Unit-test suite for `unstructured.partition.html.partition._HtmlPartitioner`."""
# -- ._main ----------------------------------
def it_can_find_the_main_element_in_the_document(self, opts_args: dict[str, Any]):
opts_args["text"] = (
"<body>\n"
" <header></header>\n"
" <p>Lots preamble stuff yada yada yada</p>\n"
" <main>\n"
" <h2>A Wonderful Section!</h2>\n"
" <p>Look at this amazing section!</p>\n"
" </main>\n"
"</body>\n"
)
opts = HtmlPartitionerOptions(**opts_args)
partitioner = _HtmlPartitioner(opts)
assert partitioner._main.tag == "main"
def and_it_falls_back_to_the_body_when_there_is_no_main(self, opts_args: dict[str, Any]):
"""And there is always a <body>, the parser adds one if there's not one in the HTML."""
opts_args["text"] = (
"<body>\n"
" <header></header>\n"
" <p>Lots preamble stuff yada yada yada</p>\n"
" <h2>A Wonderful Section!</h2>\n"
" <p>Look at this amazing section!</p>\n"
"</body>\n"
)
opts = HtmlPartitionerOptions(**opts_args)
partitioner = _HtmlPartitioner(opts)
assert partitioner._main.tag == "body"
# -- ElementCls selection behaviors -----------------
def it_produces_a_Text_element_when_the_tag_contents_are_not_narrative_or_a_title(
self, opts_args: dict[str, Any]
):
opts_args["text"] = "<p>NO PARTICULAR TYPE.</p>"
opts = HtmlPartitionerOptions(**opts_args)
(element,) = list(_HtmlPartitioner.iter_elements(opts))
assert element == Text("NO PARTICULAR TYPE.")
def it_produces_a_ListItem_element_when_the_tag_contains_are_preceded_by_a_bullet_character(
self, opts_args: dict[str, Any]
):
opts_args["text"] = "<p>● An excellent point!</p>"
opts = HtmlPartitionerOptions(**opts_args)
(element,) = list(_HtmlPartitioner.iter_elements(opts))
assert element == ListItem("An excellent point!")
def but_not_when_the_tag_contains_only_a_bullet_character_and_no_text(
self, opts_args: dict[str, Any]
):
opts_args["text"] = "<p>●</p>"
opts = HtmlPartitionerOptions(**opts_args)
assert list(_HtmlPartitioner.iter_elements(opts)) == []
def it_produces_no_element_when_the_tag_has_no_content(self, opts_args: dict[str, Any]):
opts_args["text"] = "<p></p>"
opts = HtmlPartitionerOptions(**opts_args)
assert list(_HtmlPartitioner.iter_elements(opts)) == []
def and_it_produces_no_element_when_the_tag_contains_only_a_stub(
self, opts_args: dict[str, Any]
):
opts_args["text"] = "<p>$</p>"
opts = HtmlPartitionerOptions(**opts_args)
assert list(_HtmlPartitioner.iter_elements(opts)) == []