# pyright: reportPrivateUsage=false """Test suite for `unstructured.partition.html.partition` module.""" from __future__ import annotations import io import pathlib from typing import Any import pytest from lxml import etree from test_unstructured.unit_utils import ( FixtureRequest, Mock, assert_round_trips_through_JSON, example_doc_path, example_doc_text, function_mock, ) from unstructured.chunking.title import chunk_by_title from unstructured.cleaners.core import clean_extra_whitespace from unstructured.documents.elements import ( Address, CompositeElement, ListItem, NarrativeText, Table, TableChunk, Text, Title, ) from unstructured.file_utils.encoding import read_txt_file from unstructured.partition.html import partition_html from unstructured.partition.html.partition import HtmlPartitionerOptions, _HtmlPartitioner # ================================================================================================ # SOURCE HTML LOADING BEHAVIORS # ================================================================================================ # -- document-source (filename, file, text, url) ------------------------------------------------- def test_partition_html_accepts_a_file_path(tmp_path: pathlib.Path): file_path = str(tmp_path / "sample-doc.html") with open(file_path, "w") as f: f.write( "\n" " \n" "

A Great and Glorious Section

\n" "

Dear Leader is the best. He is such a wonderful engineer!

\n" "

Another Magnificent paragraph

\n" "

The prior element is a title based on its capitalization patterns!

\n" " \n" " \n" " \n" " \n" " \n" " \n" "

I'm in a table

\n" "

A New Beginning

\n" "

Here is the start of a new page.

\n" " \n" "\n" ) elements = partition_html(file_path) assert len(elements) == 7 assert elements == [ Title("A Great and Glorious Section"), NarrativeText("Dear Leader is the best. He is such a wonderful engineer!"), Title("Another Magnificent paragraph"), NarrativeText("The prior element is a title based on its capitalization patterns!"), Table("I'm in a table"), Title("A New Beginning"), NarrativeText("Here is the start of a new page."), ] assert all(e.metadata.filename == "sample-doc.html" for e in elements) def test_user_without_file_write_permission_can_partition_html(tmp_path: pathlib.Path): read_only_file_path = tmp_path / "example-10k-readonly.html" read_only_file_path.write_text(example_doc_text("example-10k-1p.html")) read_only_file_path.chmod(0o444) elements = partition_html(filename=str(read_only_file_path.resolve())) assert len(elements) > 0 def test_partition_html_accepts_a_file_like_object(): with open(example_doc_path("example-10k-1p.html"), "rb") as f: elements = partition_html(file=f) assert len(elements) > 0 assert all(e.metadata.filename is None for e in elements) def test_partition_html_accepts_an_html_str(): elements = partition_html(text=example_doc_text("example-10k-1p.html")) assert len(elements) > 0 def test_partition_html_accepts_a_url_to_an_HTML_document(requests_get_: Mock): requests_get_.return_value = FakeResponse( text=example_doc_text("example-10k-1p.html"), status_code=200, headers={"Content-Type": "text/html"}, ) elements = partition_html(url="https://fake.url") requests_get_.assert_called_once_with("https://fake.url", headers={}, verify=True) assert len(elements) > 0 def test_partition_html_raises_when_no_path_or_file_or_text_or_url_is_specified(): with pytest.raises(ValueError, match="Exactly one of filename, file, text, or url must be sp"): partition_html() # -- encoding for filename, file, and text ------------------------------------------------------- @pytest.mark.parametrize( "filename", ["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html"] ) def test_partition_html_from_filename_raises_when_explicit_encoding_is_wrong(filename: str): with pytest.raises(UnicodeDecodeError): with open(example_doc_path(filename), "rb") as f: partition_html(file=f, encoding="utf-8") @pytest.mark.parametrize( "filename", ["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html", "fake-html-lang-de.html"], ) def test_partition_html_from_filename_default_encoding(filename: str): elements = partition_html(example_doc_path(filename)) assert len(elements) > 0 assert all(e.metadata.filename == filename for e in elements) if filename == "fake-html-lang-de.html": assert elements == EXPECTED_OUTPUT_LANGUAGE_DE @pytest.mark.parametrize( "filename", ["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html"] ) def test_partition_html_from_file_raises_encoding_error(filename: str): with open(example_doc_path(filename), "rb") as f: file = io.BytesIO(f.read()) with pytest.raises(UnicodeDecodeError, match="'utf-8' codec can't decode byte 0xff in posi"): partition_html(file=file, encoding="utf-8") @pytest.mark.parametrize( "filename", ["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html", "fake-html-lang-de.html"], ) def test_partition_html_from_file_default_encoding(filename: str): with open(example_doc_path(filename), "rb") as f: elements = partition_html(file=f) assert len(elements) > 0 if filename == "fake-html-lang-de.html": assert elements == EXPECTED_OUTPUT_LANGUAGE_DE @pytest.mark.parametrize( "filename", ["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html"] ) def test_partition_html_from_file_rb_raises_encoding_error(filename: str): with pytest.raises(UnicodeDecodeError, match="'utf-8' codec can't decode byte 0xff in posi"): with open(example_doc_path(filename), "rb") as f: partition_html(file=f, encoding="utf-8") @pytest.mark.parametrize( "filename", ["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html", "fake-html-lang-de.html"], ) def test_partition_html_from_file_rb_default_encoding(filename: str): with open(example_doc_path(filename), "rb") as f: elements = partition_html(file=f) assert len(elements) > 0 if filename == "fake-html-lang-de.html": assert elements == EXPECTED_OUTPUT_LANGUAGE_DE def test_partition_html_processes_chinese_chracters(): html_text = "

每日新闻

" elements = partition_html(text=html_text) assert elements[0].text == "每日新闻" def test_emoji_appears_with_emoji_utf8_code(): assert partition_html(text='

Hello 😀

') == [ Title("Hello 😀") ] # -- partition_html() from URL ------------------------------------------------------------------- def test_partition_html_from_url_raises_on_failure_response_status_code(requests_get_: Mock): requests_get_.return_value = FakeResponse( text=example_doc_text("example-10k-1p.html"), status_code=500, headers={"Content-Type": "text/html"}, ) with pytest.raises(ValueError, match="Error status code on GET of provided URL: 500"): partition_html(url="https://fake.url") def test_partition_html_from_url_raises_on_response_of_wrong_content_type(requests_get_: Mock): requests_get_.return_value = FakeResponse( text=example_doc_text("example-10k-1p.html"), status_code=200, headers={"Content-Type": "application/json"}, ) with pytest.raises(ValueError, match="Expected content type text/html. Got application/json."): partition_html(url="https://fake.url") def test_partition_from_url_includes_provided_headers_in_request(requests_get_: Mock): requests_get_.return_value = FakeResponse( text="

What do I know? Who needs to know it?

", status_code=200, headers={"Content-Type": "text/html"}, ) partition_html(url="https://example.com", headers={"User-Agent": "test"}) requests_get_.assert_called_once_with( "https://example.com", headers={"User-Agent": "test"}, verify=True ) # ================================================================================================ # PARSING TESTS # ================================================================================================ def test_partition_html_on_ideas_page(): elements = partition_html(example_doc_path("ideas-page.html")) assert len(elements) == 1 e = elements[0] assert e == Table( "January 2023 ( Someone fed my essays into GPT to make something that could answer" "\nquestions based on them, then asked it where good ideas come from. The" "\nanswer was ok, but not what I would have said. This is what I would have said.)" " The way to get new ideas is to notice anomalies: what seems strange," "\nor missing, or broken? You can see anomalies in everyday life (much" "\nof standup comedy is based on this), but the best place to look for" "\nthem is at the frontiers of knowledge. Knowledge grows fractally." "\nFrom a distance its edges look smooth, but when you learn enough" "\nto get close to one, you'll notice it's full of gaps. These gaps" "\nwill seem obvious; it will seem inexplicable that no one has tried" "\nx or wondered about y. In the best case, exploring such gaps yields" "\nwhole new fractal buds.", ) assert e.metadata.emphasized_text_contents is None assert e.metadata.link_urls is None assert e.metadata.text_as_html is not None # -- element-suppression behaviors --------------------------------------------------------------- def test_it_does_not_extract_text_in_script_tags(): elements = partition_html(example_doc_path("example-with-scripts.html")) assert all("function (" not in e.text for e in elements) def test_it_does_not_extract_text_in_style_tags(): html_text = ( "\n" "\n" "

Lorem ipsum dolor

\n" "\n" "" ) (element,) = partition_html(text=html_text) assert isinstance(element, Text) assert element.text == "Lorem ipsum dolor" # -- table parsing behaviors --------------------------------------------------------------------- def test_it_can_parse_a_bare_bones_table_to_a_Table_element(): """Bare-bones means no ``, ``, or `` elements.""" html_text = ( "\n" "\n" " \n" " \n" " \n" "

Lorem	Ipsum
Ut enim non	ad minim\nveniam quis

\n" "\n" "" ) (element,) = partition_html(text=html_text) assert isinstance(element, Table) # -- table text is joined into a single string; no row or cell boundaries are represented -- assert element.text == "Lorem Ipsum Ut enim non ad minim\nveniam quis" # -- An HTML representation is also available that is longer but represents table structure. assert element.metadata.text_as_html == ( "" "" "" "

Lorem	Ipsum
Ut enim non	ad minim veniam quis

" ) def test_it_accommodates_column_heading_cells_enclosed_in_thead_tbody_and_tfoot_elements(): """Cells within a `table/thead` element are included in the text and html. The presence of a `` element in the original also determines whether a `` element appears in `.text_as_html` or whether the first row of cells is simply in the body. """ html_text = ( "\n" "\n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" "

Lorem	Ipsum
Lorem ipsum	dolor sit amet nulla
Ut enim non	ad minim\nveniam quis
Dolor	Equis

\n" "\n" "" ) (element,) = partition_html(text=html_text) assert isinstance(element, Table) assert element.metadata.text_as_html == ( "" "" "" "" "" "

Lorem	Ipsum
Lorem ipsum	dolor sit amet nulla
Ut enim non	ad minim veniam quis
Dolor	Equis

" ) def test_it_does_not_emit_a_Table_element_for_a_table_with_no_text(): html_text = ( "\n" "\n" " \n" " \n" " \n" "

\n" "\n" "" ) assert partition_html(text=html_text) == [] def test_it_provides_parseable_HTML_in_text_as_html(): html_text = ( "\n" "\n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" "

Lorem	Ipsum
Lorem ipsum	dolor sit amet nulla
Ut enim non	ad minim\nveniam quis
Dolor	Equis

\n" "\n" "" ) (element,) = partition_html(text=html_text) text_as_html = element.metadata.text_as_html assert text_as_html is not None html = etree.fromstring(text_as_html, etree.HTMLParser()) assert html is not None # -- lxml adds the container, that's not present in `.text_as_html` -- assert etree.tostring(html, encoding=str) == ( "" "" "" "" "" "" "

Lorem	Ipsum
Lorem ipsum	dolor sit amet nulla
Ut enim non	ad minim veniam quis
Dolor	Equis

" "" ) @pytest.mark.parametrize( ("tag", "expected_text_as_html"), [ ("thead", "

Header 1

Header 2

"), ("tfoot", "

Header 1

Header 2

"), ], ) def test_partition_html_parses_table_without_tbody(tag: str, expected_text_as_html: str): elements = partition_html( text=( f"\n" f" <{tag}>\n" f" \n" f" \n" f"

Header 1	Header 2

" ) ) assert elements[0].metadata.text_as_html == expected_text_as_html def test_partition_html_reduces_a_nested_table_to_its_text_placed_in_the_cell_that_contains_it(): html_text = ( "\n" " \n" " \n" " \n" " \n" "

\n" " \n" " \n" " \n" "

foo	bar
baz	bng

\n" "

\n" " \n" " \n" "

fizz

bang

\n" "

" ) (element,) = partition_html(text=html_text) assert element == Table("foo bar baz bng fizz bang") assert element.metadata.text_as_html == ( "

foo bar baz bng

fizz bang

" ) def test_partition_html_accommodates_tds_with_child_elements(): """Like this example from an SEC 10k filing.""" html_text = ( "\n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" "


\n" " \n" " \n" ' \n' " ☒\n" " \n" " \n" " \n" "	\n" " \n" " ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE" " ACT OF 1934\n" " \n" "

\n" ) (element,) = partition_html(text=html_text) assert element == Table( "☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934" ) assert element.metadata.text_as_html == ( "" "" "" "


☒	ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES" " EXCHANGE ACT OF 1934

" ) # -- other element-specific behaviors ------------------------------------------------------------ def test_partition_html_recognizes_h1_to_h6_as_Title_with_category_depth(): html_text = ( "

This is narrative text, it's long, flows and has meaning

\n" "

This heading is a title, even though it's long, flows and has meaning

\n" "

A heading that is at the second level

\n" "

Finally, the third heading

\n" "

December 1-17, 2017

\n" "

email@example.com

\n" "

* bullet point

\n" "

- invalidly nested list item

\n" ) elements = partition_html(text=html_text) assert len(elements) == 8 e = elements[0] assert e == NarrativeText("This is narrative text, it's long, flows and has meaning") assert e.metadata.category_depth is None e = elements[1] assert e == Title("This heading is a title, even though it's long, flows and has meaning") assert e.metadata.category_depth == 0 e = elements[2] assert e == Title("A heading that is at the second level") assert e.metadata.category_depth == 1 e = elements[3] assert e == Title("Finally, the third heading") assert e.metadata.category_depth == 2 e = elements[4] assert e == Title("December 1-17, 2017") assert e.metadata.category_depth == 3 e = elements[5] assert e == Title("email@example.com") assert e.metadata.category_depth == 4 e = elements[6] assert e == Title("* bullet point") assert e.metadata.category_depth == 5 e = elements[7] assert e == ListItem("- invalidly nested list item") assert e.metadata.category_depth == 0 def test_partition_html_with_widely_encompassing_pre_tag(): elements = partition_html(example_doc_path("fake-html-pre.htm")) print(f"{len(elements)=}") assert len(elements) > 0 assert clean_extra_whitespace(elements[0].text).startswith("[107th Congress Public Law 56]") assert isinstance(elements[0], NarrativeText) assert elements[0].metadata.filetype == "text/html" assert elements[0].metadata.filename == "fake-html-pre.htm" def test_pre_tag_parsing_respects_order(): assert partition_html( text=( "

The Big Brown Bear

\n" "

The big brown bear is growling.

\n" "

The big brown bear is sleeping.

\n" "

The Big Blue Bear

\n" ) ) == [ Title("The Big Brown Bear"), NarrativeText("The big brown bear is growling."), NarrativeText("The big brown bear is sleeping."), Title("The Big Blue Bear"), ] def test_partition_html_br_tag_parsing(): html_text = ( "\n" "\n" "\n" "

\n" "

Header 1

\n" "

Text

\n" "

Header 2

\n" "

\n"
        "    Param1 = Y
Param2 = 1
Param3 = 2
Param4 = A\n"
        "    
Param5 = A,B,C,D,E
Param6 = 7
Param7 = Five
\n"
        "

\n" "

\n" "\n" "\n" ) elements = partition_html(text=html_text) assert elements == [ Title("Header 1"), Title("Text"), Title("Header 2"), Text( " Param1 = Y\nParam2 = 1\nParam3 = 2\nParam4 = A\n \nParam5 = A,B,C,D,E\n" "Param6 = 7\nParam7 = Five\n\n " ), ] e = elements[3] assert e.metadata.emphasized_text_contents == [ "Param1", "Param2", "Param3", "Param4", "Param5", "Param6", "Param7", ] assert e.metadata.emphasized_text_tags == ["b", "b", "b", "b", "b", "b", "b"] def test_partition_html_tag_tail_parsing(): html_text = ( "\n" "\n" "

\n" " Head\n" "

Nested

\n" " Tail\n" "

\n" "\n" "\n" ) elements = partition_html(text=html_text) assert elements == [Title("Head"), Title("Nested"), Title("Tail")] # -- parsing edge cases -------------------------------------------------------------------------- def test_partition_html_from_text_works_with_empty_string(): assert partition_html(text="") == [] def test_partition_html_accommodates_block_item_nested_inside_phrasing_element(): html_text = """

We start out normally and then add a citation

But whoa, this is a paragraph inside a phrasing element.

so we close the first element at the start of the block element and emit it, then we emit the block element, and then start a new element for the tail and whatever phrasing follows it.

""" elements = partition_html(text=html_text) assert elements == [ NarrativeText("We start out normally and then add a citation"), NarrativeText("But whoa, this is a paragraph inside a phrasing element."), NarrativeText( "so we close the first element at the start of the block element and emit it," " then we emit the block element," " and then start a new element for the tail and whatever phrasing follows it." ), ] assert elements[2].metadata.emphasized_text_contents == ["emit"] assert elements[2].metadata.emphasized_text_tags == ["b"] def test_partition_html_handles_anchor_with_nested_block_item(): html_text = """

O Deep Thought computer, he said,

The task we have designed you to perform is this.

We want you to tell us.... he paused,

""" elements = partition_html(text=html_text) assert [e.text for e in elements] == [ "O Deep Thought computer, he said,", "The task we have designed you to perform is this.", "We want you to tell us.... he paused,", ] link_annotated_element = elements[0] assert link_annotated_element.metadata.link_texts == ["computer, he said,"] assert link_annotated_element.metadata.link_urls == ["http://eie.io"] assert all(e.metadata.link_texts is None for e in elements[1:]) assert all(e.metadata.link_urls is None for e in elements[1:]) def test_containers_with_text_are_processed(): html_text = ( '

Hi All,\n' "

\n" "

Get excited for our first annual family day!

\n" '

Best.
\n' "

\n" " --
\n" '

\n' '

Dino the Datasaur

\n' " Unstructured Technologies
\n" "

Data Scientist

\n" "

Doylestown, PA 18901

\n" "

\n" " See you there!\n" "

\n" ) elements = partition_html(text=html_text) assert elements == [ Text("Hi All,"), NarrativeText("Get excited for our first annual family day!"), Title("Best."), Text("--"), Title("Dino the Datasaur"), Title("Unstructured Technologies"), Title("Data Scientist"), Address("Doylestown, PA 18901"), NarrativeText("See you there!"), ] def test_html_grabs_bulleted_text_in_tags(): html_text = ( "\n" " \n" "

Happy Groundhog's day!
Looks like six more weeks of winter ...

\n" " \n" "\n" ) elements = partition_html(text=html_text) assert elements == [ ListItem("Happy Groundhog's day!"), ListItem("Looks like six more weeks of winter ..."), ] def test_html_grabs_bulleted_text_in_paras(): html_text = ( "\n" " \n" "

\n" " • Happy Groundhog's day!\n" "

\n" "

\n" " • Looks like six more weeks of winter ...\n" "

\n" " \n" "\n" ) elements = partition_html(text=html_text) # -- bullet characters are removed -- assert elements == [ ListItem("Happy Groundhog's day!"), ListItem("Looks like six more weeks of winter ..."), ] def test_joins_tag_text_correctly(): elements = partition_html(text="

Hello again peet magical

") assert elements == [Title("Hello again peet magical")] def test_sample_doc_with_emoji(): elements = partition_html(text='\n

Hello again 😀

\n') assert elements == [NarrativeText("Hello again 😀")] def test_only_text_and_no_elements_in_body(): elements = partition_html(text="Hello") assert elements == [Title("Hello")] def test_text_before_elements_in_body(): elements = partition_html(text="Hello

World

") assert elements == [Title("Hello"), Title("World")] def test_line_break_in_container(): elements = partition_html(text="

Hello
World

") assert elements == [Title("Hello World")] @pytest.mark.parametrize("tag", ["del", "form", "noscript"]) def test_exclude_tag_types(tag: str): html_text = f"\n <{tag}>\n There is some text here.\n \n\n" elements = partition_html(text=html_text) assert elements == [] # ================================================================================================ # OTHER ARGS # ================================================================================================ # -- `chunking_strategy` arg --------------------------------------------------------------------- def test_partition_html_can_chunk_while_partitioning(): file_path = example_doc_path("example-10k-1p.html") chunks = partition_html(file_path, chunking_strategy="by_title") chunks_2 = chunk_by_title(partition_html(file_path)) assert all(isinstance(c, (CompositeElement, Table, TableChunk)) for c in chunks) assert chunks == chunks_2 # -- `skip_headers_and_footers` arg -------------------------------------------------------------- def test_partition_html_can_skip_headers_and_footers(): assert partition_html( text=( "\n" "

\n" "

Header

\n" "

\n" " \n" "

My First Heading

\n" "

It was a dark and stormy night. No one was around.

\n" " \n" " \n" "\n" ), skip_headers_and_footers=True, ) == [ Title("My First Heading"), NarrativeText("It was a dark and stormy night. No one was around."), ] # -- `unique_element_ids` arg -------------------------------------------------------------------- def test_all_element_ids_are_unique(): ids = [e.id for e in partition_html(example_doc_path("fake-html-with-duplicate-elements.html"))] assert len(ids) == len(set(ids)) def test_element_ids_are_deterministic(): ids = [e.id for e in partition_html("example-docs/fake-html-with-duplicate-elements.html")] ids_2 = [e.id for e in partition_html("example-docs/fake-html-with-duplicate-elements.html")] assert ids == ids_2 # ================================================================================================ # METADATA BEHAVIORS # ================================================================================================ # -- .metadata.category_depth + parent_id -------------------------------------------------------- def test_partition_html_records_hierarchy_metadata(): elements = partition_html( text=( "\n" "

Preamble gets no category_depth or parent_id

\n" "

Heading gets category_depth but no parent_id

\n" "

Body paragraph gets parent_id but no category_depth

\n" "

List item gets category_depth and parent_id
Second list item gets category_depth and parent_id

\n" "

Body paragraph after list gets parent_id but no category_depth

\n" "\n" ) ) assert len(elements) == 6 e = elements[0] assert isinstance(e, NarrativeText) assert e.text == "Preamble gets no category_depth or parent_id" assert e.metadata.category_depth is None assert e.metadata.parent_id is None e = elements[1] assert isinstance(e, Title) assert e.text == "Heading gets category_depth but no parent_id" assert e.metadata.category_depth == 0 assert e.metadata.parent_id is None e = elements[2] assert isinstance(e, NarrativeText) assert e.text == "Body paragraph gets parent_id but no category_depth" assert e.metadata.category_depth is None assert e.metadata.parent_id == elements[1].id e = elements[3] assert isinstance(e, ListItem) assert e.text == "List item gets category_depth and parent_id" assert e.metadata.category_depth == 1 assert e.metadata.parent_id == elements[1].id e = elements[4] assert isinstance(e, ListItem) assert e.text == "Second list item gets category_depth and parent_id" assert e.metadata.category_depth == 1 assert e.metadata.parent_id == elements[1].id e = elements[5] assert isinstance(e, NarrativeText) assert e.text == "Body paragraph after list gets parent_id but no category_depth" assert e.metadata.category_depth is None assert e.metadata.parent_id == elements[1].id # -- .metadata.emphasis -------------------------------------------------------------------------- def test_partition_html_grabs_emphasized_texts(): elements = partition_html( text=( "\n" "

Hello there I am a very important text!

\n" "

Here is a list of my favorite things

\n" "

Parrots
Dogs

\n" " A lone span text!\n" "\n" ) ) e = elements[0] assert e == NarrativeText("Hello there I am a very important text!") assert e.metadata.emphasized_text_contents == ["important"] assert e.metadata.emphasized_text_tags == ["b"] e = elements[1] assert e == NarrativeText("Here is a list of my favorite things") assert e.metadata.emphasized_text_contents == ["my", "favorite", "things"] assert e.metadata.emphasized_text_tags == ["b", "bi", "b"] e = elements[2] assert e == ListItem("Parrots") assert e.metadata.emphasized_text_contents == ["Parrots"] assert e.metadata.emphasized_text_tags == ["i"] e = elements[3] assert e == ListItem("Dogs") assert e.metadata.emphasized_text_contents is None assert e.metadata.emphasized_text_tags is None e = elements[4] assert e == Title("A lone span text!") assert e.metadata.emphasized_text_contents is None assert e.metadata.emphasized_text_tags is None # -- .metadata.filename -------------------------------------------------------------------------- def test_partition_html_from_filename_uses_source_filename_for_metadata_by_default(): elements = partition_html(example_doc_path("example-10k-1p.html")) assert len(elements) > 0 assert all(e.metadata.filename == "example-10k-1p.html" for e in elements) assert all(e.metadata.file_directory == example_doc_path("") for e in elements) def test_partition_html_from_filename_prefers_metadata_filename(): elements = partition_html(example_doc_path("example-10k-1p.html"), metadata_filename="test") assert len(elements) > 0 assert all(element.metadata.filename == "test" for element in elements) def test_partition_html_from_file_prefers_metadata_filename(): with open(example_doc_path("example-10k-1p.html"), "rb") as f: elements = partition_html(file=f, metadata_filename="test") assert len(elements) > 0 assert all(e.metadata.filename == "test" for e in elements) # -- .metadata.languages ------------------------------------------------------------------------- def test_partition_html_element_metadata_has_languages(): elements = partition_html(example_doc_path("example-10k-1p.html")) assert elements[0].metadata.languages == ["eng"] def test_partition_html_respects_detect_language_per_element(): elements = partition_html( example_doc_path("language-docs/eng_spa_mult.html"), detect_language_per_element=True ) assert [e.metadata.languages for e in elements] == [ ["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"], ] # -- .metadata.last_modified --------------------------------------------------------------------- def test_partition_html_from_filename_pulls_last_modified_from_filesystem(request: FixtureRequest): get_last_modified_date_ = function_mock( request, "unstructured.partition.html.partition.get_last_modified_date", return_value="2024-06-17T22:22:20", ) file_path = example_doc_path("fake-html.html") elements = partition_html(file_path) get_last_modified_date_.assert_called_once_with(file_path) assert elements assert all(e.metadata.last_modified == "2024-06-17T22:22:20" for e in elements) def test_partition_html_from_filename_prefers_metadata_last_modified(): elements = partition_html( example_doc_path("fake-html.html"), metadata_last_modified="2023-07-05T09:24:28" ) assert isinstance(elements[0], Title) assert all(e.metadata.last_modified == "2023-07-05T09:24:28" for e in elements) # -- .metadata.link_texts and .link_urls --------------------------------------------------------- def test_partition_html_grabs_links(): html_text = ( "\n" '

Hello there I am a very important link!

\n' "

Here is a list of my favorite things

\n" "

Parrots
Dogs

\n" ' A lone link!\n' "\n" ) elements = partition_html(text=html_text) e = elements[0] assert e == NarrativeText("Hello there I am a very important link!") assert e.metadata.link_urls == ["/link"] assert e.metadata.link_texts == ["very important link!"] e = elements[1] assert e == NarrativeText("Here is a list of my favorite things") assert e.metadata.link_urls is None assert e.metadata.link_texts is None e = elements[2] assert e == ListItem("Parrots") assert e.metadata.link_urls == ["https://en.wikipedia.org/wiki/Parrot"] assert e.metadata.link_texts == ["Parrots"] e = elements[3] assert e == ListItem("Dogs") assert e.metadata.link_urls is None assert e.metadata.link_texts is None e = elements[4] assert e == Title("A lone link!") assert e.metadata.link_urls == ["/loner"] assert e.metadata.link_texts == ["A lone link!"] def test_partition_html_links(): html_text = ( "\n" ' A lone link!\n' '

Hello link!

\n' '

\n Hello link!

\n' '

Parrots and Dogs

\n' "\n" ) elements = partition_html(text=html_text) e = elements[0] assert e.metadata.link_texts == ["A lone link!"] assert e.metadata.link_urls == ["/loner"] e = elements[1] assert e.metadata.link_texts == ["link!"] assert e.metadata.link_urls == ["/link"] e = elements[2] assert e.metadata.link_texts == ["link!"] assert e.metadata.link_urls == ["/link"] e = elements[3] assert e.metadata.link_texts == ["Parrots", "Dogs"] assert e.metadata.link_urls == ["/wiki/parrots", "/wiki/dogs"] # -- .metadata.text_as_html ---------------------------------------------------------------------- @pytest.mark.parametrize( ("html_text", "expected_value"), [ ( "

Header 1	Header 2

", "

Header 1

Header 2

", ), ( "" "" "" "

Dimensions	Weight
4'-6\" x 1'	18 kg

", # ---------- "" "" "" "

Dimensions	Weight
4'-6" x 1'	18 kg

", ), ], ) def test_partition_html_applies_text_as_html_metadata_for_tables( html_text: str, expected_value: str ): elements = partition_html(text=html_text) assert len(elements) == 1 assert elements[0].metadata.text_as_html == expected_value # -- .metadata.url ------------------------------------------------------------------------------- def test_partition_html_from_url_adds_url_to_metadata(requests_get_: Mock): requests_get_.return_value = FakeResponse( text=example_doc_text("example-10k-1p.html"), status_code=200, headers={"Content-Type": "text/html"}, ) elements = partition_html(url="https://trusttheforceluke.com") requests_get_.assert_called_once_with("https://trusttheforceluke.com", headers={}, verify=True) assert len(elements) > 0 assert all(e.metadata.url == "https://trusttheforceluke.com" for e in elements) # ================================================================================================ # SERIALIZATION BEHAVIORS # ================================================================================================ def test_partition_html_round_trips_through_json(): elements = partition_html(example_doc_path("example-10k-1p.html")) assert_round_trips_through_JSON(elements) # ================================================================================================ # MODULE-LEVEL FIXTURES # ================================================================================================ EXPECTED_OUTPUT_LANGUAGE_DE = [ Title(text="Jahresabschluss zum Geschäftsjahr vom 01.01.2020 bis zum 31.12.2020"), ] class FakeResponse: def __init__(self, text: str, status_code: int, headers: dict[str, str] = {}): self.text = text self.status_code = status_code self.ok = status_code < 300 self.headers = headers @pytest.fixture def opts_args() -> dict[str, Any]: """All default arguments for `HtmlPartitionerOptions`. Individual argument values can be changed to suit each test. Makes construction of opts more compact for testing purposes. """ return { "file": None, "file_path": None, "text": None, "encoding": None, "url": None, "headers": {}, "ssl_verify": True, "skip_headers_and_footers": False, "detection_origin": None, } @pytest.fixture def requests_get_(request: pytest.FixtureRequest): return function_mock(request, "unstructured.partition.html.partition.requests.get") # ================================================================================================ # ISOLATED UNIT TESTS # ================================================================================================ # These test components used by `partition_html()` in isolation such that all edge cases can be # exercised. # ================================================================================================ class DescribeHtmlPartitionerOptions: """Unit-test suite for `unstructured.partition.html.partition.HtmlPartitionerOptions`.""" # -- .detection_origin ----------------------- @pytest.mark.parametrize("detection_origin", ["html", None]) def it_knows_the_caller_provided_detection_origin( self, detection_origin: str | None, opts_args: dict[str, Any] ): opts_args["detection_origin"] = detection_origin opts = HtmlPartitionerOptions(**opts_args) assert opts.detection_origin == detection_origin # -- .html_text ------------------------------ def it_gets_the_HTML_from_the_file_path_when_one_is_provided(self, opts_args: dict[str, Any]): file_path = example_doc_path("example-10k-1p.html") opts_args["file_path"] = file_path opts = HtmlPartitionerOptions(**opts_args) html_text = opts.html_text assert isinstance(html_text, str) assert html_text == read_txt_file(file_path)[1] def and_it_gets_the_HTML_from_the_file_like_object_when_one_is_provided( self, opts_args: dict[str, Any] ): file_path = example_doc_path("example-10k-1p.html") with open(file_path, "rb") as f: file = io.BytesIO(f.read()) opts_args["file"] = file opts = HtmlPartitionerOptions(**opts_args) html_text = opts.html_text assert isinstance(html_text, str) assert html_text == read_txt_file(file_path)[1] def and_it_uses_the_HTML_in_the_text_argument_when_that_is_provided( self, opts_args: dict[str, Any] ): opts_args["text"] = "

Hello World!

" opts = HtmlPartitionerOptions(**opts_args) assert opts.html_text == "

Hello World!

" def and_it_gets_the_HTML_from_the_url_when_one_is_provided( self, requests_get_: Mock, opts_args: dict[str, Any] ): requests_get_.return_value = FakeResponse( text="

I just flew over the internet!

", status_code=200, headers={"Content-Type": "text/html"}, ) opts_args["url"] = "https://insta.tweet.face.org" opts = HtmlPartitionerOptions(**opts_args) assert opts.html_text == "

I just flew over the internet!

" def but_it_raises_when_no_path_or_file_or_text_or_url_was_provided( self, opts_args: dict[str, Any] ): opts = HtmlPartitionerOptions(**opts_args) with pytest.raises(ValueError, match="Exactly one of filename, file, text, or url must be"): opts.html_text # -- .last_modified -------------------------- def it_gets_last_modified_from_the_filesystem_when_file_path_is_provided( self, opts_args: dict[str, Any], get_last_modified_date_: Mock ): opts_args["file_path"] = "a/b/document.html" get_last_modified_date_.return_value = "2024-04-02T20:32:35" opts = HtmlPartitionerOptions(**opts_args) last_modified = opts.last_modified get_last_modified_date_.assert_called_once_with("a/b/document.html") assert last_modified == "2024-04-02T20:32:35" def but_it_falls_back_to_None_for_the_last_modified_date_when_no_file_path_is_provided( self, opts_args: dict[str, Any] ): file = io.BytesIO(b"abcdefg") opts_args["file"] = file opts = HtmlPartitionerOptions(**opts_args) last_modified = opts.last_modified assert last_modified is None # -- .skip_headers_and_footers --------------- @pytest.mark.parametrize("skip_headers_and_footers", [True, False]) def it_knows_the_caller_provided_skip_headers_and_footers_setting( self, skip_headers_and_footers: bool, opts_args: dict[str, Any] ): opts_args["skip_headers_and_footers"] = skip_headers_and_footers opts = HtmlPartitionerOptions(**opts_args) assert opts.skip_headers_and_footers is skip_headers_and_footers # -- fixtures -------------------------------------------------------------------------------- @pytest.fixture() def get_last_modified_date_(self, request: FixtureRequest) -> Mock: return function_mock( request, "unstructured.partition.html.partition.get_last_modified_date" ) class Describe_HtmlPartitioner: """Unit-test suite for `unstructured.partition.html.partition._HtmlPartitioner`.""" # -- ._main ---------------------------------- def it_can_find_the_main_element_in_the_document(self, opts_args: dict[str, Any]): opts_args["text"] = ( "\n" "

\n" "

Lots preamble stuff yada yada yada

\n" "

A Wonderful Section!

\n" "

Look at this amazing section!

\n" "

\n" "\n" ) opts = HtmlPartitionerOptions(**opts_args) partitioner = _HtmlPartitioner(opts) assert partitioner._main.tag == "main" def and_it_falls_back_to_the_body_when_there_is_no_main(self, opts_args: dict[str, Any]): """And there is always a , the parser adds one if there's not one in the HTML.""" opts_args["text"] = ( "\n" "

\n" "

Lots preamble stuff yada yada yada

\n" "

A Wonderful Section!

\n" "

Look at this amazing section!

\n" "\n" ) opts = HtmlPartitionerOptions(**opts_args) partitioner = _HtmlPartitioner(opts) assert partitioner._main.tag == "body" # -- ElementCls selection behaviors ----------------- def it_produces_a_Text_element_when_the_tag_contents_are_not_narrative_or_a_title( self, opts_args: dict[str, Any] ): opts_args["text"] = "

NO PARTICULAR TYPE.

" opts = HtmlPartitionerOptions(**opts_args) (element,) = list(_HtmlPartitioner.iter_elements(opts)) assert element == Text("NO PARTICULAR TYPE.") def it_produces_a_ListItem_element_when_the_tag_contains_are_preceded_by_a_bullet_character( self, opts_args: dict[str, Any] ): opts_args["text"] = "

● An excellent point!

" opts = HtmlPartitionerOptions(**opts_args) (element,) = list(_HtmlPartitioner.iter_elements(opts)) assert element == ListItem("An excellent point!") def but_not_when_the_tag_contains_only_a_bullet_character_and_no_text( self, opts_args: dict[str, Any] ): opts_args["text"] = "

●

" opts = HtmlPartitionerOptions(**opts_args) assert list(_HtmlPartitioner.iter_elements(opts)) == [] def it_produces_no_element_when_the_tag_has_no_content(self, opts_args: dict[str, Any]): opts_args["text"] = "

" opts = HtmlPartitionerOptions(**opts_args) assert list(_HtmlPartitioner.iter_elements(opts)) == [] def and_it_produces_no_element_when_the_tag_contains_only_a_stub( self, opts_args: dict[str, Any] ): opts_args["text"] = "

" opts = HtmlPartitionerOptions(**opts_args) assert list(_HtmlPartitioner.iter_elements(opts)) == []