A Wonderful Section!
\n" "Look at this amazing section!
\n" "# pyright: reportPrivateUsage=false """Test suite for `unstructured.partition.html.partition` module.""" from __future__ import annotations import io import pathlib from typing import Any import pytest from lxml import etree from test_unstructured.unit_utils import ( FixtureRequest, Mock, assert_round_trips_through_JSON, example_doc_path, example_doc_text, function_mock, ) from unstructured.chunking.title import chunk_by_title from unstructured.cleaners.core import clean_extra_whitespace from unstructured.documents.elements import ( Address, CompositeElement, ListItem, NarrativeText, Table, TableChunk, Text, Title, ) from unstructured.file_utils.encoding import read_txt_file from unstructured.partition.html import partition_html from unstructured.partition.html.partition import HtmlPartitionerOptions, _HtmlPartitioner # ================================================================================================ # SOURCE HTML LOADING BEHAVIORS # ================================================================================================ # -- document-source (filename, file, text, url) ------------------------------------------------- def test_partition_html_accepts_a_file_path(tmp_path: pathlib.Path): file_path = str(tmp_path / "sample-doc.html") with open(file_path, "w") as f: f.write( "\n" "
\n" "Dear Leader is the best. He is such a wonderful engineer!
\n" " \n" "Another Magnificent paragraph
\n" "The prior element is a title based on its capitalization patterns!
\n" "I'm in a table | \n"
"
每日新闻
Hello 😀
') == [ Title("Hello 😀") ] # -- partition_html() from URL ------------------------------------------------------------------- def test_partition_html_from_url_raises_on_failure_response_status_code(requests_get_: Mock): requests_get_.return_value = FakeResponse( text=example_doc_text("example-10k-1p.html"), status_code=500, headers={"Content-Type": "text/html"}, ) with pytest.raises(ValueError, match="Error status code on GET of provided URL: 500"): partition_html(url="https://fake.url") def test_partition_html_from_url_raises_on_response_of_wrong_content_type(requests_get_: Mock): requests_get_.return_value = FakeResponse( text=example_doc_text("example-10k-1p.html"), status_code=200, headers={"Content-Type": "application/json"}, ) with pytest.raises(ValueError, match="Expected content type text/html. Got application/json."): partition_html(url="https://fake.url") def test_partition_from_url_includes_provided_headers_in_request(requests_get_: Mock): requests_get_.return_value = FakeResponse( text="What do I know? Who needs to know it?
", status_code=200, headers={"Content-Type": "text/html"}, ) partition_html(url="https://example.com", headers={"User-Agent": "test"}) requests_get_.assert_called_once_with( "https://example.com", headers={"User-Agent": "test"}, verify=True ) # ================================================================================================ # PARSING TESTS # ================================================================================================ def test_partition_html_on_ideas_page(): elements = partition_html(example_doc_path("ideas-page.html")) assert len(elements) == 1 e = elements[0] assert e == Table( "January 2023 ( Someone fed my essays into GPT to make something that could answer" "\nquestions based on them, then asked it where good ideas come from. The" "\nanswer was ok, but not what I would have said. This is what I would have said.)" " The way to get new ideas is to notice anomalies: what seems strange," "\nor missing, or broken? You can see anomalies in everyday life (much" "\nof standup comedy is based on this), but the best place to look for" "\nthem is at the frontiers of knowledge. Knowledge grows fractally." "\nFrom a distance its edges look smooth, but when you learn enough" "\nto get close to one, you'll notice it's full of gaps. These gaps" "\nwill seem obvious; it will seem inexplicable that no one has tried" "\nx or wondered about y. In the best case, exploring such gaps yields" "\nwhole new fractal buds.", ) assert e.metadata.emphasized_text_contents is None assert e.metadata.link_urls is None assert e.metadata.text_as_html is not None # -- element-suppression behaviors --------------------------------------------------------------- def test_it_does_not_extract_text_in_script_tags(): elements = partition_html(example_doc_path("example-with-scripts.html")) assert all("function (" not in e.text for e in elements) def test_it_does_not_extract_text_in_style_tags(): html_text = ( "\n" "\n" "Lorem ipsum dolor
\n" "\n" "" ) (element,) = partition_html(text=html_text) assert isinstance(element, Text) assert element.text == "Lorem ipsum dolor" # -- table parsing behaviors --------------------------------------------------------------------- def test_it_can_parse_a_bare_bones_table_to_a_Table_element(): """Bare-bones means no ``, ``, or `` elements.""" html_text = ( "\n" "\n" "Lorem | Ipsum |
Ut enim non | ad minim\nveniam quis |
Lorem | Ipsum |
Ut enim non | ad minim veniam quis |
Lorem | Ipsum |
---|---|
Lorem ipsum | dolor sit amet nulla |
Ut enim non | ad minim\nveniam quis |
Dolor | Equis |
Lorem | Ipsum |
Lorem ipsum | dolor sit amet nulla |
Ut enim non | ad minim veniam quis |
Dolor | Equis |
Lorem | Ipsum |
---|---|
Lorem ipsum | dolor sit amet nulla |
Ut enim non | ad minim\nveniam quis |
Dolor | Equis |
Lorem | Ipsum |
Lorem ipsum | dolor sit amet nulla |
Ut enim non | ad minim veniam quis |
Dolor | Equis |
Header 1 | Header 2 |
Header 1 | Header 2 |
Header 1 | Header 2 |
---|
\n"
"
| \n"
" \n"
"
| \n"
"
foo bar baz bng | fizz bang |
\n" " | \n" " |
\n"
" \n"
" \n"
' | \n"
" \n"
" \n" " ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE" " ACT OF 1934\n" " \n" " | \n"
"
☒ | ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES" " EXCHANGE ACT OF 1934 |
This is narrative text, it's long, flows and has meaning
\n" "The Big Brown Bear\n" "
The big brown bear is sleeping.\n" "
Text
\n" "\n" " Param1 = Y\n" "
Param2 = 1
Param3 = 2
Param4 = A\n" "
Param5 = A,B,C,D,E
Param6 = 7
Param7 = Five
\n" "
But whoa, this is a paragraph inside a phrasing element.
so we close the first element at the start of the block element and emit it, then we emit the block element, and then start a new element for the tail and whatever phrasing follows it.The task we have designed you to perform is this.
We want you to tell us.... he paused,\n" " • Happy Groundhog's day!\n" "
\n" "\n" " • Looks like six more weeks of winter ...\n" "
\n" " \n" "\n" ) elements = partition_html(text=html_text) # -- bullet characters are removed -- assert elements == [ ListItem("Happy Groundhog's day!"), ListItem("Looks like six more weeks of winter ..."), ] def test_joins_tag_text_correctly(): elements = partition_html(text="Hello again peet magical
") assert elements == [Title("Hello again peet magical")] def test_sample_doc_with_emoji(): elements = partition_html(text='\nHello again 😀
\n') assert elements == [NarrativeText("Hello again 😀")] def test_only_text_and_no_elements_in_body(): elements = partition_html(text="Hello") assert elements == [Title("Hello")] def test_text_before_elements_in_body(): elements = partition_html(text="HelloWorld
") assert elements == [Title("Hello"), Title("World")] def test_line_break_in_container(): elements = partition_html(text="Header
\n" "It was a dark and stormy night. No one was around.
\n" " \n" " \n" "\n" ), skip_headers_and_footers=True, ) == [ Title("My First Heading"), NarrativeText("It was a dark and stormy night. No one was around."), ] # -- `unique_element_ids` arg -------------------------------------------------------------------- def test_all_element_ids_are_unique(): ids = [e.id for e in partition_html(example_doc_path("fake-html-with-duplicate-elements.html"))] assert len(ids) == len(set(ids)) def test_element_ids_are_deterministic(): ids = [e.id for e in partition_html("example-docs/fake-html-with-duplicate-elements.html")] ids_2 = [e.id for e in partition_html("example-docs/fake-html-with-duplicate-elements.html")] assert ids == ids_2 # ================================================================================================ # METADATA BEHAVIORS # ================================================================================================ # -- .metadata.category_depth + parent_id -------------------------------------------------------- def test_partition_html_records_hierarchy_metadata(): elements = partition_html( text=( "\n" "Preamble gets no category_depth or parent_id
\n" "Body paragraph gets parent_id but no category_depth
\n" "Body paragraph after list gets parent_id but no category_depth
\n" "\n" ) ) assert len(elements) == 6 e = elements[0] assert isinstance(e, NarrativeText) assert e.text == "Preamble gets no category_depth or parent_id" assert e.metadata.category_depth is None assert e.metadata.parent_id is None e = elements[1] assert isinstance(e, Title) assert e.text == "Heading gets category_depth but no parent_id" assert e.metadata.category_depth == 0 assert e.metadata.parent_id is None e = elements[2] assert isinstance(e, NarrativeText) assert e.text == "Body paragraph gets parent_id but no category_depth" assert e.metadata.category_depth is None assert e.metadata.parent_id == elements[1].id e = elements[3] assert isinstance(e, ListItem) assert e.text == "List item gets category_depth and parent_id" assert e.metadata.category_depth == 1 assert e.metadata.parent_id == elements[1].id e = elements[4] assert isinstance(e, ListItem) assert e.text == "Second list item gets category_depth and parent_id" assert e.metadata.category_depth == 1 assert e.metadata.parent_id == elements[1].id e = elements[5] assert isinstance(e, NarrativeText) assert e.text == "Body paragraph after list gets parent_id but no category_depth" assert e.metadata.category_depth is None assert e.metadata.parent_id == elements[1].id # -- .metadata.emphasis -------------------------------------------------------------------------- def test_partition_html_grabs_emphasized_texts(): elements = partition_html( text=( "\n" "Hello there I am a very important text!
\n" "Here is a list of my favorite things
\n" "Hello there I am a very important link!
\n' "Here is a list of my favorite things
\n" "Hello link!
\n' '\n Hello link!
\n' ' \n' "\n" ) elements = partition_html(text=html_text) e = elements[0] assert e.metadata.link_texts == ["A lone link!"] assert e.metadata.link_urls == ["/loner"] e = elements[1] assert e.metadata.link_texts == ["link!"] assert e.metadata.link_urls == ["/link"] e = elements[2] assert e.metadata.link_texts == ["link!"] assert e.metadata.link_urls == ["/link"] e = elements[3] assert e.metadata.link_texts == ["Parrots", "Dogs"] assert e.metadata.link_urls == ["/wiki/parrots", "/wiki/dogs"] # -- .metadata.text_as_html ---------------------------------------------------------------------- @pytest.mark.parametrize( ("html_text", "expected_value"), [ ( "Header 1 | Header 2 |
---|
Header 1 | Header 2 |
Dimensions | Weight |
4'-6\" x 1' | 18 kg |
Dimensions | Weight |
4'-6" x 1' | 18 kg |
Hello World!
" opts = HtmlPartitionerOptions(**opts_args) assert opts.html_text == "Hello World!
" def and_it_gets_the_HTML_from_the_url_when_one_is_provided( self, requests_get_: Mock, opts_args: dict[str, Any] ): requests_get_.return_value = FakeResponse( text="I just flew over the internet!
", status_code=200, headers={"Content-Type": "text/html"}, ) opts_args["url"] = "https://insta.tweet.face.org" opts = HtmlPartitionerOptions(**opts_args) assert opts.html_text == "I just flew over the internet!
" def but_it_raises_when_no_path_or_file_or_text_or_url_was_provided( self, opts_args: dict[str, Any] ): opts = HtmlPartitionerOptions(**opts_args) with pytest.raises(ValueError, match="Exactly one of filename, file, text, or url must be"): opts.html_text # -- .last_modified -------------------------- def it_gets_last_modified_from_the_filesystem_when_file_path_is_provided( self, opts_args: dict[str, Any], get_last_modified_date_: Mock ): opts_args["file_path"] = "a/b/document.html" get_last_modified_date_.return_value = "2024-04-02T20:32:35" opts = HtmlPartitionerOptions(**opts_args) last_modified = opts.last_modified get_last_modified_date_.assert_called_once_with("a/b/document.html") assert last_modified == "2024-04-02T20:32:35" def but_it_falls_back_to_None_for_the_last_modified_date_when_no_file_path_is_provided( self, opts_args: dict[str, Any] ): file = io.BytesIO(b"abcdefg") opts_args["file"] = file opts = HtmlPartitionerOptions(**opts_args) last_modified = opts.last_modified assert last_modified is None # -- .skip_headers_and_footers --------------- @pytest.mark.parametrize("skip_headers_and_footers", [True, False]) def it_knows_the_caller_provided_skip_headers_and_footers_setting( self, skip_headers_and_footers: bool, opts_args: dict[str, Any] ): opts_args["skip_headers_and_footers"] = skip_headers_and_footers opts = HtmlPartitionerOptions(**opts_args) assert opts.skip_headers_and_footers is skip_headers_and_footers # -- fixtures -------------------------------------------------------------------------------- @pytest.fixture() def get_last_modified_date_(self, request: FixtureRequest) -> Mock: return function_mock( request, "unstructured.partition.html.partition.get_last_modified_date" ) class Describe_HtmlPartitioner: """Unit-test suite for `unstructured.partition.html.partition._HtmlPartitioner`.""" # -- ._main ---------------------------------- def it_can_find_the_main_element_in_the_document(self, opts_args: dict[str, Any]): opts_args["text"] = ( "\n" "Lots preamble stuff yada yada yada
\n" "Look at this amazing section!
\n" "Lots preamble stuff yada yada yada
\n" "Look at this amazing section!
\n" "\n" ) opts = HtmlPartitionerOptions(**opts_args) partitioner = _HtmlPartitioner(opts) assert partitioner._main.tag == "body" # -- ElementCls selection behaviors ----------------- def it_produces_a_Text_element_when_the_tag_contents_are_not_narrative_or_a_title( self, opts_args: dict[str, Any] ): opts_args["text"] = "NO PARTICULAR TYPE.
" opts = HtmlPartitionerOptions(**opts_args) (element,) = list(_HtmlPartitioner.iter_elements(opts)) assert element == Text("NO PARTICULAR TYPE.") def it_produces_a_ListItem_element_when_the_tag_contains_are_preceded_by_a_bullet_character( self, opts_args: dict[str, Any] ): opts_args["text"] = "● An excellent point!
" opts = HtmlPartitionerOptions(**opts_args) (element,) = list(_HtmlPartitioner.iter_elements(opts)) assert element == ListItem("An excellent point!") def but_not_when_the_tag_contains_only_a_bullet_character_and_no_text( self, opts_args: dict[str, Any] ): opts_args["text"] = "●
" opts = HtmlPartitionerOptions(**opts_args) assert list(_HtmlPartitioner.iter_elements(opts)) == [] def it_produces_no_element_when_the_tag_has_no_content(self, opts_args: dict[str, Any]): opts_args["text"] = "" opts = HtmlPartitionerOptions(**opts_args) assert list(_HtmlPartitioner.iter_elements(opts)) == [] def and_it_produces_no_element_when_the_tag_contains_only_a_stub( self, opts_args: dict[str, Any] ): opts_args["text"] = "$
" opts = HtmlPartitionerOptions(**opts_args) assert list(_HtmlPartitioner.iter_elements(opts)) == []