Some important stuff is going on!
Here is a description of that stuff
import os import pathlib from tempfile import SpooledTemporaryFile from unittest.mock import patch import pytest import requests from requests.models import Response from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path from unstructured.chunking.title import chunk_by_title from unstructured.cleaners.core import clean_extra_whitespace from unstructured.documents.elements import EmailAddress, ListItem, NarrativeText, Table, Title from unstructured.documents.html import HTMLTitle from unstructured.partition.html import partition_html DIRECTORY = pathlib.Path(__file__).parent.resolve() EXPECTED_OUTPUT_LANGUAGE_DE = [ Title(text="Jahresabschluss zum Geschäftsjahr vom 01.01.2020 bis zum 31.12.2020"), ] def test_partition_html_from_filename(): directory = os.path.join(DIRECTORY, "..", "..", "example-docs") filename = os.path.join(directory, "example-10k.html") elements = partition_html(filename=filename) assert len(elements) > 0 assert "PageBreak" not in [elem.category for elem in elements] assert elements[0].metadata.filename == "example-10k.html" assert elements[0].metadata.file_directory == directory def test_partition_html_from_filename_returns_html_elements(): directory = os.path.join(DIRECTORY, "..", "..", "example-docs") filename = os.path.join(directory, "example-10k.html") elements = partition_html(filename=filename) assert len(elements) > 0 assert isinstance(elements[0], HTMLTitle) def test_partition_html_from_filename_with_metadata_filename(): directory = os.path.join(DIRECTORY, "..", "..", "example-docs") filename = os.path.join(directory, "example-10k.html") elements = partition_html(filename=filename, metadata_filename="test") assert len(elements) > 0 assert all(element.metadata.filename == "test" for element in elements) @pytest.mark.parametrize( ("filename", "encoding", "error"), [ ("example-10k-utf-16.html", "utf-8", UnicodeDecodeError), ("example-steelJIS-datasheet-utf-16.html", "utf-8", UnicodeDecodeError), ], ) def test_partition_html_from_filename_raises_encoding_error(filename, encoding, error): with pytest.raises(error): filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) with open(filename) as f: partition_html(file=f, encoding=encoding) @pytest.mark.parametrize( "filename", [ "example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html", "fake-html-lang-de.html", ], ) def test_partition_html_from_filename_default_encoding(filename): filename_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) elements = partition_html(filename=filename_path) assert len(elements) > 0 for element in elements: assert element.metadata.filename == filename if filename == "fake-html-lang-de.html": assert elements == EXPECTED_OUTPUT_LANGUAGE_DE def test_partition_html_from_filename_metadata_false(): directory = os.path.join(DIRECTORY, "..", "..", "example-docs") filename = os.path.join(directory, "example-10k.html") elements = partition_html(filename=filename, include_metadata=False) metadata_present = any(element.metadata.to_dict() for element in elements) assert not metadata_present def test_partition_html_with_page_breaks(): filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html") elements = partition_html(filename=filename, include_page_breaks=True) assert "PageBreak" in [elem.category for elem in elements] assert len(elements) > 0 for element in elements: assert element.metadata.filename == "example-10k.html" def test_partition_html_from_file(): filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html") with open(filename) as f: elements = partition_html(file=f) assert len(elements) > 0 for element in elements: assert element.metadata.filename is None def test_partition_html_from_file_with_metadata_filename(): filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html") with open(filename) as f: elements = partition_html(file=f, metadata_filename="test") assert len(elements) > 0 for element in elements: assert element.metadata.filename == "test" @pytest.mark.parametrize( ("filename", "encoding", "error"), [ ("example-10k-utf-16.html", "utf-8", UnicodeDecodeError), ("example-steelJIS-datasheet-utf-16.html", "utf-8", UnicodeDecodeError), ], ) def test_partition_html_from_file_raises_encoding_error(filename, encoding, error): with pytest.raises(error): filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) with open(filename) as f, pytest.raises(UnicodeEncodeError): partition_html(file=f, encoding=encoding) @pytest.mark.parametrize( "filename", [ "example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html", "fake-html-lang-de.html", ], ) def test_partition_html_from_file_default_encoding(filename): filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) with open(filename) as f: elements = partition_html(file=f) assert len(elements) > 0 if filename == "fake-html-lang-de.html": assert elements == EXPECTED_OUTPUT_LANGUAGE_DE @pytest.mark.parametrize( ("filename", "encoding", "error"), [ ("example-10k-utf-16.html", "utf-8", UnicodeDecodeError), ("example-steelJIS-datasheet-utf-16.html", "utf-8", UnicodeDecodeError), ], ) def test_partition_html_from_file_rb_raises_encoding_error(filename, encoding, error): with pytest.raises(error): filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) with open(filename, "rb") as f: partition_html(file=f, encoding=encoding) @pytest.mark.parametrize( "filename", [ "example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html", "fake-html-lang-de.html", ], ) def test_partition_html_from_file_rb_default_encoding(filename): filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) with open(filename, "rb") as f: elements = partition_html(file=f) assert len(elements) > 0 if filename == "fake-html-lang-de.html": assert elements == EXPECTED_OUTPUT_LANGUAGE_DE def test_partition_html_from_text(): filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html") with open(filename) as f: text = f.read() elements = partition_html(text=text) assert len(elements) > 0 def test_partition_html_from_text_works_with_empty_string(): assert partition_html(text="") == [] class MockResponse: def __init__(self, text, status_code, headers={}): self.text = text self.status_code = status_code self.ok = status_code < 300 self.headers = headers def test_partition_html_from_url(): filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html") with open(filename) as f: text = f.read() response = MockResponse( text=text, status_code=200, headers={"Content-Type": "text/html"}, ) with patch.object(requests, "get", return_value=response) as _: elements = partition_html(url="https://fake.url") assert len(elements) > 0 def test_partition_html_from_url_raises_with_bad_status_code(): filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html") with open(filename) as f: text = f.read() response = MockResponse( text=text, status_code=500, headers={"Content-Type": "text/html"}, ) with patch.object(requests, "get", return_value=response) as _, pytest.raises(ValueError): partition_html(url="https://fake.url") def test_partition_html_from_url_raises_with_bad_content_type(): filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html") with open(filename) as f: text = f.read() response = MockResponse( text=text, status_code=200, headers={"Content-Type": "application/json"}, ) with patch.object(requests, "get", return_value=response) as _, pytest.raises(ValueError): partition_html(url="https://fake.url") def test_partition_from_url_uses_headers(mocker): test_url = "https://example.com" test_headers = {"User-Agent": "test"} response = Response() response.status_code = 200 response._content = ( b"
What do i know? Who needs to know it?
" ) response.headers = {"Content-Type": "text/html"} mock_get = mocker.patch("requests.get", return_value=response) partition_html(url=test_url, headers=test_headers) # Check if requests.get was called with the correct arguments mock_get.assert_called_once_with(test_url, headers=test_headers, verify=True) def test_partition_html_raises_with_none_specified(): with pytest.raises(ValueError): partition_html() def test_partition_html_raises_with_too_many_specified(): filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html") with open(filename) as f: text = f.read() with pytest.raises(ValueError): partition_html(filename=filename, text=text) def test_partition_html_on_ideas_page(filename="example-docs/ideas-page.html"): elements = partition_html(filename=filename) assert len(elements) == 1 assert elements[0] == Table( text="January 2023 ( Someone fed my essays into GPT to make something " "that could answer\nquestions based on them, then asked it where good " "ideas come from. The\nanswer was ok, but not what I would have said. " "This is what I would have said.) The way to get new ideas is to notice " "anomalies: what seems strange,\nor missing, or broken? You can see anomalies" " in everyday life (much\nof standup comedy is based on this), but the best " "place to look for\nthem is at the frontiers of knowledge. Knowledge grows " "fractally.\nFrom a distance its edges look smooth, but when you learn " "enough\nto get close to one, you'll notice it's full of gaps. These " "gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx " "or wondered about y. In the best case, exploring such gaps yields\nwhole " "new fractal buds.", ) assert elements[0].metadata.emphasized_text_contents is None assert elements[0].metadata.link_urls is None assert elements[0].metadata.text_as_html is not None def test_user_without_file_write_permission_can_partition_html(tmp_path, monkeypatch): example_filename = os.path.join( DIRECTORY, "..", "..", "example-docs", "example-10k.html", ) # create a file with no write permissions read_only_file = tmp_path / "example-10k-readonly.html" read_only_file.touch() # set content of read_only_file to be that of example-10k.html with open(example_filename) as f: read_only_file.write_text(f.read()) # set read_only_file to be read only read_only_file.chmod(0o444) # partition html should still work elements = partition_html(filename=read_only_file.resolve()) assert len(elements) > 0 def test_partition_html_processes_chinese_chracters(): html_text = "每日新闻
Hello 😀
""" elements = partition_html(text=html_text) assert elements[0] == Title("Hello 😀") def test_partition_html_can_turn_off_assemble_articles(): html_text = """Here is a description of that stuff
Here is a description of that stuff
TEST
TEST
Hello there I am a very important link!
Here is a list of my favorite things
Header
My first paragraph.
""" elements = partition_html(text=text, skip_headers_and_footers=True) for element in elements: assert "footer" not in element.ancestortags assert "header" not in element.ancestortags def test_partition_html_from_url_with_skip_headers_and_footers(mocker): test_url = "https://example.com" test_headers = {"User-Agent": "test"} response = Response() response.status_code = 200 response._content = b"""Header
My first paragraph.
""" response.headers = {"Content-Type": "text/html"} mocker.patch("requests.get", return_value=response) elements = partition_html(url=test_url, headers=test_headers, skip_headers_and_footers=True) for element in elements: assert "footer" not in element.ancestortags assert "header" not in element.ancestortags def test_partition_html_grabs_emphasized_texts(): html_text = """Hello there I am a very important text!
Here is a list of my favorite things
The Big Brown Bear
The big brown bear is sleeping.
This is a section of narrative text, it's long, flows and has meaning
Header 1 | Header 2 |
Header 1 | Header 2 |
Header 1 | Header 2 |
---|
Text
Param1 = Y
Param2 = 1
Param3 = 2
Param4 = A
Param5 = A,B,C,D,E
Param6 = 7
Param7 = Five
Hello link!
\n Hello link!
""" expected_results = [ [ {"text": "A lone link!", "url": "/loner", "start_index": -1}, ], [ {"text": "link!", "url": "/link", "start_index": 6}, ], [ {"text": "link!", "url": "/link", "start_index": 6}, ], [ {"text": "Parrots", "url": "/wiki/parrots", "start_index": 0}, {"text": "Dogs", "url": "/wiki/dogs", "start_index": 12}, ], ] elements = partition_html(text=html_text) for el_idx, el in enumerate(elements): expected_result = expected_results[el_idx] for link_idx, (text, url, start_index) in enumerate( zip(el.metadata.link_texts, el.metadata.link_urls, el.metadata.link_start_indexes) ): assert text == expected_result[link_idx]["text"] assert url == expected_result[link_idx]["url"] assert start_index == expected_result[link_idx]["start_index"]