2024-06-21 13:59:48 -07:00
|
|
|
# pyright: reportPrivateUsage=false
|
|
|
|
|
|
|
|
"""Test suite for `unstructured.partition.html.partition` module."""
|
2024-06-05 16:11:58 -07:00
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
import io
|
|
|
|
import pathlib
|
|
|
|
from tempfile import SpooledTemporaryFile
|
2024-06-13 11:19:42 -07:00
|
|
|
from typing import Any
|
2024-06-05 16:11:58 -07:00
|
|
|
|
|
|
|
import pytest
|
2024-06-21 13:59:48 -07:00
|
|
|
from lxml import etree
|
2024-06-05 16:11:58 -07:00
|
|
|
|
|
|
|
from test_unstructured.unit_utils import (
|
2024-06-13 11:19:42 -07:00
|
|
|
FixtureRequest,
|
2024-06-05 16:11:58 -07:00
|
|
|
Mock,
|
|
|
|
assert_round_trips_through_JSON,
|
|
|
|
example_doc_path,
|
|
|
|
example_doc_text,
|
|
|
|
function_mock,
|
|
|
|
)
|
|
|
|
from unstructured.chunking.title import chunk_by_title
|
|
|
|
from unstructured.cleaners.core import clean_extra_whitespace
|
|
|
|
from unstructured.documents.elements import (
|
2024-06-21 13:59:48 -07:00
|
|
|
Address,
|
2024-06-05 16:11:58 -07:00
|
|
|
CompositeElement,
|
|
|
|
EmailAddress,
|
|
|
|
ListItem,
|
|
|
|
NarrativeText,
|
|
|
|
Table,
|
|
|
|
TableChunk,
|
2024-06-21 13:59:48 -07:00
|
|
|
Text,
|
2024-06-05 16:11:58 -07:00
|
|
|
Title,
|
|
|
|
)
|
2024-06-21 13:59:48 -07:00
|
|
|
from unstructured.documents.html import HTMLDocument
|
2024-06-13 11:19:42 -07:00
|
|
|
from unstructured.file_utils.encoding import read_txt_file
|
2024-06-21 13:59:48 -07:00
|
|
|
from unstructured.partition.html import partition_html
|
|
|
|
from unstructured.partition.html.partition import HtmlPartitionerOptions, _HtmlPartitioner
|
|
|
|
|
|
|
|
# ================================================================================================
|
|
|
|
# SOURCE HTML LOADING BEHAVIORS
|
|
|
|
# ================================================================================================
|
2024-06-05 16:11:58 -07:00
|
|
|
|
|
|
|
# -- document-source (filename, file, text, url) -------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_accepts_a_file_path():
|
|
|
|
elements = partition_html(example_doc_path("example-10k-1p.html"))
|
|
|
|
|
|
|
|
assert len(elements) > 0
|
|
|
|
assert all(e.metadata.filename == "example-10k-1p.html" for e in elements)
|
|
|
|
assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
|
|
|
|
|
|
|
|
|
2024-06-21 13:59:48 -07:00
|
|
|
def test_user_without_file_write_permission_can_partition_html(tmp_path: pathlib.Path):
|
|
|
|
read_only_file_path = tmp_path / "example-10k-readonly.html"
|
|
|
|
read_only_file_path.write_text(example_doc_text("example-10k-1p.html"))
|
|
|
|
read_only_file_path.chmod(0o444)
|
|
|
|
|
|
|
|
elements = partition_html(filename=str(read_only_file_path.resolve()))
|
|
|
|
|
|
|
|
assert len(elements) > 0
|
|
|
|
|
|
|
|
|
2024-06-05 16:11:58 -07:00
|
|
|
def test_partition_html_accepts_a_file_like_object():
|
|
|
|
with open(example_doc_path("example-10k-1p.html"), "rb") as f:
|
|
|
|
elements = partition_html(file=f)
|
|
|
|
|
|
|
|
assert len(elements) > 0
|
|
|
|
assert all(e.metadata.filename is None for e in elements)
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_accepts_an_html_str():
|
|
|
|
elements = partition_html(text=example_doc_text("example-10k-1p.html"))
|
|
|
|
assert len(elements) > 0
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_accepts_a_url_to_an_HTML_document(requests_get_: Mock):
|
|
|
|
requests_get_.return_value = FakeResponse(
|
|
|
|
text=example_doc_text("example-10k-1p.html"),
|
|
|
|
status_code=200,
|
|
|
|
headers={"Content-Type": "text/html"},
|
|
|
|
)
|
|
|
|
|
|
|
|
elements = partition_html(url="https://fake.url")
|
|
|
|
|
|
|
|
requests_get_.assert_called_once_with("https://fake.url", headers={}, verify=True)
|
|
|
|
assert len(elements) > 0
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_raises_when_no_path_or_file_or_text_or_url_is_specified():
|
2024-06-13 11:19:42 -07:00
|
|
|
with pytest.raises(ValueError, match="Exactly one of filename, file, text, or url must be sp"):
|
2024-06-05 16:11:58 -07:00
|
|
|
partition_html()
|
|
|
|
|
|
|
|
|
2024-06-21 13:59:48 -07:00
|
|
|
# -- encoding for filename, file, and text -------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"filename", ["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html"]
|
|
|
|
)
|
|
|
|
def test_partition_html_from_filename_raises_when_explicit_encoding_is_wrong(filename: str):
|
|
|
|
with pytest.raises(UnicodeDecodeError):
|
|
|
|
with open(example_doc_path(filename), "rb") as f:
|
|
|
|
partition_html(file=f, encoding="utf-8")
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"filename",
|
|
|
|
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html", "fake-html-lang-de.html"],
|
|
|
|
)
|
|
|
|
def test_partition_html_from_filename_default_encoding(filename: str):
|
|
|
|
elements = partition_html(example_doc_path(filename))
|
|
|
|
|
|
|
|
assert len(elements) > 0
|
|
|
|
assert all(e.metadata.filename == filename for e in elements)
|
|
|
|
if filename == "fake-html-lang-de.html":
|
|
|
|
assert elements == EXPECTED_OUTPUT_LANGUAGE_DE
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"filename", ["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html"]
|
|
|
|
)
|
|
|
|
def test_partition_html_from_file_raises_encoding_error(filename: str):
|
|
|
|
with open(example_doc_path(filename), "rb") as f:
|
|
|
|
file = io.BytesIO(f.read())
|
|
|
|
|
|
|
|
with pytest.raises(UnicodeDecodeError, match="'utf-8' codec can't decode byte 0xff in posi"):
|
|
|
|
partition_html(file=file, encoding="utf-8")
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"filename",
|
|
|
|
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html", "fake-html-lang-de.html"],
|
|
|
|
)
|
|
|
|
def test_partition_html_from_file_default_encoding(filename: str):
|
|
|
|
with open(example_doc_path(filename), "rb") as f:
|
|
|
|
elements = partition_html(file=f)
|
|
|
|
|
|
|
|
assert len(elements) > 0
|
|
|
|
if filename == "fake-html-lang-de.html":
|
|
|
|
assert elements == EXPECTED_OUTPUT_LANGUAGE_DE
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"filename", ["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html"]
|
|
|
|
)
|
|
|
|
def test_partition_html_from_file_rb_raises_encoding_error(filename: str):
|
|
|
|
with pytest.raises(UnicodeDecodeError, match="'utf-8' codec can't decode byte 0xff in posi"):
|
|
|
|
with open(example_doc_path(filename), "rb") as f:
|
|
|
|
partition_html(file=f, encoding="utf-8")
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"filename",
|
|
|
|
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html", "fake-html-lang-de.html"],
|
|
|
|
)
|
|
|
|
def test_partition_html_from_file_rb_default_encoding(filename: str):
|
|
|
|
with open(example_doc_path(filename), "rb") as f:
|
|
|
|
elements = partition_html(file=f)
|
|
|
|
|
|
|
|
assert len(elements) > 0
|
|
|
|
if filename == "fake-html-lang-de.html":
|
|
|
|
assert elements == EXPECTED_OUTPUT_LANGUAGE_DE
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_processes_chinese_chracters():
|
|
|
|
html_text = "<html><div><p>每日新闻</p></div></html>"
|
|
|
|
elements = partition_html(text=html_text)
|
|
|
|
assert elements[0].text == "每日新闻"
|
|
|
|
|
|
|
|
|
|
|
|
def test_emoji_appears_with_emoji_utf8_code():
|
|
|
|
assert partition_html(text='<html charset="utf-8"><p>Hello 😀</p></html>') == [
|
|
|
|
Title("Hello 😀")
|
|
|
|
]
|
|
|
|
|
|
|
|
|
2024-06-05 16:11:58 -07:00
|
|
|
# -- partition_html() from URL -------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_from_url_raises_on_failure_response_status_code(requests_get_: Mock):
|
|
|
|
requests_get_.return_value = FakeResponse(
|
|
|
|
text=example_doc_text("example-10k-1p.html"),
|
|
|
|
status_code=500,
|
|
|
|
headers={"Content-Type": "text/html"},
|
|
|
|
)
|
|
|
|
|
|
|
|
with pytest.raises(ValueError, match="Error status code on GET of provided URL: 500"):
|
|
|
|
partition_html(url="https://fake.url")
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_from_url_raises_on_response_of_wrong_content_type(requests_get_: Mock):
|
|
|
|
requests_get_.return_value = FakeResponse(
|
|
|
|
text=example_doc_text("example-10k-1p.html"),
|
|
|
|
status_code=200,
|
|
|
|
headers={"Content-Type": "application/json"},
|
|
|
|
)
|
|
|
|
|
|
|
|
with pytest.raises(ValueError, match="Expected content type text/html. Got application/json."):
|
|
|
|
partition_html(url="https://fake.url")
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_from_url_includes_provided_headers_in_request(requests_get_: Mock):
|
|
|
|
requests_get_.return_value = FakeResponse(
|
|
|
|
text="<html><head></head><body><p>What do I know? Who needs to know it?</p></body></html>",
|
|
|
|
status_code=200,
|
|
|
|
headers={"Content-Type": "text/html"},
|
|
|
|
)
|
|
|
|
|
|
|
|
partition_html(url="https://example.com", headers={"User-Agent": "test"})
|
|
|
|
|
|
|
|
requests_get_.assert_called_once_with(
|
|
|
|
"https://example.com", headers={"User-Agent": "test"}, verify=True
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2024-06-21 13:59:48 -07:00
|
|
|
# ================================================================================================
|
|
|
|
# PARSING TESTS
|
|
|
|
# ================================================================================================
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_on_ideas_page():
|
|
|
|
elements = partition_html(example_doc_path("ideas-page.html"))
|
|
|
|
|
|
|
|
assert len(elements) == 1
|
|
|
|
e = elements[0]
|
|
|
|
assert e == Table(
|
|
|
|
"January 2023 ( Someone fed my essays into GPT to make something that could answer"
|
|
|
|
"\nquestions based on them, then asked it where good ideas come from. The"
|
|
|
|
"\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
|
|
|
" The way to get new ideas is to notice anomalies: what seems strange,"
|
|
|
|
"\nor missing, or broken? You can see anomalies in everyday life (much"
|
|
|
|
"\nof standup comedy is based on this), but the best place to look for"
|
|
|
|
"\nthem is at the frontiers of knowledge. Knowledge grows fractally."
|
|
|
|
"\nFrom a distance its edges look smooth, but when you learn enough"
|
|
|
|
"\nto get close to one, you'll notice it's full of gaps. These gaps"
|
|
|
|
"\nwill seem obvious; it will seem inexplicable that no one has tried"
|
|
|
|
"\nx or wondered about y. In the best case, exploring such gaps yields"
|
|
|
|
"\nwhole new fractal buds.",
|
|
|
|
)
|
|
|
|
assert e.metadata.emphasized_text_contents is None
|
|
|
|
assert e.metadata.link_urls is None
|
|
|
|
assert e.metadata.text_as_html is not None
|
|
|
|
|
|
|
|
|
|
|
|
# -- element-suppression behaviors ---------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
def test_it_does_not_extract_text_in_script_tags(opts_args: dict[str, Any]):
|
|
|
|
opts_args["file_path"] = example_doc_path("example-with-scripts.html")
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
doc = HTMLDocument.load(opts)
|
|
|
|
assert all("function (" not in element.text for element in doc.elements)
|
|
|
|
|
|
|
|
|
|
|
|
def test_it_does_not_extract_text_in_style_tags(opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = (
|
|
|
|
"<html>\n"
|
|
|
|
"<body>\n"
|
|
|
|
" <p><style> p { margin:0; padding:0; } </style>Lorem ipsum dolor</p>\n"
|
|
|
|
"</body>\n"
|
|
|
|
"</html>"
|
|
|
|
)
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
html_document = HTMLDocument.load(opts)
|
|
|
|
|
|
|
|
(element,) = html_document.elements
|
|
|
|
assert isinstance(element, Text)
|
|
|
|
assert element.text == "Lorem ipsum dolor"
|
|
|
|
|
|
|
|
|
|
|
|
# -- table parsing behaviors ---------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
def test_it_can_parse_a_bare_bones_table_to_a_Table_element(opts_args: dict[str, Any]):
|
|
|
|
"""Bare-bones means no `<thead>`, `<tbody>`, or `<tfoot>` elements."""
|
|
|
|
opts_args["text"] = (
|
|
|
|
"<html>\n"
|
|
|
|
"<body>\n"
|
|
|
|
" <table>\n"
|
|
|
|
" <tr><td>Lorem</td><td>Ipsum</td></tr>\n"
|
|
|
|
" <tr><td>Ut enim non</td><td>ad minim\nveniam quis</td></tr>\n"
|
|
|
|
" </table>\n"
|
|
|
|
"</body>\n"
|
|
|
|
"</html>"
|
|
|
|
)
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
html_document = HTMLDocument.load(opts)
|
|
|
|
|
|
|
|
# -- there is exactly one element and it's a Table instance --
|
|
|
|
(element,) = html_document.elements
|
|
|
|
assert isinstance(element, Table)
|
|
|
|
# -- table text is joined into a single string; no row or cell boundaries are represented --
|
|
|
|
assert element.text == "Lorem Ipsum Ut enim non ad minim\nveniam quis"
|
|
|
|
# -- An HTML representation is also available that is longer but represents table structure.
|
|
|
|
assert element.metadata.text_as_html == (
|
|
|
|
"<table>"
|
|
|
|
"<tr><td>Lorem</td><td>Ipsum</td></tr>"
|
|
|
|
"<tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr>"
|
|
|
|
"</table>"
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def test_it_accommodates_column_heading_cells_enclosed_in_thead_tbody_and_tfoot_elements(
|
|
|
|
opts_args: dict[str, Any]
|
|
|
|
):
|
|
|
|
"""Cells within a `table/thead` element are included in the text and html.
|
|
|
|
|
|
|
|
The presence of a `<thead>` element in the original also determines whether a `<thead>` element
|
|
|
|
appears in `.text_as_html` or whether the first row of cells is simply in the body.
|
|
|
|
"""
|
|
|
|
opts_args["text"] = (
|
|
|
|
"<html>\n"
|
|
|
|
"<body>\n"
|
|
|
|
" <table>\n"
|
|
|
|
" <thead>\n"
|
|
|
|
" <tr><th>Lorem</th><th>Ipsum</th></tr>\n"
|
|
|
|
" </thead>\n"
|
|
|
|
" <tbody>\n"
|
|
|
|
" <tr><th>Lorem ipsum</th><td>dolor sit amet nulla</td></tr>\n"
|
|
|
|
" <tr><th>Ut enim non</th><td>ad minim\nveniam quis</td></tr>\n"
|
|
|
|
" </tbody>\n"
|
|
|
|
" <tfoot>\n"
|
|
|
|
" <tr><th>Dolor</th><td>Equis</td></tr>\n"
|
|
|
|
" </tfoot>\n"
|
|
|
|
" </table>\n"
|
|
|
|
"</body>\n"
|
|
|
|
"</html>"
|
|
|
|
)
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
html_document = HTMLDocument.load(opts)
|
|
|
|
|
|
|
|
(element,) = html_document.elements
|
|
|
|
assert isinstance(element, Table)
|
|
|
|
assert element.metadata.text_as_html == (
|
|
|
|
"<table>"
|
|
|
|
"<tr><td>Lorem</td><td>Ipsum</td></tr>"
|
|
|
|
"<tr><td>Lorem ipsum</td><td>dolor sit amet nulla</td></tr>"
|
|
|
|
"<tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr>"
|
|
|
|
"<tr><td>Dolor</td><td>Equis</td></tr>"
|
|
|
|
"</table>"
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def test_it_does_not_emit_a_Table_element_for_a_table_with_no_text(opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = (
|
|
|
|
"<html>\n"
|
|
|
|
"<body>\n"
|
|
|
|
" <table>\n"
|
|
|
|
" <tr><td> </td><td> </td></tr>\n"
|
|
|
|
" <tr><td> </td><td> </td></tr>\n"
|
|
|
|
" </table>\n"
|
|
|
|
"</body>\n"
|
|
|
|
"</html>"
|
|
|
|
)
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
html_document = HTMLDocument.load(opts)
|
|
|
|
|
|
|
|
assert html_document.elements == []
|
|
|
|
|
|
|
|
|
|
|
|
def test_it_provides_parseable_HTML_in_text_as_html(opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = (
|
|
|
|
"<html>\n"
|
|
|
|
"<body>\n"
|
|
|
|
" <table>\n"
|
|
|
|
" <thead>\n"
|
|
|
|
" <tr><th>Lorem</th><th>Ipsum</th></tr>\n"
|
|
|
|
" </thead>\n"
|
|
|
|
" <tbody>\n"
|
|
|
|
" <tr><th>Lorem ipsum</th><td>dolor sit amet nulla</td></tr>\n"
|
|
|
|
" <tr><th>Ut enim non</th><td>ad minim\nveniam quis</td></tr>\n"
|
|
|
|
" </tbody>\n"
|
|
|
|
" <tfoot>\n"
|
|
|
|
" <tr><th>Dolor</th><td>Equis</td></tr>\n"
|
|
|
|
" </tfoot>\n"
|
|
|
|
" </table>\n"
|
|
|
|
"</body>\n"
|
|
|
|
"</html>"
|
|
|
|
)
|
|
|
|
html_document = HTMLDocument.load(HtmlPartitionerOptions(**opts_args))
|
|
|
|
(element,) = html_document.elements
|
|
|
|
assert isinstance(element, Table)
|
|
|
|
text_as_html = element.metadata.text_as_html
|
|
|
|
assert text_as_html is not None
|
|
|
|
|
|
|
|
html = etree.fromstring(text_as_html, etree.HTMLParser())
|
|
|
|
|
|
|
|
assert html is not None
|
|
|
|
# -- lxml adds the <html><body> container, that's not present in `.text_as_html` --
|
|
|
|
assert etree.tostring(html, encoding=str) == (
|
|
|
|
"<html><body>"
|
|
|
|
"<table>"
|
|
|
|
"<tr><td>Lorem</td><td>Ipsum</td></tr>"
|
|
|
|
"<tr><td>Lorem ipsum</td><td>dolor sit amet nulla</td></tr>"
|
|
|
|
"<tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr>"
|
|
|
|
"<tr><td>Dolor</td><td>Equis</td></tr>"
|
|
|
|
"</table>"
|
|
|
|
"</body></html>"
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("tag", "expected_text_as_html"),
|
|
|
|
[
|
|
|
|
("thead", "<table><tr><td>Header 1</td><td>Header 2</td></tr></table>"),
|
|
|
|
("tfoot", "<table><tr><td>Header 1</td><td>Header 2</td></tr></table>"),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_partition_html_parses_table_without_tbody(tag: str, expected_text_as_html: str):
|
|
|
|
elements = partition_html(
|
|
|
|
text=(
|
|
|
|
f"<table>\n"
|
|
|
|
f" <{tag}>\n"
|
|
|
|
f" <tr><th>Header 1</th><th>Header 2</th></tr>\n"
|
|
|
|
f" </{tag}>\n"
|
|
|
|
f"</table>"
|
|
|
|
)
|
|
|
|
)
|
|
|
|
assert elements[0].metadata.text_as_html == expected_text_as_html
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_reduces_a_nested_table_to_its_text_placed_in_the_cell_that_contains_it(
|
|
|
|
opts_args: dict[str, Any]
|
|
|
|
):
|
|
|
|
"""Recursively ..."""
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
# -- note <table> elements nested in <td> elements --
|
|
|
|
html_str = (
|
|
|
|
"<table>\n"
|
|
|
|
" <tr>\n"
|
|
|
|
" <td>\n"
|
|
|
|
" <table>\n"
|
|
|
|
" <tr><td>foo</td><td>bar</td></tr>\n"
|
|
|
|
" <tr><td>baz</td><td>bng</td></tr>\n"
|
|
|
|
" </table>\n"
|
|
|
|
" </td>\n"
|
|
|
|
" <td>\n"
|
|
|
|
" <table>\n"
|
|
|
|
" <tr><td>fizz</td><td>bang</td></tr>\n"
|
|
|
|
" </table>\n"
|
|
|
|
" </td>\n"
|
|
|
|
" </tr>\n"
|
|
|
|
"</table>"
|
|
|
|
)
|
|
|
|
html_document = HTMLDocument(html_str, opts)
|
|
|
|
table_elem = html_document._main.find(".//table")
|
|
|
|
assert table_elem is not None
|
|
|
|
|
|
|
|
html_table = html_document._parse_Table_from_table_elem(table_elem)
|
|
|
|
|
|
|
|
assert isinstance(html_table, Table)
|
|
|
|
assert html_table.text == "foo bar baz bng fizz bang"
|
|
|
|
assert html_table.metadata.text_as_html == (
|
|
|
|
"<table><tr><td>foo bar baz bng</td><td>fizz bang</td></tr></table>"
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_accommodates_tds_with_child_elements(opts_args: dict[str, Any]):
|
|
|
|
"""Like this example from an SEC 10k filing."""
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
html_str = (
|
|
|
|
"<table>\n"
|
|
|
|
" <tr>\n"
|
|
|
|
" <td></td>\n"
|
|
|
|
" <td></td>\n"
|
|
|
|
" </tr>\n"
|
|
|
|
" <tr>\n"
|
|
|
|
" <td>\n"
|
|
|
|
" <p>\n"
|
|
|
|
" <span>\n"
|
|
|
|
' <ix:nonNumeric id="F_be4cc145-372a-4689-be60-d8a70b0c8b9a"'
|
|
|
|
' contextRef="C_1de69f73-df01-4830-8af0-0f11b469bc4a" name="dei:DocumentAnnualReport"'
|
|
|
|
' format="ixt-sec:boolballotbox">\n'
|
|
|
|
" <span>☒</span>\n"
|
|
|
|
" </ix:nonNumeric>\n"
|
|
|
|
" </span>\n"
|
|
|
|
" </p>\n"
|
|
|
|
" </td>\n"
|
|
|
|
" <td>\n"
|
|
|
|
" <p>\n"
|
|
|
|
" <span>ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE"
|
|
|
|
" ACT OF 1934</span>\n"
|
|
|
|
" </p>\n"
|
|
|
|
" </td>\n"
|
|
|
|
" </tr>\n"
|
|
|
|
"</table>\n"
|
|
|
|
)
|
|
|
|
html_document = HTMLDocument(html_str, opts)
|
|
|
|
table_elem = html_document._main.find(".//table")
|
|
|
|
assert table_elem is not None
|
|
|
|
|
|
|
|
html_table = html_document._parse_Table_from_table_elem(table_elem)
|
|
|
|
|
|
|
|
assert isinstance(html_table, Table)
|
|
|
|
assert html_table.text == (
|
|
|
|
"☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934"
|
|
|
|
)
|
|
|
|
assert html_table.metadata.text_as_html == (
|
|
|
|
"<table>"
|
|
|
|
"<tr><td></td><td></td></tr>"
|
|
|
|
"<tr><td>☒</td><td>ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES"
|
|
|
|
" EXCHANGE ACT OF 1934</td></tr>"
|
|
|
|
"</table>"
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# -- other element-specific behaviors ------------------------------------------------------------
|
2024-06-05 16:11:58 -07:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_recognizes_h1_to_h3_as_Title_except_in_edge_cases():
|
|
|
|
assert partition_html(
|
|
|
|
text=(
|
|
|
|
"<p>This is a section of narrative text, it's long, flows and has meaning</p>\n"
|
|
|
|
"<h1>This heading is a title, even though it's long, flows and has meaning</h1>\n"
|
|
|
|
"<h2>A heading that is at the second level</h2>\n"
|
|
|
|
"<h3>Finally, the third heading</h3>\n"
|
|
|
|
"<h2>December 1-17, 2017</h2>\n"
|
|
|
|
"<h3>email@example.com</h3>\n"
|
|
|
|
"<h3><li>- bulleted item</li></h3>\n"
|
|
|
|
)
|
|
|
|
) == [
|
|
|
|
NarrativeText("This is a section of narrative text, it's long, flows and has meaning"),
|
|
|
|
Title("This heading is a title, even though it's long, flows and has meaning"),
|
|
|
|
Title("A heading that is at the second level"),
|
|
|
|
Title("Finally, the third heading"),
|
|
|
|
Title("December 1-17, 2017"),
|
|
|
|
EmailAddress("email@example.com"),
|
|
|
|
ListItem("- bulleted item"),
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_with_pre_tag():
|
|
|
|
elements = partition_html(example_doc_path("fake-html-pre.htm"))
|
|
|
|
|
|
|
|
assert len(elements) > 0
|
|
|
|
assert all(e.category != "PageBreak" for e in elements)
|
|
|
|
assert clean_extra_whitespace(elements[0].text).startswith("[107th Congress Public Law 56]")
|
|
|
|
assert isinstance(elements[0], NarrativeText)
|
|
|
|
assert elements[0].metadata.filetype == "text/html"
|
|
|
|
assert elements[0].metadata.filename == "fake-html-pre.htm"
|
|
|
|
|
|
|
|
|
|
|
|
def test_pre_tag_parsing_respects_order():
|
|
|
|
assert partition_html(
|
|
|
|
text=(
|
|
|
|
"<pre>The Big Brown Bear</pre>\n"
|
|
|
|
"<div>The big brown bear is growling.</div>\n"
|
|
|
|
"<pre>The big brown bear is sleeping.</pre>\n"
|
|
|
|
"<div>The Big Blue Bear</div>\n"
|
|
|
|
)
|
|
|
|
) == [
|
|
|
|
Title("The Big Brown Bear"),
|
|
|
|
NarrativeText("The big brown bear is growling."),
|
|
|
|
NarrativeText("The big brown bear is sleeping."),
|
|
|
|
Title("The Big Blue Bear"),
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_b_tag_parsing():
|
|
|
|
elements = partition_html(
|
|
|
|
text=(
|
|
|
|
"<!DOCTYPE html>\n"
|
|
|
|
"<html>\n"
|
|
|
|
"<body>\n"
|
|
|
|
"<div>\n"
|
|
|
|
" <h1>Header 1</h1>\n"
|
|
|
|
" <p>Text</p>\n"
|
|
|
|
" <h2>Header 2</h2>\n"
|
|
|
|
" <pre>\n"
|
|
|
|
" <b>Param1</b> = Y<br><b>Param2</b> = 1<br><b>Param3</b> = 2<br><b>Param4</b> = A\n"
|
|
|
|
" <br><b>Param5</b> = A,B,C,D,E<br><b>Param6</b> = 7<br><b>Param7</b> = Five<br>\n"
|
|
|
|
" </pre>\n"
|
|
|
|
"</div>\n"
|
|
|
|
"</body>\n"
|
|
|
|
"</html>\n"
|
|
|
|
)
|
|
|
|
)
|
|
|
|
assert "|".join(e.text for e in elements) == (
|
|
|
|
"Header 1|Text|Header 2|Param1 = Y|Param2 = 1|Param3 = 2|Param4 = A|"
|
|
|
|
"Param5 = A,B,C,D,E|Param6 = 7|Param7 = Five"
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_tag_tail_parsing():
|
|
|
|
elements = partition_html(
|
|
|
|
text=(
|
|
|
|
"<html>\n"
|
|
|
|
"<body>\n"
|
|
|
|
"<div>\n"
|
|
|
|
" Head\n"
|
|
|
|
" <div><span>Nested</span></div>\n"
|
|
|
|
" Tail\n"
|
|
|
|
"</div>\n"
|
|
|
|
"</body>\n"
|
|
|
|
"</html>\n"
|
|
|
|
)
|
|
|
|
)
|
|
|
|
assert "|".join([str(e).strip() for e in elements]) == "Head|Nested|Tail"
|
|
|
|
|
|
|
|
|
2024-06-21 13:59:48 -07:00
|
|
|
# -- parsing edge cases --------------------------------------------------------------------------
|
2024-06-05 16:11:58 -07:00
|
|
|
|
|
|
|
|
2024-06-21 13:59:48 -07:00
|
|
|
def test_partition_html_from_text_works_with_empty_string():
|
|
|
|
assert partition_html(text="") == []
|
2024-06-05 16:11:58 -07:00
|
|
|
|
|
|
|
|
2024-06-21 13:59:48 -07:00
|
|
|
def test_nested_text_tags(opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = (
|
|
|
|
"<body>\n"
|
|
|
|
" <p>\n"
|
|
|
|
" <a>\n"
|
|
|
|
" There is some text here.\n"
|
|
|
|
" </a>\n"
|
|
|
|
" </p>\n"
|
|
|
|
"</body>\n"
|
|
|
|
)
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
elements = HTMLDocument.load(opts).elements
|
2024-06-05 16:11:58 -07:00
|
|
|
|
2024-06-21 13:59:48 -07:00
|
|
|
assert len(elements) == 1
|
2024-06-05 16:11:58 -07:00
|
|
|
|
|
|
|
|
2024-06-21 13:59:48 -07:00
|
|
|
def test_containers_with_text_are_processed(opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = (
|
|
|
|
'<div dir=3D"ltr">Hi All,<div><br></div>\n'
|
|
|
|
" <div>Get excited for our first annual family day!</div>\n"
|
|
|
|
' <div>Best.<br clear=3D"all">\n'
|
|
|
|
" <div><br></div>\n"
|
|
|
|
" -- <br>\n"
|
|
|
|
' <div dir=3D"ltr">\n'
|
|
|
|
' <div dir=3D"ltr">Dino the Datasaur<div>\n'
|
|
|
|
" Unstructured Technologies<br>\n"
|
|
|
|
" <div>Data Scientist</div>\n"
|
|
|
|
" <div>Doylestown, PA 18901</div>\n"
|
|
|
|
" <div><br></div>\n"
|
|
|
|
" </div>\n"
|
|
|
|
" </div>\n"
|
|
|
|
" </div>\n"
|
|
|
|
" </div>\n"
|
|
|
|
"</div>\n"
|
|
|
|
)
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
html_document = HTMLDocument.load(opts)
|
|
|
|
|
|
|
|
assert html_document.elements == [
|
|
|
|
Text(text="Hi All,"),
|
|
|
|
NarrativeText(text="Get excited for our first annual family day!"),
|
|
|
|
Title(text="Best."),
|
|
|
|
Text(text="\n -- "),
|
|
|
|
Title(text="Dino the Datasaur"),
|
|
|
|
Title(text="\n Unstructured Technologies"),
|
|
|
|
Title(text="Data Scientist"),
|
|
|
|
Address(text="Doylestown, PA 18901"),
|
|
|
|
]
|
2024-06-05 16:11:58 -07:00
|
|
|
|
|
|
|
|
2024-06-21 13:59:48 -07:00
|
|
|
def test_html_grabs_bulleted_text_in_tags(opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = (
|
|
|
|
"<html>\n"
|
|
|
|
" <body>\n"
|
|
|
|
" <ol>\n"
|
|
|
|
" <li>Happy Groundhog's day!</li>\n"
|
|
|
|
" <li>Looks like six more weeks of winter ...</li>\n"
|
|
|
|
" </ol>\n"
|
|
|
|
" </body>\n"
|
|
|
|
"</html>\n"
|
|
|
|
)
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
assert HTMLDocument.load(opts).elements == [
|
|
|
|
ListItem(text="Happy Groundhog's day!"),
|
|
|
|
ListItem(text="Looks like six more weeks of winter ..."),
|
|
|
|
]
|
2024-06-05 16:11:58 -07:00
|
|
|
|
|
|
|
|
2024-06-21 13:59:48 -07:00
|
|
|
def test_html_grabs_bulleted_text_in_paras(opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = (
|
|
|
|
"<html>\n"
|
|
|
|
" <body>\n"
|
|
|
|
" <p>\n"
|
|
|
|
" <span>• Happy Groundhog's day!</span>\n"
|
|
|
|
" </p>\n"
|
|
|
|
" <p>\n"
|
|
|
|
" <span>• Looks like six more weeks of winter ...</span>\n"
|
|
|
|
" </p>\n"
|
|
|
|
" </body>\n"
|
|
|
|
"</html>\n"
|
|
|
|
)
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
assert HTMLDocument.load(opts).elements == [
|
|
|
|
ListItem(text="Happy Groundhog's day!"),
|
|
|
|
ListItem(text="Looks like six more weeks of winter ..."),
|
|
|
|
]
|
2024-06-05 16:11:58 -07:00
|
|
|
|
|
|
|
|
2024-06-21 13:59:48 -07:00
|
|
|
def test_joins_tag_text_correctly(opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = "<p>Hello again peet mag<i>ic</i>al</p>"
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
doc = HTMLDocument.load(opts)
|
|
|
|
assert doc.elements[0].text == "Hello again peet magical"
|
2024-06-05 16:11:58 -07:00
|
|
|
|
|
|
|
|
2024-06-21 13:59:48 -07:00
|
|
|
def test_sample_doc_with_emoji(opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = '<html charset="unicode">\n<p>Hello again 😀</p>\n</html>'
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
doc = HTMLDocument.load(opts)
|
|
|
|
# NOTE(robinson) - unclear why right now, but the output is the emoji on the test runners
|
|
|
|
# and the byte string representation when running locally on mac
|
|
|
|
assert doc.elements[0].text in ["Hello again ð\x9f\x98\x80", "Hello again 😀"]
|
2024-06-05 16:11:58 -07:00
|
|
|
|
|
|
|
|
2024-06-21 13:59:48 -07:00
|
|
|
def test_only_plain_text_in_body(opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = "<body>Hello</body>"
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
assert HTMLDocument.load(opts).elements[0].text == "Hello"
|
2024-06-05 16:11:58 -07:00
|
|
|
|
|
|
|
|
2024-06-21 13:59:48 -07:00
|
|
|
def test_plain_text_before_anything_in_body(opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = "<body>Hello<p>World</p></body>"
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
doc = HTMLDocument.load(opts)
|
|
|
|
assert doc.elements[0].text == "Hello"
|
|
|
|
assert doc.elements[1].text == "World"
|
2024-06-05 16:11:58 -07:00
|
|
|
|
|
|
|
|
2024-06-21 13:59:48 -07:00
|
|
|
def test_line_break_in_container(opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = "<div>Hello<br/>World</div>"
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
doc = HTMLDocument.load(opts)
|
|
|
|
assert doc.elements[0].text == "Hello"
|
|
|
|
assert doc.elements[1].text == "World"
|
2024-06-05 16:11:58 -07:00
|
|
|
|
|
|
|
|
2024-06-21 13:59:48 -07:00
|
|
|
@pytest.mark.parametrize("tag", ["del", "form", "noscript"])
|
|
|
|
def test_exclude_tag_types(tag: str, opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = f"<body>\n <{tag}>\n There is some text here.\n </{tag}>\n</body>\n"
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
2024-06-05 16:11:58 -07:00
|
|
|
|
2024-06-21 13:59:48 -07:00
|
|
|
elements = HTMLDocument.load(opts).elements
|
|
|
|
|
|
|
|
assert len(elements) == 0
|
|
|
|
|
|
|
|
|
|
|
|
# ================================================================================================
|
|
|
|
# OTHER ARGS
|
|
|
|
# ================================================================================================
|
|
|
|
|
|
|
|
# -- `chunking_strategy` arg ---------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_can_chunk_while_partitioning():
|
|
|
|
file_path = example_doc_path("example-10k-1p.html")
|
|
|
|
chunks = partition_html(file_path, chunking_strategy="by_title")
|
|
|
|
chunks_2 = chunk_by_title(partition_html(file_path))
|
|
|
|
assert all(isinstance(c, (CompositeElement, Table, TableChunk)) for c in chunks)
|
|
|
|
assert chunks == chunks_2
|
2024-06-05 16:11:58 -07:00
|
|
|
|
|
|
|
|
|
|
|
# -- `include_metadata` arg ----------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_from_filename_can_suppress_metadata():
|
|
|
|
elements = partition_html(example_doc_path("example-10k-1p.html"), include_metadata=False)
|
|
|
|
assert all(e.metadata.to_dict() == {} for e in elements)
|
|
|
|
|
|
|
|
|
|
|
|
# -- `skip_headers_and_footers` arg --------------------------------------------------------------
|
|
|
|
|
|
|
|
|
2024-06-13 11:19:42 -07:00
|
|
|
def test_partition_html_can_skip_headers_and_footers():
|
|
|
|
assert partition_html(
|
2024-06-05 16:11:58 -07:00
|
|
|
text=(
|
|
|
|
"<html>\n"
|
|
|
|
" <header>\n"
|
|
|
|
" <p>Header</p>\n"
|
|
|
|
" </header>\n"
|
|
|
|
" <body>\n"
|
|
|
|
" <h1>My First Heading</h1>\n"
|
2024-06-13 11:19:42 -07:00
|
|
|
" <p>It was a dark and stormy night. No one was around.</p>\n"
|
2024-06-05 16:11:58 -07:00
|
|
|
" </body>\n"
|
|
|
|
" <footer>\n"
|
|
|
|
" <p>Footer</p>\n"
|
|
|
|
" </footer>\n"
|
|
|
|
"</html>\n"
|
|
|
|
),
|
2024-06-13 11:19:42 -07:00
|
|
|
skip_headers_and_footers=True,
|
|
|
|
) == [
|
|
|
|
Title("My First Heading"),
|
|
|
|
NarrativeText("It was a dark and stormy night. No one was around."),
|
|
|
|
]
|
2024-06-05 16:11:58 -07:00
|
|
|
|
|
|
|
|
|
|
|
# -- `unique_element_ids` arg --------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
def test_all_element_ids_are_unique():
|
|
|
|
ids = [e.id for e in partition_html(example_doc_path("fake-html-with-duplicate-elements.html"))]
|
|
|
|
assert len(ids) == len(set(ids))
|
|
|
|
|
|
|
|
|
|
|
|
def test_element_ids_are_deterministic():
|
|
|
|
ids = [e.id for e in partition_html("example-docs/fake-html-with-duplicate-elements.html")]
|
|
|
|
ids_2 = [e.id for e in partition_html("example-docs/fake-html-with-duplicate-elements.html")]
|
|
|
|
assert ids == ids_2
|
|
|
|
|
|
|
|
|
2024-06-21 13:59:48 -07:00
|
|
|
# ================================================================================================
|
|
|
|
# METADATA BEHAVIORS
|
|
|
|
# ================================================================================================
|
|
|
|
|
2024-06-11 13:54:11 -07:00
|
|
|
# -- .metadata.category_depth + parent_id --------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_records_hierarchy_metadata():
|
|
|
|
elements = partition_html(
|
|
|
|
text=(
|
|
|
|
"<html>\n"
|
|
|
|
" <p>Preamble gets no category_depth or parent_id</p>\n"
|
|
|
|
" <h1>Heading gets category_depth but no parent_id</h1>\n"
|
|
|
|
" <p>Body paragraph gets parent_id but no category_depth</p>\n"
|
|
|
|
" <ul>\n"
|
|
|
|
" <li>List item gets category_depth and parent_id</li>\n"
|
|
|
|
" <li>Second list item gets category_depth and parent_id</li>\n"
|
|
|
|
" </ul>\n"
|
|
|
|
" <p>Body paragraph after list gets parent_id but no category_depth</p>\n"
|
|
|
|
"</html>\n"
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
assert len(elements) == 6
|
|
|
|
e = elements[0]
|
2024-06-14 17:14:22 -07:00
|
|
|
assert isinstance(e, NarrativeText)
|
2024-06-11 13:54:11 -07:00
|
|
|
assert e.text == "Preamble gets no category_depth or parent_id"
|
|
|
|
assert e.metadata.category_depth is None
|
|
|
|
assert e.metadata.parent_id is None
|
|
|
|
e = elements[1]
|
2024-06-14 17:14:22 -07:00
|
|
|
assert isinstance(e, Title)
|
2024-06-11 13:54:11 -07:00
|
|
|
assert e.text == "Heading gets category_depth but no parent_id"
|
|
|
|
assert e.metadata.category_depth == 0
|
|
|
|
assert e.metadata.parent_id is None
|
|
|
|
e = elements[2]
|
2024-06-14 17:14:22 -07:00
|
|
|
assert isinstance(e, NarrativeText)
|
2024-06-11 13:54:11 -07:00
|
|
|
assert e.text == "Body paragraph gets parent_id but no category_depth"
|
|
|
|
assert e.metadata.category_depth is None
|
|
|
|
assert e.metadata.parent_id == elements[1].id
|
|
|
|
e = elements[3]
|
2024-06-14 17:14:22 -07:00
|
|
|
assert isinstance(e, ListItem)
|
2024-06-11 13:54:11 -07:00
|
|
|
assert e.text == "List item gets category_depth and parent_id"
|
|
|
|
assert e.metadata.category_depth == 1
|
|
|
|
assert e.metadata.parent_id == elements[1].id
|
|
|
|
e = elements[4]
|
2024-06-14 17:14:22 -07:00
|
|
|
assert isinstance(e, ListItem)
|
2024-06-11 13:54:11 -07:00
|
|
|
assert e.text == "Second list item gets category_depth and parent_id"
|
|
|
|
assert e.metadata.category_depth == 1
|
|
|
|
assert e.metadata.parent_id == elements[1].id
|
|
|
|
e = elements[5]
|
2024-06-14 17:14:22 -07:00
|
|
|
assert isinstance(e, NarrativeText)
|
2024-06-11 13:54:11 -07:00
|
|
|
assert e.text == "Body paragraph after list gets parent_id but no category_depth"
|
|
|
|
assert e.metadata.category_depth is None
|
|
|
|
assert e.metadata.parent_id == elements[1].id
|
|
|
|
|
|
|
|
|
2024-06-05 16:11:58 -07:00
|
|
|
# -- .metadata.emphasis --------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_grabs_emphasized_texts():
|
|
|
|
elements = partition_html(
|
|
|
|
text=(
|
|
|
|
"<html>\n"
|
|
|
|
" <p>Hello there I am a very <strong>important</strong> text!</p>\n"
|
|
|
|
" <p>Here is a <span>list</span> of <b>my <i>favorite</i> things</b></p>\n"
|
|
|
|
" <ul>\n"
|
|
|
|
" <li><em>Parrots</em></li>\n"
|
|
|
|
" <li>Dogs</li>\n"
|
|
|
|
" </ul>\n"
|
|
|
|
" <span>A lone span text!</span>\n"
|
|
|
|
"</html>\n"
|
|
|
|
)
|
|
|
|
)
|
|
|
|
e = elements[0]
|
|
|
|
assert e == NarrativeText("Hello there I am a very important text!")
|
|
|
|
assert e.metadata.emphasized_text_contents == ["important"]
|
|
|
|
assert e.metadata.emphasized_text_tags == ["strong"]
|
|
|
|
e = elements[1]
|
|
|
|
assert e == NarrativeText("Here is a list of my favorite things")
|
|
|
|
assert e.metadata.emphasized_text_contents == ["list", "my favorite things", "favorite"]
|
|
|
|
assert e.metadata.emphasized_text_tags == ["span", "b", "i"]
|
|
|
|
e = elements[2]
|
|
|
|
assert e == ListItem("Parrots")
|
|
|
|
assert e.metadata.emphasized_text_contents == ["Parrots"]
|
|
|
|
assert e.metadata.emphasized_text_tags == ["em"]
|
|
|
|
e = elements[3]
|
|
|
|
assert e == ListItem("Dogs")
|
|
|
|
assert e.metadata.emphasized_text_contents is None
|
|
|
|
assert e.metadata.emphasized_text_tags is None
|
|
|
|
e = elements[4]
|
|
|
|
assert e == Title("A lone span text!")
|
|
|
|
assert e.metadata.emphasized_text_contents == ["A lone span text!"]
|
|
|
|
assert e.metadata.emphasized_text_tags == ["span"]
|
|
|
|
|
|
|
|
|
|
|
|
# -- .metadata.filename --------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
2024-06-11 13:54:11 -07:00
|
|
|
def test_partition_html_from_filename_uses_source_filename_for_metadata_by_default():
|
|
|
|
elements = partition_html(example_doc_path("example-10k-1p.html"))
|
|
|
|
|
|
|
|
assert len(elements) > 0
|
|
|
|
assert all(e.metadata.filename == "example-10k-1p.html" for e in elements)
|
|
|
|
assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
|
|
|
|
|
|
|
|
|
2024-06-05 16:11:58 -07:00
|
|
|
def test_partition_html_from_filename_prefers_metadata_filename():
|
|
|
|
elements = partition_html(example_doc_path("example-10k-1p.html"), metadata_filename="test")
|
|
|
|
|
|
|
|
assert len(elements) > 0
|
|
|
|
assert all(element.metadata.filename == "test" for element in elements)
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_from_file_prefers_metadata_filename():
|
|
|
|
with open(example_doc_path("example-10k-1p.html"), "rb") as f:
|
|
|
|
elements = partition_html(file=f, metadata_filename="test")
|
|
|
|
|
|
|
|
assert len(elements) > 0
|
|
|
|
assert all(e.metadata.filename == "test" for e in elements)
|
|
|
|
|
|
|
|
|
|
|
|
# -- .metadata.languages -------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_element_metadata_has_languages():
|
|
|
|
elements = partition_html(example_doc_path("example-10k-1p.html"))
|
|
|
|
assert elements[0].metadata.languages == ["eng"]
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_respects_detect_language_per_element():
|
|
|
|
elements = partition_html(
|
|
|
|
example_doc_path("language-docs/eng_spa_mult.html"), detect_language_per_element=True
|
|
|
|
)
|
|
|
|
|
|
|
|
assert [e.metadata.languages for e in elements] == [
|
|
|
|
["eng"],
|
|
|
|
["spa", "eng"],
|
|
|
|
["eng"],
|
|
|
|
["eng"],
|
|
|
|
["spa"],
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
# -- .metadata.last_modified ---------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_from_filename_pulls_last_modified_from_filesystem(
|
|
|
|
get_last_modified_date_: Mock,
|
|
|
|
):
|
|
|
|
last_modified_on_filesystem = "2023-07-05T09:24:28"
|
|
|
|
get_last_modified_date_.return_value = last_modified_on_filesystem
|
|
|
|
|
|
|
|
elements = partition_html(example_doc_path("fake-html.html"))
|
|
|
|
|
|
|
|
assert isinstance(elements[0], Title)
|
|
|
|
assert elements[0].metadata.last_modified == last_modified_on_filesystem
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_from_filename_prefers_metadata_last_modified(
|
|
|
|
get_last_modified_date_: Mock,
|
|
|
|
):
|
|
|
|
metadata_last_modified = "2023-07-05T09:24:28"
|
|
|
|
get_last_modified_date_.return_value = "2024-06-04T09:24:28"
|
|
|
|
|
|
|
|
elements = partition_html(
|
|
|
|
example_doc_path("fake-html.html"), metadata_last_modified=metadata_last_modified
|
|
|
|
)
|
|
|
|
|
|
|
|
assert isinstance(elements[0], Title)
|
|
|
|
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_from_file_does_not_assign_last_modified_metadata_by_default(
|
|
|
|
get_last_modified_date_from_file_: Mock,
|
|
|
|
):
|
|
|
|
get_last_modified_date_from_file_.return_value = "2029-07-05T09:24:28"
|
|
|
|
|
|
|
|
with open(example_doc_path("fake-html.html"), "rb") as f:
|
|
|
|
elements = partition_html(file=f)
|
|
|
|
|
|
|
|
assert isinstance(elements[0], Title)
|
|
|
|
assert elements[0].metadata.last_modified is None
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_from_file_pulls_last_modified_from_file_like_object_when_so_instructed(
|
|
|
|
get_last_modified_date_from_file_: Mock,
|
|
|
|
):
|
|
|
|
get_last_modified_date_from_file_.return_value = "2024-06-04T09:24:28"
|
|
|
|
|
|
|
|
with open(example_doc_path("fake-html.html"), "rb") as f:
|
|
|
|
elements = partition_html(file=f, date_from_file_object=True)
|
|
|
|
|
|
|
|
assert isinstance(elements[0], Title)
|
|
|
|
assert all(e.metadata.last_modified == "2024-06-04T09:24:28" for e in elements)
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_from_file_assigns_no_last_modified_metadata_when_file_has_none():
|
|
|
|
"""Test partition_html() with file that are not possible to get last modified date"""
|
|
|
|
with open(example_doc_path("fake-html.html"), "rb") as f:
|
|
|
|
sf = SpooledTemporaryFile()
|
|
|
|
sf.write(f.read())
|
|
|
|
sf.seek(0)
|
|
|
|
elements = partition_html(file=sf, date_from_file_object=True)
|
|
|
|
|
|
|
|
assert all(e.metadata.last_modified is None for e in elements)
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_from_file_prefers_metadata_last_modified(
|
|
|
|
get_last_modified_date_from_file_: Mock,
|
|
|
|
):
|
|
|
|
metadata_last_modified = "2023-07-05T09:24:28"
|
|
|
|
get_last_modified_date_from_file_.return_value = "2024-06-04T09:24:28"
|
|
|
|
|
|
|
|
with open(example_doc_path("fake-html.html"), "rb") as f:
|
|
|
|
elements = partition_html(file=f, metadata_last_modified=metadata_last_modified)
|
|
|
|
|
|
|
|
assert isinstance(elements[0], Title)
|
|
|
|
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_from_text_assigns_no_last_modified_metadata():
|
|
|
|
elements = partition_html(text="<html><div><p>TEST</p></div></html>")
|
|
|
|
|
|
|
|
assert isinstance(elements[0], Title)
|
|
|
|
assert elements[0].metadata.last_modified is None
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_from_text_prefers_metadata_last_modified():
|
|
|
|
metadata_last_modified = "2023-07-05T09:24:28"
|
|
|
|
|
|
|
|
elements = partition_html(
|
|
|
|
text="<html><div><p>TEST</p></div></html>", metadata_last_modified=metadata_last_modified
|
|
|
|
)
|
|
|
|
|
|
|
|
assert isinstance(elements[0], Title)
|
|
|
|
assert elements[0].metadata.last_modified == metadata_last_modified
|
|
|
|
|
|
|
|
|
|
|
|
# -- .metadata.link* -----------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_grabs_links():
|
|
|
|
elements = partition_html(
|
|
|
|
text=(
|
|
|
|
"<html>\n"
|
|
|
|
' <p>Hello there I am a <a href="/link">very important link!</a></p>\n'
|
|
|
|
" <p>Here is a list of my favorite things</p>\n"
|
|
|
|
" <ul>\n"
|
|
|
|
' <li><a href="https://en.wikipedia.org/wiki/Parrot">Parrots</a></li>\n'
|
|
|
|
" <li>Dogs</li>\n"
|
|
|
|
" </ul>\n"
|
|
|
|
' <a href="/loner">A lone link!</a>\n'
|
|
|
|
"</html>\n"
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
e = elements[0]
|
|
|
|
assert e == NarrativeText("Hello there I am a very important link!")
|
|
|
|
assert e.metadata.link_urls == ["/link"]
|
|
|
|
assert e.metadata.link_texts == ["very important link!"]
|
|
|
|
e = elements[1]
|
|
|
|
assert e == NarrativeText("Here is a list of my favorite things")
|
|
|
|
assert e.metadata.link_urls is None
|
|
|
|
assert e.metadata.link_texts is None
|
|
|
|
e = elements[2]
|
|
|
|
assert e == ListItem("Parrots")
|
|
|
|
assert e.metadata.link_urls == ["https://en.wikipedia.org/wiki/Parrot"]
|
|
|
|
assert e.metadata.link_texts == ["Parrots"]
|
|
|
|
e = elements[3]
|
|
|
|
assert e == ListItem("Dogs")
|
|
|
|
assert e.metadata.link_urls is None
|
|
|
|
assert e.metadata.link_texts is None
|
|
|
|
e = elements[4]
|
|
|
|
assert e == Title("A lone link!")
|
|
|
|
assert e.metadata.link_urls == ["/loner"]
|
|
|
|
assert e.metadata.link_texts == ["A lone link!"]
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_links():
|
|
|
|
elements = partition_html(
|
|
|
|
text=(
|
|
|
|
"<html>\n"
|
|
|
|
' <a href="/loner">A lone link!</a>\n'
|
|
|
|
' <p>Hello <a href="/link">link!</a></p>\n'
|
|
|
|
' <p>\n Hello <a href="/link">link!</a></p>\n'
|
|
|
|
' <p><a href="/wiki/parrots">Parrots</a> and <a href="/wiki/dogs">Dogs</a></p>\n'
|
|
|
|
"</html>\n"
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
e = elements[0]
|
|
|
|
assert e.metadata.link_texts == ["A lone link!"]
|
|
|
|
assert e.metadata.link_urls == ["/loner"]
|
|
|
|
assert e.metadata.link_start_indexes == [-1]
|
|
|
|
e = elements[1]
|
|
|
|
assert e.metadata.link_texts == ["link!"]
|
|
|
|
assert e.metadata.link_urls == ["/link"]
|
|
|
|
assert e.metadata.link_start_indexes == [6]
|
|
|
|
e = elements[2]
|
|
|
|
assert e.metadata.link_texts == ["link!"]
|
|
|
|
assert e.metadata.link_urls == ["/link"]
|
|
|
|
assert e.metadata.link_start_indexes == [6]
|
|
|
|
e = elements[3]
|
|
|
|
assert e.metadata.link_texts == ["Parrots", "Dogs"]
|
|
|
|
assert e.metadata.link_urls == ["/wiki/parrots", "/wiki/dogs"]
|
|
|
|
assert e.metadata.link_start_indexes == [0, 12]
|
|
|
|
|
|
|
|
|
2024-06-11 13:54:11 -07:00
|
|
|
# -- .metadata.text_as_html ----------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("html_str", "expected_value"),
|
|
|
|
[
|
|
|
|
(
|
|
|
|
"<table><tr><th>Header 1</th><th>Header 2</th></tr></table>",
|
|
|
|
"<table><tr><td>Header 1</td><td>Header 2</td></tr></table>",
|
|
|
|
),
|
|
|
|
(
|
|
|
|
"<table>"
|
|
|
|
"<tr><td>Dimensions</td><td>Weight</td></tr>"
|
|
|
|
"<tr><td>4'-6\" x 1'</td><td>18 kg</td></tr>"
|
|
|
|
"</table>",
|
|
|
|
# ----------
|
|
|
|
"<table>"
|
|
|
|
"<tr><td>Dimensions</td><td>Weight</td></tr>"
|
|
|
|
"<tr><td>4'-6" x 1'</td><td>18 kg</td></tr>"
|
|
|
|
"</table>",
|
|
|
|
),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_partition_html_applies_text_as_html_metadata_for_tables(
|
|
|
|
html_str: str, expected_value: str
|
|
|
|
):
|
|
|
|
elements = partition_html(text=html_str)
|
|
|
|
|
|
|
|
assert len(elements) == 1
|
|
|
|
assert elements[0].metadata.text_as_html == expected_value
|
|
|
|
|
|
|
|
|
|
|
|
# -- .metadata.url -------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_from_url_adds_url_to_metadata(requests_get_: Mock):
|
|
|
|
requests_get_.return_value = FakeResponse(
|
|
|
|
text=example_doc_text("example-10k-1p.html"),
|
|
|
|
status_code=200,
|
|
|
|
headers={"Content-Type": "text/html"},
|
|
|
|
)
|
|
|
|
|
|
|
|
elements = partition_html(url="https://trusttheforceluke.com")
|
|
|
|
|
|
|
|
requests_get_.assert_called_once_with("https://trusttheforceluke.com", headers={}, verify=True)
|
|
|
|
assert len(elements) > 0
|
|
|
|
assert all(e.metadata.url == "https://trusttheforceluke.com" for e in elements)
|
|
|
|
|
|
|
|
|
2024-06-21 13:59:48 -07:00
|
|
|
# ================================================================================================
|
|
|
|
# SERIALIZATION BEHAVIORS
|
|
|
|
# ================================================================================================
|
2024-06-05 16:11:58 -07:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_round_trips_through_json():
|
|
|
|
elements = partition_html(example_doc_path("example-10k-1p.html"))
|
|
|
|
assert_round_trips_through_JSON(elements)
|
|
|
|
|
|
|
|
|
2024-06-21 13:59:48 -07:00
|
|
|
# ================================================================================================
|
|
|
|
# MODULE-LEVEL FIXTURES
|
|
|
|
# ================================================================================================
|
2024-06-05 16:11:58 -07:00
|
|
|
|
|
|
|
EXPECTED_OUTPUT_LANGUAGE_DE = [
|
|
|
|
Title(text="Jahresabschluss zum Geschäftsjahr vom 01.01.2020 bis zum 31.12.2020"),
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def get_last_modified_date_(request: pytest.FixtureRequest):
|
2024-06-21 13:59:48 -07:00
|
|
|
return function_mock(request, "unstructured.partition.html.partition.get_last_modified_date")
|
2024-06-05 16:11:58 -07:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def get_last_modified_date_from_file_(request: pytest.FixtureRequest):
|
2024-06-21 13:59:48 -07:00
|
|
|
return function_mock(
|
|
|
|
request, "unstructured.partition.html.partition.get_last_modified_date_from_file"
|
|
|
|
)
|
2024-06-05 16:11:58 -07:00
|
|
|
|
|
|
|
|
|
|
|
class FakeResponse:
|
|
|
|
def __init__(self, text: str, status_code: int, headers: dict[str, str] = {}):
|
|
|
|
self.text = text
|
|
|
|
self.status_code = status_code
|
|
|
|
self.ok = status_code < 300
|
|
|
|
self.headers = headers
|
|
|
|
|
|
|
|
|
2024-06-21 13:59:48 -07:00
|
|
|
@pytest.fixture
|
|
|
|
def opts_args() -> dict[str, Any]:
|
|
|
|
"""All default arguments for `HtmlPartitionerOptions`.
|
|
|
|
|
|
|
|
Individual argument values can be changed to suit each test. Makes construction of opts more
|
|
|
|
compact for testing purposes.
|
|
|
|
"""
|
|
|
|
return {
|
|
|
|
"file": None,
|
|
|
|
"file_path": None,
|
|
|
|
"text": None,
|
|
|
|
"encoding": None,
|
|
|
|
"url": None,
|
|
|
|
"headers": {},
|
|
|
|
"ssl_verify": True,
|
|
|
|
"date_from_file_object": False,
|
|
|
|
"metadata_last_modified": None,
|
|
|
|
"skip_headers_and_footers": False,
|
|
|
|
"detection_origin": None,
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2024-06-05 16:11:58 -07:00
|
|
|
@pytest.fixture
|
|
|
|
def requests_get_(request: pytest.FixtureRequest):
|
2024-06-21 13:59:48 -07:00
|
|
|
return function_mock(request, "unstructured.partition.html.partition.requests.get")
|
2024-06-13 11:19:42 -07:00
|
|
|
|
|
|
|
|
|
|
|
# ================================================================================================
|
|
|
|
# ISOLATED UNIT TESTS
|
|
|
|
# ================================================================================================
|
|
|
|
# These test components used by `partition_html()` in isolation such that all edge cases can be
|
|
|
|
# exercised.
|
|
|
|
# ================================================================================================
|
|
|
|
|
|
|
|
|
|
|
|
class DescribeHtmlPartitionerOptions:
|
2024-06-21 13:59:48 -07:00
|
|
|
"""Unit-test suite for `unstructured.partition.html.partition.HtmlPartitionerOptions`."""
|
2024-06-13 11:19:42 -07:00
|
|
|
|
|
|
|
# -- .detection_origin -----------------------
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("detection_origin", ["html", None])
|
|
|
|
def it_knows_the_caller_provided_detection_origin(
|
|
|
|
self, detection_origin: str | None, opts_args: dict[str, Any]
|
|
|
|
):
|
|
|
|
opts_args["detection_origin"] = detection_origin
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
|
|
|
|
assert opts.detection_origin == detection_origin
|
|
|
|
|
|
|
|
# -- .encoding -------------------------------
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("encoding", ["utf-8", None])
|
|
|
|
def it_knows_the_caller_provided_encoding(
|
|
|
|
self, encoding: str | None, opts_args: dict[str, Any]
|
|
|
|
):
|
|
|
|
opts_args["encoding"] = encoding
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
|
|
|
|
assert opts.encoding == encoding
|
|
|
|
|
2024-06-21 13:59:48 -07:00
|
|
|
# -- .html_text ------------------------------
|
2024-06-13 11:19:42 -07:00
|
|
|
|
|
|
|
def it_gets_the_HTML_from_the_file_path_when_one_is_provided(self, opts_args: dict[str, Any]):
|
|
|
|
file_path = example_doc_path("example-10k-1p.html")
|
|
|
|
opts_args["file_path"] = file_path
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
|
2024-06-21 13:59:48 -07:00
|
|
|
html_text = opts.html_text
|
2024-06-13 11:19:42 -07:00
|
|
|
|
2024-06-21 13:59:48 -07:00
|
|
|
assert isinstance(html_text, str)
|
|
|
|
assert html_text == read_txt_file(file_path)[1]
|
2024-06-13 11:19:42 -07:00
|
|
|
|
|
|
|
def and_it_gets_the_HTML_from_the_file_like_object_when_one_is_provided(
|
|
|
|
self, opts_args: dict[str, Any]
|
|
|
|
):
|
|
|
|
file_path = example_doc_path("example-10k-1p.html")
|
|
|
|
with open(file_path, "rb") as f:
|
|
|
|
file = io.BytesIO(f.read())
|
|
|
|
opts_args["file"] = file
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
|
2024-06-21 13:59:48 -07:00
|
|
|
html_text = opts.html_text
|
2024-06-13 11:19:42 -07:00
|
|
|
|
2024-06-21 13:59:48 -07:00
|
|
|
assert isinstance(html_text, str)
|
|
|
|
assert html_text == read_txt_file(file_path)[1]
|
2024-06-13 11:19:42 -07:00
|
|
|
|
|
|
|
def and_it_uses_the_HTML_in_the_text_argument_when_that_is_provided(
|
|
|
|
self, opts_args: dict[str, Any]
|
|
|
|
):
|
|
|
|
opts_args["text"] = "<html><body><p>Hello World!</p></body></html>"
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
|
2024-06-21 13:59:48 -07:00
|
|
|
assert opts.html_text == "<html><body><p>Hello World!</p></body></html>"
|
2024-06-13 11:19:42 -07:00
|
|
|
|
|
|
|
def and_it_gets_the_HTML_from_the_url_when_one_is_provided(
|
|
|
|
self, requests_get_: Mock, opts_args: dict[str, Any]
|
|
|
|
):
|
|
|
|
requests_get_.return_value = FakeResponse(
|
|
|
|
text="<html><body><p>I just flew over the internet!</p></body></html>",
|
|
|
|
status_code=200,
|
|
|
|
headers={"Content-Type": "text/html"},
|
|
|
|
)
|
|
|
|
opts_args["url"] = "https://insta.tweet.face.org"
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
|
2024-06-21 13:59:48 -07:00
|
|
|
assert opts.html_text == "<html><body><p>I just flew over the internet!</p></body></html>"
|
2024-06-13 11:19:42 -07:00
|
|
|
|
|
|
|
def but_it_raises_when_no_path_or_file_or_text_or_url_was_provided(
|
|
|
|
self, opts_args: dict[str, Any]
|
|
|
|
):
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
|
|
|
|
with pytest.raises(ValueError, match="Exactly one of filename, file, text, or url must be"):
|
2024-06-21 13:59:48 -07:00
|
|
|
opts.html_text
|
2024-06-13 11:19:42 -07:00
|
|
|
|
|
|
|
# -- .last_modified --------------------------
|
|
|
|
|
|
|
|
def it_gets_the_last_modified_date_of_the_document_from_the_caller_when_provided(
|
|
|
|
self, opts_args: dict[str, Any]
|
|
|
|
):
|
|
|
|
opts_args["metadata_last_modified"] = "2024-03-05T17:02:53"
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
|
|
|
|
assert opts.last_modified == "2024-03-05T17:02:53"
|
|
|
|
|
|
|
|
def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_path_is_provided(
|
|
|
|
self, opts_args: dict[str, Any], get_last_modified_date_: Mock
|
|
|
|
):
|
|
|
|
opts_args["file_path"] = "a/b/document.html"
|
|
|
|
get_last_modified_date_.return_value = "2024-04-02T20:32:35"
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
|
|
|
|
last_modified = opts.last_modified
|
|
|
|
|
|
|
|
get_last_modified_date_.assert_called_once_with("a/b/document.html")
|
|
|
|
assert last_modified == "2024-04-02T20:32:35"
|
|
|
|
|
|
|
|
def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_file_like_object_is_provided(
|
|
|
|
self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock
|
|
|
|
):
|
|
|
|
file = io.BytesIO(b"abcdefg")
|
|
|
|
opts_args["file"] = file
|
|
|
|
opts_args["date_from_file_object"] = True
|
|
|
|
get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07"
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
|
|
|
|
last_modified = opts.last_modified
|
|
|
|
|
|
|
|
get_last_modified_date_from_file_.assert_called_once_with(file)
|
|
|
|
assert last_modified == "2024-04-02T20:42:07"
|
|
|
|
|
|
|
|
def but_it_falls_back_to_None_for_the_last_modified_date_when_date_from_file_object_is_False(
|
|
|
|
self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock
|
|
|
|
):
|
|
|
|
file = io.BytesIO(b"abcdefg")
|
|
|
|
opts_args["file"] = file
|
|
|
|
opts_args["date_from_file_object"] = False
|
|
|
|
get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07"
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
|
|
|
|
last_modified = opts.last_modified
|
|
|
|
|
|
|
|
get_last_modified_date_from_file_.assert_not_called()
|
|
|
|
assert last_modified is None
|
|
|
|
|
|
|
|
# -- .skip_headers_and_footers ---------------
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("skip_headers_and_footers", [True, False])
|
|
|
|
def it_knows_the_caller_provided_skip_headers_and_footers_setting(
|
|
|
|
self, skip_headers_and_footers: bool, opts_args: dict[str, Any]
|
|
|
|
):
|
|
|
|
opts_args["skip_headers_and_footers"] = skip_headers_and_footers
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
|
|
|
|
assert opts.skip_headers_and_footers is skip_headers_and_footers
|
|
|
|
|
|
|
|
# -- fixtures --------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
@pytest.fixture()
|
|
|
|
def get_last_modified_date_(self, request: FixtureRequest) -> Mock:
|
2024-06-21 13:59:48 -07:00
|
|
|
return function_mock(
|
|
|
|
request, "unstructured.partition.html.partition.get_last_modified_date"
|
|
|
|
)
|
2024-06-13 11:19:42 -07:00
|
|
|
|
|
|
|
@pytest.fixture()
|
|
|
|
def get_last_modified_date_from_file_(self, request: FixtureRequest):
|
|
|
|
return function_mock(
|
2024-06-21 13:59:48 -07:00
|
|
|
request, "unstructured.partition.html.partition.get_last_modified_date_from_file"
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
class Describe_HtmlPartitioner:
|
|
|
|
"""Unit-test suite for `unstructured.partition.html.partition._HtmlPartitioner`."""
|
|
|
|
|
|
|
|
# -- ._main ----------------------------------
|
|
|
|
|
|
|
|
def it_can_find_the_main_element_in_the_document(self, opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = (
|
|
|
|
"<body>\n"
|
|
|
|
" <header></header>\n"
|
|
|
|
" <p>Lots preamble stuff yada yada yada</p>\n"
|
|
|
|
" <main>\n"
|
|
|
|
" <h2>A Wonderful Section!</h2>\n"
|
|
|
|
" <p>Look at this amazing section!</p>\n"
|
|
|
|
" </main>\n"
|
|
|
|
"</body>\n"
|
|
|
|
)
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
|
|
|
|
partitioner = _HtmlPartitioner(opts)
|
|
|
|
|
|
|
|
assert partitioner._main.tag == "main"
|
|
|
|
|
|
|
|
def and_it_falls_back_to_the_body_when_there_is_no_main(self, opts_args: dict[str, Any]):
|
|
|
|
"""And there is always a <body>, the parser adds one if there's not one in the HTML."""
|
|
|
|
opts_args["text"] = (
|
|
|
|
"<body>\n"
|
|
|
|
" <header></header>\n"
|
|
|
|
" <p>Lots preamble stuff yada yada yada</p>\n"
|
|
|
|
" <h2>A Wonderful Section!</h2>\n"
|
|
|
|
" <p>Look at this amazing section!</p>\n"
|
|
|
|
"</body>\n"
|
2024-06-13 11:19:42 -07:00
|
|
|
)
|
2024-06-21 13:59:48 -07:00
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
|
|
|
|
partitioner = _HtmlPartitioner(opts)
|
|
|
|
|
|
|
|
assert partitioner._main.tag == "body"
|
|
|
|
|
|
|
|
# -- ElementCls selection behaviors -----------------
|
|
|
|
|
|
|
|
def it_produces_a_Text_element_when_the_tag_contents_are_not_narrative_or_a_title(
|
|
|
|
self, opts_args: dict[str, Any]
|
|
|
|
):
|
|
|
|
opts_args["text"] = "<p>NO PARTICULAR TYPE.</p>"
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
|
|
|
|
(element,) = list(_HtmlPartitioner.iter_elements(opts))
|
|
|
|
|
|
|
|
assert element == Text("NO PARTICULAR TYPE.")
|
|
|
|
|
|
|
|
def it_produces_a_ListItem_element_when_the_tag_contains_are_preceded_by_a_bullet_character(
|
|
|
|
self, opts_args: dict[str, Any]
|
|
|
|
):
|
|
|
|
opts_args["text"] = "<p>● An excellent point!</p>"
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
|
|
|
|
(element,) = list(_HtmlPartitioner.iter_elements(opts))
|
|
|
|
|
|
|
|
assert element == ListItem("An excellent point!")
|
|
|
|
|
|
|
|
def but_not_when_the_tag_contains_only_a_bullet_character_and_no_text(
|
|
|
|
self, opts_args: dict[str, Any]
|
|
|
|
):
|
|
|
|
opts_args["text"] = "<p>●</p>"
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
|
|
|
|
assert list(_HtmlPartitioner.iter_elements(opts)) == []
|
|
|
|
|
|
|
|
def it_produces_no_element_when_the_tag_has_no_content(self, opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = "<p></p>"
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
|
|
|
|
assert list(_HtmlPartitioner.iter_elements(opts)) == []
|
|
|
|
|
|
|
|
def and_it_produces_no_element_when_the_tag_contains_only_a_stub(
|
|
|
|
self, opts_args: dict[str, Any]
|
|
|
|
):
|
|
|
|
opts_args["text"] = "<p>$</p>"
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
2024-06-13 11:19:42 -07:00
|
|
|
|
2024-06-21 13:59:48 -07:00
|
|
|
assert list(_HtmlPartitioner.iter_elements(opts)) == []
|