2023-11-17 09:56:38 -08:00
|
|
|
# pyright: reportPrivateUsage=false
|
2024-06-06 14:21:33 -07:00
|
|
|
# pyright: reportUnusedFunction=false
|
2023-11-17 09:56:38 -08:00
|
|
|
|
2024-06-06 11:16:02 -07:00
|
|
|
"""Test suite for `unstructured.documents.html` module."""
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
2023-02-28 23:24:24 +01:00
|
|
|
import pathlib
|
2024-06-13 11:19:42 -07:00
|
|
|
from typing import Any
|
2022-06-29 14:35:19 -04:00
|
|
|
|
|
|
|
import pytest
|
2023-02-27 17:30:54 +01:00
|
|
|
from lxml import etree
|
2022-06-29 14:35:19 -04:00
|
|
|
|
2024-06-06 11:16:02 -07:00
|
|
|
from test_unstructured.unit_utils import (
|
|
|
|
FixtureRequest,
|
|
|
|
Mock,
|
|
|
|
example_doc_path,
|
|
|
|
function_mock,
|
|
|
|
property_mock,
|
|
|
|
)
|
2023-02-27 17:30:54 +01:00
|
|
|
from unstructured.documents import html
|
|
|
|
from unstructured.documents.elements import (
|
2024-06-14 17:14:22 -07:00
|
|
|
Address,
|
2023-02-27 17:30:54 +01:00
|
|
|
ListItem,
|
|
|
|
NarrativeText,
|
2023-09-11 11:14:11 -07:00
|
|
|
Table,
|
2023-02-27 17:30:54 +01:00
|
|
|
Text,
|
|
|
|
Title,
|
|
|
|
)
|
2024-06-14 17:14:22 -07:00
|
|
|
from unstructured.documents.html import HTMLDocument, HtmlPartitionerOptions
|
2022-06-29 14:35:19 -04:00
|
|
|
|
|
|
|
TAGS = (
|
2023-11-20 08:29:32 -08:00
|
|
|
(
|
|
|
|
"<a><abbr><acronym><address><applet><area><article><aside><audio><b><base><basefont><bdi>"
|
|
|
|
"<bdo><big><blockquote><body><br><button><canvas><caption><center><cite><code><col>"
|
|
|
|
"<colgroup><data><datalist><dd><del><details><dfn><dialog><dir><div><dl><dt><em><embed>"
|
|
|
|
"<fieldset><figcaption><figure><font><footer><form><frame><frameset><h1><h2><h3><h4><h5>"
|
|
|
|
"<h6><head><header><hr><html><i><iframe><img><input><ins><kbd><label><legend><li><link>"
|
|
|
|
"<main><map><mark><meta><meter><nav><noframes><noscript><object><ol><optgroup><option>"
|
|
|
|
"<output><p><param><picture><pre><progress><q><rp><rt><ruby><s><samp><script><section>"
|
|
|
|
"<select><small><source><span><strike><strong><style><sub><summary><sup><table><tbody><td>"
|
|
|
|
"<template><textarea><tfoot><th><thead><time><title><tr><track><tt><u><ul><var><video><wbr>"
|
|
|
|
)
|
|
|
|
.replace(">", "")
|
|
|
|
.split("<")[1:]
|
2022-06-29 14:35:19 -04:00
|
|
|
)
|
|
|
|
|
2023-11-20 08:29:32 -08:00
|
|
|
VOID_TAGS = (
|
|
|
|
("<area><base><br><col><embed><hr><img><input><link><meta><param><source><track><wbr>")
|
|
|
|
.replace(">", "")
|
|
|
|
.split("<")[1:]
|
|
|
|
)
|
2023-10-03 12:17:51 +08:00
|
|
|
|
2024-06-06 11:16:02 -07:00
|
|
|
INCLUDED_TAGS = html.TEXT_TAGS + html.HEADING_TAGS + html.LIST_ITEM_TAGS + html.SECTION_TAGS
|
2023-10-03 12:17:51 +08:00
|
|
|
EXCLUDED_TAGS = [
|
|
|
|
tag
|
|
|
|
for tag in TAGS
|
2024-06-06 11:16:02 -07:00
|
|
|
if tag not in (INCLUDED_TAGS + html.TABLE_TAGS + VOID_TAGS + ["html", "head", "body"])
|
2023-10-03 12:17:51 +08:00
|
|
|
]
|
2022-06-29 14:35:19 -04:00
|
|
|
|
|
|
|
|
2023-11-20 08:29:32 -08:00
|
|
|
# -- table-extraction behaviors ------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
2024-06-14 17:14:22 -07:00
|
|
|
def test_it_can_parse_a_bare_bones_table_to_a_Table_element(opts_args: dict[str, Any]):
|
2023-11-20 08:29:32 -08:00
|
|
|
"""Bare-bones means no `<thead>`, `<tbody>`, or `<tfoot>` elements."""
|
2024-06-13 11:19:42 -07:00
|
|
|
opts_args["text"] = (
|
2023-11-20 08:29:32 -08:00
|
|
|
"<html>\n"
|
|
|
|
"<body>\n"
|
|
|
|
" <table>\n"
|
|
|
|
" <tr><td>Lorem</td><td>Ipsum</td></tr>\n"
|
|
|
|
" <tr><td>Ut enim non</td><td>ad minim\nveniam quis</td></tr>\n"
|
|
|
|
" </table>\n"
|
|
|
|
"</body>\n"
|
|
|
|
"</html>"
|
2022-06-29 14:35:19 -04:00
|
|
|
)
|
2024-06-13 11:19:42 -07:00
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
html_document = HTMLDocument.load(opts)
|
2023-11-20 08:29:32 -08:00
|
|
|
|
2024-06-14 17:14:22 -07:00
|
|
|
# -- there is exactly one element and it's a Table instance --
|
2023-11-20 08:29:32 -08:00
|
|
|
(element,) = html_document.elements
|
2024-06-14 17:14:22 -07:00
|
|
|
assert isinstance(element, Table)
|
2023-11-20 08:29:32 -08:00
|
|
|
# -- table text is joined into a single string; no row or cell boundaries are represented --
|
|
|
|
assert element.text == "Lorem Ipsum Ut enim non ad minim\nveniam quis"
|
|
|
|
# -- An HTML representation is also available that is longer but represents table structure.
|
2024-06-14 17:14:22 -07:00
|
|
|
assert element.metadata.text_as_html == (
|
2023-11-20 08:29:32 -08:00
|
|
|
"<table>"
|
|
|
|
"<tr><td>Lorem</td><td>Ipsum</td></tr>"
|
|
|
|
"<tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr>"
|
|
|
|
"</table>"
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2024-06-13 11:19:42 -07:00
|
|
|
def test_it_accommodates_column_heading_cells_enclosed_in_thead_tbody_and_tfoot_elements(
|
|
|
|
opts_args: dict[str, Any]
|
|
|
|
):
|
2023-11-20 08:29:32 -08:00
|
|
|
"""Cells within a `table/thead` element are included in the text and html.
|
|
|
|
|
|
|
|
The presence of a `<thead>` element in the original also determines whether a `<thead>` element
|
|
|
|
appears in `.text_as_html` or whether the first row of cells is simply in the body.
|
|
|
|
"""
|
2024-06-13 11:19:42 -07:00
|
|
|
opts_args["text"] = (
|
2023-11-20 08:29:32 -08:00
|
|
|
"<html>\n"
|
|
|
|
"<body>\n"
|
|
|
|
" <table>\n"
|
|
|
|
" <thead>\n"
|
|
|
|
" <tr><th>Lorem</th><th>Ipsum</th></tr>\n"
|
|
|
|
" </thead>\n"
|
|
|
|
" <tbody>\n"
|
|
|
|
" <tr><th>Lorem ipsum</th><td>dolor sit amet nulla</td></tr>\n"
|
|
|
|
" <tr><th>Ut enim non</th><td>ad minim\nveniam quis</td></tr>\n"
|
|
|
|
" </tbody>\n"
|
|
|
|
" <tfoot>\n"
|
|
|
|
" <tr><th>Dolor</th><td>Equis</td></tr>\n"
|
|
|
|
" </tfoot>\n"
|
|
|
|
" </table>\n"
|
|
|
|
"</body>\n"
|
|
|
|
"</html>"
|
|
|
|
)
|
2024-06-13 11:19:42 -07:00
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
html_document = HTMLDocument.load(opts)
|
2023-11-20 08:29:32 -08:00
|
|
|
|
|
|
|
(element,) = html_document.elements
|
2024-06-14 17:14:22 -07:00
|
|
|
assert isinstance(element, Table)
|
|
|
|
assert element.metadata.text_as_html == (
|
2023-11-20 08:29:32 -08:00
|
|
|
"<table>"
|
|
|
|
"<tr><td>Lorem</td><td>Ipsum</td></tr>"
|
|
|
|
"<tr><td>Lorem ipsum</td><td>dolor sit amet nulla</td></tr>"
|
|
|
|
"<tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr>"
|
|
|
|
"<tr><td>Dolor</td><td>Equis</td></tr>"
|
|
|
|
"</table>"
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2024-06-14 17:14:22 -07:00
|
|
|
def test_it_does_not_emit_a_Table_element_for_a_table_with_no_text(opts_args: dict[str, Any]):
|
2024-06-13 11:19:42 -07:00
|
|
|
opts_args["text"] = (
|
2023-11-20 08:29:32 -08:00
|
|
|
"<html>\n"
|
|
|
|
"<body>\n"
|
|
|
|
" <table>\n"
|
|
|
|
" <tr><td> </td><td> </td></tr>\n"
|
|
|
|
" <tr><td> </td><td> </td></tr>\n"
|
|
|
|
" </table>\n"
|
|
|
|
"</body>\n"
|
|
|
|
"</html>"
|
|
|
|
)
|
2024-06-13 11:19:42 -07:00
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
html_document = HTMLDocument.load(opts)
|
2023-11-20 08:29:32 -08:00
|
|
|
|
|
|
|
assert html_document.elements == []
|
|
|
|
|
|
|
|
|
2024-06-13 11:19:42 -07:00
|
|
|
def test_it_grabs_bulleted_text_in_tables_as_ListItem_elements(opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = (
|
2024-06-06 11:16:02 -07:00
|
|
|
"<html>\n"
|
|
|
|
" <body>\n"
|
|
|
|
" <table>\n"
|
|
|
|
" <tbody>\n"
|
|
|
|
" <tr>\n"
|
|
|
|
" <td>•</td>\n"
|
|
|
|
" <td><p>Happy Groundhog's day!</p></td>\n"
|
|
|
|
" </tr>\n"
|
|
|
|
" <tr>\n"
|
|
|
|
" <td>•</td>\n"
|
|
|
|
" <td><p>Looks like six more weeks of winter ...</p></td>\n"
|
|
|
|
" </tr>\n"
|
|
|
|
" </tbody>\n"
|
|
|
|
" </table>\n"
|
|
|
|
" </body>\n"
|
|
|
|
"</html>\n"
|
|
|
|
)
|
2024-06-13 11:19:42 -07:00
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
html_document = HTMLDocument.load(opts)
|
2024-06-06 11:16:02 -07:00
|
|
|
|
|
|
|
assert html_document.elements == [
|
|
|
|
ListItem(text="Happy Groundhog's day!"),
|
|
|
|
ListItem(text="Looks like six more weeks of winter ..."),
|
|
|
|
]
|
|
|
|
|
|
|
|
|
2024-06-13 11:19:42 -07:00
|
|
|
def test_it_does_not_consider_an_empty_table_a_bulleted_text_table(opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = (
|
2023-11-20 08:29:32 -08:00
|
|
|
"<html>\n"
|
|
|
|
"<body>\n"
|
|
|
|
" <table>\n"
|
|
|
|
" <tr><td> </td><td> </td></tr>\n"
|
|
|
|
" <tr><td> </td><td> </td></tr>\n"
|
|
|
|
" </table>\n"
|
|
|
|
"</body>\n"
|
|
|
|
"</html>"
|
|
|
|
)
|
2024-06-13 11:19:42 -07:00
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
html_document = HTMLDocument.load(opts)
|
2024-06-08 00:36:18 -07:00
|
|
|
html_elem = html_document._document_tree
|
2023-11-20 08:29:32 -08:00
|
|
|
assert html_elem is not None
|
|
|
|
table = html_elem.find(".//table")
|
|
|
|
assert table is not None
|
|
|
|
|
2024-06-14 17:14:22 -07:00
|
|
|
assert html_document._is_bulleted_table(table) is False
|
2023-11-20 08:29:32 -08:00
|
|
|
|
|
|
|
|
2024-06-13 11:19:42 -07:00
|
|
|
def test_it_provides_parseable_HTML_in_text_as_html(opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = (
|
2023-11-20 08:29:32 -08:00
|
|
|
"<html>\n"
|
|
|
|
"<body>\n"
|
|
|
|
" <table>\n"
|
|
|
|
" <thead>\n"
|
|
|
|
" <tr><th>Lorem</th><th>Ipsum</th></tr>\n"
|
|
|
|
" </thead>\n"
|
|
|
|
" <tbody>\n"
|
|
|
|
" <tr><th>Lorem ipsum</th><td>dolor sit amet nulla</td></tr>\n"
|
|
|
|
" <tr><th>Ut enim non</th><td>ad minim\nveniam quis</td></tr>\n"
|
|
|
|
" </tbody>\n"
|
|
|
|
" <tfoot>\n"
|
|
|
|
" <tr><th>Dolor</th><td>Equis</td></tr>\n"
|
|
|
|
" </tfoot>\n"
|
|
|
|
" </table>\n"
|
|
|
|
"</body>\n"
|
|
|
|
"</html>"
|
|
|
|
)
|
2024-06-14 17:14:22 -07:00
|
|
|
html_document = HTMLDocument.load(HtmlPartitionerOptions(**opts_args))
|
2023-11-20 08:29:32 -08:00
|
|
|
(element,) = html_document.elements
|
2024-06-14 17:14:22 -07:00
|
|
|
assert isinstance(element, Table)
|
|
|
|
text_as_html = element.metadata.text_as_html
|
2023-11-20 08:29:32 -08:00
|
|
|
assert text_as_html is not None
|
|
|
|
|
|
|
|
html = etree.fromstring(text_as_html, etree.HTMLParser())
|
|
|
|
|
|
|
|
assert html is not None
|
|
|
|
# -- lxml adds the <html><body> container, that's not present in `.text_as_html` --
|
|
|
|
assert etree.tostring(html, encoding=str) == (
|
|
|
|
"<html><body>"
|
|
|
|
"<table>"
|
|
|
|
"<tr><td>Lorem</td><td>Ipsum</td></tr>"
|
|
|
|
"<tr><td>Lorem ipsum</td><td>dolor sit amet nulla</td></tr>"
|
|
|
|
"<tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr>"
|
|
|
|
"<tr><td>Dolor</td><td>Equis</td></tr>"
|
|
|
|
"</table>"
|
|
|
|
"</body></html>"
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2023-11-20 20:22:13 -08:00
|
|
|
# -- element-suppression behaviors ---------------------------------------------------------------
|
|
|
|
|
|
|
|
|
2024-06-13 11:19:42 -07:00
|
|
|
def test_it_does_not_extract_text_in_script_tags(opts_args: dict[str, Any]):
|
|
|
|
opts_args["file_path"] = example_doc_path("example-with-scripts.html")
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
doc = HTMLDocument.load(opts)
|
2023-12-13 12:22:25 -08:00
|
|
|
assert all("function (" not in element.text for element in doc.elements)
|
2023-11-20 20:22:13 -08:00
|
|
|
|
|
|
|
|
2024-06-13 11:19:42 -07:00
|
|
|
def test_it_does_not_extract_text_in_style_tags(opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = (
|
2023-11-20 20:22:13 -08:00
|
|
|
"<html>\n"
|
|
|
|
"<body>\n"
|
|
|
|
" <p><style> p { margin:0; padding:0; } </style>Lorem ipsum dolor</p>\n"
|
|
|
|
"</body>\n"
|
|
|
|
"</html>"
|
|
|
|
)
|
2024-06-13 11:19:42 -07:00
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
html_document = HTMLDocument.load(opts)
|
2023-11-20 20:22:13 -08:00
|
|
|
|
|
|
|
(element,) = html_document.elements
|
|
|
|
assert isinstance(element, Text)
|
|
|
|
assert element.text == "Lorem ipsum dolor"
|
|
|
|
|
|
|
|
|
2024-06-06 11:16:02 -07:00
|
|
|
# -- HTMLDocument.from_file() --------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
2024-06-13 11:19:42 -07:00
|
|
|
def test_read_html_doc(tmp_path: pathlib.Path, opts_args: dict[str, Any]):
|
|
|
|
file_path = str(tmp_path / "sample-doc.html")
|
|
|
|
with open(file_path, "w") as f:
|
2024-06-06 11:16:02 -07:00
|
|
|
f.write(
|
|
|
|
"<html>\n"
|
|
|
|
" <body>\n"
|
|
|
|
" <h1>A Great and Glorious Section</h1>\n"
|
|
|
|
" <p>Dear Leader is the best. He is such a wonderful engineer!</p>\n"
|
|
|
|
" <p></p>\n"
|
|
|
|
" <p>Another Magnificent Title</p>\n"
|
|
|
|
" <p><b>The prior element is a title based on its capitalization patterns!</b></p>\n"
|
|
|
|
" <table>\n"
|
|
|
|
" <tbody>\n"
|
|
|
|
" <tr>\n"
|
2024-06-06 14:21:33 -07:00
|
|
|
" <td><p>I'm in a table</p></td>\n"
|
2024-06-06 11:16:02 -07:00
|
|
|
" </tr>\n"
|
|
|
|
" </tbody>\n"
|
|
|
|
" </table>\n"
|
|
|
|
" <h2>A New Beginning</h2>\n"
|
|
|
|
" <div>Here is the start of a new page.</div>\n"
|
|
|
|
" </body>\n"
|
|
|
|
"</html>\n"
|
|
|
|
)
|
2024-06-13 11:19:42 -07:00
|
|
|
opts_args["file_path"] = file_path
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
2024-06-06 11:16:02 -07:00
|
|
|
|
2024-06-13 11:19:42 -07:00
|
|
|
elements = HTMLDocument.load(opts).elements
|
2024-06-06 11:16:02 -07:00
|
|
|
|
2024-06-13 11:19:42 -07:00
|
|
|
assert len(elements) == 7
|
|
|
|
assert elements == [
|
2024-06-06 11:16:02 -07:00
|
|
|
Title("A Great and Glorious Section"),
|
|
|
|
NarrativeText("Dear Leader is the best. He is such a wonderful engineer!"),
|
|
|
|
Title("Another Magnificent Title"),
|
|
|
|
NarrativeText("The prior element is a title based on its capitalization patterns!"),
|
2024-06-06 14:21:33 -07:00
|
|
|
Table("I'm in a table"),
|
2024-06-06 11:16:02 -07:00
|
|
|
Title("A New Beginning"),
|
|
|
|
NarrativeText("Here is the start of a new page."),
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
# -- HTMLDocument.elements -----------------------------------------------------------------------
|
2022-06-29 14:35:19 -04:00
|
|
|
|
|
|
|
|
2024-06-13 11:19:42 -07:00
|
|
|
def test_nested_text_tags(opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = (
|
2024-06-06 14:21:33 -07:00
|
|
|
"<body>\n"
|
|
|
|
" <p>\n"
|
|
|
|
" <a>\n"
|
|
|
|
" There is some text here.\n"
|
|
|
|
" </a>\n"
|
|
|
|
" </p>\n"
|
|
|
|
"</body>\n"
|
2024-06-06 11:16:02 -07:00
|
|
|
)
|
2024-06-13 11:19:42 -07:00
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
elements = HTMLDocument.load(opts).elements
|
2024-06-06 11:16:02 -07:00
|
|
|
|
2024-06-13 11:19:42 -07:00
|
|
|
assert len(elements) == 1
|
2024-06-06 11:16:02 -07:00
|
|
|
|
|
|
|
|
2024-06-13 11:19:42 -07:00
|
|
|
def test_containers_with_text_are_processed(opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = (
|
2024-06-06 11:16:02 -07:00
|
|
|
'<div dir=3D"ltr">Hi All,<div><br></div>\n'
|
|
|
|
" <div>Get excited for our first annual family day!</div>\n"
|
|
|
|
' <div>Best.<br clear=3D"all">\n'
|
|
|
|
" <div><br></div>\n"
|
|
|
|
" -- <br>\n"
|
|
|
|
' <div dir=3D"ltr">\n'
|
|
|
|
' <div dir=3D"ltr">Dino the Datasaur<div>\n'
|
|
|
|
" Unstructured Technologies<br>\n"
|
|
|
|
" <div>Data Scientist</div>\n"
|
|
|
|
" <div>Doylestown, PA 18901</div>\n"
|
|
|
|
" <div><br></div>\n"
|
|
|
|
" </div>\n"
|
|
|
|
" </div>\n"
|
|
|
|
" </div>\n"
|
|
|
|
" </div>\n"
|
|
|
|
"</div>\n"
|
|
|
|
)
|
2024-06-13 11:19:42 -07:00
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
html_document = HTMLDocument.load(opts)
|
2024-06-06 11:16:02 -07:00
|
|
|
|
|
|
|
assert html_document.elements == [
|
2024-06-14 17:14:22 -07:00
|
|
|
Text(text="Hi All,"),
|
|
|
|
NarrativeText(text="Get excited for our first annual family day!"),
|
|
|
|
Title(text="Best."),
|
|
|
|
Text(text="\n -- "),
|
|
|
|
Title(text="Dino the Datasaur"),
|
|
|
|
Title(text="\n Unstructured Technologies"),
|
|
|
|
Title(text="Data Scientist"),
|
|
|
|
Address(text="Doylestown, PA 18901"),
|
2024-06-06 11:16:02 -07:00
|
|
|
]
|
|
|
|
|
|
|
|
|
2024-06-13 11:19:42 -07:00
|
|
|
def test_html_grabs_bulleted_text_in_tags(opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = (
|
2024-06-06 11:16:02 -07:00
|
|
|
"<html>\n"
|
|
|
|
" <body>\n"
|
|
|
|
" <ol>\n"
|
|
|
|
" <li>Happy Groundhog's day!</li>\n"
|
|
|
|
" <li>Looks like six more weeks of winter ...</li>\n"
|
|
|
|
" </ol>\n"
|
|
|
|
" </body>\n"
|
|
|
|
"</html>\n"
|
2024-06-13 11:19:42 -07:00
|
|
|
)
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
assert HTMLDocument.load(opts).elements == [
|
2024-06-06 11:16:02 -07:00
|
|
|
ListItem(text="Happy Groundhog's day!"),
|
|
|
|
ListItem(text="Looks like six more weeks of winter ..."),
|
|
|
|
]
|
|
|
|
|
|
|
|
|
2024-06-13 11:19:42 -07:00
|
|
|
def test_html_grabs_bulleted_text_in_paras(opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = (
|
2024-06-06 11:16:02 -07:00
|
|
|
"<html>\n"
|
|
|
|
" <body>\n"
|
|
|
|
" <p>\n"
|
|
|
|
" <span>• Happy Groundhog's day!</span>\n"
|
|
|
|
" </p>\n"
|
|
|
|
" <p>\n"
|
|
|
|
" <span>• Looks like six more weeks of winter ...</span>\n"
|
|
|
|
" </p>\n"
|
|
|
|
" </body>\n"
|
|
|
|
"</html>\n"
|
2024-06-13 11:19:42 -07:00
|
|
|
)
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
assert HTMLDocument.load(opts).elements == [
|
2024-06-06 11:16:02 -07:00
|
|
|
ListItem(text="Happy Groundhog's day!"),
|
|
|
|
ListItem(text="Looks like six more weeks of winter ..."),
|
|
|
|
]
|
|
|
|
|
|
|
|
|
2024-06-13 11:19:42 -07:00
|
|
|
def test_joins_tag_text_correctly(opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = "<p>Hello again peet mag<i>ic</i>al</p>"
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
doc = HTMLDocument.load(opts)
|
2024-06-06 11:16:02 -07:00
|
|
|
assert doc.elements[0].text == "Hello again peet magical"
|
|
|
|
|
|
|
|
|
2024-06-13 11:19:42 -07:00
|
|
|
def test_sample_doc_with_emoji(opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = '<html charset="unicode">\n<p>Hello again 😀</p>\n</html>'
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
doc = HTMLDocument.load(opts)
|
2024-06-06 11:16:02 -07:00
|
|
|
# NOTE(robinson) - unclear why right now, but the output is the emoji on the test runners
|
|
|
|
# and the byte string representation when running locally on mac
|
|
|
|
assert doc.elements[0].text in ["Hello again ð\x9f\x98\x80", "Hello again 😀"]
|
|
|
|
|
|
|
|
|
2024-06-13 11:19:42 -07:00
|
|
|
def test_only_plain_text_in_body(opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = "<body>Hello</body>"
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
assert HTMLDocument.load(opts).elements[0].text == "Hello"
|
2024-06-06 11:16:02 -07:00
|
|
|
|
|
|
|
|
2024-06-13 11:19:42 -07:00
|
|
|
def test_plain_text_before_anything_in_body(opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = "<body>Hello<p>World</p></body>"
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
doc = HTMLDocument.load(opts)
|
2024-06-06 11:16:02 -07:00
|
|
|
assert doc.elements[0].text == "Hello"
|
|
|
|
assert doc.elements[1].text == "World"
|
|
|
|
|
|
|
|
|
2024-06-13 11:19:42 -07:00
|
|
|
def test_line_break_in_container(opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = "<div>Hello<br/>World</div>"
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
doc = HTMLDocument.load(opts)
|
2024-06-06 11:16:02 -07:00
|
|
|
assert doc.elements[0].text == "Hello"
|
|
|
|
assert doc.elements[1].text == "World"
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("tag", html.TEXT_TAGS)
|
2024-06-13 11:19:42 -07:00
|
|
|
def test_line_break_in_text_tag(tag: str, opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = f"<{tag}>Hello<br/>World</{tag}>"
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
doc = HTMLDocument.load(opts)
|
2024-06-06 11:16:02 -07:00
|
|
|
assert doc.elements[0].text == "Hello"
|
|
|
|
assert doc.elements[1].text == "World"
|
|
|
|
|
|
|
|
|
2024-06-13 11:19:42 -07:00
|
|
|
@pytest.mark.parametrize("tag", [tag for tag in html.TEXT_TAGS if tag not in html.TABLE_TAGS])
|
|
|
|
def test_tag_types(tag: str, opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = f"<body>\n <{tag}>\n There is some text here.\n </{tag}>\n</body>\n"
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
2024-06-06 11:16:02 -07:00
|
|
|
|
2024-06-13 11:19:42 -07:00
|
|
|
elements = HTMLDocument.load(opts).elements
|
2024-06-06 11:16:02 -07:00
|
|
|
|
2024-06-13 11:19:42 -07:00
|
|
|
assert len(elements) == 1
|
2024-06-06 11:16:02 -07:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("tag", EXCLUDED_TAGS)
|
2024-06-13 11:19:42 -07:00
|
|
|
def test_exclude_tag_types(tag: str, opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = f"<body>\n <{tag}>\n There is some text here.\n </{tag}>\n</body>\n"
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
|
|
|
|
elements = HTMLDocument.load(opts).elements
|
|
|
|
|
|
|
|
assert len(elements) == 0
|
2024-06-06 11:16:02 -07:00
|
|
|
|
|
|
|
|
|
|
|
# -- _construct_text() ---------------------------------------------------------------------------
|
2022-06-29 14:35:19 -04:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
2023-02-27 17:30:54 +01:00
|
|
|
("doc", "expected"),
|
2022-06-29 14:35:19 -04:00
|
|
|
[
|
|
|
|
(
|
|
|
|
"<p>Hi there <span>my name is</span> <b><i>Matt</i></i></p>",
|
|
|
|
"Hi there my name is Matt",
|
|
|
|
),
|
|
|
|
("<p>I have a</p> tail", "I have a tail"),
|
|
|
|
],
|
|
|
|
)
|
2024-06-06 11:16:02 -07:00
|
|
|
def test_construct_text(doc: str, expected: str):
|
2022-06-29 14:35:19 -04:00
|
|
|
document_tree = etree.fromstring(doc, etree.HTMLParser())
|
|
|
|
para = document_tree.find(".//p")
|
2024-06-06 11:16:02 -07:00
|
|
|
assert para is not None
|
2022-06-29 14:35:19 -04:00
|
|
|
text = html._construct_text(para)
|
|
|
|
assert text == expected
|
|
|
|
|
|
|
|
|
2024-06-06 11:16:02 -07:00
|
|
|
# -- _get_emphasized_texts_from_tag() ------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("doc", "root", "expected"),
|
|
|
|
[
|
|
|
|
(
|
|
|
|
"<p>Hello <strong>there</strong> I <em>am</em> a <b>very</b> <i>important</i> text</p>",
|
|
|
|
"p",
|
2024-06-14 17:14:22 -07:00
|
|
|
(["there", "am", "very", "important"], ["strong", "em", "b", "i"]),
|
2024-06-06 11:16:02 -07:00
|
|
|
),
|
|
|
|
(
|
|
|
|
"<p>Here is a <span>list</span> of <b>my <i>favorite</i> things</b></p>",
|
|
|
|
"p",
|
2024-06-14 17:14:22 -07:00
|
|
|
(["list", "my favorite things", "favorite"], ["span", "b", "i"]),
|
2024-06-06 11:16:02 -07:00
|
|
|
),
|
2024-06-14 17:14:22 -07:00
|
|
|
("<strong>A lone strong text!</strong>", "strong", (["A lone strong text!"], ["strong"])),
|
|
|
|
("<span>I have a</span> tail", "span", (["I have a"], ["span"])),
|
2024-06-06 11:16:02 -07:00
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_get_emphasized_texts_from_tag(doc: str, root: str, expected: list[dict[str, str]]):
|
|
|
|
document_tree = etree.fromstring(doc, etree.HTMLParser())
|
|
|
|
el = document_tree.find(f".//{root}")
|
|
|
|
assert el is not None
|
|
|
|
|
|
|
|
emphasized_texts = html._get_emphasized_texts_from_tag(el)
|
|
|
|
|
|
|
|
assert emphasized_texts == expected
|
|
|
|
|
|
|
|
|
|
|
|
# -- _get_links_from_tag() -----------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("doc", "root", "expected"),
|
|
|
|
[
|
|
|
|
(
|
|
|
|
"<a href='/loner'>A lone link!</a>",
|
2024-04-12 08:14:20 +02:00
|
|
|
"a",
|
2024-06-14 17:14:22 -07:00
|
|
|
(["A lone link!"], ["/loner"], [-1]),
|
2024-04-12 08:14:20 +02:00
|
|
|
),
|
|
|
|
(
|
|
|
|
"<ul><li><a href='/wiki/Parrot'>Parrots</a></li><li>Dogs</li></ul>",
|
|
|
|
"ul",
|
2024-06-14 17:14:22 -07:00
|
|
|
(["Parrots"], ["/wiki/Parrot"], [0]),
|
2024-04-12 08:14:20 +02:00
|
|
|
),
|
|
|
|
(
|
|
|
|
"<ul><li><a href='/parrot'>Parrots</a></li><li><a href='/dog'>Dogs</a></li></ul>",
|
|
|
|
"ul",
|
2024-06-14 17:14:22 -07:00
|
|
|
(["Parrots", "Dogs"], ["/parrot", "/dog"], [0, 7]),
|
2024-04-12 08:14:20 +02:00
|
|
|
),
|
|
|
|
(
|
|
|
|
"<div>Here is <p>P tag</p> tail text. <a href='/link'>link!</a></div>",
|
|
|
|
"div",
|
2024-06-14 17:14:22 -07:00
|
|
|
(["link!"], ["/link"], [25]),
|
2024-04-12 08:14:20 +02:00
|
|
|
),
|
|
|
|
(
|
|
|
|
"<div>Here is <p>P tag</p><a href='/link'>link!</a></div>",
|
|
|
|
"div",
|
2024-06-14 17:14:22 -07:00
|
|
|
(["link!"], ["/link"], [13]),
|
2024-04-12 08:14:20 +02:00
|
|
|
),
|
|
|
|
],
|
|
|
|
)
|
2024-06-06 11:16:02 -07:00
|
|
|
def test_get_links_from_tag(doc: str, root: str, expected: list[dict[str, str]]):
|
2024-04-12 08:14:20 +02:00
|
|
|
document_tree = etree.fromstring(doc, etree.HTMLParser())
|
|
|
|
el = document_tree.find(f".//{root}")
|
|
|
|
assert el is not None
|
|
|
|
|
|
|
|
links = html._get_links_from_tag(el)
|
|
|
|
|
|
|
|
assert links == expected
|
|
|
|
|
|
|
|
|
2024-06-06 11:16:02 -07:00
|
|
|
# -- _is_text_tag() ------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
2024-06-14 17:14:22 -07:00
|
|
|
def test_adjacent_spans_are_text_tags(opts_args: dict[str, Any]):
|
|
|
|
html_str = "<div><span>•</span><span>A bullet!</span></div>"
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
html_document = HTMLDocument(html_str, opts)
|
|
|
|
el = html_document._main.find(".//div")
|
2024-06-06 11:16:02 -07:00
|
|
|
assert el is not None
|
2022-06-29 14:35:19 -04:00
|
|
|
|
2024-06-14 17:14:22 -07:00
|
|
|
assert html_document._is_text_tag(el) is True
|
2022-06-29 14:35:19 -04:00
|
|
|
|
|
|
|
|
2024-01-04 13:53:19 -08:00
|
|
|
# -- unit-level tests ----------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
2024-06-10 15:07:21 -07:00
|
|
|
class DescribeHTMLDocument:
|
|
|
|
"""Unit-test suite for `unstructured.documents.html.HTMLDocument`."""
|
|
|
|
|
|
|
|
# -- ._main ----------------------------------
|
|
|
|
|
2024-06-13 11:19:42 -07:00
|
|
|
def it_can_find_the_main_element_in_the_document(self, opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = (
|
2024-06-10 15:07:21 -07:00
|
|
|
"<header></header>\n"
|
|
|
|
"<body>\n"
|
|
|
|
" <p>Lots preamble stuff yada yada yada</p>\n"
|
|
|
|
" <main>\n"
|
|
|
|
" <article>\n"
|
|
|
|
" <section>\n"
|
|
|
|
" <h2>A Wonderful Section!</h2>\n"
|
|
|
|
" <p>Look at this amazing section!</p>\n"
|
|
|
|
" </section>\n"
|
|
|
|
" <section>\n"
|
|
|
|
" <h2>Another Wonderful Section!</h2>\n"
|
|
|
|
" <p>Look at this other amazing section!</p>\n"
|
|
|
|
" </section>\n"
|
|
|
|
" </article>\n"
|
|
|
|
" </main>\n"
|
|
|
|
"</body>\n"
|
|
|
|
)
|
2024-06-13 11:19:42 -07:00
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
html_document = HTMLDocument.load(opts)
|
2024-06-10 15:07:21 -07:00
|
|
|
assert html_document._main.tag == "main"
|
|
|
|
|
2024-06-13 11:19:42 -07:00
|
|
|
def but_it_returns_the_root_when_no_main_element_is_present(self, opts_args: dict[str, Any]):
|
|
|
|
opts_args["text"] = (
|
2024-06-10 15:07:21 -07:00
|
|
|
"<header></header>\n"
|
|
|
|
"<body>\n"
|
|
|
|
" <p>Lots preamble stuff yada yada yada</p>\n"
|
|
|
|
" <article>\n"
|
|
|
|
" <section>\n"
|
|
|
|
" <h2>A Wonderful Section!</h2>\n"
|
|
|
|
" <p>Look at this amazing section!</p>\n"
|
|
|
|
" </section>\n"
|
|
|
|
" <section>\n"
|
|
|
|
" <h2>Another Wonderful Section!</h2>\n"
|
|
|
|
" <p>Look at this other amazing section!</p>\n"
|
|
|
|
" </section>\n"
|
|
|
|
" </article>\n"
|
|
|
|
"</body>\n"
|
|
|
|
)
|
2024-06-13 11:19:42 -07:00
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
html_document = HTMLDocument.load(opts)
|
2024-06-10 15:07:21 -07:00
|
|
|
assert html_document._main.tag == "html"
|
|
|
|
|
2024-06-14 17:14:22 -07:00
|
|
|
# -- ._parse_bulleted_text_from_table() ------
|
|
|
|
|
|
|
|
def it_extracts_bullet_text_from_table_as_ListItem_elements(self, opts_args: dict[str, Any]):
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
html_str = """
|
|
|
|
<html>
|
|
|
|
<body>
|
|
|
|
<table>
|
|
|
|
<tbody>
|
|
|
|
<tr>
|
|
|
|
<td>•</td>
|
|
|
|
<td><p>Happy Groundhog's day!</p></td>
|
|
|
|
</tr>
|
|
|
|
<tr>
|
|
|
|
<td>•</td>
|
|
|
|
<td><p>Looks like six more weeks of winter ...</p></td>
|
|
|
|
</tr>
|
|
|
|
</tbody>
|
|
|
|
</table>
|
|
|
|
</body>
|
|
|
|
</html>
|
|
|
|
"""
|
|
|
|
html_document = HTMLDocument(html_str, opts)
|
|
|
|
table_elem = html_document._main.find(".//table")
|
|
|
|
assert table_elem is not None
|
|
|
|
|
|
|
|
list_item_iter = html_document._parse_bulleted_text_from_table(table_elem)
|
|
|
|
|
|
|
|
assert next(list_item_iter) == ListItem(text="Happy Groundhog's day!")
|
|
|
|
assert next(list_item_iter) == ListItem(text="Looks like six more weeks of winter ...")
|
|
|
|
with pytest.raises(StopIteration):
|
|
|
|
next(list_item_iter)
|
2024-06-10 15:07:21 -07:00
|
|
|
|
2024-06-14 17:14:22 -07:00
|
|
|
# -- ._parse_Table_from_table_elem() ---------
|
2024-01-04 13:53:19 -08:00
|
|
|
|
2024-06-14 17:14:22 -07:00
|
|
|
def it_produces_one_cell_for_each_original_table_cell(self, opts_args: dict[str, Any]):
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
html_str = (
|
2024-01-04 13:53:19 -08:00
|
|
|
# -- include formatting whitespace to make sure it is removed --
|
|
|
|
"<table>\n"
|
|
|
|
" <tr>\n"
|
|
|
|
" <td>foo</td>\n"
|
|
|
|
" <td>bar</td>\n"
|
|
|
|
" </tr>\n"
|
|
|
|
"</table>"
|
|
|
|
)
|
2024-06-14 17:14:22 -07:00
|
|
|
html_document = HTMLDocument(html_str, opts)
|
|
|
|
table_elem = html_document._main.find(".//table")
|
|
|
|
assert table_elem is not None
|
2024-01-04 13:53:19 -08:00
|
|
|
|
2024-06-14 17:14:22 -07:00
|
|
|
html_table = html_document._parse_Table_from_table_elem(table_elem)
|
2024-01-04 13:53:19 -08:00
|
|
|
|
2024-06-14 17:14:22 -07:00
|
|
|
assert isinstance(html_table, Table)
|
2024-01-04 13:53:19 -08:00
|
|
|
assert html_table.text == "foo bar"
|
2024-06-14 17:14:22 -07:00
|
|
|
assert html_table.metadata.text_as_html == (
|
|
|
|
"<table><tr><td>foo</td><td>bar</td></tr></table>"
|
|
|
|
)
|
2024-01-04 13:53:19 -08:00
|
|
|
|
2024-06-14 17:14:22 -07:00
|
|
|
def it_accommodates_tds_with_child_elements(self, opts_args: dict[str, Any]):
|
2024-01-04 13:53:19 -08:00
|
|
|
"""Like this example from an SEC 10k filing."""
|
2024-06-14 17:14:22 -07:00
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
html_str = (
|
2024-01-04 13:53:19 -08:00
|
|
|
"<table>\n"
|
|
|
|
" <tr>\n"
|
|
|
|
" <td></td>\n"
|
|
|
|
" <td></td>\n"
|
|
|
|
" </tr>\n"
|
|
|
|
" <tr>\n"
|
|
|
|
" <td>\n"
|
|
|
|
" <p>\n"
|
|
|
|
" <span>\n"
|
|
|
|
' <ix:nonNumeric id="F_be4cc145-372a-4689-be60-d8a70b0c8b9a"'
|
|
|
|
' contextRef="C_1de69f73-df01-4830-8af0-0f11b469bc4a" name="dei:DocumentAnnualReport"'
|
|
|
|
' format="ixt-sec:boolballotbox">\n'
|
|
|
|
" <span>☒</span>\n"
|
|
|
|
" </ix:nonNumeric>\n"
|
|
|
|
" </span>\n"
|
|
|
|
" </p>\n"
|
|
|
|
" </td>\n"
|
|
|
|
" <td>\n"
|
|
|
|
" <p>\n"
|
|
|
|
" <span>ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE"
|
|
|
|
" ACT OF 1934</span>\n"
|
|
|
|
" </p>\n"
|
|
|
|
" </td>\n"
|
|
|
|
" </tr>\n"
|
|
|
|
"</table>\n"
|
|
|
|
)
|
2024-06-14 17:14:22 -07:00
|
|
|
html_document = HTMLDocument(html_str, opts)
|
|
|
|
table_elem = html_document._main.find(".//table")
|
|
|
|
assert table_elem is not None
|
2024-01-04 13:53:19 -08:00
|
|
|
|
2024-06-14 17:14:22 -07:00
|
|
|
html_table = html_document._parse_Table_from_table_elem(table_elem)
|
2024-01-04 13:53:19 -08:00
|
|
|
|
2024-06-14 17:14:22 -07:00
|
|
|
assert isinstance(html_table, Table)
|
2024-01-04 13:53:19 -08:00
|
|
|
assert html_table.text == (
|
|
|
|
"☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934"
|
|
|
|
)
|
2024-06-14 17:14:22 -07:00
|
|
|
assert html_table.metadata.text_as_html == (
|
2024-01-04 13:53:19 -08:00
|
|
|
"<table>"
|
|
|
|
"<tr><td></td><td></td></tr>"
|
|
|
|
"<tr><td>☒</td><td>ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES"
|
|
|
|
" EXCHANGE ACT OF 1934</td></tr>"
|
|
|
|
"</table>"
|
|
|
|
)
|
|
|
|
|
2024-06-14 17:14:22 -07:00
|
|
|
def it_reduces_a_nested_table_to_its_text_placed_in_the_cell_containing_the_nested_table(
|
|
|
|
self, opts_args: dict[str, Any]
|
|
|
|
):
|
2024-01-04 13:53:19 -08:00
|
|
|
"""Recursively ..."""
|
2024-06-14 17:14:22 -07:00
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
# -- note <table> elements nested in <td> elements --
|
|
|
|
html_str = (
|
2024-01-04 13:53:19 -08:00
|
|
|
"<table>\n"
|
|
|
|
" <tr>\n"
|
|
|
|
" <td>\n"
|
|
|
|
" <table>\n"
|
|
|
|
" <tr><td>foo</td><td>bar</td></tr>\n"
|
|
|
|
" <tr><td>baz</td><td>bng</td></tr>\n"
|
|
|
|
" </table>\n"
|
|
|
|
" </td>\n"
|
|
|
|
" <td>\n"
|
|
|
|
" <table>\n"
|
|
|
|
" <tr><td>fizz</td><td>bang</td></tr>\n"
|
|
|
|
" </table>\n"
|
|
|
|
" </td>\n"
|
|
|
|
" </tr>\n"
|
|
|
|
"</table>"
|
|
|
|
)
|
2024-06-14 17:14:22 -07:00
|
|
|
html_document = HTMLDocument(html_str, opts)
|
|
|
|
table_elem = html_document._main.find(".//table")
|
|
|
|
assert table_elem is not None
|
2024-01-04 13:53:19 -08:00
|
|
|
|
2024-06-14 17:14:22 -07:00
|
|
|
html_table = html_document._parse_Table_from_table_elem(table_elem)
|
2024-01-04 13:53:19 -08:00
|
|
|
|
2024-06-14 17:14:22 -07:00
|
|
|
assert isinstance(html_table, Table)
|
2024-01-04 13:53:19 -08:00
|
|
|
assert html_table.text == "foo bar baz bng fizz bang"
|
2024-06-14 17:14:22 -07:00
|
|
|
assert html_table.metadata.text_as_html == (
|
2024-01-04 13:53:19 -08:00
|
|
|
"<table><tr><td>foo bar baz bng</td><td>fizz bang</td></tr></table>"
|
|
|
|
)
|
|
|
|
|
2024-06-14 17:14:22 -07:00
|
|
|
# -- ._parse_tag() ---------------------------
|
2024-01-04 13:53:19 -08:00
|
|
|
|
2024-06-14 17:14:22 -07:00
|
|
|
def it_produces_a_Text_element_when_the_tag_contents_are_not_narrative_or_a_title(
|
|
|
|
self,
|
|
|
|
is_possible_title_: Mock,
|
|
|
|
opts_args: dict[str, Any],
|
|
|
|
):
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
html_str = "<p>NO PARTICULAR TYPE.</p>"
|
|
|
|
is_possible_title_.return_value = False
|
|
|
|
html_document = HTMLDocument(html_str, opts)
|
|
|
|
p = html_document._main.find(".//p")
|
|
|
|
assert p is not None
|
2023-11-20 08:29:32 -08:00
|
|
|
|
2024-06-14 17:14:22 -07:00
|
|
|
assert html_document._parse_tag(p) == Text(text="NO PARTICULAR TYPE.")
|
2023-11-20 08:29:32 -08:00
|
|
|
|
2024-06-14 17:14:22 -07:00
|
|
|
def it_produces_a_ListItem_element_when_the_tag_contains_are_preceded_by_a_bullet_character(
|
|
|
|
self, opts_args: dict[str, Any]
|
|
|
|
):
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
html_str = "<p>● An excellent point!</p>"
|
|
|
|
html_document = HTMLDocument(html_str, opts)
|
|
|
|
p = html_document._main.find(".//p")
|
|
|
|
assert p is not None
|
|
|
|
|
|
|
|
assert html_document._parse_tag(p) == ListItem("An excellent point!")
|
|
|
|
|
|
|
|
def but_not_when_the_tag_contains_only_a_bullet_character_and_no_text(
|
|
|
|
self, opts_args: dict[str, Any]
|
|
|
|
):
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
html_str = "<p>●</p>"
|
|
|
|
html_document = HTMLDocument(html_str, opts)
|
|
|
|
p = html_document._main.find(".//p")
|
|
|
|
assert p is not None
|
|
|
|
|
|
|
|
assert html_document._parse_tag(p) is None
|
|
|
|
|
|
|
|
def it_returns_None_when_the_tag_has_no_content(self, opts_args: dict[str, Any]):
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
html_str = "<p></p>"
|
|
|
|
html_document = HTMLDocument(html_str, opts)
|
|
|
|
p = html_document._main.find(".//p")
|
|
|
|
assert p is not None
|
|
|
|
|
|
|
|
assert html_document._parse_tag(p) is None
|
|
|
|
|
|
|
|
def and_it_returns_None_when_the_tag_contains_only_a_stub(self, opts_args: dict[str, Any]):
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
html_str = "<p>$</p>"
|
|
|
|
html_document = HTMLDocument(html_str, opts)
|
|
|
|
p = html_document._main.find(".//p")
|
|
|
|
assert p is not None
|
|
|
|
|
|
|
|
assert html_document._parse_tag(p) is None
|
|
|
|
|
|
|
|
# -- ._process_list_item() -------------------------------------------------------------------
|
|
|
|
|
|
|
|
def it_continues_processing_elements_after_a_bulleted_list(self, opts_args: dict[str, Any]):
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
html_str = """
|
|
|
|
<div>
|
|
|
|
<p>●</p>
|
|
|
|
<p>●</p>
|
|
|
|
</div>
|
|
|
|
<div>
|
|
|
|
<p>An excellent point!</p>
|
|
|
|
</div>
|
|
|
|
"""
|
|
|
|
html_document = HTMLDocument(html_str, opts)
|
|
|
|
el = html_document._main.find(".//div")
|
|
|
|
assert el is not None
|
|
|
|
|
|
|
|
parsed_el, _ = html_document._process_list_item(el, max_predecessor_len=10)
|
|
|
|
|
|
|
|
assert parsed_el == ListItem(text="An excellent point!")
|
|
|
|
|
|
|
|
def but_it_returns_None_when_no_element_follows_empty_bullets(self, opts_args: dict[str, Any]):
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
html_str = """
|
|
|
|
<div>
|
|
|
|
<p>●</p>
|
|
|
|
<p>●</p>
|
|
|
|
</div>
|
|
|
|
"""
|
|
|
|
html_document = HTMLDocument(html_str, opts)
|
|
|
|
el = html_document._main.find(".//div")
|
|
|
|
assert el is not None
|
|
|
|
|
|
|
|
parsed_el, _ = html_document._process_list_item(el)
|
|
|
|
|
|
|
|
assert parsed_el is None
|
|
|
|
|
|
|
|
def and_it_returns_None_when_following_element_contains_no_text(
|
|
|
|
self, opts_args: dict[str, Any]
|
|
|
|
):
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
html_str = """
|
|
|
|
<div>
|
|
|
|
<p>●</p>
|
|
|
|
<p>●</p>
|
|
|
|
</div>
|
|
|
|
<div>
|
|
|
|
</div>
|
|
|
|
"""
|
|
|
|
html_document = HTMLDocument(html_str, opts)
|
|
|
|
el = html_document._main.find(".//div")
|
|
|
|
assert el is not None
|
|
|
|
|
|
|
|
parsed_el, _ = html_document._process_list_item(el)
|
|
|
|
|
|
|
|
assert parsed_el is None
|
|
|
|
|
|
|
|
def it_ignores_deep_divs_when_so_instructed(self, opts_args: dict[str, Any]):
|
|
|
|
opts = HtmlPartitionerOptions(**opts_args)
|
|
|
|
html_str = """
|
|
|
|
<div>
|
|
|
|
<p>●</p>
|
|
|
|
<p>●</p>
|
|
|
|
<p>●</p>
|
|
|
|
<p>●</p>
|
|
|
|
<p>●</p>
|
|
|
|
</div>
|
|
|
|
<div>
|
|
|
|
<p>An excellent point!</p>
|
|
|
|
</div>
|
|
|
|
"""
|
|
|
|
html_document = HTMLDocument(html_str, opts)
|
|
|
|
el = html_document._main.find(".//div")
|
|
|
|
assert el is not None
|
|
|
|
|
|
|
|
parsed_el, _ = html_document._process_list_item(el, max_predecessor_len=2)
|
|
|
|
|
|
|
|
assert parsed_el is None
|
|
|
|
|
|
|
|
|
|
|
|
# -- module-level fixtures -----------------------------------------------------------------------
|
2024-06-06 11:16:02 -07:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def is_possible_narrative_text_(request: FixtureRequest):
|
|
|
|
return function_mock(request, "unstructured.documents.html.is_possible_narrative_text")
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def is_possible_title_(request: FixtureRequest):
|
|
|
|
return function_mock(request, "unstructured.documents.html.is_possible_title")
|
|
|
|
|
|
|
|
|
2024-06-13 11:19:42 -07:00
|
|
|
@pytest.fixture
|
|
|
|
def opts_args() -> dict[str, Any]:
|
|
|
|
"""All default arguments for `HtmlPartitionerOptions`.
|
|
|
|
|
|
|
|
Individual argument values can be changed to suit each test. Makes construction of opts more
|
|
|
|
compact for testing purposes.
|
|
|
|
"""
|
|
|
|
return {
|
|
|
|
"file": None,
|
|
|
|
"file_path": None,
|
|
|
|
"text": None,
|
|
|
|
"encoding": None,
|
|
|
|
"url": None,
|
|
|
|
"headers": {},
|
|
|
|
"ssl_verify": True,
|
|
|
|
"date_from_file_object": False,
|
|
|
|
"metadata_last_modified": None,
|
|
|
|
"skip_headers_and_footers": False,
|
|
|
|
"detection_origin": None,
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2024-06-06 11:16:02 -07:00
|
|
|
@pytest.fixture
|
|
|
|
def pages_prop_(request: FixtureRequest):
|
|
|
|
return property_mock(request, HTMLDocument, "pages")
|