rfctr(html): prepare for new html parser (#3257)

**Summary**
Extract as much mechanical refactoring from the HTML parser change-over
into the PR as possible. This leaves the next PR focused on installing
the new parser and the ingest-test impact.

**Reviewers:** Commits are well groomed and reviewing commit-by-commit
is probably easier.

**Additional Context**
This PR introduces the rewritten HTML parser. Its general design is
recursive, consistent with the recursive structure of HTML (tree of
elements). It also adds the unit tests for that parser but it does not
_install_ the parser. So the behavior of `partition_html()` is unchanged
by this PR. The next PR in this series will do that and handle the
ingest and other unit test changes required to reflect the dozen or so
bug-fixes the new parser provides.
This commit is contained in:
Steve Canny 2024-06-21 13:59:48 -07:00 committed by GitHub
parent e1b75539f7
commit 6fe1c9980e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
20 changed files with 2783 additions and 833 deletions

View File

@ -1,4 +1,4 @@
## 0.14.8-dev0
## 0.14.8-dev1
### Enhancements

View File

@ -317,7 +317,7 @@ test-no-extras:
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) pytest \
test_${PACKAGE_NAME}/partition/test_text.py \
test_${PACKAGE_NAME}/partition/test_email.py \
test_${PACKAGE_NAME}/partition/test_html.py \
test_${PACKAGE_NAME}/partition/html/test_partition.py \
test_${PACKAGE_NAME}/partition/test_xml_partition.py
.PHONY: test-extra-csv

View File

@ -14,20 +14,19 @@ from lxml import etree
from test_unstructured.unit_utils import (
FixtureRequest,
Mock,
example_doc_path,
function_mock,
property_mock,
)
from unstructured.documents import html
from unstructured.documents.elements import (
Address,
ListItem,
NarrativeText,
Table,
Text,
Title,
)
from unstructured.documents.html import HTMLDocument, HtmlPartitionerOptions
from unstructured.documents.html import HTMLDocument
from unstructured.partition.html.partition import HtmlPartitionerOptions
TAGS = (
(
@ -59,212 +58,6 @@ EXCLUDED_TAGS = [
]
# -- table-extraction behaviors ------------------------------------------------------------------
def test_it_can_parse_a_bare_bones_table_to_a_Table_element(opts_args: dict[str, Any]):
"""Bare-bones means no `<thead>`, `<tbody>`, or `<tfoot>` elements."""
opts_args["text"] = (
"<html>\n"
"<body>\n"
" <table>\n"
" <tr><td>Lorem</td><td>Ipsum</td></tr>\n"
" <tr><td>Ut enim non</td><td>ad minim\nveniam quis</td></tr>\n"
" </table>\n"
"</body>\n"
"</html>"
)
opts = HtmlPartitionerOptions(**opts_args)
html_document = HTMLDocument.load(opts)
# -- there is exactly one element and it's a Table instance --
(element,) = html_document.elements
assert isinstance(element, Table)
# -- table text is joined into a single string; no row or cell boundaries are represented --
assert element.text == "Lorem Ipsum Ut enim non ad minim\nveniam quis"
# -- An HTML representation is also available that is longer but represents table structure.
assert element.metadata.text_as_html == (
"<table>"
"<tr><td>Lorem</td><td>Ipsum</td></tr>"
"<tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr>"
"</table>"
)
def test_it_accommodates_column_heading_cells_enclosed_in_thead_tbody_and_tfoot_elements(
opts_args: dict[str, Any]
):
"""Cells within a `table/thead` element are included in the text and html.
The presence of a `<thead>` element in the original also determines whether a `<thead>` element
appears in `.text_as_html` or whether the first row of cells is simply in the body.
"""
opts_args["text"] = (
"<html>\n"
"<body>\n"
" <table>\n"
" <thead>\n"
" <tr><th>Lorem</th><th>Ipsum</th></tr>\n"
" </thead>\n"
" <tbody>\n"
" <tr><th>Lorem ipsum</th><td>dolor sit amet nulla</td></tr>\n"
" <tr><th>Ut enim non</th><td>ad minim\nveniam quis</td></tr>\n"
" </tbody>\n"
" <tfoot>\n"
" <tr><th>Dolor</th><td>Equis</td></tr>\n"
" </tfoot>\n"
" </table>\n"
"</body>\n"
"</html>"
)
opts = HtmlPartitionerOptions(**opts_args)
html_document = HTMLDocument.load(opts)
(element,) = html_document.elements
assert isinstance(element, Table)
assert element.metadata.text_as_html == (
"<table>"
"<tr><td>Lorem</td><td>Ipsum</td></tr>"
"<tr><td>Lorem ipsum</td><td>dolor sit amet nulla</td></tr>"
"<tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr>"
"<tr><td>Dolor</td><td>Equis</td></tr>"
"</table>"
)
def test_it_does_not_emit_a_Table_element_for_a_table_with_no_text(opts_args: dict[str, Any]):
opts_args["text"] = (
"<html>\n"
"<body>\n"
" <table>\n"
" <tr><td> </td><td> </td></tr>\n"
" <tr><td> </td><td> </td></tr>\n"
" </table>\n"
"</body>\n"
"</html>"
)
opts = HtmlPartitionerOptions(**opts_args)
html_document = HTMLDocument.load(opts)
assert html_document.elements == []
def test_it_grabs_bulleted_text_in_tables_as_ListItem_elements(opts_args: dict[str, Any]):
opts_args["text"] = (
"<html>\n"
" <body>\n"
" <table>\n"
" <tbody>\n"
" <tr>\n"
" <td>&#8226;</td>\n"
" <td><p>Happy Groundhog's day!</p></td>\n"
" </tr>\n"
" <tr>\n"
" <td>&#8226;</td>\n"
" <td><p>Looks like six more weeks of winter ...</p></td>\n"
" </tr>\n"
" </tbody>\n"
" </table>\n"
" </body>\n"
"</html>\n"
)
opts = HtmlPartitionerOptions(**opts_args)
html_document = HTMLDocument.load(opts)
assert html_document.elements == [
ListItem(text="Happy Groundhog's day!"),
ListItem(text="Looks like six more weeks of winter ..."),
]
def test_it_does_not_consider_an_empty_table_a_bulleted_text_table(opts_args: dict[str, Any]):
opts_args["text"] = (
"<html>\n"
"<body>\n"
" <table>\n"
" <tr><td> </td><td> </td></tr>\n"
" <tr><td> </td><td> </td></tr>\n"
" </table>\n"
"</body>\n"
"</html>"
)
opts = HtmlPartitionerOptions(**opts_args)
html_document = HTMLDocument.load(opts)
html_elem = html_document._document_tree
assert html_elem is not None
table = html_elem.find(".//table")
assert table is not None
assert html_document._is_bulleted_table(table) is False
def test_it_provides_parseable_HTML_in_text_as_html(opts_args: dict[str, Any]):
opts_args["text"] = (
"<html>\n"
"<body>\n"
" <table>\n"
" <thead>\n"
" <tr><th>Lorem</th><th>Ipsum</th></tr>\n"
" </thead>\n"
" <tbody>\n"
" <tr><th>Lorem ipsum</th><td>dolor sit amet nulla</td></tr>\n"
" <tr><th>Ut enim non</th><td>ad minim\nveniam quis</td></tr>\n"
" </tbody>\n"
" <tfoot>\n"
" <tr><th>Dolor</th><td>Equis</td></tr>\n"
" </tfoot>\n"
" </table>\n"
"</body>\n"
"</html>"
)
html_document = HTMLDocument.load(HtmlPartitionerOptions(**opts_args))
(element,) = html_document.elements
assert isinstance(element, Table)
text_as_html = element.metadata.text_as_html
assert text_as_html is not None
html = etree.fromstring(text_as_html, etree.HTMLParser())
assert html is not None
# -- lxml adds the <html><body> container, that's not present in `.text_as_html` --
assert etree.tostring(html, encoding=str) == (
"<html><body>"
"<table>"
"<tr><td>Lorem</td><td>Ipsum</td></tr>"
"<tr><td>Lorem ipsum</td><td>dolor sit amet nulla</td></tr>"
"<tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr>"
"<tr><td>Dolor</td><td>Equis</td></tr>"
"</table>"
"</body></html>"
)
# -- element-suppression behaviors ---------------------------------------------------------------
def test_it_does_not_extract_text_in_script_tags(opts_args: dict[str, Any]):
opts_args["file_path"] = example_doc_path("example-with-scripts.html")
opts = HtmlPartitionerOptions(**opts_args)
doc = HTMLDocument.load(opts)
assert all("function (" not in element.text for element in doc.elements)
def test_it_does_not_extract_text_in_style_tags(opts_args: dict[str, Any]):
opts_args["text"] = (
"<html>\n"
"<body>\n"
" <p><style> p { margin:0; padding:0; } </style>Lorem ipsum dolor</p>\n"
"</body>\n"
"</html>"
)
opts = HtmlPartitionerOptions(**opts_args)
html_document = HTMLDocument.load(opts)
(element,) = html_document.elements
assert isinstance(element, Text)
assert element.text == "Lorem ipsum dolor"
# -- HTMLDocument.from_file() --------------------------------------------------------------------
@ -311,132 +104,6 @@ def test_read_html_doc(tmp_path: pathlib.Path, opts_args: dict[str, Any]):
# -- HTMLDocument.elements -----------------------------------------------------------------------
def test_nested_text_tags(opts_args: dict[str, Any]):
opts_args["text"] = (
"<body>\n"
" <p>\n"
" <a>\n"
" There is some text here.\n"
" </a>\n"
" </p>\n"
"</body>\n"
)
opts = HtmlPartitionerOptions(**opts_args)
elements = HTMLDocument.load(opts).elements
assert len(elements) == 1
def test_containers_with_text_are_processed(opts_args: dict[str, Any]):
opts_args["text"] = (
'<div dir=3D"ltr">Hi All,<div><br></div>\n'
" <div>Get excited for our first annual family day!</div>\n"
' <div>Best.<br clear=3D"all">\n'
" <div><br></div>\n"
" -- <br>\n"
' <div dir=3D"ltr">\n'
' <div dir=3D"ltr">Dino the Datasaur<div>\n'
" Unstructured Technologies<br>\n"
" <div>Data Scientist</div>\n"
" <div>Doylestown, PA 18901</div>\n"
" <div><br></div>\n"
" </div>\n"
" </div>\n"
" </div>\n"
" </div>\n"
"</div>\n"
)
opts = HtmlPartitionerOptions(**opts_args)
html_document = HTMLDocument.load(opts)
assert html_document.elements == [
Text(text="Hi All,"),
NarrativeText(text="Get excited for our first annual family day!"),
Title(text="Best."),
Text(text="\n -- "),
Title(text="Dino the Datasaur"),
Title(text="\n Unstructured Technologies"),
Title(text="Data Scientist"),
Address(text="Doylestown, PA 18901"),
]
def test_html_grabs_bulleted_text_in_tags(opts_args: dict[str, Any]):
opts_args["text"] = (
"<html>\n"
" <body>\n"
" <ol>\n"
" <li>Happy Groundhog's day!</li>\n"
" <li>Looks like six more weeks of winter ...</li>\n"
" </ol>\n"
" </body>\n"
"</html>\n"
)
opts = HtmlPartitionerOptions(**opts_args)
assert HTMLDocument.load(opts).elements == [
ListItem(text="Happy Groundhog's day!"),
ListItem(text="Looks like six more weeks of winter ..."),
]
def test_html_grabs_bulleted_text_in_paras(opts_args: dict[str, Any]):
opts_args["text"] = (
"<html>\n"
" <body>\n"
" <p>\n"
" <span>&#8226; Happy Groundhog's day!</span>\n"
" </p>\n"
" <p>\n"
" <span>&#8226; Looks like six more weeks of winter ...</span>\n"
" </p>\n"
" </body>\n"
"</html>\n"
)
opts = HtmlPartitionerOptions(**opts_args)
assert HTMLDocument.load(opts).elements == [
ListItem(text="Happy Groundhog's day!"),
ListItem(text="Looks like six more weeks of winter ..."),
]
def test_joins_tag_text_correctly(opts_args: dict[str, Any]):
opts_args["text"] = "<p>Hello again peet mag<i>ic</i>al</p>"
opts = HtmlPartitionerOptions(**opts_args)
doc = HTMLDocument.load(opts)
assert doc.elements[0].text == "Hello again peet magical"
def test_sample_doc_with_emoji(opts_args: dict[str, Any]):
opts_args["text"] = '<html charset="unicode">\n<p>Hello again 😀</p>\n</html>'
opts = HtmlPartitionerOptions(**opts_args)
doc = HTMLDocument.load(opts)
# NOTE(robinson) - unclear why right now, but the output is the emoji on the test runners
# and the byte string representation when running locally on mac
assert doc.elements[0].text in ["Hello again ð\x9f\x98\x80", "Hello again 😀"]
def test_only_plain_text_in_body(opts_args: dict[str, Any]):
opts_args["text"] = "<body>Hello</body>"
opts = HtmlPartitionerOptions(**opts_args)
assert HTMLDocument.load(opts).elements[0].text == "Hello"
def test_plain_text_before_anything_in_body(opts_args: dict[str, Any]):
opts_args["text"] = "<body>Hello<p>World</p></body>"
opts = HtmlPartitionerOptions(**opts_args)
doc = HTMLDocument.load(opts)
assert doc.elements[0].text == "Hello"
assert doc.elements[1].text == "World"
def test_line_break_in_container(opts_args: dict[str, Any]):
opts_args["text"] = "<div>Hello<br/>World</div>"
opts = HtmlPartitionerOptions(**opts_args)
doc = HTMLDocument.load(opts)
assert doc.elements[0].text == "Hello"
assert doc.elements[1].text == "World"
@pytest.mark.parametrize("tag", html.TEXT_TAGS)
def test_line_break_in_text_tag(tag: str, opts_args: dict[str, Any]):
opts_args["text"] = f"<{tag}>Hello<br/>World</{tag}>"
@ -456,16 +123,6 @@ def test_tag_types(tag: str, opts_args: dict[str, Any]):
assert len(elements) == 1
@pytest.mark.parametrize("tag", EXCLUDED_TAGS)
def test_exclude_tag_types(tag: str, opts_args: dict[str, Any]):
opts_args["text"] = f"<body>\n <{tag}>\n There is some text here.\n </{tag}>\n</body>\n"
opts = HtmlPartitionerOptions(**opts_args)
elements = HTMLDocument.load(opts).elements
assert len(elements) == 0
# -- _construct_text() ---------------------------------------------------------------------------
@ -683,89 +340,6 @@ class DescribeHTMLDocument:
"<table><tr><td>foo</td><td>bar</td></tr></table>"
)
def it_accommodates_tds_with_child_elements(self, opts_args: dict[str, Any]):
"""Like this example from an SEC 10k filing."""
opts = HtmlPartitionerOptions(**opts_args)
html_str = (
"<table>\n"
" <tr>\n"
" <td></td>\n"
" <td></td>\n"
" </tr>\n"
" <tr>\n"
" <td>\n"
" <p>\n"
" <span>\n"
' <ix:nonNumeric id="F_be4cc145-372a-4689-be60-d8a70b0c8b9a"'
' contextRef="C_1de69f73-df01-4830-8af0-0f11b469bc4a" name="dei:DocumentAnnualReport"'
' format="ixt-sec:boolballotbox">\n'
" <span>&#9746;</span>\n"
" </ix:nonNumeric>\n"
" </span>\n"
" </p>\n"
" </td>\n"
" <td>\n"
" <p>\n"
" <span>ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE"
" ACT OF 1934</span>\n"
" </p>\n"
" </td>\n"
" </tr>\n"
"</table>\n"
)
html_document = HTMLDocument(html_str, opts)
table_elem = html_document._main.find(".//table")
assert table_elem is not None
html_table = html_document._parse_Table_from_table_elem(table_elem)
assert isinstance(html_table, Table)
assert html_table.text == (
"☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934"
)
assert html_table.metadata.text_as_html == (
"<table>"
"<tr><td></td><td></td></tr>"
"<tr><td>☒</td><td>ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES"
" EXCHANGE ACT OF 1934</td></tr>"
"</table>"
)
def it_reduces_a_nested_table_to_its_text_placed_in_the_cell_containing_the_nested_table(
self, opts_args: dict[str, Any]
):
"""Recursively ..."""
opts = HtmlPartitionerOptions(**opts_args)
# -- note <table> elements nested in <td> elements --
html_str = (
"<table>\n"
" <tr>\n"
" <td>\n"
" <table>\n"
" <tr><td>foo</td><td>bar</td></tr>\n"
" <tr><td>baz</td><td>bng</td></tr>\n"
" </table>\n"
" </td>\n"
" <td>\n"
" <table>\n"
" <tr><td>fizz</td><td>bang</td></tr>\n"
" </table>\n"
" </td>\n"
" </tr>\n"
"</table>"
)
html_document = HTMLDocument(html_str, opts)
table_elem = html_document._main.find(".//table")
assert table_elem is not None
html_table = html_document._parse_Table_from_table_elem(table_elem)
assert isinstance(html_table, Table)
assert html_table.text == "foo bar baz bng fizz bang"
assert html_table.metadata.text_as_html == (
"<table><tr><td>foo bar baz bng</td><td>fizz bang</td></tr></table>"
)
# -- ._parse_tag() ---------------------------
def it_produces_a_Text_element_when_the_tag_contents_are_not_narrative_or_a_title(

View File

@ -0,0 +1,785 @@
# pyright: reportPrivateUsage=false
# pyright: reportUnknownArgumentType=false
"""Test suite for `unstructured.partition.html.parser` module."""
from __future__ import annotations
from collections import deque
import pytest
from lxml import etree
from unstructured.documents.elements import Address, Element, ListItem, NarrativeText, Text, Title
from unstructured.partition.html.parser import (
Anchor,
Annotation,
DefaultElement,
Flow,
Phrasing,
RemovedPhrasing,
TextSegment,
_consolidate_annotations,
_normalize_text,
html_parser,
)
# -- MODULE-LEVEL FUNCTIONS ----------------------------------------------------------------------
# -- _consolidate_annotations() ------------------
def it_gathers_annotations_from_text_segments():
text_segments = [
TextSegment(
" Ford Prefect ",
{
"link_texts": "Ford Prefect",
"link_url": "https://wikipedia/Ford_Prefect",
"emphasized_text_contents": "Ford Prefect",
"emphasized_text_tags": "b",
},
),
TextSegment(
" alien encounter",
{
"emphasized_text_contents": "alien encounter",
"emphasized_text_tags": "bi",
},
),
]
annotations = _consolidate_annotations(text_segments)
assert annotations == {
# -- each distinct key gets a list of values --
"emphasized_text_contents": ["Ford Prefect", "alien encounter"],
"emphasized_text_tags": ["b", "bi"],
# -- even when there is only one value --
"link_texts": ["Ford Prefect"],
"link_url": ["https://wikipedia/Ford_Prefect"],
}
# -- and the annotations mapping is immutable --
with pytest.raises(TypeError, match="object does not support item assignment"):
annotations["new_key"] = "foobar" # pyright: ignore[reportIndexIssue]
# -- (but not its list values unfortunately) --
annotations["emphasized_text_tags"].append("xyz")
assert annotations["emphasized_text_tags"] == ["b", "bi", "xyz"]
# -- _normalize_text() ---------------------------
@pytest.mark.parametrize(
("text", "expected_value"),
[
# -- already normalized text is left unchanged --
("iterators allow", "iterators allow"),
# -- newlines are treated as whitespace --
("algorithm\nto be", "algorithm to be"),
(" separated\n from ", "separated from"),
("\n container\n details\n ", "container details"),
(
"\n iterators allow \n algorithm to be \nexpressed without container \nnoise",
"iterators allow algorithm to be expressed without container noise",
),
],
)
def test_normalize_text_produces_normalized_text(text: str, expected_value: str):
assert _normalize_text(text) == expected_value
# -- FLOW (BLOCK-ITEM) ELEMENTS ------------------------------------------------------------------
class DescribeFlow:
"""Isolated unit-test suite for `unstructured.partition.html.parser.Flow`.
The `Flow` class provides most behaviors for flow (block-level) elements.
"""
# -- .is_phrasing -----------------------------------------------------
def it_knows_it_is_NOT_a_phrasing_element(self):
p = etree.fromstring("<p>Hello</p>", html_parser).xpath(".//p")[0]
assert isinstance(p, Flow)
assert p.is_phrasing is False
# -- .iter_elements() -------------------------------------------------
def it_generates_the_document_elements_from_the_Flow_element(self):
"""Phrasing siblings of child block elements are processed with text or tail.
In the general case, a Flow element can contain text, phrasing content, and child flow
elements.
Each of these five lines in this example is a "paragraph" and gives rise to a distinct
document-element.
"""
html_text = """
<div>
Text of div <b>with <i>hierarchical</i>\nphrasing</b> content before first block item
<p>Click <a href="http://blurb.io">here</a> to see the blurb for this block item. </p>
tail of block item <b>with <i>hierarchical</i> phrasing </b> content
<p>second block item</p>
tail of block item <b>with <i> hierarchical </i></b> phrasing content
</div>
"""
div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]
elements = div.iter_elements()
e = next(elements)
assert e == Title("Text of div with hierarchical phrasing content before first block item")
assert e.metadata.to_dict() == {
"category_depth": 0,
"emphasized_text_contents": ["with", "hierarchical", "phrasing"],
"emphasized_text_tags": ["b", "bi", "b"],
}
e = next(elements)
assert e == NarrativeText("Click here to see the blurb for this block item.")
assert e.metadata.to_dict() == {"link_texts": ["here"], "link_urls": ["http://blurb.io"]}
e = next(elements)
assert e == Title("tail of block item with hierarchical phrasing content")
assert e.metadata.to_dict() == {
"category_depth": 0,
"emphasized_text_contents": ["with", "hierarchical", "phrasing"],
"emphasized_text_tags": ["b", "bi", "b"],
}
e = next(elements)
assert e == Title("second block item")
assert e.metadata.to_dict() == {"category_depth": 0}
e = next(elements)
assert e == Title("tail of block item with hierarchical phrasing content")
assert e.metadata.to_dict() == {
"category_depth": 0,
"emphasized_text_contents": ["with", "hierarchical"],
"emphasized_text_tags": ["b", "bi"],
}
with pytest.raises(StopIteration):
e = next(elements)
# -- ._category_depth() -----------------------------------------------
@pytest.mark.parametrize(
("html_text", "tag", "ElementCls", "expected_value"),
[
("<p>Ford... you're turning into a penguin. Stop it.<p>", "p", Text, None),
("<p>* thanks for all the fish.</p>", "p", ListItem, 0),
("<li>thanks for all the fish.</li>", "li", ListItem, 0),
("<ul><li>So long</li><li>and thanks for all the fish.</li></ul>", "li", ListItem, 1),
("<dl><dd>So long<ol><li>and thanks for the fish.</li></ol></ul>", "li", ListItem, 2),
("<p>Examples</p>", "p", Title, 0),
("<h1>Examples</h1>", "h1", Title, 0),
("<h2>Examples</h2>", "h2", Title, 1),
("<h3>Examples</h3>", "h3", Title, 2),
("<h4>Examples</h4>", "h4", Title, 3),
("<h5>Examples</h5>", "h5", Title, 4),
("<h6>Examples</h6>", "h6", Title, 5),
],
)
def it_computes_the_category_depth_to_help(
self, html_text: str, tag: str, ElementCls: type[Element], expected_value: int | None
):
e = etree.fromstring(html_text, html_parser).xpath(f".//{tag}")[0]
assert e._category_depth(ElementCls) == expected_value
# -- ._element_from_text_or_tail() ------------------------------------
def it_assembles_text_and_tail_document_elements_to_help(self):
"""Text and tails and their phrasing content are both processed the same way."""
html_text = "<div>The \n Roman <b>poet <i> Virgil</i> gave</b> his <q>pet</q> fly</div>"
div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]
elements = div._element_from_text_or_tail(div.text, deque(div), Text)
e = next(elements)
# -- element text is normalized --
assert e == Text("The Roman poet Virgil gave his pet fly")
# -- individual annotations are consolidated --
assert e.metadata.to_dict() == {
"emphasized_text_contents": ["poet", "Virgil", "gave"],
"emphasized_text_tags": ["b", "bi", "b"],
}
def but_it_does_not_generate_a_document_element_when_only_whitespace_is_contained(self):
html_text = "<div> <b> \n <i> \n </i> </b> <q> \n </q> \n </div>"
div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]
elements = div._element_from_text_or_tail(div.text, deque(div), Text)
with pytest.raises(StopIteration):
next(elements)
def it_uses_the_specified_element_class_to_form_the_document_element(self):
html_text = "<div>\n The line-storm clouds fly tattered and swift\n</div>"
div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]
elements = div._element_from_text_or_tail(div.text, deque(div), Address)
e = next(elements)
assert e == Address("The line-storm clouds fly tattered and swift")
assert e.metadata.to_dict() == {}
with pytest.raises(StopIteration):
next(elements)
def and_it_selects_the_document_element_class_by_analyzing_the_text_when_not_specified(self):
html_text = "<div>\n The line-storm clouds fly tattered and swift,\n</div>"
div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]
elements = div._element_from_text_or_tail(div.text, deque(div))
assert next(elements) == NarrativeText("The line-storm clouds fly tattered and swift,")
def but_it_does_not_generate_a_document_element_when_only_a_bullet_character_is_contained(self):
html_text = "<div> * </div>"
div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]
elements = div._element_from_text_or_tail(div.text, deque(div))
with pytest.raises(StopIteration):
next(elements)
# -- ._iter_text_segments() -------------------------------------------
@pytest.mark.parametrize(
("html_text", "expected_value"),
[
( # -- text with no phrasing --
"<p>Ford... you're turning into a penguin.<p>",
[("Ford... you're turning into a penguin.", {})],
),
( # -- text with phrasing --
"<p>Ford... <b>you're turning</b> into\na <i>penguin</i>.<p>",
[
("Ford... ", {}),
(
"you're turning",
{"emphasized_text_contents": "you're turning", "emphasized_text_tags": "b"},
),
(" into\na ", {}),
(
"penguin",
{"emphasized_text_contents": "penguin", "emphasized_text_tags": "i"},
),
(".", {}),
],
),
( # -- text with nested phrasing --
"<p>Ford... <b>you're <i>turning</i></b> into a penguin.<p>",
[
("Ford... ", {}),
(
"you're ",
{"emphasized_text_contents": "you're", "emphasized_text_tags": "b"},
),
(
"turning",
{"emphasized_text_contents": "turning", "emphasized_text_tags": "bi"},
),
(" into a penguin.", {}),
],
),
],
)
def it_recursively_generates_text_segments_from_text_and_phrasing_to_help(
self, html_text: str, expected_value: list[Annotation]
):
p = etree.fromstring(html_text, html_parser).xpath(".//p")[0]
text_segments = list(p._iter_text_segments(p.text, deque(p)))
assert text_segments == expected_value
class DescribePre:
"""Isolated unit-test suite for `unstructured.partition.html.parser.Pre`.
The `Pre` class specializes behaviors for the `<pre>` (pre-formatted text) element.
"""
def it_preserves_the_whitespace_of_its_phrasing_only_contents(self):
"""A `<pre>` element can contain only phrasing content."""
html_text = (
"<pre>\n"
" The Answer to the Great Question... Of Life, the Universe and Everything...\n"
" Is... Forty-two, said Deep Thought, with infinite majesty and calm.\n"
"</pre>\n"
)
pre = etree.fromstring(html_text, html_parser).xpath(".//pre")[0]
elements = pre.iter_elements()
e = next(elements)
assert e == Text(
" The Answer to the Great Question... Of Life, the Universe and Everything...\n"
" Is... Forty-two, said Deep Thought, with infinite majesty and calm."
)
with pytest.raises(StopIteration):
next(elements)
@pytest.mark.parametrize(
("html_text", "expected_value"),
[
# -- a newline in the 0th position of pre.text is dropped --
("<pre>\n foo </pre>", " foo "),
# -- but not when preceded by any other whitespace --
("<pre> \n foo </pre>", " \n foo "),
# -- and only one is dropped --
("<pre>\n\n foo </pre>", "\n foo "),
# -- a newline in the -1th position is dropped --
("<pre> foo \n</pre>", " foo "),
# -- but not when followed by any other whitespace --
("<pre> foo \n </pre>", " foo \n "),
# -- and only one is dropped --
("<pre> foo \n\n</pre>", " foo \n"),
# -- a newline in both positions are both dropped --
("<pre>\n foo \n</pre>", " foo "),
# -- or not when not at the absolute edge --
("<pre> \n foo \n </pre>", " \n foo \n "),
],
)
def but_it_strips_a_single_leading_or_trailing_newline(
self, html_text: str, expected_value: str
):
"""Content starts on next line when opening `<pre>` tag is immediately followed by `\n`"""
pre = etree.fromstring(html_text, html_parser).xpath(".//pre")[0]
e = next(pre.iter_elements())
assert e.text == expected_value
def it_assigns_emphasis_and_link_metadata_when_contents_have_those_phrasing_elements(self):
html_text = '<pre>You\'re <b>turning</b> into a <a href="http://eie.io">penguin</a>.</pre>'
pre = etree.fromstring(html_text, html_parser).xpath(".//pre")[0]
e = next(pre.iter_elements())
assert e.text == "You're turning into a penguin."
assert e.metadata.emphasized_text_contents == ["turning"]
assert e.metadata.emphasized_text_tags == ["b"]
assert e.metadata.link_texts == ["penguin"]
assert e.metadata.link_urls == ["http://eie.io"]
class DescribeRemovedBlock:
"""Isolated unit-test suite for `unstructured.partition.html.parser.RemovedBlock`.
This class is used for block level items we want to skip like `<hr/>` and `<figure>`.
"""
def it_is_skipped_during_parsing(self):
html_text = """
<div>
<hr/>
<figure>
<img src="/media/cc0-images/elephant-660-480.jpg" alt="Elephant at sunset" />
<figcaption>An elephant at sunset</figcaption>
</figure>
<p>Content we want.</p>
</div>
"""
div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]
assert list(div.iter_elements()) == [NarrativeText("Content we want.")]
# -- PHRASING (INLINE) ELEMENTS ------------------------------------------------------------------
class DescribePhrasing:
"""Isolated unit-test suite for `unstructured.partition.html.parser.Phrasing`.
The `Phrasing` class provides most behaviors for phrasing (inline) elements.
"""
def it_knows_it_is_a_phrasing_element(self):
b = etree.fromstring("<b>Hello</b>", html_parser).xpath(".//b")[0]
assert isinstance(b, Phrasing)
assert b.is_phrasing is True
@pytest.mark.parametrize(
("html_text", "expected_value"),
[
# -- an empty element produces no text segments --
("<code></code>", []),
# -- element text produces one segment --
("<data> foo </data>", [(" foo ", {})]),
# -- element tail produces one segment --
("<dfn/> bar ", [(" bar ", {})]),
# -- element descendants each produce one segment --
("<kbd><mark>foo <meter>bar</meter></mark></kbd>", [("foo ", {}), ("bar", {})]),
# -- and any combination produces a segment for each text, child, and tail --
(
"<kbd> <mark>foo <meter>bar</meter> baz</mark> </kbd>",
[
(" ", {}),
("foo ", {}),
("bar", {}),
(" baz", {}),
(" ", {}),
],
),
],
)
def it_generates_text_segments_for_its_text_and_children_and_tail(
self, html_text: str, expected_value: list[TextSegment]
):
e = etree.fromstring(html_text, html_parser).xpath(".//body")[0][0]
assert list(e.iter_text_segments()) == expected_value
def it_forms_its_annotations_from_emphasis(self):
cite = etree.fromstring("<cite> rhombus </cite>", html_parser).xpath(".//cite")[0]
assert cite._annotation(cite.text, "bi") == {
"emphasized_text_contents": "rhombus",
"emphasized_text_tags": "bi",
}
def but_not_when_text_is_empty_or_whitespace(self):
cite = etree.fromstring("<cite> </cite>", html_parser).xpath(".//cite")[0]
assert cite._annotation(cite.text, "bi") == {}
def and_not_when_there_is_no_emphasis(self):
cite = etree.fromstring("<cite>rhombus</cite>", html_parser).xpath(".//cite")[0]
assert cite._annotation(cite.text, "") == {}
def it_uses_the_enclosing_emphasis_as_the_default_inside_emphasis(self):
abbr = etree.fromstring("<abbr>LLM</abbr>", html_parser).xpath(".//abbr")[0]
assert abbr._inside_emphasis("xyz") == "xyz"
class DescribeBold:
"""Isolated unit-test suite for `unstructured.partition.html.parser.Bold`.
The `Bold` class is used for `<b>` and `<strong>` tags and adds emphasis metadata.
"""
def it_annotates_its_text_segment_with_bold_emphasis(self):
b = etree.fromstring("<b>rhombus</b>", html_parser).xpath(".//b")[0]
text_segments = b.iter_text_segments()
text, annotation = next(text_segments)
assert text == "rhombus"
assert annotation == {
"emphasized_text_contents": "rhombus",
"emphasized_text_tags": "b",
}
def and_its_children_are_also_annotated_with_bold_emphasis(self):
b = etree.fromstring("<b>rhombus <i>pentagon</i></b>", html_parser).xpath(".//b")[0]
text_segments = b.iter_text_segments()
text, annotation = next(text_segments)
assert text == "rhombus "
assert annotation == {
"emphasized_text_contents": "rhombus",
"emphasized_text_tags": "b",
}
text, annotation = next(text_segments)
assert text == "pentagon"
assert annotation == {
"emphasized_text_contents": "pentagon",
"emphasized_text_tags": "bi",
}
def but_not_its_tail(self):
b = etree.fromstring("<b>rhombus</b> pentagon", html_parser).xpath(".//b")[0]
text_segments = b.iter_text_segments()
text, annotation = next(text_segments)
assert text == "rhombus"
assert annotation == {
"emphasized_text_contents": "rhombus",
"emphasized_text_tags": "b",
}
text, annotation = next(text_segments)
assert text == " pentagon"
assert annotation == {}
class DescribeItalic:
"""Isolated unit-test suite for `unstructured.partition.html.parser.Italic`.
The `Italic` class is used for `<i>` and `<em>` tags and adds emphasis metadata.
"""
def it_annotates_its_text_segment_with_italic_emphasis(self):
i = etree.fromstring("<i>rhombus</i>", html_parser).xpath(".//i")[0]
text_segments = i.iter_text_segments()
text, annotation = next(text_segments)
assert text == "rhombus"
assert annotation == {
"emphasized_text_contents": "rhombus",
"emphasized_text_tags": "i",
}
def and_its_children_are_also_annotated_with_italic_emphasis(self):
em = etree.fromstring("<em>rhombus <b>pentagon</b></em>", html_parser).xpath(".//em")[0]
text_segments = em.iter_text_segments()
text, annotation = next(text_segments)
assert text == "rhombus "
assert annotation == {
"emphasized_text_contents": "rhombus",
"emphasized_text_tags": "i",
}
text, annotation = next(text_segments)
assert text == "pentagon"
assert annotation == {
"emphasized_text_contents": "pentagon",
"emphasized_text_tags": "bi",
}
def but_not_its_tail(self):
i = etree.fromstring("<i>rhombus</i> pentagon", html_parser).xpath(".//i")[0]
text_segments = i.iter_text_segments()
text, annotation = next(text_segments)
assert text == "rhombus"
assert annotation == {
"emphasized_text_contents": "rhombus",
"emphasized_text_tags": "i",
}
text, annotation = next(text_segments)
assert text == " pentagon"
assert annotation == {}
class DescribeLineBreak:
"""Isolated unit-test suite for `unstructured.partition.html.parser.LineBreak`.
Used for `<br/>` elements, it's only special behavior is to add whitespace such that phrasing
butted up tight on both sides of the `<br/>` element is not joined, like `abc<br/>def` should
become "abc def", not "abcdef".
"""
def it_adds_a_newline_in_its_place(self):
cite = etree.fromstring(
"<cite>spaceships of the<br/>Vogon Constructor Fleet</cite>", html_parser
).xpath(".//cite")[0]
text_segments = cite.iter_text_segments()
texts = [ts.text for ts in text_segments]
assert texts == ["spaceships of the", "\n", "Vogon Constructor Fleet"]
assert _normalize_text("".join(texts)) == "spaceships of the Vogon Constructor Fleet"
class DescribeRemovedPhrasing:
"""Isolated unit-test suite for `unstructured.partition.html.parser.RemovedPhrasing`.
Used for phrasing elements like `<label>` that we want to skip, including any content they
enclose. The tail of such an element is not skipped though.
"""
def it_behaves_like_an_empty_element(self):
label = etree.fromstring(
"<div>\n"
" <label>Space<p>is big</p>, <b>mind-bogglingly</b> big.</label>\n"
" Like vastly, hugely big.\n"
"</div>",
html_parser,
).xpath(".//label")[0]
(text_segment,) = list(label.iter_text_segments())
assert isinstance(label, RemovedPhrasing)
assert label.is_phrasing is True
assert text_segment.text == "\n Like vastly, hugely big.\n"
# -- DUAL-ROLE ELEMENTS --------------------------------------------------------------------------
class DescribeAnchor:
"""Isolated unit-test suite for `unstructured.partition.html.parser.Anchor`.
The `Anchor` class is used for `<a>` tags and provides link metadata.
"""
# -- .is_phrasing -----------------------------------------------------
@pytest.mark.parametrize(
("html_text", "expected_value"),
[
# -- an empty <a> identifies as phrasing --
('<a href="http://eie.io"></a>', True),
# -- an <a> with text but no children identifies as phrasing --
('<a href="http://eie.io">“O Deep Thought computer," he said,</a>', True),
# -- an <a> with no text and only phrasing children identifies as phrasing --
('<a href="http://eie.io"><i>“O Deep Thought computer,"</i></a>', True),
# -- an <a> with both text and phrasing children identifies as phrasing --
('<a href="http://eie.io">“O <b>Deep Thought</b> computer,"</a>', True),
# -- but an <a> with a block-item child does not --
('<a href="http://eie.io"><p>“O Deep Thought computer,"</p></a>', False),
# -- and an <a> with both text and a block-item child does not --
('<a href="http://eie.io">“O Deep Thought computer,"<div>he said,</div></a>', False),
# -- and an <a> with text and both block and phrasing children does not --
('<a href="http://eie.io">“O <b>Deep</b> Thought <div>computer," he</div></a>', False),
],
)
def it_determines_whether_it_is_phrasing_dynamically(
self, html_text: str, expected_value: bool
):
a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
assert isinstance(a, Anchor)
assert a.is_phrasing is expected_value
# -- .iter_elements() -------------------------------------------------
def it_can_also_act_as_a_block_item(self):
html_text = """
<div>
<a href="http://eie.io">
O Deep Thought computer, he said,
<div>The task we have designed you to perform is this.</div>
<p>We want you to tell us.... he paused,</p>
</a>
</div>
"""
a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
elements = a.iter_elements()
assert [e.text for e in elements] == [
"O Deep Thought computer, he said,",
"The task we have designed you to perform is this.",
"We want you to tell us.... he paused,",
]
# -- .iter_text_segments() --------------------------------------------
@pytest.mark.parametrize(
("html_text", "expected_value"),
[
# -- produces no text-segment or annotation for anchor.text when there is none --
('<a href="http://abc.com"></a>', []),
# -- but it produces a text-segment for the tail if there is one --
('<a href="http://abc.com"></a> long tail ', [TextSegment(" long tail ", {})]),
# -- produces text-segment but no annotation for anchor.text when it is whitespace --
('<a href="http://abc.com"> </a>', [TextSegment(" ", {})]),
# -- produces text-segment and annotation for anchor text
# -- Note link-texts annotation is whitespace-normalized but text-segment text is not.
(
'<a href="http://abc.com"> click here </a>',
[
TextSegment(
" click here ",
{"link_texts": ["click here"], "link_urls": ["http://abc.com"]},
)
],
),
# -- produces text-segment for both text and tail when present --
(
'<a href="http://abc.com"> click here </a> long tail',
[
TextSegment(
" click here ",
{"link_texts": ["click here"], "link_urls": ["http://abc.com"]},
),
TextSegment(" long tail", {}),
],
),
# -- nested phrasing inside <a> element is handled as expected --
(
'<p>I am <a href="http://eie.io">one <u>with<i> the</i></u> Force</a>.</p>',
[
TextSegment(
"one with the Force",
{
"emphasized_text_contents": ["the"],
"emphasized_text_tags": ["i"],
"link_texts": ["one with the Force"],
"link_urls": ["http://eie.io"],
},
),
TextSegment(".", {}),
],
),
],
)
def it_generates_link_annotated_text_segments_for_its_text_and_a_tail_text_segment(
self, html_text: str, expected_value: list[TextSegment]
):
a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
assert list(a.iter_text_segments()) == expected_value
# -- DEFAULT ELEMENT -----------------------------------------------------------------------------
class DescribeDefaultElement:
"""Isolated unit-test suite for `unstructured.partition.html.parser.DefaultElement`.
Used for any element we haven't assigned a custom element-class too. This prominently includes
any non-HTML elements that can be embedded in the HTML.
It identifies as a block item but it can behave as either a block-item or phrasing. Its behavior
is a combination of RemovedBlock and RemovedPhrasing. Namely, it iterates zero elements and only
iterates a text-segment for its tail.
"""
# -- .is_phrasing -----------------------------------------------------
def it_identifies_as_a_phrasing_element(self):
foobar = etree.fromstring("<foobar>Vogon</foobar>", html_parser).xpath(".//foobar")[0]
assert isinstance(foobar, DefaultElement)
assert foobar.is_phrasing is True
# -- .iter_elements() -------------------------------------------------
def it_generates_zero_elements_as_a_block_item(self):
"""Should never be called but belts and suspenders."""
foobar = etree.fromstring(
"<foobar>Space<p>is big</p>, <b>mind-bogglingly</b> big.</foobar>",
html_parser,
).xpath(".//foobar")[0]
elements = foobar.iter_elements()
with pytest.raises(StopIteration):
next(elements)
# -- .iter_text_segments() --------------------------------------------
def it_generates_its_tail_but_no_inner_text_segments_when_called_like_phrasing(self):
foobar = etree.fromstring(
"<div>\n"
" O Deep Thought computer, he said,\n"
" <foobar>Vogon Constructor Fleet</foobar>\n"
" The task we have designed you to perform is this.\n"
" <p>We want you to tell us.... he paused,</p>\n"
"</div>",
html_parser,
).xpath(".//foobar")[0]
texts = [ts.text for ts in foobar.iter_text_segments()]
assert texts == ["\n The task we have designed you to perform is this.\n "]
def and_it_behaves_like_an_empty_phrasing_element_inside_a_block_element(self):
div = etree.fromstring(
"<div>\n"
" O Deep Thought computer, he said,\n"
" <foobar>Vogon Constructor Fleet</foobar>\n"
" The task we have designed you to perform is this.\n"
" <p>We want you to tell us.... he paused,</p>\n"
"</div>",
html_parser,
).xpath(".//div")[0]
texts = [e.text for e in div.iter_elements()]
assert texts == [
"O Deep Thought computer, he said, The task we have designed you to perform is this.",
"We want you to tell us.... he paused,",
]

29
typings/lxml/_types.pyi Normal file
View File

@ -0,0 +1,29 @@
# pyright: reportPrivateUsage=false
from __future__ import annotations
from typing import Any, Callable, Collection, Protocol, TypeVar
from typing_extensions import TypeAlias
from .etree import QName, _Element, _ElementTree
_ET = TypeVar("_ET", bound=_Element, default=_Element)
_ET_co = TypeVar("_ET_co", bound=_Element, default=_Element, covariant=True)
_KT_co = TypeVar("_KT_co", covariant=True)
_VT_co = TypeVar("_VT_co", covariant=True)
_AttrName: TypeAlias = str
_ElemPathArg: TypeAlias = str | QName
_ElementOrTree: TypeAlias = _ET | _ElementTree[_ET]
_TagName: TypeAlias = str
_TagSelector: TypeAlias = _TagName | Callable[..., _Element]
_XPathObject = Any
class SupportsLaxedItems(Protocol[_KT_co, _VT_co]):
def items(self) -> Collection[tuple[_KT_co, _VT_co]]: ...

View File

@ -0,0 +1,14 @@
# pyright: reportPrivateUsage=false
from __future__ import annotations
from ._classlookup import ElementBase as ElementBase
from ._classlookup import ElementDefaultClassLookup as ElementDefaultClassLookup
from ._element import _Element as _Element
from ._element import _ElementTree as _ElementTree
from ._module_func import fromstring as fromstring
from ._module_func import tostring as tostring
from ._module_misc import QName as QName
from ._nsclasses import ElementNamespaceClassLookup as ElementNamespaceClassLookup
from ._parser import HTMLParser as HTMLParser
from ._parser import XMLParser as XMLParser

View File

@ -0,0 +1,75 @@
# pyright: reportPrivateUsage=false
from __future__ import annotations
from ._element import _Element
class ElementBase(_Element):
"""The public Element class
Original Docstring
------------------
All custom Element classes must inherit from this one.
To create an Element, use the `Element()` factory.
BIG FAT WARNING: Subclasses *must not* override `__init__` or
`__new__` as it is absolutely undefined when these objects will be
created or destroyed. All persistent state of Elements must be
stored in the underlying XML. If you really need to initialize
the object after creation, you can implement an ``_init(self)``
method that will be called directly after object creation.
Subclasses of this class can be instantiated to create a new
Element. By default, the tag name will be the class name and the
namespace will be empty. You can modify this with the following
class attributes:
* TAG - the tag name, possibly containing a namespace in Clark
notation
* NAMESPACE - the default namespace URI, unless provided as part
of the TAG attribute.
* HTML - flag if the class is an HTML tag, as opposed to an XML
tag. This only applies to un-namespaced tags and defaults to
false (i.e. XML).
* PARSER - the parser that provides the configuration for the
newly created document. Providing an HTML parser here will
default to creating an HTML element.
In user code, the latter three are commonly inherited in class
hierarchies that implement a common namespace.
"""
def __init__(
self,
*children: object,
attrib: dict[str, str] | None = None,
**_extra: str,
) -> None: ...
def _init(self) -> None: ...
class ElementClassLookup:
"""Superclass of Element class lookups"""
class ElementDefaultClassLookup(ElementClassLookup):
"""Element class lookup scheme that always returns the default Element
class.
The keyword arguments ``element``, ``comment``, ``pi`` and ``entity``
accept the respective Element classes."""
def __init__(
self,
element: type[ElementBase] | None = None,
) -> None: ...
class FallbackElementClassLookup(ElementClassLookup):
"""Superclass of Element class lookups with additional fallback"""
@property
def fallback(self) -> ElementClassLookup | None: ...
def __init__(self, fallback: ElementClassLookup | None = None) -> None: ...
def set_fallback(self, lookup: ElementClassLookup) -> None:
"""Sets the fallback scheme for this lookup method"""

View File

@ -0,0 +1,50 @@
# pyright: reportPrivateUsage=false
from __future__ import annotations
from typing import Collection, Generic, Iterator, TypeVar, overload
from typing_extensions import Self
from .. import _types as _t
_T = TypeVar("_T")
class _Element:
@overload
def __getitem__(self, __x: int) -> Self: ...
@overload
def __getitem__(self, __x: slice) -> list[Self]: ...
def __contains__(self, __o: object) -> bool: ...
def __len__(self) -> int: ...
def __iter__(self) -> Iterator[Self]: ...
def find(self, path: _t._ElemPathArg) -> Self | None: ...
@overload
def get(self, key: _t._AttrName) -> str | None: ...
@overload
def get(self, key: _t._AttrName, default: _T) -> str | _T: ...
def iterancestors(
self, *, tag: _t._TagSelector | Collection[_t._TagSelector] | None = None
) -> Iterator[Self]: ...
@overload
def itertext(self, *tags: _t._TagSelector, with_tail: bool = True) -> Iterator[str]: ...
@overload
def itertext(
self,
*,
tag: _t._TagSelector | Collection[_t._TagSelector] | None = None,
with_tail: bool = True,
) -> Iterator[str]: ...
@property
def tag(self) -> str: ...
@property
def tail(self) -> str | None: ...
@property
def text(self) -> str | None: ...
def xpath(
self,
_path: str,
/,
) -> _t._XPathObject: ...
class _ElementTree(Generic[_t._ET_co]): ...

View File

@ -0,0 +1,19 @@
# pyright: reportPrivateUsage=false
from __future__ import annotations
from .._types import _ElementOrTree
from ..etree import HTMLParser, XMLParser
from ._element import _Element
def fromstring(text: str | bytes, parser: XMLParser | HTMLParser) -> _Element: ...
# Under XML Canonicalization (C14N) mode, most arguments are ignored,
# some arguments would even raise exception outright if specified.
def tostring(
element_or_tree: _ElementOrTree,
*,
encoding: str | type[str] | None = None,
pretty_print: bool = False,
with_tail: bool = True,
) -> str: ...

View File

@ -0,0 +1,5 @@
# pyright: reportPrivateUsage=false
from __future__ import annotations
class QName: ...

View File

@ -0,0 +1,31 @@
# pyright: reportPrivateUsage=false
from __future__ import annotations
from typing import Iterable, Iterator, MutableMapping, TypeVar
from .._types import SupportsLaxedItems
from ._classlookup import ElementBase, ElementClassLookup, FallbackElementClassLookup
_KT = TypeVar("_KT")
_VT = TypeVar("_VT")
class _NamespaceRegistry(MutableMapping[_KT, _VT]):
def __delitem__(self, __key: _KT) -> None: ...
def __getitem__(self, __key: _KT) -> _VT: ...
def __setitem__(self, __key: _KT, __value: _VT) -> None: ...
def __iter__(self) -> Iterator[_KT]: ...
def __len__(self) -> int: ...
def update( # type: ignore[override]
self,
class_dict_iterable: SupportsLaxedItems[_KT, _VT] | Iterable[tuple[_KT, _VT]],
) -> None: ...
def items(self) -> list[tuple[_KT, _VT]]: ... # type: ignore[override]
def iteritems(self) -> Iterator[tuple[_KT, _VT]]: ...
def clear(self) -> None: ...
class _ClassNamespaceRegistry(_NamespaceRegistry[str | None, type[ElementBase]]): ...
class ElementNamespaceClassLookup(FallbackElementClassLookup):
def __init__(self, fallback: ElementClassLookup | None = None) -> None: ...
def get_namespace(self, ns_uri: str | None) -> _ClassNamespaceRegistry: ...

View File

@ -0,0 +1,41 @@
from __future__ import annotations
from ._classlookup import ElementClassLookup
class HTMLParser:
def __init__(
self,
*,
encoding: str | None = None,
remove_blank_text: bool = False,
remove_comments: bool = False,
remove_pis: bool = False,
strip_cdata: bool = True,
no_network: bool = True,
recover: bool = True,
compact: bool = True,
default_doctype: bool = True,
collect_ids: bool = True,
huge_tree: bool = False,
) -> None: ...
def set_element_class_lookup(self, lookup: ElementClassLookup | None = None) -> None: ...
class XMLParser:
def __init__(
self,
*,
encoding: str | None = None,
attribute_defaults: bool = False,
dtd_validation: bool = False,
load_dtd: bool = False,
no_network: bool = True,
ns_clean: bool = False,
recover: bool = False,
huge_tree: bool = False,
remove_blank_text: bool = False,
remove_comments: bool = False,
remove_pis: bool = False,
strip_cdata: bool = True,
collect_ids: bool = True,
compact: bool = True,
) -> None: ...

View File

@ -1 +1 @@
__version__ = "0.14.8-dev0" # pragma: no cover
__version__ = "0.14.8-dev1" # pragma: no cover

View File

@ -2,9 +2,8 @@
from __future__ import annotations
from typing import IO, Final, Iterator, cast
from typing import TYPE_CHECKING, Final, Iterator, cast
import requests
from lxml import etree
from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes
@ -19,8 +18,6 @@ from unstructured.documents.elements import (
Text,
Title,
)
from unstructured.file_utils.encoding import read_txt_file
from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file
from unstructured.partition.text_type import (
is_bulleted_text,
is_email_address,
@ -29,7 +26,10 @@ from unstructured.partition.text_type import (
is_us_city_state_zip,
)
from unstructured.partition.utils.constants import HTML_MAX_PREDECESSOR_LEN
from unstructured.utils import htmlify_matrix_of_cell_texts, is_temp_file_path, lazyproperty
from unstructured.utils import htmlify_matrix_of_cell_texts, lazyproperty
if TYPE_CHECKING:
from unstructured.partition.html.partition import HtmlPartitionerOptions
TEXT_TAGS: Final[list[str]] = ["p", "a", "td", "span", "b", "font"]
LIST_ITEM_TAGS: Final[list[str]] = ["li", "dd"]
@ -57,7 +57,7 @@ class HTMLDocument:
@classmethod
def load(cls, opts: HtmlPartitionerOptions) -> HTMLDocument:
"""Construct instance from whatever source is specified in `opts`."""
return cls(opts.html_str, opts)
return cls(opts.html_text, opts)
@lazyproperty
def elements(self) -> list[Element]:
@ -452,108 +452,6 @@ class HTMLDocument:
yield element
class HtmlPartitionerOptions:
"""Encapsulates partitioning option validation, computation, and application of defaults."""
# TODO: this eventually moves to `unstructured.partition.html` but not until `HTMLDocument`
# becomes `_HtmlPartitioner` and moves there with it.
def __init__(
self,
*,
file_path: str | None,
file: IO[bytes] | None,
text: str | None,
encoding: str | None,
url: str | None,
headers: dict[str, str],
ssl_verify: bool,
date_from_file_object: bool,
metadata_last_modified: str | None,
skip_headers_and_footers: bool,
detection_origin: str | None,
):
self._file_path = file_path
self._file = file
self._text = text
self._encoding = encoding
self._url = url
self._headers = headers
self._ssl_verify = ssl_verify
self._date_from_file_object = date_from_file_object
self._metadata_last_modified = metadata_last_modified
self._skip_headers_and_footers = skip_headers_and_footers
self._detection_origin = detection_origin
@lazyproperty
def detection_origin(self) -> str | None:
"""Trace of initial partitioner to be included in metadata for debugging purposes."""
return self._detection_origin
@lazyproperty
def encoding(self) -> str | None:
"""Caller-provided encoding used to store HTML character stream as bytes.
`None` when no encoding was provided and encoding should be auto-detected.
"""
return self._encoding
@lazyproperty
def html_str(self) -> str:
"""The HTML document as a string, loaded from wherever the caller specified."""
if self._file_path:
return read_txt_file(filename=self._file_path, encoding=self._encoding)[1]
if self._file:
return read_txt_file(file=self._file, encoding=self._encoding)[1]
if self._text:
return str(self._text)
if self._url:
response = requests.get(self._url, headers=self._headers, verify=self._ssl_verify)
if not response.ok:
raise ValueError(
f"Error status code on GET of provided URL: {response.status_code}"
)
content_type = response.headers.get("Content-Type", "")
if not content_type.startswith("text/html"):
raise ValueError(f"Expected content type text/html. Got {content_type}.")
return response.text
raise ValueError("Exactly one of filename, file, text, or url must be specified.")
@lazyproperty
def last_modified(self) -> str | None:
"""The best last-modified date available, None if no sources are available."""
# -- Value explicitly specified by caller takes precedence. This is used for example when
# -- this file was converted from another format.
if self._metadata_last_modified:
return self._metadata_last_modified
if self._file_path:
return (
None
if is_temp_file_path(self._file_path)
else get_last_modified_date(self._file_path)
)
if self._file:
return (
get_last_modified_date_from_file(self._file)
if self._date_from_file_object
else None
)
return None
@lazyproperty
def skip_headers_and_footers(self) -> bool:
"""When True, elements located within a header or footer are pruned."""
return self._skip_headers_and_footers
# -- tag processors ------------------------------------------------------------------------------

View File

@ -1,105 +0,0 @@
from __future__ import annotations
from typing import IO, Any, Optional
from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata
from unstructured.documents.html import HTMLDocument, HtmlPartitionerOptions
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.lang import apply_lang_metadata
@process_metadata()
@add_metadata_with_filetype(FileType.HTML)
@add_chunking_strategy
def partition_html(
filename: Optional[str] = None,
*,
file: Optional[IO[bytes]] = None,
text: Optional[str] = None,
encoding: Optional[str] = None,
url: Optional[str] = None,
headers: dict[str, str] = {},
ssl_verify: bool = True,
date_from_file_object: bool = False,
detect_language_per_element: bool = False,
languages: Optional[list[str]] = ["auto"],
metadata_last_modified: Optional[str] = None,
skip_headers_and_footers: bool = False,
detection_origin: Optional[str] = None,
**kwargs: Any,
) -> list[Element]:
"""Partitions an HTML document into its constituent elements.
HTML source parameters
----------------------
The HTML to be partitioned can be specified four different ways:
filename
A string defining the target filename path.
file
A file-like object using "r" mode --> open(filename, "r").
text
The string representation of the HTML document.
url
The URL of a webpage to parse. Only for URLs that return an HTML document.
headers
The HTTP headers to be used in the HTTP request when `url` is specified.
ssl_verify
If the URL parameter is set, determines whether or not SSL verification is performed
on the HTTP request.
date_from_file_object
Applies only when providing file via `file` parameter. If this option is True, attempt
infer last_modified metadata from bytes, otherwise set it to None.
encoding
The encoding method used to decode the text input. If None, utf-8 will be used.
Other parameters
----------------
include_metadata
Optionally allows for excluding metadata from the output. Primarily intended
for when partition_html is called by other partitioners (like partition_email).
languages
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
metadata_last_modified
The last modified date for the document.
skip_headers_and_footers
If True, ignores any content that is within <header> or <footer> tags
source_format
The source of the original html. If None we will return HTMLElements but for example
partition_rst will pass a value of 'rst' so that we return Title vs HTMLTitle
"""
# -- parser rejects an empty str, nip that edge-case in the bud here --
if text is not None and text.strip() == "" and not file and not filename and not url:
return []
opts = HtmlPartitionerOptions(
file_path=filename,
file=file,
text=text,
encoding=encoding,
url=url,
headers=headers,
ssl_verify=ssl_verify,
date_from_file_object=date_from_file_object,
metadata_last_modified=metadata_last_modified,
skip_headers_and_footers=skip_headers_and_footers,
detection_origin=detection_origin,
)
document = HTMLDocument.load(opts)
elements = list(
apply_lang_metadata(
document.elements,
languages=languages,
detect_language_per_element=detect_language_per_element,
)
)
return elements

View File

@ -0,0 +1,3 @@
from unstructured.partition.html.partition import partition_html
__all__ = ["partition_html"]

View File

@ -0,0 +1,774 @@
# pyright: reportPrivateUsage=false
"""Provides the HTML parser used by `partition_html()`.
The names "flow" and "phrasing" derive from the language of the HTML Standard.
PRINCIPLES
- _Elements are paragraphs._ Each paragraph in the HTML document should become a distinct element.
In particular, a paragraph should not be split into two elements and an element should not contain
more than one paragraph.
- _An empty paragraph is not an Element._ A paragraph which contains no text or contains only
whitespace does not give rise to an Element (is skipped).
- _The browser rendering is the document._ The HTML "source-code" is not the document. The document
is the way that HTML is rendered by a browser (Chrome for a first authority). This foundational
principle gives rise to a few that are more specific.
- _Whitespace is normalized._ Whitespace used for formatting the HTML source is _normalized_ to a
single space between text segments. More specifically:
- Any leading or trailing space on a paragraph is removed.
- All other runs of whitespace in the paragraph are reduced to a single space (" ").
- Whitespace is never added where none existed in the HTML source.
- Whitespace within a `<pre>` element is the exception and is not normalized. Its
whitespace is preserved excepting a leading and/or trailing newline ("\n").
- _Block-items are paragraphs._ Visible content in HTML can be divided into _block-items_ and
_phrasing content_ (aka. _inline content_).
- As an example, a `<p>` element is a block item and a `<b>` element is phrasing.
- A block item starts a new paragraph and so represents an Element boundary.
- A phrasing item affects the appearance of a run of text within a paragraph, like
making it bold or making it into a link.
- Some elements can take either role, depending upon there ancestors and descendants.
- The final authority for whether a particular element is displayed as a block or as
inline "formatting" is the CSS. We do not attempt to interpret the CSS and assume
the default role for each element.
Other background
- The parser's design is _recursive_, consistent with the recursive (tree) structure of HTML. The
nodes of the tree are _HTML elements_. Unfortunately this naming sometimes conflicts with
Unstructured _document-elements_. In the parser code the term "document-element" is used when
there may be ambiguity.
- The parser is primarily composed of `lxml` Custom Element Classes. The gist is you write a class
like `Anchor` and then tell the `lxml` parser that all `<a>` elements should be instantiated using
the `Anchor` class. We also provide a default class for any elements that we haven't called out
explicitly.
- _Anatomy of an HTML element._ Some basic terms are important to know to understand the domain
language of the parser code. Consider this example:
```html
<div>
<p>Text <b>bold child</b> tail of child</p>
tail of p
</div>
```
- An element can have _text_.
- All visible content within an HTML document is the text (or tail) of some element.
- The text of the `<p>` element (`p.text`) is "Text ".
- Note the formatting whitespace is included.
- An element can have _child elements_.
- The `<p>` element (`p`) is a child of `div`.
- `b` is a child of `p`.
- An element can have a _tail_.
- Whatever text follows an element, before the next element starts, is the tail of
that element.
- `b.tail` is `" tail of child"`. Note the included whitespace.
- `p.tail` is `"\n tail of p\n"`.
- Tail text is _accessed_ via the element that precedes it but that element does not
_influence_ its tail text. For example, "tail of child" does not appear in a bold
typeface even though it is the tail of `b`.
"""
from __future__ import annotations
import itertools
from collections import defaultdict, deque
from types import MappingProxyType
from typing import Any, Iterable, Iterator, Mapping, NamedTuple, cast
from lxml import etree
from typing_extensions import TypeAlias
from unstructured.cleaners.core import clean_bullets
from unstructured.documents.elements import (
Address,
Element,
ElementMetadata,
EmailAddress,
ListItem,
NarrativeText,
Table,
Text,
Title,
)
from unstructured.partition.text_type import (
is_bulleted_text,
is_email_address,
is_possible_narrative_text,
is_possible_title,
is_us_city_state_zip,
)
from unstructured.utils import htmlify_matrix_of_cell_texts
# ------------------------------------------------------------------------------------------------
# DOMAIN MODEL
# ------------------------------------------------------------------------------------------------
Annotation: TypeAlias = Mapping[str, Any]
"""A mapping with zero or more keywords, each represening a noted characteristic.
An annotation can be associated with a text segment or element. In general the keys and value-types
differ between the individual (text-segment) and consolidated (Element) forms.
"""
def _consolidate_annotations(text_segments: Iterable[TextSegment]) -> Annotation:
"""Combine individual text-segment annotations into an element-level annotation.
Sequence is significant.
"""
combined_annotations = cast(defaultdict[str, list[str]], defaultdict(list))
for ts in text_segments:
for k, v in ts.annotation.items():
if isinstance(v, list):
combined_annotations[k].extend(cast(list[Any], v))
else:
combined_annotations[k].append(v)
return MappingProxyType(dict(combined_annotations))
def _normalize_text(text: str) -> str:
"""`text` with normalized whitespace.
- leading and trailing whitespace are removed
- all whitespace segments within text (spacing between words) are reduced to a single space
each.
Produces the empty string when `text` contains only whitespace.
"""
return " ".join(text.strip().split())
class TextSegment(NamedTuple):
"""An annotated string from a Phrasing element.
Annotations are for emphasis and for links. The text includes any leading, trailing, and
inter-word whitespace, just as it occurred in the HTML. The text-segments for a paragraph are
consolidated once the paragraph is fully parsed and whitespace it normalized at that time. It
cannot be normalized prior to that without distoring or losing inter-word spacing.
However, text within annotations, like the text of a link, is normalized since its full extents
are known.
"""
text: str
annotation: Annotation
# ------------------------------------------------------------------------------------------------
# CUSTOM ELEMENT-CLASSES
# ------------------------------------------------------------------------------------------------
# -- FLOW (BLOCK-ITEM) ELEMENTS ------------------------------------------------------------------
class Flow(etree.ElementBase):
"""Base and default class for elements that act like a div.
These can contain other flow elements or phrasing elements.
"""
# -- by default, choose the element class based on the form of the text --
_ElementCls = None
@property
def is_phrasing(self) -> bool:
return False
def iter_elements(self) -> Iterator[Element]:
"""Generate paragraph string for each block item within."""
# -- place child elements in a queue --
q: deque[Flow | Phrasing] = deque(self)
yield from self._element_from_text_or_tail(self.text or "", q, self._ElementCls)
while q:
assert not q[0].is_phrasing
block_item = cast(Flow, q.popleft())
yield from block_item.iter_elements()
yield from self._element_from_text_or_tail(block_item.tail or "", q)
def _category_depth(self, ElementCls: type[Element]) -> int | None:
"""Not clear on concept. Something to do with hierarchy ..."""
if ElementCls is ListItem:
return (
len([e for e in self.iterancestors() if e.tag in ("dl", "ol", "ul")])
if self.tag in ("li", "dd")
else 0
)
if ElementCls is Title:
return int(self.tag[1]) - 1 if self.tag in ("h1", "h2", "h3", "h4", "h5", "h6") else 0
return None
def _element_from_text_or_tail(
self, text: str, q: deque[Flow | Phrasing], ElementCls: type[Element] | None = None
) -> Iterator[Element]:
"""Generate zero-or-one paragraph formed from text and leading phrasing elements.
Note this mutates `q` by popping phrasing elements off as they are processed.
"""
text_segments = tuple(self._iter_text_segments(text, q))
normalized_text = " ".join("".join(ts.text for ts in text_segments).split())
if not normalized_text:
return
# -- if we don't have a more specific element-class, choose one based on the text --
if ElementCls is None:
ElementCls = derive_element_type_from_text(normalized_text)
# -- normalized text that contains only a bullet character is skipped --
if ElementCls is None:
return
# -- derived ListItem means text starts with a bullet character that needs removing --
if ElementCls is ListItem:
normalized_text = clean_bullets(normalized_text)
if not normalized_text:
return
category_depth = self._category_depth(ElementCls)
yield ElementCls(
normalized_text,
metadata=ElementMetadata(
**_consolidate_annotations(text_segments), category_depth=category_depth
),
)
def _iter_text_segments(self, text: str, q: deque[Flow | Phrasing]) -> Iterator[TextSegment]:
"""Generate zero-or-more `TextSegment`s from text and leading phrasing elements.
This is used to process the text or tail of a flow element. For example, this <div>:
<div>
For a <b>moment, <i>nothing</i> happened.</b>
<p>Then, after a second or so, nothing continued to happen.</p>
The dolphins had always believed that <em>they</em> were far more intelligent.
</div>
Should generate three distinct elements, one for each contained line. This method is
invoked to process the first beginning "For a" and the third line beginning "The dolphins".
Note this method mutates `q` by popping phrasing elements off as they are processed.
"""
yield TextSegment(text, {})
while q and q[0].is_phrasing:
e = cast(Phrasing, q.popleft())
yield from e.iter_text_segments()
class BlockItem(Flow):
"""Custom element-class for `<p>` element, `<h1>`, and others like it.
These can appear in a flow container like a div but can only contain phrasing content.
"""
# -- Turns out there are no implementation differences so far between Flow and BlockItem, but
# -- maintaining the distinction for now. We may use it to add hierarchy information or
# -- customize how we deal with invalid HTML that places flow items inside one of these.
class Heading(Flow):
"""An `<h1>..<h6>` element.
These are distinguished because they generate a `Title` element.
"""
_ElementCls = Title
class ListBlock(Flow):
"""Either a `<ul>` or `<ol>` element, maybe a `<dl>` element at some point.
The primary reason for distinguishing these is because they increment the hierarchy depth for
lists that are nested inside them.
Can only contain `<li>` elements (ignoring `<script>` and `<template>`). A list nested inside
must actually be a child of one of these `<li>` elements.
"""
# TODO: might want alternate `.iter_elements()` since these can only contain `<li>` elements and
# not text nodes (I believe).
class ListItemBlock(Flow):
"""A `<li>` element.
These are distinguished because they generate a `ListItem` element.
"""
_ElementCls = ListItem
class Pre(BlockItem):
"""Custom element-class for `<pre>` element.
Can only contain phrasing content.
"""
def iter_elements(self) -> Iterator[Element]:
"""Generate zero or one document element for the entire `<pre>` element.
Whitespace is preserved just as it appears in the source HTML.
"""
pre_text = self.text or ""
# -- this is pretty subtle, but in a browser, if the opening `<pre>` is immediately
# -- followed by a newline, that newline is removed from the rendered text.
if pre_text.startswith("\n"):
pre_text = pre_text[1:]
text_segments = tuple(self._iter_text_segments(pre_text, deque(self)))
text = "".join(ts.text for ts in text_segments)
# -- also subtle, but in a browser, if the closing `</pre>` tag is immediately preceded
# -- by a newline (starts in column 1), that preceding newline is removed too.
if text.endswith("\n"):
text = text[:-1]
if not text:
return
ElementCls = derive_element_type_from_text(text)
if not ElementCls:
return
yield ElementCls(text, metadata=ElementMetadata(**_consolidate_annotations(text_segments)))
class TableBlock(Flow):
"""Custom element-class for `<table>` element."""
def iter_elements(self) -> Iterator[Table]:
"""Generate paragraph string for each block item within."""
# -- NOTE this algorithm handles a nested-table by parsing all of its text into the text
# -- for the _cell_ containing the table (and this is recursive, so a table nested within
# -- a cell within a table within a cell too.)
trs = cast(list[etree._Element], self.xpath("./tr | ./thead/tr | ./tbody/tr | ./tfoot/tr"))
if not trs:
return
def iter_cell_texts(tr: etree._Element) -> Iterator[str]:
"""Generate the text of each cell in `tr`."""
# -- a cell can be either a "data" cell (td) or a "heading" cell (th) --
tds = cast(list[etree._Element], tr.xpath("./td | ./th"))
for td in tds:
# -- a cell can contain other elements like spans etc. so we can't count on the
# -- text being directly below the `<td>` element. `.itertext()` gets all of it
# -- recursively. Filter out whitespace text nodes resulting from HTML formatting.
stripped_text_nodes = (t.strip() for t in td.itertext())
yield " ".join(t for t in stripped_text_nodes if t)
table_data = [list(iter_cell_texts(tr)) for tr in trs]
html_table = htmlify_matrix_of_cell_texts(table_data)
table_text = " ".join(" ".join(t for t in row if t) for row in table_data).strip()
if table_text == "":
return
yield Table(table_text, metadata=ElementMetadata(text_as_html=html_table))
class RemovedBlock(Flow):
"""Elements that are to be ignored.
An element may be ignored because it commonly contains boilerplate that would dilute the meaning
extracted rather than contribute to it.
All contents of a removed block item are ignored but its tail is emitted by its container.
"""
def iter_elements(self) -> Iterator[Element]:
"""Don't generate any document-elements."""
return
yield
# -- PHRASING ELEMENTS ---------------------------------------------------------------------------
class Phrasing(etree.ElementBase):
"""Base-class for phrasing (inline/run) elements like bold and italic."""
@property
def is_phrasing(self) -> bool:
return True
def iter_text_segments(self, enclosing_emphasis: str = "") -> Iterator[TextSegment]:
"""Generate text segments for text, children, and tail of this element."""
inside_emphasis = self._inside_emphasis(enclosing_emphasis)
yield from self._iter_text_segment(inside_emphasis)
# -- Recurse into any nested tags. All child tags are assumed to also be phrasing tags. --
yield from self._iter_child_text_segments(inside_emphasis)
# -- It is the phrasing element's job to emit its tail when it has one (there is no one
# -- else who can do it). Note that the tail gets the _enclosing-emphasis_, not the
# -- _inside-emphasis_ since the tail occurs after this phrasing element's closing tag.
yield from self._iter_tail_segment(enclosing_emphasis)
def _annotation(self, text: str, emphasis: str) -> Annotation:
"""Emphasis annotations that apply to text inside this element.
No annotations are added when the text contains only whitespace. Otherwise, emphasis
annotations are returned for the text contents, normalized as it will appear in the
document-element.
Emphasis annotations apply to the contents of all elements enclosed by the emphasis element.
Sub-classes like the one for anchor elements that add non-emphasis annotations will need to
override this method.
"""
# -- emphasis annotation is only added when there is both emphasis and non-whitespace text
# -- to apply it to
return MappingProxyType(
{"emphasized_text_contents": normalized_text, "emphasized_text_tags": emphasis}
if (normalized_text := _normalize_text(text)) and emphasis
else {}
)
def _inside_emphasis(self, enclosing_emphasis: str) -> str:
"""By default, the inside emphasis is the same as the outside emphasis.
This method is overridden by sub-classes that annotate particular emphasis types but many
phrasing elements do not contribute to annotations.
"""
return enclosing_emphasis
def _iter_child_text_segments(self, emphasis: str) -> Iterator[TextSegment]:
"""Generate zero-or-more text-segments for phrasing children of this element.
All generated text segments will be annotated with `emphasis` when it is other than the
empty string.
"""
for child in self:
yield from child.iter_text_segments(emphasis)
def _iter_tail_segment(self, emphasis: str) -> Iterator[TextSegment]:
"""Generate zero-or-one text-segment for tail of this element.
No text-segment is generated when this element has no tail node. However a segment _is_
generated for a whitespace-only tail node.
"""
if tail := self.tail:
yield TextSegment(tail, self._annotation(tail, emphasis))
def _iter_text_segment(self, emphasis: str) -> Iterator[TextSegment]:
"""Generate zero-or-one text-segment for text of this element.
No text-segment is generated when this element has no text node. However a segment _is_
generated for a whitespace-only text node.
"""
if text := self.text:
yield TextSegment(text, self._annotation(text, emphasis))
class Bold(Phrasing):
"""Provides annotations for bold/strong text."""
def _inside_emphasis(self, enclosing_emphasis: str) -> str:
"""Emphasis tags that apply to text inside this element.
Formed by adding "b" (for "bold") to the enclosing emphasis, unless it's already there.
The returned emphasis tuple is sorted to make its form canonical, which eases testing. For
Example `("b", "i")` and `("i", "b")` are semantically the same but don't directly compare
equal in a test. Sorting it basically gives it some set-like properties.
"""
chars = set(enclosing_emphasis + "b")
return "".join(sorted(chars))
class Italic(Phrasing):
"""Provides annotations for italic/emphasized text."""
def _inside_emphasis(self, enclosing_emphasis: str) -> str:
"""Emphasis tags that apply to text inside this element.
Formed by adding "i" (for "italic") to the enclosing emphasis, unless it's already there.
"""
chars = set(enclosing_emphasis + "i")
return "".join(sorted(chars))
class LineBreak(Phrasing):
"""A `<br/>` line-break element.
It's only special behavior is to add whitespace such that phrasing tight on both sides is not
joined, like `abc<br/>def` should become "abc def", not "abcdef".
"""
def iter_text_segments(self, enclosing_emphasis: str = "") -> Iterator[TextSegment]:
"""Generate text segments for text, children, and tail of this element."""
yield TextSegment("\n", {})
yield from self._iter_tail_segment(enclosing_emphasis)
class RemovedPhrasing(Phrasing):
"""Phrasing where we want to skip the content.
- `.is_phrasing` is True so it doesn't break the paragraph like a block.
- `element.text` is discarded
- `element.tail` is preserved
"""
def iter_text_segments(self, enclosing_emphasis: str = "") -> Iterator[TextSegment]:
"""Generate text segment for tail only of this element."""
yield from self._iter_tail_segment(enclosing_emphasis)
# -- DUAL-ROLE ELEMENTS --------------------------------------------------------------------------
class Anchor(Phrasing, Flow):
"""Custom element-class for `<a>` element.
Provides link annotations.
"""
@property
def is_phrasing(self) -> bool:
"""False when the `<a>` element contains any block items, True otherwise."""
return all(e.is_phrasing for e in self)
def iter_text_segments(self, enclosing_emphasis: str = "") -> Iterator[TextSegment]:
"""Generate text segments for text and tail of this element, when they exist.
The behavior for an anchor element is slightly different because link annotations are only
added to the text, not the tail. Also an anchor can have no children.
"""
# -- the text of the link is everything inside the `<a>` element, text and child text --
text_segments = tuple(
itertools.chain(
self._iter_text_segment(enclosing_emphasis),
self._iter_child_text_segments(enclosing_emphasis),
)
)
link_text = "".join("".join(ts.text for ts in text_segments))
# -- the link_text and link_url annotation refers to the entire text inside the `<a>` --
link_text_segment = TextSegment(
link_text, self._link_annotations(link_text, enclosing_emphasis)
)
# -- but the emphasis annotations must come from the individual text segments within --
consolidated_annotations = _consolidate_annotations((link_text_segment, *text_segments))
# -- generate at most one text-segment for the `<a>` element, the full enclosed text with
# -- consolidated emphasis and link annotations.
if link_text:
yield TextSegment(link_text, consolidated_annotations)
# -- A tail is emitted when present whether anchor itself was or not --
yield from self._iter_tail_segment(enclosing_emphasis)
def _link_annotations(self, text: str, emphasis: str) -> Annotation:
"""Link and emphasis annotations that apply to the text of this anchor.
An anchor element does not add any emphasis but uses any introduced by enclosing elements.
"""
normalized_text = _normalize_text(text)
if not normalized_text:
return {}
def iter_annotation_pairs() -> Iterator[tuple[str, Any]]:
# -- emphasis annotation is only added when there is enclosing emphasis --
if emphasis:
yield "emphasized_text_contents", normalized_text
yield "emphasized_text_tags", emphasis
if href := self.get("href"):
yield "link_texts", normalized_text
yield "link_urls", href
return MappingProxyType(dict(iter_annotation_pairs()))
# -- DEFAULT ELEMENT -----------------------------------------------------------------------------
class DefaultElement(Flow, Phrasing):
"""Custom element-class used for any element without an assigned custom element class.
An unrecognized element is given both Flow (block) and Phrasing (inline) behaviors. It behaves
like a Flow element When nested in a Flow element like a Phrasing element when nested in a
Phrasing element.
The contents of the element is skipped in either case, but its tail is not when it behaves as a
Phrasing element. The tail is processed by its parent when that is a Flow element.
"""
@property
def is_phrasing(self) -> bool:
"""If asked (by a parent Flow element), identify as a phrasing element.
It's not possible to determine the display intent (block|inline) of an unknown element
(like `<foobar>`) and phrasing is less disruptive, adding the tail of this element to any
text or phrasing content before and after it without starting a new paragraph.
"""
return True
def iter_elements(self) -> Iterator[Element]:
"""Don't generate any document-elements when behaving like a Flow element.
Because the element identifies as phrasing and will always be enclosed by at least a
`<body>` element, this method should never be called. However, it's easier to prove it does
the appropriate thing if it is called than prove that it can never happen.
"""
return
yield
def iter_text_segments(self, enclosing_emphasis: str = "") -> Iterator[TextSegment]:
"""Generate text segment for tail of this element only.
This method is only called on Phrasing elements and their children. In that case, act like a
Phrasing element but don't generate a text segment for this element or any children. Do
however generate a tail text-segment.
"""
# -- It is the phrasing element's job to emit its tail when it has one (there is no one
# -- else who can do it). Note that the tail gets the _enclosing-emphasis_, not the
# -- _inside-emphasis_ since the tail occurs after this phrasing element's closing tag.
yield from self._iter_tail_segment(enclosing_emphasis)
# ------------------------------------------------------------------------------------------------
# TEXT-ELEMENT CLASSIFIER
# ------------------------------------------------------------------------------------------------
def derive_element_type_from_text(text: str) -> type[Text] | None:
"""Produce a document-element of the appropriate sub-type for `text`."""
if is_bulleted_text(text):
return ListItem
if is_us_city_state_zip(text):
return Address
if is_email_address(text):
return EmailAddress
if len(text) < 2:
return None
if is_possible_narrative_text(text):
return NarrativeText
# NOTE (scanny): Classifying short paragraphs as titles produces noise much more frequently
# than it does value. A `Title` element is very consequential in its effect on chunking and
# document hierarchy. Classifying any small paragraph as a heading is frequently wrong and
# throws off these important downstream processes much more than missing the occasional
# heading does. If we want to infer headings, I think we have to be much more intelligent
# about it and consider what elements came before and after to see if the text _behaves_ like
# a heading, maybe whether it is bold and how many text elements follow it before the next
# title and how long since the prior title, whether `h1..h6` are used elsewhere in the
# document, etc.
if is_possible_title(text):
return Title
return Text
# ------------------------------------------------------------------------------------------------
# HTML PARSER
# ------------------------------------------------------------------------------------------------
html_parser = etree.HTMLParser(remove_comments=True)
# -- elements that don't have a registered class get DefaultElement --
fallback = etree.ElementDefaultClassLookup(element=DefaultElement)
# -- elements that do have a registered class are assigned that class via lookup --
element_class_lookup = etree.ElementNamespaceClassLookup(fallback)
html_parser.set_element_class_lookup(element_class_lookup)
# -- register classes --
element_class_lookup.get_namespace(None).update(
{
# -- flow/containers --
"address": Flow,
"article": Flow,
"aside": Flow,
"blockquote": Flow,
"body": Flow,
"center": Flow,
"div": Flow,
"footer": Flow,
"header": Flow,
"hgroup": Flow,
"main": Flow,
"section": Flow,
# -- block items --
"h1": Heading,
"h2": Heading,
"h3": Heading,
"h4": Heading,
"h5": Heading,
"h6": Heading,
"p": BlockItem,
"pre": Pre,
# -- list blocks --
"ol": ListBlock,
"ul": ListBlock,
"li": ListItemBlock,
# -- table --
"table": TableBlock,
# -- annotated phrasing --
"a": Anchor,
"b": Bold,
"em": Italic,
"i": Italic,
"strong": Bold,
# -- transparent phrasing --
"abbr": Phrasing, # -- abbreviation, like "LLM (Large Language Model)"
"bdi": Phrasing, # -- Bidirectional Isolate - important for RTL languages
"bdo": Phrasing, # -- Bidirectional Override - maybe reverse
"big": Phrasing, # -- deprecated --
"br": LineBreak, # -- line break --
"cite": Phrasing, # -- title of book or article etc. --
"code": Phrasing, # -- monospaced terminal font --
"data": Phrasing, # -- similar to `time`, provides machine readable value as attribute --
"dfn": Phrasing, # -- definition, like new term in italic when first introduced --
"kbd": Phrasing, # -- font that looks like keyboard keys --
"mark": Phrasing, # -- like yellow highlighter --
"meter": Phrasing, # -- bar thermometer progress-meter thing --
"q": Phrasing, # -- inline quotation, usually quoted and maybe italic --
"s": Phrasing, # -- strikethrough --
"samp": Phrasing, # -- sample terminal output; like markdown back-ticks for inline code --
"small": Phrasing, # -- fine-print; maybe likely boilerplate --
"span": Phrasing,
"strike": Phrasing, # -- deprecated - obsolete version of `del` or `s` --
"sub": Phrasing, # -- subscript --
"sup": Phrasing, # -- superscript --
"time": Phrasing, # -- wrap human-readable time to provide machine-readable time as attr --
"tt": Phrasing, # -- deprecated - "teletype", obsolete version of `code` or `samp` --
"u": Phrasing, # -- red squiggly underline for e.g. spelling mistake; was underscore --
"var": Phrasing, # -- variable like "x" in a mathematical expression --
"wbr": Phrasing, # -- word-break opportunity; empty --
# -- removed phrasing --
"button": RemovedPhrasing,
"label": RemovedPhrasing,
# -- removed block --
"details": RemovedBlock, # -- likely boilerplate --
"dl": RemovedBlock,
"dd": RemovedBlock,
"dt": RemovedBlock,
"figure": RemovedBlock,
"hr": RemovedBlock,
"nav": RemovedBlock,
"template": RemovedBlock,
# -- removed form-related --
"form": RemovedBlock,
"input": RemovedBlock,
"summary": RemovedBlock, # -- child of `details`
}
)

View File

@ -0,0 +1,268 @@
# pyright: reportPrivateUsage=false
"""Provides `partition_html()."""
from __future__ import annotations
from typing import IO, Any, Iterator, Optional, cast
import requests
from lxml import etree
from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata
from unstructured.documents.html import HTMLDocument
from unstructured.file_utils.encoding import read_txt_file
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file
from unstructured.partition.html.parser import Flow, html_parser
from unstructured.partition.lang import apply_lang_metadata
from unstructured.utils import is_temp_file_path, lazyproperty
@process_metadata()
@add_metadata_with_filetype(FileType.HTML)
@add_chunking_strategy
def partition_html(
filename: Optional[str] = None,
*,
file: Optional[IO[bytes]] = None,
text: Optional[str] = None,
encoding: Optional[str] = None,
url: Optional[str] = None,
headers: dict[str, str] = {},
ssl_verify: bool = True,
date_from_file_object: bool = False,
detect_language_per_element: bool = False,
languages: Optional[list[str]] = ["auto"],
metadata_last_modified: Optional[str] = None,
skip_headers_and_footers: bool = False,
detection_origin: Optional[str] = None,
**kwargs: Any,
) -> list[Element]:
"""Partitions an HTML document into its constituent elements.
HTML source parameters
----------------------
The HTML to be partitioned can be specified four different ways:
filename
A string defining the target filename path.
file
A file-like object using "r" mode --> open(filename, "r").
text
The string representation of the HTML document.
url
The URL of a webpage to parse. Only for URLs that return an HTML document.
headers
The HTTP headers to be used in the HTTP request when `url` is specified.
ssl_verify
If the URL parameter is set, determines whether or not SSL verification is performed
on the HTTP request.
date_from_file_object
Applies only when providing file via `file` parameter. If this option is True, attempt
infer last_modified metadata from bytes, otherwise set it to None.
encoding
The encoding method used to decode the text input. If None, utf-8 will be used.
Other parameters
----------------
include_metadata
Optionally allows for excluding metadata from the output. Primarily intended
for when partition_html is called by other partitioners (like partition_email).
languages
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
metadata_last_modified
The last modified date for the document.
skip_headers_and_footers
If True, ignores any content that is within <header> or <footer> tags
"""
# -- parser rejects an empty str, nip that edge-case in the bud here --
if text is not None and text.strip() == "" and not file and not filename and not url:
return []
opts = HtmlPartitionerOptions(
file_path=filename,
file=file,
text=text,
encoding=encoding,
url=url,
headers=headers,
ssl_verify=ssl_verify,
date_from_file_object=date_from_file_object,
metadata_last_modified=metadata_last_modified,
skip_headers_and_footers=skip_headers_and_footers,
detection_origin=detection_origin,
)
document = HTMLDocument.load(opts)
elements = list(
apply_lang_metadata(
document.elements,
languages=languages,
detect_language_per_element=detect_language_per_element,
)
)
return elements
class HtmlPartitionerOptions:
"""Encapsulates partitioning option validation, computation, and application of defaults."""
def __init__(
self,
*,
file_path: str | None,
file: IO[bytes] | None,
text: str | None,
encoding: str | None,
url: str | None,
headers: dict[str, str],
ssl_verify: bool,
date_from_file_object: bool,
metadata_last_modified: str | None,
skip_headers_and_footers: bool,
detection_origin: str | None,
):
self._file_path = file_path
self._file = file
self._text = text
self._encoding = encoding
self._url = url
self._headers = headers
self._ssl_verify = ssl_verify
self._date_from_file_object = date_from_file_object
self._metadata_last_modified = metadata_last_modified
self._skip_headers_and_footers = skip_headers_and_footers
self._detection_origin = detection_origin
@lazyproperty
def detection_origin(self) -> str | None:
"""Trace of initial partitioner to be included in metadata for debugging purposes."""
return self._detection_origin
@lazyproperty
def encoding(self) -> str | None:
"""Caller-provided encoding used to store HTML character stream as bytes.
`None` when no encoding was provided and encoding should be auto-detected.
"""
return self._encoding
@lazyproperty
def html_text(self) -> str:
"""The HTML document as a string, loaded from wherever the caller specified."""
if self._file_path:
return read_txt_file(filename=self._file_path, encoding=self._encoding)[1]
if self._file:
return read_txt_file(file=self._file, encoding=self._encoding)[1]
if self._text:
return str(self._text)
if self._url:
response = requests.get(self._url, headers=self._headers, verify=self._ssl_verify)
if not response.ok:
raise ValueError(
f"Error status code on GET of provided URL: {response.status_code}"
)
content_type = response.headers.get("Content-Type", "")
if not content_type.startswith("text/html"):
raise ValueError(f"Expected content type text/html. Got {content_type}.")
return response.text
raise ValueError("Exactly one of filename, file, text, or url must be specified.")
@lazyproperty
def last_modified(self) -> str | None:
"""The best last-modified date available, None if no sources are available."""
# -- Value explicitly specified by caller takes precedence. This is used for example when
# -- this file was converted from another format.
if self._metadata_last_modified:
return self._metadata_last_modified
if self._file_path:
return (
None
if is_temp_file_path(self._file_path)
else get_last_modified_date(self._file_path)
)
if self._file:
return (
get_last_modified_date_from_file(self._file)
if self._date_from_file_object
else None
)
return None
@lazyproperty
def skip_headers_and_footers(self) -> bool:
"""When True, elements located within a header or footer are pruned."""
return self._skip_headers_and_footers
class _HtmlPartitioner: # pyright: ignore[reportUnusedClass]
"""Partition HTML document into document-elements."""
def __init__(self, opts: HtmlPartitionerOptions):
self._opts = opts
@classmethod
def iter_elements(cls, opts: HtmlPartitionerOptions) -> Iterator[Element]:
"""Partition HTML document provided by `opts` into document-elements."""
yield from cls(opts)._iter_elements()
def _iter_elements(self) -> Iterator[Element]:
"""Generated document-elements (e.g. Title, NarrativeText, etc.) parsed from document.
Elements appear in document order.
"""
for e in self._main.iter_elements():
e.metadata.last_modified = self._opts.last_modified
e.metadata.detection_origin = self._opts.detection_origin
yield e
@lazyproperty
def _main(self) -> Flow:
"""The root HTML element."""
# NOTE(scanny) - get `html_text` first so any encoding error raised is not confused with a
# recoverable parsing error.
html_text = self._opts.html_text
# NOTE(scanny) - `lxml` will not parse a `str` that includes an XML encoding declaration
# and will raise the following error:
# ValueError: Unicode strings with encoding declaration are not supported. ...
# This is not valid HTML (would be in XHTML), but Chrome accepts it so we work around it
# by UTF-8 encoding the str bytes and parsing those.
try:
root = etree.fromstring(html_text, html_parser)
except ValueError:
root = etree.fromstring(html_text.encode("utf-8"), html_parser)
# -- remove a variety of HTML element types like <script> and <style> that we prefer not
# -- to encounter while parsing.
etree.strip_elements(
root, ["del", "img", "link", "meta", "noscript", "script", "style"], with_tail=False
)
# -- remove <header> and <footer> tags if the caller doesn't want their contents --
if self._opts.skip_headers_and_footers:
etree.strip_elements(root, ["header", "footer"], with_tail=False)
# -- jump to the core content if the document indicates where it is --
if (main := root.find(".//main")) is not None:
return cast(Flow, main)
if (body := root.find(".//body")) is not None:
return cast(Flow, body)
return cast(Flow, root)