mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-25 06:04:53 +00:00
rfctr(html): prepare for new html parser (#3257)
**Summary** Extract as much mechanical refactoring from the HTML parser change-over into the PR as possible. This leaves the next PR focused on installing the new parser and the ingest-test impact. **Reviewers:** Commits are well groomed and reviewing commit-by-commit is probably easier. **Additional Context** This PR introduces the rewritten HTML parser. Its general design is recursive, consistent with the recursive structure of HTML (tree of elements). It also adds the unit tests for that parser but it does not _install_ the parser. So the behavior of `partition_html()` is unchanged by this PR. The next PR in this series will do that and handle the ingest and other unit test changes required to reflect the dozen or so bug-fixes the new parser provides.
This commit is contained in:
parent
e1b75539f7
commit
6fe1c9980e
@ -1,4 +1,4 @@
|
||||
## 0.14.8-dev0
|
||||
## 0.14.8-dev1
|
||||
|
||||
### Enhancements
|
||||
|
||||
|
||||
2
Makefile
2
Makefile
@ -317,7 +317,7 @@ test-no-extras:
|
||||
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) pytest \
|
||||
test_${PACKAGE_NAME}/partition/test_text.py \
|
||||
test_${PACKAGE_NAME}/partition/test_email.py \
|
||||
test_${PACKAGE_NAME}/partition/test_html.py \
|
||||
test_${PACKAGE_NAME}/partition/html/test_partition.py \
|
||||
test_${PACKAGE_NAME}/partition/test_xml_partition.py
|
||||
|
||||
.PHONY: test-extra-csv
|
||||
|
||||
@ -14,20 +14,19 @@ from lxml import etree
|
||||
from test_unstructured.unit_utils import (
|
||||
FixtureRequest,
|
||||
Mock,
|
||||
example_doc_path,
|
||||
function_mock,
|
||||
property_mock,
|
||||
)
|
||||
from unstructured.documents import html
|
||||
from unstructured.documents.elements import (
|
||||
Address,
|
||||
ListItem,
|
||||
NarrativeText,
|
||||
Table,
|
||||
Text,
|
||||
Title,
|
||||
)
|
||||
from unstructured.documents.html import HTMLDocument, HtmlPartitionerOptions
|
||||
from unstructured.documents.html import HTMLDocument
|
||||
from unstructured.partition.html.partition import HtmlPartitionerOptions
|
||||
|
||||
TAGS = (
|
||||
(
|
||||
@ -59,212 +58,6 @@ EXCLUDED_TAGS = [
|
||||
]
|
||||
|
||||
|
||||
# -- table-extraction behaviors ------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_it_can_parse_a_bare_bones_table_to_a_Table_element(opts_args: dict[str, Any]):
|
||||
"""Bare-bones means no `<thead>`, `<tbody>`, or `<tfoot>` elements."""
|
||||
opts_args["text"] = (
|
||||
"<html>\n"
|
||||
"<body>\n"
|
||||
" <table>\n"
|
||||
" <tr><td>Lorem</td><td>Ipsum</td></tr>\n"
|
||||
" <tr><td>Ut enim non</td><td>ad minim\nveniam quis</td></tr>\n"
|
||||
" </table>\n"
|
||||
"</body>\n"
|
||||
"</html>"
|
||||
)
|
||||
opts = HtmlPartitionerOptions(**opts_args)
|
||||
html_document = HTMLDocument.load(opts)
|
||||
|
||||
# -- there is exactly one element and it's a Table instance --
|
||||
(element,) = html_document.elements
|
||||
assert isinstance(element, Table)
|
||||
# -- table text is joined into a single string; no row or cell boundaries are represented --
|
||||
assert element.text == "Lorem Ipsum Ut enim non ad minim\nveniam quis"
|
||||
# -- An HTML representation is also available that is longer but represents table structure.
|
||||
assert element.metadata.text_as_html == (
|
||||
"<table>"
|
||||
"<tr><td>Lorem</td><td>Ipsum</td></tr>"
|
||||
"<tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr>"
|
||||
"</table>"
|
||||
)
|
||||
|
||||
|
||||
def test_it_accommodates_column_heading_cells_enclosed_in_thead_tbody_and_tfoot_elements(
|
||||
opts_args: dict[str, Any]
|
||||
):
|
||||
"""Cells within a `table/thead` element are included in the text and html.
|
||||
|
||||
The presence of a `<thead>` element in the original also determines whether a `<thead>` element
|
||||
appears in `.text_as_html` or whether the first row of cells is simply in the body.
|
||||
"""
|
||||
opts_args["text"] = (
|
||||
"<html>\n"
|
||||
"<body>\n"
|
||||
" <table>\n"
|
||||
" <thead>\n"
|
||||
" <tr><th>Lorem</th><th>Ipsum</th></tr>\n"
|
||||
" </thead>\n"
|
||||
" <tbody>\n"
|
||||
" <tr><th>Lorem ipsum</th><td>dolor sit amet nulla</td></tr>\n"
|
||||
" <tr><th>Ut enim non</th><td>ad minim\nveniam quis</td></tr>\n"
|
||||
" </tbody>\n"
|
||||
" <tfoot>\n"
|
||||
" <tr><th>Dolor</th><td>Equis</td></tr>\n"
|
||||
" </tfoot>\n"
|
||||
" </table>\n"
|
||||
"</body>\n"
|
||||
"</html>"
|
||||
)
|
||||
opts = HtmlPartitionerOptions(**opts_args)
|
||||
html_document = HTMLDocument.load(opts)
|
||||
|
||||
(element,) = html_document.elements
|
||||
assert isinstance(element, Table)
|
||||
assert element.metadata.text_as_html == (
|
||||
"<table>"
|
||||
"<tr><td>Lorem</td><td>Ipsum</td></tr>"
|
||||
"<tr><td>Lorem ipsum</td><td>dolor sit amet nulla</td></tr>"
|
||||
"<tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr>"
|
||||
"<tr><td>Dolor</td><td>Equis</td></tr>"
|
||||
"</table>"
|
||||
)
|
||||
|
||||
|
||||
def test_it_does_not_emit_a_Table_element_for_a_table_with_no_text(opts_args: dict[str, Any]):
|
||||
opts_args["text"] = (
|
||||
"<html>\n"
|
||||
"<body>\n"
|
||||
" <table>\n"
|
||||
" <tr><td> </td><td> </td></tr>\n"
|
||||
" <tr><td> </td><td> </td></tr>\n"
|
||||
" </table>\n"
|
||||
"</body>\n"
|
||||
"</html>"
|
||||
)
|
||||
opts = HtmlPartitionerOptions(**opts_args)
|
||||
html_document = HTMLDocument.load(opts)
|
||||
|
||||
assert html_document.elements == []
|
||||
|
||||
|
||||
def test_it_grabs_bulleted_text_in_tables_as_ListItem_elements(opts_args: dict[str, Any]):
|
||||
opts_args["text"] = (
|
||||
"<html>\n"
|
||||
" <body>\n"
|
||||
" <table>\n"
|
||||
" <tbody>\n"
|
||||
" <tr>\n"
|
||||
" <td>•</td>\n"
|
||||
" <td><p>Happy Groundhog's day!</p></td>\n"
|
||||
" </tr>\n"
|
||||
" <tr>\n"
|
||||
" <td>•</td>\n"
|
||||
" <td><p>Looks like six more weeks of winter ...</p></td>\n"
|
||||
" </tr>\n"
|
||||
" </tbody>\n"
|
||||
" </table>\n"
|
||||
" </body>\n"
|
||||
"</html>\n"
|
||||
)
|
||||
opts = HtmlPartitionerOptions(**opts_args)
|
||||
html_document = HTMLDocument.load(opts)
|
||||
|
||||
assert html_document.elements == [
|
||||
ListItem(text="Happy Groundhog's day!"),
|
||||
ListItem(text="Looks like six more weeks of winter ..."),
|
||||
]
|
||||
|
||||
|
||||
def test_it_does_not_consider_an_empty_table_a_bulleted_text_table(opts_args: dict[str, Any]):
|
||||
opts_args["text"] = (
|
||||
"<html>\n"
|
||||
"<body>\n"
|
||||
" <table>\n"
|
||||
" <tr><td> </td><td> </td></tr>\n"
|
||||
" <tr><td> </td><td> </td></tr>\n"
|
||||
" </table>\n"
|
||||
"</body>\n"
|
||||
"</html>"
|
||||
)
|
||||
opts = HtmlPartitionerOptions(**opts_args)
|
||||
html_document = HTMLDocument.load(opts)
|
||||
html_elem = html_document._document_tree
|
||||
assert html_elem is not None
|
||||
table = html_elem.find(".//table")
|
||||
assert table is not None
|
||||
|
||||
assert html_document._is_bulleted_table(table) is False
|
||||
|
||||
|
||||
def test_it_provides_parseable_HTML_in_text_as_html(opts_args: dict[str, Any]):
|
||||
opts_args["text"] = (
|
||||
"<html>\n"
|
||||
"<body>\n"
|
||||
" <table>\n"
|
||||
" <thead>\n"
|
||||
" <tr><th>Lorem</th><th>Ipsum</th></tr>\n"
|
||||
" </thead>\n"
|
||||
" <tbody>\n"
|
||||
" <tr><th>Lorem ipsum</th><td>dolor sit amet nulla</td></tr>\n"
|
||||
" <tr><th>Ut enim non</th><td>ad minim\nveniam quis</td></tr>\n"
|
||||
" </tbody>\n"
|
||||
" <tfoot>\n"
|
||||
" <tr><th>Dolor</th><td>Equis</td></tr>\n"
|
||||
" </tfoot>\n"
|
||||
" </table>\n"
|
||||
"</body>\n"
|
||||
"</html>"
|
||||
)
|
||||
html_document = HTMLDocument.load(HtmlPartitionerOptions(**opts_args))
|
||||
(element,) = html_document.elements
|
||||
assert isinstance(element, Table)
|
||||
text_as_html = element.metadata.text_as_html
|
||||
assert text_as_html is not None
|
||||
|
||||
html = etree.fromstring(text_as_html, etree.HTMLParser())
|
||||
|
||||
assert html is not None
|
||||
# -- lxml adds the <html><body> container, that's not present in `.text_as_html` --
|
||||
assert etree.tostring(html, encoding=str) == (
|
||||
"<html><body>"
|
||||
"<table>"
|
||||
"<tr><td>Lorem</td><td>Ipsum</td></tr>"
|
||||
"<tr><td>Lorem ipsum</td><td>dolor sit amet nulla</td></tr>"
|
||||
"<tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr>"
|
||||
"<tr><td>Dolor</td><td>Equis</td></tr>"
|
||||
"</table>"
|
||||
"</body></html>"
|
||||
)
|
||||
|
||||
|
||||
# -- element-suppression behaviors ---------------------------------------------------------------
|
||||
|
||||
|
||||
def test_it_does_not_extract_text_in_script_tags(opts_args: dict[str, Any]):
|
||||
opts_args["file_path"] = example_doc_path("example-with-scripts.html")
|
||||
opts = HtmlPartitionerOptions(**opts_args)
|
||||
doc = HTMLDocument.load(opts)
|
||||
assert all("function (" not in element.text for element in doc.elements)
|
||||
|
||||
|
||||
def test_it_does_not_extract_text_in_style_tags(opts_args: dict[str, Any]):
|
||||
opts_args["text"] = (
|
||||
"<html>\n"
|
||||
"<body>\n"
|
||||
" <p><style> p { margin:0; padding:0; } </style>Lorem ipsum dolor</p>\n"
|
||||
"</body>\n"
|
||||
"</html>"
|
||||
)
|
||||
opts = HtmlPartitionerOptions(**opts_args)
|
||||
html_document = HTMLDocument.load(opts)
|
||||
|
||||
(element,) = html_document.elements
|
||||
assert isinstance(element, Text)
|
||||
assert element.text == "Lorem ipsum dolor"
|
||||
|
||||
|
||||
# -- HTMLDocument.from_file() --------------------------------------------------------------------
|
||||
|
||||
|
||||
@ -311,132 +104,6 @@ def test_read_html_doc(tmp_path: pathlib.Path, opts_args: dict[str, Any]):
|
||||
# -- HTMLDocument.elements -----------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_nested_text_tags(opts_args: dict[str, Any]):
|
||||
opts_args["text"] = (
|
||||
"<body>\n"
|
||||
" <p>\n"
|
||||
" <a>\n"
|
||||
" There is some text here.\n"
|
||||
" </a>\n"
|
||||
" </p>\n"
|
||||
"</body>\n"
|
||||
)
|
||||
opts = HtmlPartitionerOptions(**opts_args)
|
||||
elements = HTMLDocument.load(opts).elements
|
||||
|
||||
assert len(elements) == 1
|
||||
|
||||
|
||||
def test_containers_with_text_are_processed(opts_args: dict[str, Any]):
|
||||
opts_args["text"] = (
|
||||
'<div dir=3D"ltr">Hi All,<div><br></div>\n'
|
||||
" <div>Get excited for our first annual family day!</div>\n"
|
||||
' <div>Best.<br clear=3D"all">\n'
|
||||
" <div><br></div>\n"
|
||||
" -- <br>\n"
|
||||
' <div dir=3D"ltr">\n'
|
||||
' <div dir=3D"ltr">Dino the Datasaur<div>\n'
|
||||
" Unstructured Technologies<br>\n"
|
||||
" <div>Data Scientist</div>\n"
|
||||
" <div>Doylestown, PA 18901</div>\n"
|
||||
" <div><br></div>\n"
|
||||
" </div>\n"
|
||||
" </div>\n"
|
||||
" </div>\n"
|
||||
" </div>\n"
|
||||
"</div>\n"
|
||||
)
|
||||
opts = HtmlPartitionerOptions(**opts_args)
|
||||
html_document = HTMLDocument.load(opts)
|
||||
|
||||
assert html_document.elements == [
|
||||
Text(text="Hi All,"),
|
||||
NarrativeText(text="Get excited for our first annual family day!"),
|
||||
Title(text="Best."),
|
||||
Text(text="\n -- "),
|
||||
Title(text="Dino the Datasaur"),
|
||||
Title(text="\n Unstructured Technologies"),
|
||||
Title(text="Data Scientist"),
|
||||
Address(text="Doylestown, PA 18901"),
|
||||
]
|
||||
|
||||
|
||||
def test_html_grabs_bulleted_text_in_tags(opts_args: dict[str, Any]):
|
||||
opts_args["text"] = (
|
||||
"<html>\n"
|
||||
" <body>\n"
|
||||
" <ol>\n"
|
||||
" <li>Happy Groundhog's day!</li>\n"
|
||||
" <li>Looks like six more weeks of winter ...</li>\n"
|
||||
" </ol>\n"
|
||||
" </body>\n"
|
||||
"</html>\n"
|
||||
)
|
||||
opts = HtmlPartitionerOptions(**opts_args)
|
||||
assert HTMLDocument.load(opts).elements == [
|
||||
ListItem(text="Happy Groundhog's day!"),
|
||||
ListItem(text="Looks like six more weeks of winter ..."),
|
||||
]
|
||||
|
||||
|
||||
def test_html_grabs_bulleted_text_in_paras(opts_args: dict[str, Any]):
|
||||
opts_args["text"] = (
|
||||
"<html>\n"
|
||||
" <body>\n"
|
||||
" <p>\n"
|
||||
" <span>• Happy Groundhog's day!</span>\n"
|
||||
" </p>\n"
|
||||
" <p>\n"
|
||||
" <span>• Looks like six more weeks of winter ...</span>\n"
|
||||
" </p>\n"
|
||||
" </body>\n"
|
||||
"</html>\n"
|
||||
)
|
||||
opts = HtmlPartitionerOptions(**opts_args)
|
||||
assert HTMLDocument.load(opts).elements == [
|
||||
ListItem(text="Happy Groundhog's day!"),
|
||||
ListItem(text="Looks like six more weeks of winter ..."),
|
||||
]
|
||||
|
||||
|
||||
def test_joins_tag_text_correctly(opts_args: dict[str, Any]):
|
||||
opts_args["text"] = "<p>Hello again peet mag<i>ic</i>al</p>"
|
||||
opts = HtmlPartitionerOptions(**opts_args)
|
||||
doc = HTMLDocument.load(opts)
|
||||
assert doc.elements[0].text == "Hello again peet magical"
|
||||
|
||||
|
||||
def test_sample_doc_with_emoji(opts_args: dict[str, Any]):
|
||||
opts_args["text"] = '<html charset="unicode">\n<p>Hello again 😀</p>\n</html>'
|
||||
opts = HtmlPartitionerOptions(**opts_args)
|
||||
doc = HTMLDocument.load(opts)
|
||||
# NOTE(robinson) - unclear why right now, but the output is the emoji on the test runners
|
||||
# and the byte string representation when running locally on mac
|
||||
assert doc.elements[0].text in ["Hello again ð\x9f\x98\x80", "Hello again 😀"]
|
||||
|
||||
|
||||
def test_only_plain_text_in_body(opts_args: dict[str, Any]):
|
||||
opts_args["text"] = "<body>Hello</body>"
|
||||
opts = HtmlPartitionerOptions(**opts_args)
|
||||
assert HTMLDocument.load(opts).elements[0].text == "Hello"
|
||||
|
||||
|
||||
def test_plain_text_before_anything_in_body(opts_args: dict[str, Any]):
|
||||
opts_args["text"] = "<body>Hello<p>World</p></body>"
|
||||
opts = HtmlPartitionerOptions(**opts_args)
|
||||
doc = HTMLDocument.load(opts)
|
||||
assert doc.elements[0].text == "Hello"
|
||||
assert doc.elements[1].text == "World"
|
||||
|
||||
|
||||
def test_line_break_in_container(opts_args: dict[str, Any]):
|
||||
opts_args["text"] = "<div>Hello<br/>World</div>"
|
||||
opts = HtmlPartitionerOptions(**opts_args)
|
||||
doc = HTMLDocument.load(opts)
|
||||
assert doc.elements[0].text == "Hello"
|
||||
assert doc.elements[1].text == "World"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tag", html.TEXT_TAGS)
|
||||
def test_line_break_in_text_tag(tag: str, opts_args: dict[str, Any]):
|
||||
opts_args["text"] = f"<{tag}>Hello<br/>World</{tag}>"
|
||||
@ -456,16 +123,6 @@ def test_tag_types(tag: str, opts_args: dict[str, Any]):
|
||||
assert len(elements) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tag", EXCLUDED_TAGS)
|
||||
def test_exclude_tag_types(tag: str, opts_args: dict[str, Any]):
|
||||
opts_args["text"] = f"<body>\n <{tag}>\n There is some text here.\n </{tag}>\n</body>\n"
|
||||
opts = HtmlPartitionerOptions(**opts_args)
|
||||
|
||||
elements = HTMLDocument.load(opts).elements
|
||||
|
||||
assert len(elements) == 0
|
||||
|
||||
|
||||
# -- _construct_text() ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@ -683,89 +340,6 @@ class DescribeHTMLDocument:
|
||||
"<table><tr><td>foo</td><td>bar</td></tr></table>"
|
||||
)
|
||||
|
||||
def it_accommodates_tds_with_child_elements(self, opts_args: dict[str, Any]):
|
||||
"""Like this example from an SEC 10k filing."""
|
||||
opts = HtmlPartitionerOptions(**opts_args)
|
||||
html_str = (
|
||||
"<table>\n"
|
||||
" <tr>\n"
|
||||
" <td></td>\n"
|
||||
" <td></td>\n"
|
||||
" </tr>\n"
|
||||
" <tr>\n"
|
||||
" <td>\n"
|
||||
" <p>\n"
|
||||
" <span>\n"
|
||||
' <ix:nonNumeric id="F_be4cc145-372a-4689-be60-d8a70b0c8b9a"'
|
||||
' contextRef="C_1de69f73-df01-4830-8af0-0f11b469bc4a" name="dei:DocumentAnnualReport"'
|
||||
' format="ixt-sec:boolballotbox">\n'
|
||||
" <span>☒</span>\n"
|
||||
" </ix:nonNumeric>\n"
|
||||
" </span>\n"
|
||||
" </p>\n"
|
||||
" </td>\n"
|
||||
" <td>\n"
|
||||
" <p>\n"
|
||||
" <span>ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE"
|
||||
" ACT OF 1934</span>\n"
|
||||
" </p>\n"
|
||||
" </td>\n"
|
||||
" </tr>\n"
|
||||
"</table>\n"
|
||||
)
|
||||
html_document = HTMLDocument(html_str, opts)
|
||||
table_elem = html_document._main.find(".//table")
|
||||
assert table_elem is not None
|
||||
|
||||
html_table = html_document._parse_Table_from_table_elem(table_elem)
|
||||
|
||||
assert isinstance(html_table, Table)
|
||||
assert html_table.text == (
|
||||
"☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934"
|
||||
)
|
||||
assert html_table.metadata.text_as_html == (
|
||||
"<table>"
|
||||
"<tr><td></td><td></td></tr>"
|
||||
"<tr><td>☒</td><td>ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES"
|
||||
" EXCHANGE ACT OF 1934</td></tr>"
|
||||
"</table>"
|
||||
)
|
||||
|
||||
def it_reduces_a_nested_table_to_its_text_placed_in_the_cell_containing_the_nested_table(
|
||||
self, opts_args: dict[str, Any]
|
||||
):
|
||||
"""Recursively ..."""
|
||||
opts = HtmlPartitionerOptions(**opts_args)
|
||||
# -- note <table> elements nested in <td> elements --
|
||||
html_str = (
|
||||
"<table>\n"
|
||||
" <tr>\n"
|
||||
" <td>\n"
|
||||
" <table>\n"
|
||||
" <tr><td>foo</td><td>bar</td></tr>\n"
|
||||
" <tr><td>baz</td><td>bng</td></tr>\n"
|
||||
" </table>\n"
|
||||
" </td>\n"
|
||||
" <td>\n"
|
||||
" <table>\n"
|
||||
" <tr><td>fizz</td><td>bang</td></tr>\n"
|
||||
" </table>\n"
|
||||
" </td>\n"
|
||||
" </tr>\n"
|
||||
"</table>"
|
||||
)
|
||||
html_document = HTMLDocument(html_str, opts)
|
||||
table_elem = html_document._main.find(".//table")
|
||||
assert table_elem is not None
|
||||
|
||||
html_table = html_document._parse_Table_from_table_elem(table_elem)
|
||||
|
||||
assert isinstance(html_table, Table)
|
||||
assert html_table.text == "foo bar baz bng fizz bang"
|
||||
assert html_table.metadata.text_as_html == (
|
||||
"<table><tr><td>foo bar baz bng</td><td>fizz bang</td></tr></table>"
|
||||
)
|
||||
|
||||
# -- ._parse_tag() ---------------------------
|
||||
|
||||
def it_produces_a_Text_element_when_the_tag_contents_are_not_narrative_or_a_title(
|
||||
|
||||
0
test_unstructured/partition/html/__init__.py
Normal file
0
test_unstructured/partition/html/__init__.py
Normal file
785
test_unstructured/partition/html/test_parser.py
Normal file
785
test_unstructured/partition/html/test_parser.py
Normal file
@ -0,0 +1,785 @@
|
||||
# pyright: reportPrivateUsage=false
|
||||
# pyright: reportUnknownArgumentType=false
|
||||
|
||||
"""Test suite for `unstructured.partition.html.parser` module."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import deque
|
||||
|
||||
import pytest
|
||||
from lxml import etree
|
||||
|
||||
from unstructured.documents.elements import Address, Element, ListItem, NarrativeText, Text, Title
|
||||
from unstructured.partition.html.parser import (
|
||||
Anchor,
|
||||
Annotation,
|
||||
DefaultElement,
|
||||
Flow,
|
||||
Phrasing,
|
||||
RemovedPhrasing,
|
||||
TextSegment,
|
||||
_consolidate_annotations,
|
||||
_normalize_text,
|
||||
html_parser,
|
||||
)
|
||||
|
||||
# -- MODULE-LEVEL FUNCTIONS ----------------------------------------------------------------------
|
||||
|
||||
# -- _consolidate_annotations() ------------------
|
||||
|
||||
|
||||
def it_gathers_annotations_from_text_segments():
|
||||
text_segments = [
|
||||
TextSegment(
|
||||
" Ford Prefect ",
|
||||
{
|
||||
"link_texts": "Ford Prefect",
|
||||
"link_url": "https://wikipedia/Ford_Prefect",
|
||||
"emphasized_text_contents": "Ford Prefect",
|
||||
"emphasized_text_tags": "b",
|
||||
},
|
||||
),
|
||||
TextSegment(
|
||||
" alien encounter",
|
||||
{
|
||||
"emphasized_text_contents": "alien encounter",
|
||||
"emphasized_text_tags": "bi",
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
annotations = _consolidate_annotations(text_segments)
|
||||
|
||||
assert annotations == {
|
||||
# -- each distinct key gets a list of values --
|
||||
"emphasized_text_contents": ["Ford Prefect", "alien encounter"],
|
||||
"emphasized_text_tags": ["b", "bi"],
|
||||
# -- even when there is only one value --
|
||||
"link_texts": ["Ford Prefect"],
|
||||
"link_url": ["https://wikipedia/Ford_Prefect"],
|
||||
}
|
||||
# -- and the annotations mapping is immutable --
|
||||
with pytest.raises(TypeError, match="object does not support item assignment"):
|
||||
annotations["new_key"] = "foobar" # pyright: ignore[reportIndexIssue]
|
||||
# -- (but not its list values unfortunately) --
|
||||
annotations["emphasized_text_tags"].append("xyz")
|
||||
assert annotations["emphasized_text_tags"] == ["b", "bi", "xyz"]
|
||||
|
||||
|
||||
# -- _normalize_text() ---------------------------
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("text", "expected_value"),
|
||||
[
|
||||
# -- already normalized text is left unchanged --
|
||||
("iterators allow", "iterators allow"),
|
||||
# -- newlines are treated as whitespace --
|
||||
("algorithm\nto be", "algorithm to be"),
|
||||
(" separated\n from ", "separated from"),
|
||||
("\n container\n details\n ", "container details"),
|
||||
(
|
||||
"\n iterators allow \n algorithm to be \nexpressed without container \nnoise",
|
||||
"iterators allow algorithm to be expressed without container noise",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_normalize_text_produces_normalized_text(text: str, expected_value: str):
|
||||
assert _normalize_text(text) == expected_value
|
||||
|
||||
|
||||
# -- FLOW (BLOCK-ITEM) ELEMENTS ------------------------------------------------------------------
|
||||
|
||||
|
||||
class DescribeFlow:
|
||||
"""Isolated unit-test suite for `unstructured.partition.html.parser.Flow`.
|
||||
|
||||
The `Flow` class provides most behaviors for flow (block-level) elements.
|
||||
"""
|
||||
|
||||
# -- .is_phrasing -----------------------------------------------------
|
||||
|
||||
def it_knows_it_is_NOT_a_phrasing_element(self):
|
||||
p = etree.fromstring("<p>Hello</p>", html_parser).xpath(".//p")[0]
|
||||
|
||||
assert isinstance(p, Flow)
|
||||
assert p.is_phrasing is False
|
||||
|
||||
# -- .iter_elements() -------------------------------------------------
|
||||
|
||||
def it_generates_the_document_elements_from_the_Flow_element(self):
|
||||
"""Phrasing siblings of child block elements are processed with text or tail.
|
||||
|
||||
In the general case, a Flow element can contain text, phrasing content, and child flow
|
||||
elements.
|
||||
|
||||
Each of these five lines in this example is a "paragraph" and gives rise to a distinct
|
||||
document-element.
|
||||
"""
|
||||
html_text = """
|
||||
<div>
|
||||
Text of div <b>with <i>hierarchical</i>\nphrasing</b> content before first block item
|
||||
<p>Click <a href="http://blurb.io">here</a> to see the blurb for this block item. </p>
|
||||
tail of block item <b>with <i>hierarchical</i> phrasing </b> content
|
||||
<p>second block item</p>
|
||||
tail of block item <b>with <i> hierarchical </i></b> phrasing content
|
||||
</div>
|
||||
"""
|
||||
div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]
|
||||
|
||||
elements = div.iter_elements()
|
||||
|
||||
e = next(elements)
|
||||
assert e == Title("Text of div with hierarchical phrasing content before first block item")
|
||||
assert e.metadata.to_dict() == {
|
||||
"category_depth": 0,
|
||||
"emphasized_text_contents": ["with", "hierarchical", "phrasing"],
|
||||
"emphasized_text_tags": ["b", "bi", "b"],
|
||||
}
|
||||
e = next(elements)
|
||||
assert e == NarrativeText("Click here to see the blurb for this block item.")
|
||||
assert e.metadata.to_dict() == {"link_texts": ["here"], "link_urls": ["http://blurb.io"]}
|
||||
e = next(elements)
|
||||
assert e == Title("tail of block item with hierarchical phrasing content")
|
||||
assert e.metadata.to_dict() == {
|
||||
"category_depth": 0,
|
||||
"emphasized_text_contents": ["with", "hierarchical", "phrasing"],
|
||||
"emphasized_text_tags": ["b", "bi", "b"],
|
||||
}
|
||||
e = next(elements)
|
||||
assert e == Title("second block item")
|
||||
assert e.metadata.to_dict() == {"category_depth": 0}
|
||||
e = next(elements)
|
||||
assert e == Title("tail of block item with hierarchical phrasing content")
|
||||
assert e.metadata.to_dict() == {
|
||||
"category_depth": 0,
|
||||
"emphasized_text_contents": ["with", "hierarchical"],
|
||||
"emphasized_text_tags": ["b", "bi"],
|
||||
}
|
||||
with pytest.raises(StopIteration):
|
||||
e = next(elements)
|
||||
|
||||
# -- ._category_depth() -----------------------------------------------
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("html_text", "tag", "ElementCls", "expected_value"),
|
||||
[
|
||||
("<p>Ford... you're turning into a penguin. Stop it.<p>", "p", Text, None),
|
||||
("<p>* thanks for all the fish.</p>", "p", ListItem, 0),
|
||||
("<li>thanks for all the fish.</li>", "li", ListItem, 0),
|
||||
("<ul><li>So long</li><li>and thanks for all the fish.</li></ul>", "li", ListItem, 1),
|
||||
("<dl><dd>So long<ol><li>and thanks for the fish.</li></ol></ul>", "li", ListItem, 2),
|
||||
("<p>Examples</p>", "p", Title, 0),
|
||||
("<h1>Examples</h1>", "h1", Title, 0),
|
||||
("<h2>Examples</h2>", "h2", Title, 1),
|
||||
("<h3>Examples</h3>", "h3", Title, 2),
|
||||
("<h4>Examples</h4>", "h4", Title, 3),
|
||||
("<h5>Examples</h5>", "h5", Title, 4),
|
||||
("<h6>Examples</h6>", "h6", Title, 5),
|
||||
],
|
||||
)
|
||||
def it_computes_the_category_depth_to_help(
|
||||
self, html_text: str, tag: str, ElementCls: type[Element], expected_value: int | None
|
||||
):
|
||||
e = etree.fromstring(html_text, html_parser).xpath(f".//{tag}")[0]
|
||||
assert e._category_depth(ElementCls) == expected_value
|
||||
|
||||
# -- ._element_from_text_or_tail() ------------------------------------
|
||||
|
||||
def it_assembles_text_and_tail_document_elements_to_help(self):
|
||||
"""Text and tails and their phrasing content are both processed the same way."""
|
||||
html_text = "<div>The \n Roman <b>poet <i> Virgil</i> gave</b> his <q>pet</q> fly</div>"
|
||||
div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]
|
||||
|
||||
elements = div._element_from_text_or_tail(div.text, deque(div), Text)
|
||||
|
||||
e = next(elements)
|
||||
# -- element text is normalized --
|
||||
assert e == Text("The Roman poet Virgil gave his pet fly")
|
||||
# -- individual annotations are consolidated --
|
||||
assert e.metadata.to_dict() == {
|
||||
"emphasized_text_contents": ["poet", "Virgil", "gave"],
|
||||
"emphasized_text_tags": ["b", "bi", "b"],
|
||||
}
|
||||
|
||||
def but_it_does_not_generate_a_document_element_when_only_whitespace_is_contained(self):
|
||||
html_text = "<div> <b> \n <i> \n </i> </b> <q> \n </q> \n </div>"
|
||||
div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]
|
||||
|
||||
elements = div._element_from_text_or_tail(div.text, deque(div), Text)
|
||||
|
||||
with pytest.raises(StopIteration):
|
||||
next(elements)
|
||||
|
||||
def it_uses_the_specified_element_class_to_form_the_document_element(self):
|
||||
html_text = "<div>\n The line-storm clouds fly tattered and swift\n</div>"
|
||||
div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]
|
||||
|
||||
elements = div._element_from_text_or_tail(div.text, deque(div), Address)
|
||||
|
||||
e = next(elements)
|
||||
assert e == Address("The line-storm clouds fly tattered and swift")
|
||||
assert e.metadata.to_dict() == {}
|
||||
with pytest.raises(StopIteration):
|
||||
next(elements)
|
||||
|
||||
def and_it_selects_the_document_element_class_by_analyzing_the_text_when_not_specified(self):
|
||||
html_text = "<div>\n The line-storm clouds fly tattered and swift,\n</div>"
|
||||
div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]
|
||||
|
||||
elements = div._element_from_text_or_tail(div.text, deque(div))
|
||||
|
||||
assert next(elements) == NarrativeText("The line-storm clouds fly tattered and swift,")
|
||||
|
||||
def but_it_does_not_generate_a_document_element_when_only_a_bullet_character_is_contained(self):
|
||||
html_text = "<div> * </div>"
|
||||
div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]
|
||||
|
||||
elements = div._element_from_text_or_tail(div.text, deque(div))
|
||||
|
||||
with pytest.raises(StopIteration):
|
||||
next(elements)
|
||||
|
||||
# -- ._iter_text_segments() -------------------------------------------
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("html_text", "expected_value"),
|
||||
[
|
||||
( # -- text with no phrasing --
|
||||
"<p>Ford... you're turning into a penguin.<p>",
|
||||
[("Ford... you're turning into a penguin.", {})],
|
||||
),
|
||||
( # -- text with phrasing --
|
||||
"<p>Ford... <b>you're turning</b> into\na <i>penguin</i>.<p>",
|
||||
[
|
||||
("Ford... ", {}),
|
||||
(
|
||||
"you're turning",
|
||||
{"emphasized_text_contents": "you're turning", "emphasized_text_tags": "b"},
|
||||
),
|
||||
(" into\na ", {}),
|
||||
(
|
||||
"penguin",
|
||||
{"emphasized_text_contents": "penguin", "emphasized_text_tags": "i"},
|
||||
),
|
||||
(".", {}),
|
||||
],
|
||||
),
|
||||
( # -- text with nested phrasing --
|
||||
"<p>Ford... <b>you're <i>turning</i></b> into a penguin.<p>",
|
||||
[
|
||||
("Ford... ", {}),
|
||||
(
|
||||
"you're ",
|
||||
{"emphasized_text_contents": "you're", "emphasized_text_tags": "b"},
|
||||
),
|
||||
(
|
||||
"turning",
|
||||
{"emphasized_text_contents": "turning", "emphasized_text_tags": "bi"},
|
||||
),
|
||||
(" into a penguin.", {}),
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
def it_recursively_generates_text_segments_from_text_and_phrasing_to_help(
|
||||
self, html_text: str, expected_value: list[Annotation]
|
||||
):
|
||||
p = etree.fromstring(html_text, html_parser).xpath(".//p")[0]
|
||||
text_segments = list(p._iter_text_segments(p.text, deque(p)))
|
||||
|
||||
assert text_segments == expected_value
|
||||
|
||||
|
||||
class DescribePre:
|
||||
"""Isolated unit-test suite for `unstructured.partition.html.parser.Pre`.
|
||||
|
||||
The `Pre` class specializes behaviors for the `<pre>` (pre-formatted text) element.
|
||||
"""
|
||||
|
||||
def it_preserves_the_whitespace_of_its_phrasing_only_contents(self):
|
||||
"""A `<pre>` element can contain only phrasing content."""
|
||||
html_text = (
|
||||
"<pre>\n"
|
||||
" The Answer to the Great Question... Of Life, the Universe and Everything...\n"
|
||||
" Is... Forty-two, said Deep Thought, with infinite majesty and calm.\n"
|
||||
"</pre>\n"
|
||||
)
|
||||
pre = etree.fromstring(html_text, html_parser).xpath(".//pre")[0]
|
||||
|
||||
elements = pre.iter_elements()
|
||||
|
||||
e = next(elements)
|
||||
assert e == Text(
|
||||
" The Answer to the Great Question... Of Life, the Universe and Everything...\n"
|
||||
" Is... Forty-two, said Deep Thought, with infinite majesty and calm."
|
||||
)
|
||||
with pytest.raises(StopIteration):
|
||||
next(elements)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("html_text", "expected_value"),
|
||||
[
|
||||
# -- a newline in the 0th position of pre.text is dropped --
|
||||
("<pre>\n foo </pre>", " foo "),
|
||||
# -- but not when preceded by any other whitespace --
|
||||
("<pre> \n foo </pre>", " \n foo "),
|
||||
# -- and only one is dropped --
|
||||
("<pre>\n\n foo </pre>", "\n foo "),
|
||||
# -- a newline in the -1th position is dropped --
|
||||
("<pre> foo \n</pre>", " foo "),
|
||||
# -- but not when followed by any other whitespace --
|
||||
("<pre> foo \n </pre>", " foo \n "),
|
||||
# -- and only one is dropped --
|
||||
("<pre> foo \n\n</pre>", " foo \n"),
|
||||
# -- a newline in both positions are both dropped --
|
||||
("<pre>\n foo \n</pre>", " foo "),
|
||||
# -- or not when not at the absolute edge --
|
||||
("<pre> \n foo \n </pre>", " \n foo \n "),
|
||||
],
|
||||
)
|
||||
def but_it_strips_a_single_leading_or_trailing_newline(
|
||||
self, html_text: str, expected_value: str
|
||||
):
|
||||
"""Content starts on next line when opening `<pre>` tag is immediately followed by `\n`"""
|
||||
pre = etree.fromstring(html_text, html_parser).xpath(".//pre")[0]
|
||||
e = next(pre.iter_elements())
|
||||
|
||||
assert e.text == expected_value
|
||||
|
||||
def it_assigns_emphasis_and_link_metadata_when_contents_have_those_phrasing_elements(self):
|
||||
html_text = '<pre>You\'re <b>turning</b> into a <a href="http://eie.io">penguin</a>.</pre>'
|
||||
pre = etree.fromstring(html_text, html_parser).xpath(".//pre")[0]
|
||||
|
||||
e = next(pre.iter_elements())
|
||||
|
||||
assert e.text == "You're turning into a penguin."
|
||||
assert e.metadata.emphasized_text_contents == ["turning"]
|
||||
assert e.metadata.emphasized_text_tags == ["b"]
|
||||
assert e.metadata.link_texts == ["penguin"]
|
||||
assert e.metadata.link_urls == ["http://eie.io"]
|
||||
|
||||
|
||||
class DescribeRemovedBlock:
|
||||
"""Isolated unit-test suite for `unstructured.partition.html.parser.RemovedBlock`.
|
||||
|
||||
This class is used for block level items we want to skip like `<hr/>` and `<figure>`.
|
||||
"""
|
||||
|
||||
def it_is_skipped_during_parsing(self):
|
||||
html_text = """
|
||||
<div>
|
||||
<hr/>
|
||||
<figure>
|
||||
<img src="/media/cc0-images/elephant-660-480.jpg" alt="Elephant at sunset" />
|
||||
<figcaption>An elephant at sunset</figcaption>
|
||||
</figure>
|
||||
<p>Content we want.</p>
|
||||
</div>
|
||||
"""
|
||||
div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]
|
||||
|
||||
assert list(div.iter_elements()) == [NarrativeText("Content we want.")]
|
||||
|
||||
|
||||
# -- PHRASING (INLINE) ELEMENTS ------------------------------------------------------------------
|
||||
|
||||
|
||||
class DescribePhrasing:
|
||||
"""Isolated unit-test suite for `unstructured.partition.html.parser.Phrasing`.
|
||||
|
||||
The `Phrasing` class provides most behaviors for phrasing (inline) elements.
|
||||
"""
|
||||
|
||||
def it_knows_it_is_a_phrasing_element(self):
|
||||
b = etree.fromstring("<b>Hello</b>", html_parser).xpath(".//b")[0]
|
||||
|
||||
assert isinstance(b, Phrasing)
|
||||
assert b.is_phrasing is True
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("html_text", "expected_value"),
|
||||
[
|
||||
# -- an empty element produces no text segments --
|
||||
("<code></code>", []),
|
||||
# -- element text produces one segment --
|
||||
("<data> foo </data>", [(" foo ", {})]),
|
||||
# -- element tail produces one segment --
|
||||
("<dfn/> bar ", [(" bar ", {})]),
|
||||
# -- element descendants each produce one segment --
|
||||
("<kbd><mark>foo <meter>bar</meter></mark></kbd>", [("foo ", {}), ("bar", {})]),
|
||||
# -- and any combination produces a segment for each text, child, and tail --
|
||||
(
|
||||
"<kbd> <mark>foo <meter>bar</meter> baz</mark> </kbd>",
|
||||
[
|
||||
(" ", {}),
|
||||
("foo ", {}),
|
||||
("bar", {}),
|
||||
(" baz", {}),
|
||||
(" ", {}),
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
def it_generates_text_segments_for_its_text_and_children_and_tail(
|
||||
self, html_text: str, expected_value: list[TextSegment]
|
||||
):
|
||||
e = etree.fromstring(html_text, html_parser).xpath(".//body")[0][0]
|
||||
assert list(e.iter_text_segments()) == expected_value
|
||||
|
||||
def it_forms_its_annotations_from_emphasis(self):
|
||||
cite = etree.fromstring("<cite> rhombus </cite>", html_parser).xpath(".//cite")[0]
|
||||
assert cite._annotation(cite.text, "bi") == {
|
||||
"emphasized_text_contents": "rhombus",
|
||||
"emphasized_text_tags": "bi",
|
||||
}
|
||||
|
||||
def but_not_when_text_is_empty_or_whitespace(self):
|
||||
cite = etree.fromstring("<cite> </cite>", html_parser).xpath(".//cite")[0]
|
||||
assert cite._annotation(cite.text, "bi") == {}
|
||||
|
||||
def and_not_when_there_is_no_emphasis(self):
|
||||
cite = etree.fromstring("<cite>rhombus</cite>", html_parser).xpath(".//cite")[0]
|
||||
assert cite._annotation(cite.text, "") == {}
|
||||
|
||||
def it_uses_the_enclosing_emphasis_as_the_default_inside_emphasis(self):
|
||||
abbr = etree.fromstring("<abbr>LLM</abbr>", html_parser).xpath(".//abbr")[0]
|
||||
assert abbr._inside_emphasis("xyz") == "xyz"
|
||||
|
||||
|
||||
class DescribeBold:
|
||||
"""Isolated unit-test suite for `unstructured.partition.html.parser.Bold`.
|
||||
|
||||
The `Bold` class is used for `<b>` and `<strong>` tags and adds emphasis metadata.
|
||||
"""
|
||||
|
||||
def it_annotates_its_text_segment_with_bold_emphasis(self):
|
||||
b = etree.fromstring("<b>rhombus</b>", html_parser).xpath(".//b")[0]
|
||||
|
||||
text_segments = b.iter_text_segments()
|
||||
|
||||
text, annotation = next(text_segments)
|
||||
assert text == "rhombus"
|
||||
assert annotation == {
|
||||
"emphasized_text_contents": "rhombus",
|
||||
"emphasized_text_tags": "b",
|
||||
}
|
||||
|
||||
def and_its_children_are_also_annotated_with_bold_emphasis(self):
|
||||
b = etree.fromstring("<b>rhombus <i>pentagon</i></b>", html_parser).xpath(".//b")[0]
|
||||
|
||||
text_segments = b.iter_text_segments()
|
||||
|
||||
text, annotation = next(text_segments)
|
||||
assert text == "rhombus "
|
||||
assert annotation == {
|
||||
"emphasized_text_contents": "rhombus",
|
||||
"emphasized_text_tags": "b",
|
||||
}
|
||||
text, annotation = next(text_segments)
|
||||
assert text == "pentagon"
|
||||
assert annotation == {
|
||||
"emphasized_text_contents": "pentagon",
|
||||
"emphasized_text_tags": "bi",
|
||||
}
|
||||
|
||||
def but_not_its_tail(self):
|
||||
b = etree.fromstring("<b>rhombus</b> pentagon", html_parser).xpath(".//b")[0]
|
||||
|
||||
text_segments = b.iter_text_segments()
|
||||
|
||||
text, annotation = next(text_segments)
|
||||
assert text == "rhombus"
|
||||
assert annotation == {
|
||||
"emphasized_text_contents": "rhombus",
|
||||
"emphasized_text_tags": "b",
|
||||
}
|
||||
text, annotation = next(text_segments)
|
||||
assert text == " pentagon"
|
||||
assert annotation == {}
|
||||
|
||||
|
||||
class DescribeItalic:
|
||||
"""Isolated unit-test suite for `unstructured.partition.html.parser.Italic`.
|
||||
|
||||
The `Italic` class is used for `<i>` and `<em>` tags and adds emphasis metadata.
|
||||
"""
|
||||
|
||||
def it_annotates_its_text_segment_with_italic_emphasis(self):
|
||||
i = etree.fromstring("<i>rhombus</i>", html_parser).xpath(".//i")[0]
|
||||
|
||||
text_segments = i.iter_text_segments()
|
||||
|
||||
text, annotation = next(text_segments)
|
||||
assert text == "rhombus"
|
||||
assert annotation == {
|
||||
"emphasized_text_contents": "rhombus",
|
||||
"emphasized_text_tags": "i",
|
||||
}
|
||||
|
||||
def and_its_children_are_also_annotated_with_italic_emphasis(self):
|
||||
em = etree.fromstring("<em>rhombus <b>pentagon</b></em>", html_parser).xpath(".//em")[0]
|
||||
|
||||
text_segments = em.iter_text_segments()
|
||||
|
||||
text, annotation = next(text_segments)
|
||||
assert text == "rhombus "
|
||||
assert annotation == {
|
||||
"emphasized_text_contents": "rhombus",
|
||||
"emphasized_text_tags": "i",
|
||||
}
|
||||
text, annotation = next(text_segments)
|
||||
assert text == "pentagon"
|
||||
assert annotation == {
|
||||
"emphasized_text_contents": "pentagon",
|
||||
"emphasized_text_tags": "bi",
|
||||
}
|
||||
|
||||
def but_not_its_tail(self):
|
||||
i = etree.fromstring("<i>rhombus</i> pentagon", html_parser).xpath(".//i")[0]
|
||||
|
||||
text_segments = i.iter_text_segments()
|
||||
|
||||
text, annotation = next(text_segments)
|
||||
assert text == "rhombus"
|
||||
assert annotation == {
|
||||
"emphasized_text_contents": "rhombus",
|
||||
"emphasized_text_tags": "i",
|
||||
}
|
||||
text, annotation = next(text_segments)
|
||||
assert text == " pentagon"
|
||||
assert annotation == {}
|
||||
|
||||
|
||||
class DescribeLineBreak:
|
||||
"""Isolated unit-test suite for `unstructured.partition.html.parser.LineBreak`.
|
||||
|
||||
Used for `<br/>` elements, it's only special behavior is to add whitespace such that phrasing
|
||||
butted up tight on both sides of the `<br/>` element is not joined, like `abc<br/>def` should
|
||||
become "abc def", not "abcdef".
|
||||
"""
|
||||
|
||||
def it_adds_a_newline_in_its_place(self):
|
||||
cite = etree.fromstring(
|
||||
"<cite>spaceships of the<br/>Vogon Constructor Fleet</cite>", html_parser
|
||||
).xpath(".//cite")[0]
|
||||
|
||||
text_segments = cite.iter_text_segments()
|
||||
|
||||
texts = [ts.text for ts in text_segments]
|
||||
assert texts == ["spaceships of the", "\n", "Vogon Constructor Fleet"]
|
||||
assert _normalize_text("".join(texts)) == "spaceships of the Vogon Constructor Fleet"
|
||||
|
||||
|
||||
class DescribeRemovedPhrasing:
|
||||
"""Isolated unit-test suite for `unstructured.partition.html.parser.RemovedPhrasing`.
|
||||
|
||||
Used for phrasing elements like `<label>` that we want to skip, including any content they
|
||||
enclose. The tail of such an element is not skipped though.
|
||||
"""
|
||||
|
||||
def it_behaves_like_an_empty_element(self):
|
||||
label = etree.fromstring(
|
||||
"<div>\n"
|
||||
" <label>Space<p>is big</p>, <b>mind-bogglingly</b> big.</label>\n"
|
||||
" Like vastly, hugely big.\n"
|
||||
"</div>",
|
||||
html_parser,
|
||||
).xpath(".//label")[0]
|
||||
|
||||
(text_segment,) = list(label.iter_text_segments())
|
||||
|
||||
assert isinstance(label, RemovedPhrasing)
|
||||
assert label.is_phrasing is True
|
||||
assert text_segment.text == "\n Like vastly, hugely big.\n"
|
||||
|
||||
|
||||
# -- DUAL-ROLE ELEMENTS --------------------------------------------------------------------------
|
||||
|
||||
|
||||
class DescribeAnchor:
|
||||
"""Isolated unit-test suite for `unstructured.partition.html.parser.Anchor`.
|
||||
|
||||
The `Anchor` class is used for `<a>` tags and provides link metadata.
|
||||
"""
|
||||
|
||||
# -- .is_phrasing -----------------------------------------------------
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("html_text", "expected_value"),
|
||||
[
|
||||
# -- an empty <a> identifies as phrasing --
|
||||
('<a href="http://eie.io"></a>', True),
|
||||
# -- an <a> with text but no children identifies as phrasing --
|
||||
('<a href="http://eie.io">“O Deep Thought computer," he said,</a>', True),
|
||||
# -- an <a> with no text and only phrasing children identifies as phrasing --
|
||||
('<a href="http://eie.io"><i>“O Deep Thought computer,"</i></a>', True),
|
||||
# -- an <a> with both text and phrasing children identifies as phrasing --
|
||||
('<a href="http://eie.io">“O <b>Deep Thought</b> computer,"</a>', True),
|
||||
# -- but an <a> with a block-item child does not --
|
||||
('<a href="http://eie.io"><p>“O Deep Thought computer,"</p></a>', False),
|
||||
# -- and an <a> with both text and a block-item child does not --
|
||||
('<a href="http://eie.io">“O Deep Thought computer,"<div>he said,</div></a>', False),
|
||||
# -- and an <a> with text and both block and phrasing children does not --
|
||||
('<a href="http://eie.io">“O <b>Deep</b> Thought <div>computer," he</div></a>', False),
|
||||
],
|
||||
)
|
||||
def it_determines_whether_it_is_phrasing_dynamically(
|
||||
self, html_text: str, expected_value: bool
|
||||
):
|
||||
a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
|
||||
|
||||
assert isinstance(a, Anchor)
|
||||
assert a.is_phrasing is expected_value
|
||||
|
||||
# -- .iter_elements() -------------------------------------------------
|
||||
|
||||
def it_can_also_act_as_a_block_item(self):
|
||||
html_text = """
|
||||
<div>
|
||||
<a href="http://eie.io">
|
||||
O Deep Thought computer, he said,
|
||||
<div>The task we have designed you to perform is this.</div>
|
||||
<p>We want you to tell us.... he paused,</p>
|
||||
</a>
|
||||
</div>
|
||||
"""
|
||||
a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
|
||||
|
||||
elements = a.iter_elements()
|
||||
|
||||
assert [e.text for e in elements] == [
|
||||
"O Deep Thought computer, he said,",
|
||||
"The task we have designed you to perform is this.",
|
||||
"We want you to tell us.... he paused,",
|
||||
]
|
||||
|
||||
# -- .iter_text_segments() --------------------------------------------
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("html_text", "expected_value"),
|
||||
[
|
||||
# -- produces no text-segment or annotation for anchor.text when there is none --
|
||||
('<a href="http://abc.com"></a>', []),
|
||||
# -- but it produces a text-segment for the tail if there is one --
|
||||
('<a href="http://abc.com"></a> long tail ', [TextSegment(" long tail ", {})]),
|
||||
# -- produces text-segment but no annotation for anchor.text when it is whitespace --
|
||||
('<a href="http://abc.com"> </a>', [TextSegment(" ", {})]),
|
||||
# -- produces text-segment and annotation for anchor text
|
||||
# -- Note link-texts annotation is whitespace-normalized but text-segment text is not.
|
||||
(
|
||||
'<a href="http://abc.com"> click here </a>',
|
||||
[
|
||||
TextSegment(
|
||||
" click here ",
|
||||
{"link_texts": ["click here"], "link_urls": ["http://abc.com"]},
|
||||
)
|
||||
],
|
||||
),
|
||||
# -- produces text-segment for both text and tail when present --
|
||||
(
|
||||
'<a href="http://abc.com"> click here </a> long tail',
|
||||
[
|
||||
TextSegment(
|
||||
" click here ",
|
||||
{"link_texts": ["click here"], "link_urls": ["http://abc.com"]},
|
||||
),
|
||||
TextSegment(" long tail", {}),
|
||||
],
|
||||
),
|
||||
# -- nested phrasing inside <a> element is handled as expected --
|
||||
(
|
||||
'<p>I am <a href="http://eie.io">one <u>with<i> the</i></u> Force</a>.</p>',
|
||||
[
|
||||
TextSegment(
|
||||
"one with the Force",
|
||||
{
|
||||
"emphasized_text_contents": ["the"],
|
||||
"emphasized_text_tags": ["i"],
|
||||
"link_texts": ["one with the Force"],
|
||||
"link_urls": ["http://eie.io"],
|
||||
},
|
||||
),
|
||||
TextSegment(".", {}),
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
def it_generates_link_annotated_text_segments_for_its_text_and_a_tail_text_segment(
|
||||
self, html_text: str, expected_value: list[TextSegment]
|
||||
):
|
||||
a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
|
||||
assert list(a.iter_text_segments()) == expected_value
|
||||
|
||||
|
||||
# -- DEFAULT ELEMENT -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class DescribeDefaultElement:
|
||||
"""Isolated unit-test suite for `unstructured.partition.html.parser.DefaultElement`.
|
||||
|
||||
Used for any element we haven't assigned a custom element-class too. This prominently includes
|
||||
any non-HTML elements that can be embedded in the HTML.
|
||||
|
||||
It identifies as a block item but it can behave as either a block-item or phrasing. Its behavior
|
||||
is a combination of RemovedBlock and RemovedPhrasing. Namely, it iterates zero elements and only
|
||||
iterates a text-segment for its tail.
|
||||
"""
|
||||
|
||||
# -- .is_phrasing -----------------------------------------------------
|
||||
|
||||
def it_identifies_as_a_phrasing_element(self):
|
||||
foobar = etree.fromstring("<foobar>Vogon</foobar>", html_parser).xpath(".//foobar")[0]
|
||||
|
||||
assert isinstance(foobar, DefaultElement)
|
||||
assert foobar.is_phrasing is True
|
||||
|
||||
# -- .iter_elements() -------------------------------------------------
|
||||
|
||||
def it_generates_zero_elements_as_a_block_item(self):
|
||||
"""Should never be called but belts and suspenders."""
|
||||
foobar = etree.fromstring(
|
||||
"<foobar>Space<p>is big</p>, <b>mind-bogglingly</b> big.</foobar>",
|
||||
html_parser,
|
||||
).xpath(".//foobar")[0]
|
||||
|
||||
elements = foobar.iter_elements()
|
||||
|
||||
with pytest.raises(StopIteration):
|
||||
next(elements)
|
||||
|
||||
# -- .iter_text_segments() --------------------------------------------
|
||||
|
||||
def it_generates_its_tail_but_no_inner_text_segments_when_called_like_phrasing(self):
|
||||
foobar = etree.fromstring(
|
||||
"<div>\n"
|
||||
" O Deep Thought computer, he said,\n"
|
||||
" <foobar>Vogon Constructor Fleet</foobar>\n"
|
||||
" The task we have designed you to perform is this.\n"
|
||||
" <p>We want you to tell us.... he paused,</p>\n"
|
||||
"</div>",
|
||||
html_parser,
|
||||
).xpath(".//foobar")[0]
|
||||
|
||||
texts = [ts.text for ts in foobar.iter_text_segments()]
|
||||
|
||||
assert texts == ["\n The task we have designed you to perform is this.\n "]
|
||||
|
||||
def and_it_behaves_like_an_empty_phrasing_element_inside_a_block_element(self):
|
||||
div = etree.fromstring(
|
||||
"<div>\n"
|
||||
" O Deep Thought computer, he said,\n"
|
||||
" <foobar>Vogon Constructor Fleet</foobar>\n"
|
||||
" The task we have designed you to perform is this.\n"
|
||||
" <p>We want you to tell us.... he paused,</p>\n"
|
||||
"</div>",
|
||||
html_parser,
|
||||
).xpath(".//div")[0]
|
||||
|
||||
texts = [e.text for e in div.iter_elements()]
|
||||
|
||||
assert texts == [
|
||||
"O Deep Thought computer, he said, The task we have designed you to perform is this.",
|
||||
"We want you to tell us.... he paused,",
|
||||
]
|
||||
File diff suppressed because it is too large
Load Diff
29
typings/lxml/_types.pyi
Normal file
29
typings/lxml/_types.pyi
Normal file
@ -0,0 +1,29 @@
|
||||
# pyright: reportPrivateUsage=false
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Callable, Collection, Protocol, TypeVar
|
||||
|
||||
from typing_extensions import TypeAlias
|
||||
|
||||
from .etree import QName, _Element, _ElementTree
|
||||
|
||||
_ET = TypeVar("_ET", bound=_Element, default=_Element)
|
||||
_ET_co = TypeVar("_ET_co", bound=_Element, default=_Element, covariant=True)
|
||||
_KT_co = TypeVar("_KT_co", covariant=True)
|
||||
_VT_co = TypeVar("_VT_co", covariant=True)
|
||||
|
||||
_AttrName: TypeAlias = str
|
||||
|
||||
_ElemPathArg: TypeAlias = str | QName
|
||||
|
||||
_ElementOrTree: TypeAlias = _ET | _ElementTree[_ET]
|
||||
|
||||
_TagName: TypeAlias = str
|
||||
|
||||
_TagSelector: TypeAlias = _TagName | Callable[..., _Element]
|
||||
|
||||
_XPathObject = Any
|
||||
|
||||
class SupportsLaxedItems(Protocol[_KT_co, _VT_co]):
|
||||
def items(self) -> Collection[tuple[_KT_co, _VT_co]]: ...
|
||||
14
typings/lxml/etree/__init__.pyi
Normal file
14
typings/lxml/etree/__init__.pyi
Normal file
@ -0,0 +1,14 @@
|
||||
# pyright: reportPrivateUsage=false
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from ._classlookup import ElementBase as ElementBase
|
||||
from ._classlookup import ElementDefaultClassLookup as ElementDefaultClassLookup
|
||||
from ._element import _Element as _Element
|
||||
from ._element import _ElementTree as _ElementTree
|
||||
from ._module_func import fromstring as fromstring
|
||||
from ._module_func import tostring as tostring
|
||||
from ._module_misc import QName as QName
|
||||
from ._nsclasses import ElementNamespaceClassLookup as ElementNamespaceClassLookup
|
||||
from ._parser import HTMLParser as HTMLParser
|
||||
from ._parser import XMLParser as XMLParser
|
||||
75
typings/lxml/etree/_classlookup.pyi
Normal file
75
typings/lxml/etree/_classlookup.pyi
Normal file
@ -0,0 +1,75 @@
|
||||
# pyright: reportPrivateUsage=false
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from ._element import _Element
|
||||
|
||||
class ElementBase(_Element):
|
||||
"""The public Element class
|
||||
|
||||
Original Docstring
|
||||
------------------
|
||||
All custom Element classes must inherit from this one.
|
||||
To create an Element, use the `Element()` factory.
|
||||
|
||||
BIG FAT WARNING: Subclasses *must not* override `__init__` or
|
||||
`__new__` as it is absolutely undefined when these objects will be
|
||||
created or destroyed. All persistent state of Elements must be
|
||||
stored in the underlying XML. If you really need to initialize
|
||||
the object after creation, you can implement an ``_init(self)``
|
||||
method that will be called directly after object creation.
|
||||
|
||||
Subclasses of this class can be instantiated to create a new
|
||||
Element. By default, the tag name will be the class name and the
|
||||
namespace will be empty. You can modify this with the following
|
||||
class attributes:
|
||||
|
||||
* TAG - the tag name, possibly containing a namespace in Clark
|
||||
notation
|
||||
|
||||
* NAMESPACE - the default namespace URI, unless provided as part
|
||||
of the TAG attribute.
|
||||
|
||||
* HTML - flag if the class is an HTML tag, as opposed to an XML
|
||||
tag. This only applies to un-namespaced tags and defaults to
|
||||
false (i.e. XML).
|
||||
|
||||
* PARSER - the parser that provides the configuration for the
|
||||
newly created document. Providing an HTML parser here will
|
||||
default to creating an HTML element.
|
||||
|
||||
In user code, the latter three are commonly inherited in class
|
||||
hierarchies that implement a common namespace.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*children: object,
|
||||
attrib: dict[str, str] | None = None,
|
||||
**_extra: str,
|
||||
) -> None: ...
|
||||
def _init(self) -> None: ...
|
||||
|
||||
class ElementClassLookup:
|
||||
"""Superclass of Element class lookups"""
|
||||
|
||||
class ElementDefaultClassLookup(ElementClassLookup):
|
||||
"""Element class lookup scheme that always returns the default Element
|
||||
class.
|
||||
|
||||
The keyword arguments ``element``, ``comment``, ``pi`` and ``entity``
|
||||
accept the respective Element classes."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
element: type[ElementBase] | None = None,
|
||||
) -> None: ...
|
||||
|
||||
class FallbackElementClassLookup(ElementClassLookup):
|
||||
"""Superclass of Element class lookups with additional fallback"""
|
||||
|
||||
@property
|
||||
def fallback(self) -> ElementClassLookup | None: ...
|
||||
def __init__(self, fallback: ElementClassLookup | None = None) -> None: ...
|
||||
def set_fallback(self, lookup: ElementClassLookup) -> None:
|
||||
"""Sets the fallback scheme for this lookup method"""
|
||||
50
typings/lxml/etree/_element.pyi
Normal file
50
typings/lxml/etree/_element.pyi
Normal file
@ -0,0 +1,50 @@
|
||||
# pyright: reportPrivateUsage=false
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Collection, Generic, Iterator, TypeVar, overload
|
||||
|
||||
from typing_extensions import Self
|
||||
|
||||
from .. import _types as _t
|
||||
|
||||
_T = TypeVar("_T")
|
||||
|
||||
class _Element:
|
||||
@overload
|
||||
def __getitem__(self, __x: int) -> Self: ...
|
||||
@overload
|
||||
def __getitem__(self, __x: slice) -> list[Self]: ...
|
||||
def __contains__(self, __o: object) -> bool: ...
|
||||
def __len__(self) -> int: ...
|
||||
def __iter__(self) -> Iterator[Self]: ...
|
||||
def find(self, path: _t._ElemPathArg) -> Self | None: ...
|
||||
@overload
|
||||
def get(self, key: _t._AttrName) -> str | None: ...
|
||||
@overload
|
||||
def get(self, key: _t._AttrName, default: _T) -> str | _T: ...
|
||||
def iterancestors(
|
||||
self, *, tag: _t._TagSelector | Collection[_t._TagSelector] | None = None
|
||||
) -> Iterator[Self]: ...
|
||||
@overload
|
||||
def itertext(self, *tags: _t._TagSelector, with_tail: bool = True) -> Iterator[str]: ...
|
||||
@overload
|
||||
def itertext(
|
||||
self,
|
||||
*,
|
||||
tag: _t._TagSelector | Collection[_t._TagSelector] | None = None,
|
||||
with_tail: bool = True,
|
||||
) -> Iterator[str]: ...
|
||||
@property
|
||||
def tag(self) -> str: ...
|
||||
@property
|
||||
def tail(self) -> str | None: ...
|
||||
@property
|
||||
def text(self) -> str | None: ...
|
||||
def xpath(
|
||||
self,
|
||||
_path: str,
|
||||
/,
|
||||
) -> _t._XPathObject: ...
|
||||
|
||||
class _ElementTree(Generic[_t._ET_co]): ...
|
||||
19
typings/lxml/etree/_module_func.pyi
Normal file
19
typings/lxml/etree/_module_func.pyi
Normal file
@ -0,0 +1,19 @@
|
||||
# pyright: reportPrivateUsage=false
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from .._types import _ElementOrTree
|
||||
from ..etree import HTMLParser, XMLParser
|
||||
from ._element import _Element
|
||||
|
||||
def fromstring(text: str | bytes, parser: XMLParser | HTMLParser) -> _Element: ...
|
||||
|
||||
# Under XML Canonicalization (C14N) mode, most arguments are ignored,
|
||||
# some arguments would even raise exception outright if specified.
|
||||
def tostring(
|
||||
element_or_tree: _ElementOrTree,
|
||||
*,
|
||||
encoding: str | type[str] | None = None,
|
||||
pretty_print: bool = False,
|
||||
with_tail: bool = True,
|
||||
) -> str: ...
|
||||
5
typings/lxml/etree/_module_misc.pyi
Normal file
5
typings/lxml/etree/_module_misc.pyi
Normal file
@ -0,0 +1,5 @@
|
||||
# pyright: reportPrivateUsage=false
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
class QName: ...
|
||||
31
typings/lxml/etree/_nsclasses.pyi
Normal file
31
typings/lxml/etree/_nsclasses.pyi
Normal file
@ -0,0 +1,31 @@
|
||||
# pyright: reportPrivateUsage=false
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Iterable, Iterator, MutableMapping, TypeVar
|
||||
|
||||
from .._types import SupportsLaxedItems
|
||||
from ._classlookup import ElementBase, ElementClassLookup, FallbackElementClassLookup
|
||||
|
||||
_KT = TypeVar("_KT")
|
||||
_VT = TypeVar("_VT")
|
||||
|
||||
class _NamespaceRegistry(MutableMapping[_KT, _VT]):
|
||||
def __delitem__(self, __key: _KT) -> None: ...
|
||||
def __getitem__(self, __key: _KT) -> _VT: ...
|
||||
def __setitem__(self, __key: _KT, __value: _VT) -> None: ...
|
||||
def __iter__(self) -> Iterator[_KT]: ...
|
||||
def __len__(self) -> int: ...
|
||||
def update( # type: ignore[override]
|
||||
self,
|
||||
class_dict_iterable: SupportsLaxedItems[_KT, _VT] | Iterable[tuple[_KT, _VT]],
|
||||
) -> None: ...
|
||||
def items(self) -> list[tuple[_KT, _VT]]: ... # type: ignore[override]
|
||||
def iteritems(self) -> Iterator[tuple[_KT, _VT]]: ...
|
||||
def clear(self) -> None: ...
|
||||
|
||||
class _ClassNamespaceRegistry(_NamespaceRegistry[str | None, type[ElementBase]]): ...
|
||||
|
||||
class ElementNamespaceClassLookup(FallbackElementClassLookup):
|
||||
def __init__(self, fallback: ElementClassLookup | None = None) -> None: ...
|
||||
def get_namespace(self, ns_uri: str | None) -> _ClassNamespaceRegistry: ...
|
||||
41
typings/lxml/etree/_parser.pyi
Normal file
41
typings/lxml/etree/_parser.pyi
Normal file
@ -0,0 +1,41 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from ._classlookup import ElementClassLookup
|
||||
|
||||
class HTMLParser:
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
encoding: str | None = None,
|
||||
remove_blank_text: bool = False,
|
||||
remove_comments: bool = False,
|
||||
remove_pis: bool = False,
|
||||
strip_cdata: bool = True,
|
||||
no_network: bool = True,
|
||||
recover: bool = True,
|
||||
compact: bool = True,
|
||||
default_doctype: bool = True,
|
||||
collect_ids: bool = True,
|
||||
huge_tree: bool = False,
|
||||
) -> None: ...
|
||||
def set_element_class_lookup(self, lookup: ElementClassLookup | None = None) -> None: ...
|
||||
|
||||
class XMLParser:
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
encoding: str | None = None,
|
||||
attribute_defaults: bool = False,
|
||||
dtd_validation: bool = False,
|
||||
load_dtd: bool = False,
|
||||
no_network: bool = True,
|
||||
ns_clean: bool = False,
|
||||
recover: bool = False,
|
||||
huge_tree: bool = False,
|
||||
remove_blank_text: bool = False,
|
||||
remove_comments: bool = False,
|
||||
remove_pis: bool = False,
|
||||
strip_cdata: bool = True,
|
||||
collect_ids: bool = True,
|
||||
compact: bool = True,
|
||||
) -> None: ...
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.14.8-dev0" # pragma: no cover
|
||||
__version__ = "0.14.8-dev1" # pragma: no cover
|
||||
|
||||
@ -2,9 +2,8 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import IO, Final, Iterator, cast
|
||||
from typing import TYPE_CHECKING, Final, Iterator, cast
|
||||
|
||||
import requests
|
||||
from lxml import etree
|
||||
|
||||
from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes
|
||||
@ -19,8 +18,6 @@ from unstructured.documents.elements import (
|
||||
Text,
|
||||
Title,
|
||||
)
|
||||
from unstructured.file_utils.encoding import read_txt_file
|
||||
from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file
|
||||
from unstructured.partition.text_type import (
|
||||
is_bulleted_text,
|
||||
is_email_address,
|
||||
@ -29,7 +26,10 @@ from unstructured.partition.text_type import (
|
||||
is_us_city_state_zip,
|
||||
)
|
||||
from unstructured.partition.utils.constants import HTML_MAX_PREDECESSOR_LEN
|
||||
from unstructured.utils import htmlify_matrix_of_cell_texts, is_temp_file_path, lazyproperty
|
||||
from unstructured.utils import htmlify_matrix_of_cell_texts, lazyproperty
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from unstructured.partition.html.partition import HtmlPartitionerOptions
|
||||
|
||||
TEXT_TAGS: Final[list[str]] = ["p", "a", "td", "span", "b", "font"]
|
||||
LIST_ITEM_TAGS: Final[list[str]] = ["li", "dd"]
|
||||
@ -57,7 +57,7 @@ class HTMLDocument:
|
||||
@classmethod
|
||||
def load(cls, opts: HtmlPartitionerOptions) -> HTMLDocument:
|
||||
"""Construct instance from whatever source is specified in `opts`."""
|
||||
return cls(opts.html_str, opts)
|
||||
return cls(opts.html_text, opts)
|
||||
|
||||
@lazyproperty
|
||||
def elements(self) -> list[Element]:
|
||||
@ -452,108 +452,6 @@ class HTMLDocument:
|
||||
yield element
|
||||
|
||||
|
||||
class HtmlPartitionerOptions:
|
||||
"""Encapsulates partitioning option validation, computation, and application of defaults."""
|
||||
|
||||
# TODO: this eventually moves to `unstructured.partition.html` but not until `HTMLDocument`
|
||||
# becomes `_HtmlPartitioner` and moves there with it.
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
file_path: str | None,
|
||||
file: IO[bytes] | None,
|
||||
text: str | None,
|
||||
encoding: str | None,
|
||||
url: str | None,
|
||||
headers: dict[str, str],
|
||||
ssl_verify: bool,
|
||||
date_from_file_object: bool,
|
||||
metadata_last_modified: str | None,
|
||||
skip_headers_and_footers: bool,
|
||||
detection_origin: str | None,
|
||||
):
|
||||
self._file_path = file_path
|
||||
self._file = file
|
||||
self._text = text
|
||||
self._encoding = encoding
|
||||
self._url = url
|
||||
self._headers = headers
|
||||
self._ssl_verify = ssl_verify
|
||||
self._date_from_file_object = date_from_file_object
|
||||
self._metadata_last_modified = metadata_last_modified
|
||||
self._skip_headers_and_footers = skip_headers_and_footers
|
||||
self._detection_origin = detection_origin
|
||||
|
||||
@lazyproperty
|
||||
def detection_origin(self) -> str | None:
|
||||
"""Trace of initial partitioner to be included in metadata for debugging purposes."""
|
||||
return self._detection_origin
|
||||
|
||||
@lazyproperty
|
||||
def encoding(self) -> str | None:
|
||||
"""Caller-provided encoding used to store HTML character stream as bytes.
|
||||
|
||||
`None` when no encoding was provided and encoding should be auto-detected.
|
||||
"""
|
||||
return self._encoding
|
||||
|
||||
@lazyproperty
|
||||
def html_str(self) -> str:
|
||||
"""The HTML document as a string, loaded from wherever the caller specified."""
|
||||
if self._file_path:
|
||||
return read_txt_file(filename=self._file_path, encoding=self._encoding)[1]
|
||||
|
||||
if self._file:
|
||||
return read_txt_file(file=self._file, encoding=self._encoding)[1]
|
||||
|
||||
if self._text:
|
||||
return str(self._text)
|
||||
|
||||
if self._url:
|
||||
response = requests.get(self._url, headers=self._headers, verify=self._ssl_verify)
|
||||
if not response.ok:
|
||||
raise ValueError(
|
||||
f"Error status code on GET of provided URL: {response.status_code}"
|
||||
)
|
||||
content_type = response.headers.get("Content-Type", "")
|
||||
if not content_type.startswith("text/html"):
|
||||
raise ValueError(f"Expected content type text/html. Got {content_type}.")
|
||||
|
||||
return response.text
|
||||
|
||||
raise ValueError("Exactly one of filename, file, text, or url must be specified.")
|
||||
|
||||
@lazyproperty
|
||||
def last_modified(self) -> str | None:
|
||||
"""The best last-modified date available, None if no sources are available."""
|
||||
# -- Value explicitly specified by caller takes precedence. This is used for example when
|
||||
# -- this file was converted from another format.
|
||||
if self._metadata_last_modified:
|
||||
return self._metadata_last_modified
|
||||
|
||||
if self._file_path:
|
||||
return (
|
||||
None
|
||||
if is_temp_file_path(self._file_path)
|
||||
else get_last_modified_date(self._file_path)
|
||||
)
|
||||
|
||||
if self._file:
|
||||
return (
|
||||
get_last_modified_date_from_file(self._file)
|
||||
if self._date_from_file_object
|
||||
else None
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
@lazyproperty
|
||||
def skip_headers_and_footers(self) -> bool:
|
||||
"""When True, elements located within a header or footer are pruned."""
|
||||
return self._skip_headers_and_footers
|
||||
|
||||
|
||||
# -- tag processors ------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
@ -1,105 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import IO, Any, Optional
|
||||
|
||||
from unstructured.chunking import add_chunking_strategy
|
||||
from unstructured.documents.elements import Element, process_metadata
|
||||
from unstructured.documents.html import HTMLDocument, HtmlPartitionerOptions
|
||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||
from unstructured.partition.lang import apply_lang_metadata
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.HTML)
|
||||
@add_chunking_strategy
|
||||
def partition_html(
|
||||
filename: Optional[str] = None,
|
||||
*,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
text: Optional[str] = None,
|
||||
encoding: Optional[str] = None,
|
||||
url: Optional[str] = None,
|
||||
headers: dict[str, str] = {},
|
||||
ssl_verify: bool = True,
|
||||
date_from_file_object: bool = False,
|
||||
detect_language_per_element: bool = False,
|
||||
languages: Optional[list[str]] = ["auto"],
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
skip_headers_and_footers: bool = False,
|
||||
detection_origin: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> list[Element]:
|
||||
"""Partitions an HTML document into its constituent elements.
|
||||
|
||||
HTML source parameters
|
||||
----------------------
|
||||
The HTML to be partitioned can be specified four different ways:
|
||||
|
||||
filename
|
||||
A string defining the target filename path.
|
||||
file
|
||||
A file-like object using "r" mode --> open(filename, "r").
|
||||
text
|
||||
The string representation of the HTML document.
|
||||
url
|
||||
The URL of a webpage to parse. Only for URLs that return an HTML document.
|
||||
headers
|
||||
The HTTP headers to be used in the HTTP request when `url` is specified.
|
||||
ssl_verify
|
||||
If the URL parameter is set, determines whether or not SSL verification is performed
|
||||
on the HTTP request.
|
||||
date_from_file_object
|
||||
Applies only when providing file via `file` parameter. If this option is True, attempt
|
||||
infer last_modified metadata from bytes, otherwise set it to None.
|
||||
encoding
|
||||
The encoding method used to decode the text input. If None, utf-8 will be used.
|
||||
|
||||
Other parameters
|
||||
----------------
|
||||
include_metadata
|
||||
Optionally allows for excluding metadata from the output. Primarily intended
|
||||
for when partition_html is called by other partitioners (like partition_email).
|
||||
languages
|
||||
User defined value for `metadata.languages` if provided. Otherwise language is detected
|
||||
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
||||
in either language.
|
||||
Additional Parameters:
|
||||
detect_language_per_element
|
||||
Detect language per element instead of at the document level.
|
||||
metadata_last_modified
|
||||
The last modified date for the document.
|
||||
skip_headers_and_footers
|
||||
If True, ignores any content that is within <header> or <footer> tags
|
||||
source_format
|
||||
The source of the original html. If None we will return HTMLElements but for example
|
||||
partition_rst will pass a value of 'rst' so that we return Title vs HTMLTitle
|
||||
"""
|
||||
# -- parser rejects an empty str, nip that edge-case in the bud here --
|
||||
if text is not None and text.strip() == "" and not file and not filename and not url:
|
||||
return []
|
||||
|
||||
opts = HtmlPartitionerOptions(
|
||||
file_path=filename,
|
||||
file=file,
|
||||
text=text,
|
||||
encoding=encoding,
|
||||
url=url,
|
||||
headers=headers,
|
||||
ssl_verify=ssl_verify,
|
||||
date_from_file_object=date_from_file_object,
|
||||
metadata_last_modified=metadata_last_modified,
|
||||
skip_headers_and_footers=skip_headers_and_footers,
|
||||
detection_origin=detection_origin,
|
||||
)
|
||||
|
||||
document = HTMLDocument.load(opts)
|
||||
|
||||
elements = list(
|
||||
apply_lang_metadata(
|
||||
document.elements,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
)
|
||||
)
|
||||
|
||||
return elements
|
||||
3
unstructured/partition/html/__init__.py
Normal file
3
unstructured/partition/html/__init__.py
Normal file
@ -0,0 +1,3 @@
|
||||
from unstructured.partition.html.partition import partition_html
|
||||
|
||||
__all__ = ["partition_html"]
|
||||
774
unstructured/partition/html/parser.py
Normal file
774
unstructured/partition/html/parser.py
Normal file
@ -0,0 +1,774 @@
|
||||
# pyright: reportPrivateUsage=false
|
||||
|
||||
"""Provides the HTML parser used by `partition_html()`.
|
||||
|
||||
The names "flow" and "phrasing" derive from the language of the HTML Standard.
|
||||
|
||||
PRINCIPLES
|
||||
|
||||
- _Elements are paragraphs._ Each paragraph in the HTML document should become a distinct element.
|
||||
In particular, a paragraph should not be split into two elements and an element should not contain
|
||||
more than one paragraph.
|
||||
|
||||
- _An empty paragraph is not an Element._ A paragraph which contains no text or contains only
|
||||
whitespace does not give rise to an Element (is skipped).
|
||||
|
||||
- _The browser rendering is the document._ The HTML "source-code" is not the document. The document
|
||||
is the way that HTML is rendered by a browser (Chrome for a first authority). This foundational
|
||||
principle gives rise to a few that are more specific.
|
||||
|
||||
- _Whitespace is normalized._ Whitespace used for formatting the HTML source is _normalized_ to a
|
||||
single space between text segments. More specifically:
|
||||
- Any leading or trailing space on a paragraph is removed.
|
||||
- All other runs of whitespace in the paragraph are reduced to a single space (" ").
|
||||
- Whitespace is never added where none existed in the HTML source.
|
||||
- Whitespace within a `<pre>` element is the exception and is not normalized. Its
|
||||
whitespace is preserved excepting a leading and/or trailing newline ("\n").
|
||||
|
||||
- _Block-items are paragraphs._ Visible content in HTML can be divided into _block-items_ and
|
||||
_phrasing content_ (aka. _inline content_).
|
||||
- As an example, a `<p>` element is a block item and a `<b>` element is phrasing.
|
||||
- A block item starts a new paragraph and so represents an Element boundary.
|
||||
- A phrasing item affects the appearance of a run of text within a paragraph, like
|
||||
making it bold or making it into a link.
|
||||
- Some elements can take either role, depending upon there ancestors and descendants.
|
||||
- The final authority for whether a particular element is displayed as a block or as
|
||||
inline "formatting" is the CSS. We do not attempt to interpret the CSS and assume
|
||||
the default role for each element.
|
||||
|
||||
Other background
|
||||
|
||||
- The parser's design is _recursive_, consistent with the recursive (tree) structure of HTML. The
|
||||
nodes of the tree are _HTML elements_. Unfortunately this naming sometimes conflicts with
|
||||
Unstructured _document-elements_. In the parser code the term "document-element" is used when
|
||||
there may be ambiguity.
|
||||
|
||||
- The parser is primarily composed of `lxml` Custom Element Classes. The gist is you write a class
|
||||
like `Anchor` and then tell the `lxml` parser that all `<a>` elements should be instantiated using
|
||||
the `Anchor` class. We also provide a default class for any elements that we haven't called out
|
||||
explicitly.
|
||||
|
||||
- _Anatomy of an HTML element._ Some basic terms are important to know to understand the domain
|
||||
language of the parser code. Consider this example:
|
||||
```html
|
||||
<div>
|
||||
<p>Text <b>bold child</b> tail of child</p>
|
||||
tail of p
|
||||
</div>
|
||||
```
|
||||
- An element can have _text_.
|
||||
- All visible content within an HTML document is the text (or tail) of some element.
|
||||
- The text of the `<p>` element (`p.text`) is "Text ".
|
||||
- Note the formatting whitespace is included.
|
||||
- An element can have _child elements_.
|
||||
- The `<p>` element (`p`) is a child of `div`.
|
||||
- `b` is a child of `p`.
|
||||
- An element can have a _tail_.
|
||||
- Whatever text follows an element, before the next element starts, is the tail of
|
||||
that element.
|
||||
- `b.tail` is `" tail of child"`. Note the included whitespace.
|
||||
- `p.tail` is `"\n tail of p\n"`.
|
||||
- Tail text is _accessed_ via the element that precedes it but that element does not
|
||||
_influence_ its tail text. For example, "tail of child" does not appear in a bold
|
||||
typeface even though it is the tail of `b`.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import itertools
|
||||
from collections import defaultdict, deque
|
||||
from types import MappingProxyType
|
||||
from typing import Any, Iterable, Iterator, Mapping, NamedTuple, cast
|
||||
|
||||
from lxml import etree
|
||||
from typing_extensions import TypeAlias
|
||||
|
||||
from unstructured.cleaners.core import clean_bullets
|
||||
from unstructured.documents.elements import (
|
||||
Address,
|
||||
Element,
|
||||
ElementMetadata,
|
||||
EmailAddress,
|
||||
ListItem,
|
||||
NarrativeText,
|
||||
Table,
|
||||
Text,
|
||||
Title,
|
||||
)
|
||||
from unstructured.partition.text_type import (
|
||||
is_bulleted_text,
|
||||
is_email_address,
|
||||
is_possible_narrative_text,
|
||||
is_possible_title,
|
||||
is_us_city_state_zip,
|
||||
)
|
||||
from unstructured.utils import htmlify_matrix_of_cell_texts
|
||||
|
||||
# ------------------------------------------------------------------------------------------------
|
||||
# DOMAIN MODEL
|
||||
# ------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
Annotation: TypeAlias = Mapping[str, Any]
|
||||
"""A mapping with zero or more keywords, each represening a noted characteristic.
|
||||
|
||||
An annotation can be associated with a text segment or element. In general the keys and value-types
|
||||
differ between the individual (text-segment) and consolidated (Element) forms.
|
||||
"""
|
||||
|
||||
|
||||
def _consolidate_annotations(text_segments: Iterable[TextSegment]) -> Annotation:
|
||||
"""Combine individual text-segment annotations into an element-level annotation.
|
||||
|
||||
Sequence is significant.
|
||||
"""
|
||||
combined_annotations = cast(defaultdict[str, list[str]], defaultdict(list))
|
||||
for ts in text_segments:
|
||||
for k, v in ts.annotation.items():
|
||||
if isinstance(v, list):
|
||||
combined_annotations[k].extend(cast(list[Any], v))
|
||||
else:
|
||||
combined_annotations[k].append(v)
|
||||
|
||||
return MappingProxyType(dict(combined_annotations))
|
||||
|
||||
|
||||
def _normalize_text(text: str) -> str:
|
||||
"""`text` with normalized whitespace.
|
||||
|
||||
- leading and trailing whitespace are removed
|
||||
- all whitespace segments within text (spacing between words) are reduced to a single space
|
||||
each.
|
||||
|
||||
Produces the empty string when `text` contains only whitespace.
|
||||
"""
|
||||
return " ".join(text.strip().split())
|
||||
|
||||
|
||||
class TextSegment(NamedTuple):
|
||||
"""An annotated string from a Phrasing element.
|
||||
|
||||
Annotations are for emphasis and for links. The text includes any leading, trailing, and
|
||||
inter-word whitespace, just as it occurred in the HTML. The text-segments for a paragraph are
|
||||
consolidated once the paragraph is fully parsed and whitespace it normalized at that time. It
|
||||
cannot be normalized prior to that without distoring or losing inter-word spacing.
|
||||
|
||||
However, text within annotations, like the text of a link, is normalized since its full extents
|
||||
are known.
|
||||
"""
|
||||
|
||||
text: str
|
||||
annotation: Annotation
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------------------
|
||||
# CUSTOM ELEMENT-CLASSES
|
||||
# ------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
# -- FLOW (BLOCK-ITEM) ELEMENTS ------------------------------------------------------------------
|
||||
|
||||
|
||||
class Flow(etree.ElementBase):
|
||||
"""Base and default class for elements that act like a div.
|
||||
|
||||
These can contain other flow elements or phrasing elements.
|
||||
"""
|
||||
|
||||
# -- by default, choose the element class based on the form of the text --
|
||||
_ElementCls = None
|
||||
|
||||
@property
|
||||
def is_phrasing(self) -> bool:
|
||||
return False
|
||||
|
||||
def iter_elements(self) -> Iterator[Element]:
|
||||
"""Generate paragraph string for each block item within."""
|
||||
# -- place child elements in a queue --
|
||||
q: deque[Flow | Phrasing] = deque(self)
|
||||
|
||||
yield from self._element_from_text_or_tail(self.text or "", q, self._ElementCls)
|
||||
|
||||
while q:
|
||||
assert not q[0].is_phrasing
|
||||
block_item = cast(Flow, q.popleft())
|
||||
yield from block_item.iter_elements()
|
||||
yield from self._element_from_text_or_tail(block_item.tail or "", q)
|
||||
|
||||
def _category_depth(self, ElementCls: type[Element]) -> int | None:
|
||||
"""Not clear on concept. Something to do with hierarchy ..."""
|
||||
if ElementCls is ListItem:
|
||||
return (
|
||||
len([e for e in self.iterancestors() if e.tag in ("dl", "ol", "ul")])
|
||||
if self.tag in ("li", "dd")
|
||||
else 0
|
||||
)
|
||||
|
||||
if ElementCls is Title:
|
||||
return int(self.tag[1]) - 1 if self.tag in ("h1", "h2", "h3", "h4", "h5", "h6") else 0
|
||||
|
||||
return None
|
||||
|
||||
def _element_from_text_or_tail(
|
||||
self, text: str, q: deque[Flow | Phrasing], ElementCls: type[Element] | None = None
|
||||
) -> Iterator[Element]:
|
||||
"""Generate zero-or-one paragraph formed from text and leading phrasing elements.
|
||||
|
||||
Note this mutates `q` by popping phrasing elements off as they are processed.
|
||||
"""
|
||||
text_segments = tuple(self._iter_text_segments(text, q))
|
||||
normalized_text = " ".join("".join(ts.text for ts in text_segments).split())
|
||||
|
||||
if not normalized_text:
|
||||
return
|
||||
|
||||
# -- if we don't have a more specific element-class, choose one based on the text --
|
||||
if ElementCls is None:
|
||||
ElementCls = derive_element_type_from_text(normalized_text)
|
||||
# -- normalized text that contains only a bullet character is skipped --
|
||||
if ElementCls is None:
|
||||
return
|
||||
# -- derived ListItem means text starts with a bullet character that needs removing --
|
||||
if ElementCls is ListItem:
|
||||
normalized_text = clean_bullets(normalized_text)
|
||||
if not normalized_text:
|
||||
return
|
||||
|
||||
category_depth = self._category_depth(ElementCls)
|
||||
|
||||
yield ElementCls(
|
||||
normalized_text,
|
||||
metadata=ElementMetadata(
|
||||
**_consolidate_annotations(text_segments), category_depth=category_depth
|
||||
),
|
||||
)
|
||||
|
||||
def _iter_text_segments(self, text: str, q: deque[Flow | Phrasing]) -> Iterator[TextSegment]:
|
||||
"""Generate zero-or-more `TextSegment`s from text and leading phrasing elements.
|
||||
|
||||
This is used to process the text or tail of a flow element. For example, this <div>:
|
||||
|
||||
<div>
|
||||
For a <b>moment, <i>nothing</i> happened.</b>
|
||||
<p>Then, after a second or so, nothing continued to happen.</p>
|
||||
The dolphins had always believed that <em>they</em> were far more intelligent.
|
||||
</div>
|
||||
|
||||
Should generate three distinct elements, one for each contained line. This method is
|
||||
invoked to process the first beginning "For a" and the third line beginning "The dolphins".
|
||||
|
||||
Note this method mutates `q` by popping phrasing elements off as they are processed.
|
||||
"""
|
||||
yield TextSegment(text, {})
|
||||
while q and q[0].is_phrasing:
|
||||
e = cast(Phrasing, q.popleft())
|
||||
yield from e.iter_text_segments()
|
||||
|
||||
|
||||
class BlockItem(Flow):
|
||||
"""Custom element-class for `<p>` element, `<h1>`, and others like it.
|
||||
|
||||
These can appear in a flow container like a div but can only contain phrasing content.
|
||||
"""
|
||||
|
||||
# -- Turns out there are no implementation differences so far between Flow and BlockItem, but
|
||||
# -- maintaining the distinction for now. We may use it to add hierarchy information or
|
||||
# -- customize how we deal with invalid HTML that places flow items inside one of these.
|
||||
|
||||
|
||||
class Heading(Flow):
|
||||
"""An `<h1>..<h6>` element.
|
||||
|
||||
These are distinguished because they generate a `Title` element.
|
||||
"""
|
||||
|
||||
_ElementCls = Title
|
||||
|
||||
|
||||
class ListBlock(Flow):
|
||||
"""Either a `<ul>` or `<ol>` element, maybe a `<dl>` element at some point.
|
||||
|
||||
The primary reason for distinguishing these is because they increment the hierarchy depth for
|
||||
lists that are nested inside them.
|
||||
|
||||
Can only contain `<li>` elements (ignoring `<script>` and `<template>`). A list nested inside
|
||||
must actually be a child of one of these `<li>` elements.
|
||||
"""
|
||||
|
||||
# TODO: might want alternate `.iter_elements()` since these can only contain `<li>` elements and
|
||||
# not text nodes (I believe).
|
||||
|
||||
|
||||
class ListItemBlock(Flow):
|
||||
"""A `<li>` element.
|
||||
|
||||
These are distinguished because they generate a `ListItem` element.
|
||||
"""
|
||||
|
||||
_ElementCls = ListItem
|
||||
|
||||
|
||||
class Pre(BlockItem):
|
||||
"""Custom element-class for `<pre>` element.
|
||||
|
||||
Can only contain phrasing content.
|
||||
"""
|
||||
|
||||
def iter_elements(self) -> Iterator[Element]:
|
||||
"""Generate zero or one document element for the entire `<pre>` element.
|
||||
|
||||
Whitespace is preserved just as it appears in the source HTML.
|
||||
"""
|
||||
pre_text = self.text or ""
|
||||
# -- this is pretty subtle, but in a browser, if the opening `<pre>` is immediately
|
||||
# -- followed by a newline, that newline is removed from the rendered text.
|
||||
if pre_text.startswith("\n"):
|
||||
pre_text = pre_text[1:]
|
||||
|
||||
text_segments = tuple(self._iter_text_segments(pre_text, deque(self)))
|
||||
text = "".join(ts.text for ts in text_segments)
|
||||
|
||||
# -- also subtle, but in a browser, if the closing `</pre>` tag is immediately preceded
|
||||
# -- by a newline (starts in column 1), that preceding newline is removed too.
|
||||
if text.endswith("\n"):
|
||||
text = text[:-1]
|
||||
|
||||
if not text:
|
||||
return
|
||||
|
||||
ElementCls = derive_element_type_from_text(text)
|
||||
if not ElementCls:
|
||||
return
|
||||
|
||||
yield ElementCls(text, metadata=ElementMetadata(**_consolidate_annotations(text_segments)))
|
||||
|
||||
|
||||
class TableBlock(Flow):
|
||||
"""Custom element-class for `<table>` element."""
|
||||
|
||||
def iter_elements(self) -> Iterator[Table]:
|
||||
"""Generate paragraph string for each block item within."""
|
||||
|
||||
# -- NOTE this algorithm handles a nested-table by parsing all of its text into the text
|
||||
# -- for the _cell_ containing the table (and this is recursive, so a table nested within
|
||||
# -- a cell within a table within a cell too.)
|
||||
|
||||
trs = cast(list[etree._Element], self.xpath("./tr | ./thead/tr | ./tbody/tr | ./tfoot/tr"))
|
||||
|
||||
if not trs:
|
||||
return
|
||||
|
||||
def iter_cell_texts(tr: etree._Element) -> Iterator[str]:
|
||||
"""Generate the text of each cell in `tr`."""
|
||||
# -- a cell can be either a "data" cell (td) or a "heading" cell (th) --
|
||||
tds = cast(list[etree._Element], tr.xpath("./td | ./th"))
|
||||
for td in tds:
|
||||
# -- a cell can contain other elements like spans etc. so we can't count on the
|
||||
# -- text being directly below the `<td>` element. `.itertext()` gets all of it
|
||||
# -- recursively. Filter out whitespace text nodes resulting from HTML formatting.
|
||||
stripped_text_nodes = (t.strip() for t in td.itertext())
|
||||
yield " ".join(t for t in stripped_text_nodes if t)
|
||||
|
||||
table_data = [list(iter_cell_texts(tr)) for tr in trs]
|
||||
html_table = htmlify_matrix_of_cell_texts(table_data)
|
||||
table_text = " ".join(" ".join(t for t in row if t) for row in table_data).strip()
|
||||
|
||||
if table_text == "":
|
||||
return
|
||||
|
||||
yield Table(table_text, metadata=ElementMetadata(text_as_html=html_table))
|
||||
|
||||
|
||||
class RemovedBlock(Flow):
|
||||
"""Elements that are to be ignored.
|
||||
|
||||
An element may be ignored because it commonly contains boilerplate that would dilute the meaning
|
||||
extracted rather than contribute to it.
|
||||
|
||||
All contents of a removed block item are ignored but its tail is emitted by its container.
|
||||
"""
|
||||
|
||||
def iter_elements(self) -> Iterator[Element]:
|
||||
"""Don't generate any document-elements."""
|
||||
return
|
||||
yield
|
||||
|
||||
|
||||
# -- PHRASING ELEMENTS ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class Phrasing(etree.ElementBase):
|
||||
"""Base-class for phrasing (inline/run) elements like bold and italic."""
|
||||
|
||||
@property
|
||||
def is_phrasing(self) -> bool:
|
||||
return True
|
||||
|
||||
def iter_text_segments(self, enclosing_emphasis: str = "") -> Iterator[TextSegment]:
|
||||
"""Generate text segments for text, children, and tail of this element."""
|
||||
inside_emphasis = self._inside_emphasis(enclosing_emphasis)
|
||||
|
||||
yield from self._iter_text_segment(inside_emphasis)
|
||||
|
||||
# -- Recurse into any nested tags. All child tags are assumed to also be phrasing tags. --
|
||||
yield from self._iter_child_text_segments(inside_emphasis)
|
||||
|
||||
# -- It is the phrasing element's job to emit its tail when it has one (there is no one
|
||||
# -- else who can do it). Note that the tail gets the _enclosing-emphasis_, not the
|
||||
# -- _inside-emphasis_ since the tail occurs after this phrasing element's closing tag.
|
||||
yield from self._iter_tail_segment(enclosing_emphasis)
|
||||
|
||||
def _annotation(self, text: str, emphasis: str) -> Annotation:
|
||||
"""Emphasis annotations that apply to text inside this element.
|
||||
|
||||
No annotations are added when the text contains only whitespace. Otherwise, emphasis
|
||||
annotations are returned for the text contents, normalized as it will appear in the
|
||||
document-element.
|
||||
|
||||
Emphasis annotations apply to the contents of all elements enclosed by the emphasis element.
|
||||
Sub-classes like the one for anchor elements that add non-emphasis annotations will need to
|
||||
override this method.
|
||||
"""
|
||||
# -- emphasis annotation is only added when there is both emphasis and non-whitespace text
|
||||
# -- to apply it to
|
||||
return MappingProxyType(
|
||||
{"emphasized_text_contents": normalized_text, "emphasized_text_tags": emphasis}
|
||||
if (normalized_text := _normalize_text(text)) and emphasis
|
||||
else {}
|
||||
)
|
||||
|
||||
def _inside_emphasis(self, enclosing_emphasis: str) -> str:
|
||||
"""By default, the inside emphasis is the same as the outside emphasis.
|
||||
|
||||
This method is overridden by sub-classes that annotate particular emphasis types but many
|
||||
phrasing elements do not contribute to annotations.
|
||||
"""
|
||||
return enclosing_emphasis
|
||||
|
||||
def _iter_child_text_segments(self, emphasis: str) -> Iterator[TextSegment]:
|
||||
"""Generate zero-or-more text-segments for phrasing children of this element.
|
||||
|
||||
All generated text segments will be annotated with `emphasis` when it is other than the
|
||||
empty string.
|
||||
"""
|
||||
for child in self:
|
||||
yield from child.iter_text_segments(emphasis)
|
||||
|
||||
def _iter_tail_segment(self, emphasis: str) -> Iterator[TextSegment]:
|
||||
"""Generate zero-or-one text-segment for tail of this element.
|
||||
|
||||
No text-segment is generated when this element has no tail node. However a segment _is_
|
||||
generated for a whitespace-only tail node.
|
||||
"""
|
||||
if tail := self.tail:
|
||||
yield TextSegment(tail, self._annotation(tail, emphasis))
|
||||
|
||||
def _iter_text_segment(self, emphasis: str) -> Iterator[TextSegment]:
|
||||
"""Generate zero-or-one text-segment for text of this element.
|
||||
|
||||
No text-segment is generated when this element has no text node. However a segment _is_
|
||||
generated for a whitespace-only text node.
|
||||
"""
|
||||
if text := self.text:
|
||||
yield TextSegment(text, self._annotation(text, emphasis))
|
||||
|
||||
|
||||
class Bold(Phrasing):
|
||||
"""Provides annotations for bold/strong text."""
|
||||
|
||||
def _inside_emphasis(self, enclosing_emphasis: str) -> str:
|
||||
"""Emphasis tags that apply to text inside this element.
|
||||
|
||||
Formed by adding "b" (for "bold") to the enclosing emphasis, unless it's already there.
|
||||
The returned emphasis tuple is sorted to make its form canonical, which eases testing. For
|
||||
Example `("b", "i")` and `("i", "b")` are semantically the same but don't directly compare
|
||||
equal in a test. Sorting it basically gives it some set-like properties.
|
||||
"""
|
||||
chars = set(enclosing_emphasis + "b")
|
||||
return "".join(sorted(chars))
|
||||
|
||||
|
||||
class Italic(Phrasing):
|
||||
"""Provides annotations for italic/emphasized text."""
|
||||
|
||||
def _inside_emphasis(self, enclosing_emphasis: str) -> str:
|
||||
"""Emphasis tags that apply to text inside this element.
|
||||
|
||||
Formed by adding "i" (for "italic") to the enclosing emphasis, unless it's already there.
|
||||
"""
|
||||
chars = set(enclosing_emphasis + "i")
|
||||
return "".join(sorted(chars))
|
||||
|
||||
|
||||
class LineBreak(Phrasing):
|
||||
"""A `<br/>` line-break element.
|
||||
|
||||
It's only special behavior is to add whitespace such that phrasing tight on both sides is not
|
||||
joined, like `abc<br/>def` should become "abc def", not "abcdef".
|
||||
"""
|
||||
|
||||
def iter_text_segments(self, enclosing_emphasis: str = "") -> Iterator[TextSegment]:
|
||||
"""Generate text segments for text, children, and tail of this element."""
|
||||
yield TextSegment("\n", {})
|
||||
yield from self._iter_tail_segment(enclosing_emphasis)
|
||||
|
||||
|
||||
class RemovedPhrasing(Phrasing):
|
||||
"""Phrasing where we want to skip the content.
|
||||
|
||||
- `.is_phrasing` is True so it doesn't break the paragraph like a block.
|
||||
- `element.text` is discarded
|
||||
- `element.tail` is preserved
|
||||
"""
|
||||
|
||||
def iter_text_segments(self, enclosing_emphasis: str = "") -> Iterator[TextSegment]:
|
||||
"""Generate text segment for tail only of this element."""
|
||||
yield from self._iter_tail_segment(enclosing_emphasis)
|
||||
|
||||
|
||||
# -- DUAL-ROLE ELEMENTS --------------------------------------------------------------------------
|
||||
|
||||
|
||||
class Anchor(Phrasing, Flow):
|
||||
"""Custom element-class for `<a>` element.
|
||||
|
||||
Provides link annotations.
|
||||
"""
|
||||
|
||||
@property
|
||||
def is_phrasing(self) -> bool:
|
||||
"""False when the `<a>` element contains any block items, True otherwise."""
|
||||
return all(e.is_phrasing for e in self)
|
||||
|
||||
def iter_text_segments(self, enclosing_emphasis: str = "") -> Iterator[TextSegment]:
|
||||
"""Generate text segments for text and tail of this element, when they exist.
|
||||
|
||||
The behavior for an anchor element is slightly different because link annotations are only
|
||||
added to the text, not the tail. Also an anchor can have no children.
|
||||
"""
|
||||
# -- the text of the link is everything inside the `<a>` element, text and child text --
|
||||
text_segments = tuple(
|
||||
itertools.chain(
|
||||
self._iter_text_segment(enclosing_emphasis),
|
||||
self._iter_child_text_segments(enclosing_emphasis),
|
||||
)
|
||||
)
|
||||
|
||||
link_text = "".join("".join(ts.text for ts in text_segments))
|
||||
|
||||
# -- the link_text and link_url annotation refers to the entire text inside the `<a>` --
|
||||
link_text_segment = TextSegment(
|
||||
link_text, self._link_annotations(link_text, enclosing_emphasis)
|
||||
)
|
||||
|
||||
# -- but the emphasis annotations must come from the individual text segments within --
|
||||
consolidated_annotations = _consolidate_annotations((link_text_segment, *text_segments))
|
||||
|
||||
# -- generate at most one text-segment for the `<a>` element, the full enclosed text with
|
||||
# -- consolidated emphasis and link annotations.
|
||||
if link_text:
|
||||
yield TextSegment(link_text, consolidated_annotations)
|
||||
|
||||
# -- A tail is emitted when present whether anchor itself was or not --
|
||||
yield from self._iter_tail_segment(enclosing_emphasis)
|
||||
|
||||
def _link_annotations(self, text: str, emphasis: str) -> Annotation:
|
||||
"""Link and emphasis annotations that apply to the text of this anchor.
|
||||
|
||||
An anchor element does not add any emphasis but uses any introduced by enclosing elements.
|
||||
"""
|
||||
normalized_text = _normalize_text(text)
|
||||
|
||||
if not normalized_text:
|
||||
return {}
|
||||
|
||||
def iter_annotation_pairs() -> Iterator[tuple[str, Any]]:
|
||||
# -- emphasis annotation is only added when there is enclosing emphasis --
|
||||
if emphasis:
|
||||
yield "emphasized_text_contents", normalized_text
|
||||
yield "emphasized_text_tags", emphasis
|
||||
|
||||
if href := self.get("href"):
|
||||
yield "link_texts", normalized_text
|
||||
yield "link_urls", href
|
||||
|
||||
return MappingProxyType(dict(iter_annotation_pairs()))
|
||||
|
||||
|
||||
# -- DEFAULT ELEMENT -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class DefaultElement(Flow, Phrasing):
|
||||
"""Custom element-class used for any element without an assigned custom element class.
|
||||
|
||||
An unrecognized element is given both Flow (block) and Phrasing (inline) behaviors. It behaves
|
||||
like a Flow element When nested in a Flow element like a Phrasing element when nested in a
|
||||
Phrasing element.
|
||||
|
||||
The contents of the element is skipped in either case, but its tail is not when it behaves as a
|
||||
Phrasing element. The tail is processed by its parent when that is a Flow element.
|
||||
"""
|
||||
|
||||
@property
|
||||
def is_phrasing(self) -> bool:
|
||||
"""If asked (by a parent Flow element), identify as a phrasing element.
|
||||
|
||||
It's not possible to determine the display intent (block|inline) of an unknown element
|
||||
(like `<foobar>`) and phrasing is less disruptive, adding the tail of this element to any
|
||||
text or phrasing content before and after it without starting a new paragraph.
|
||||
"""
|
||||
return True
|
||||
|
||||
def iter_elements(self) -> Iterator[Element]:
|
||||
"""Don't generate any document-elements when behaving like a Flow element.
|
||||
|
||||
Because the element identifies as phrasing and will always be enclosed by at least a
|
||||
`<body>` element, this method should never be called. However, it's easier to prove it does
|
||||
the appropriate thing if it is called than prove that it can never happen.
|
||||
"""
|
||||
return
|
||||
yield
|
||||
|
||||
def iter_text_segments(self, enclosing_emphasis: str = "") -> Iterator[TextSegment]:
|
||||
"""Generate text segment for tail of this element only.
|
||||
|
||||
This method is only called on Phrasing elements and their children. In that case, act like a
|
||||
Phrasing element but don't generate a text segment for this element or any children. Do
|
||||
however generate a tail text-segment.
|
||||
"""
|
||||
# -- It is the phrasing element's job to emit its tail when it has one (there is no one
|
||||
# -- else who can do it). Note that the tail gets the _enclosing-emphasis_, not the
|
||||
# -- _inside-emphasis_ since the tail occurs after this phrasing element's closing tag.
|
||||
yield from self._iter_tail_segment(enclosing_emphasis)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------------------
|
||||
# TEXT-ELEMENT CLASSIFIER
|
||||
# ------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
def derive_element_type_from_text(text: str) -> type[Text] | None:
|
||||
"""Produce a document-element of the appropriate sub-type for `text`."""
|
||||
if is_bulleted_text(text):
|
||||
return ListItem
|
||||
|
||||
if is_us_city_state_zip(text):
|
||||
return Address
|
||||
|
||||
if is_email_address(text):
|
||||
return EmailAddress
|
||||
|
||||
if len(text) < 2:
|
||||
return None
|
||||
|
||||
if is_possible_narrative_text(text):
|
||||
return NarrativeText
|
||||
|
||||
# NOTE (scanny): Classifying short paragraphs as titles produces noise much more frequently
|
||||
# than it does value. A `Title` element is very consequential in its effect on chunking and
|
||||
# document hierarchy. Classifying any small paragraph as a heading is frequently wrong and
|
||||
# throws off these important downstream processes much more than missing the occasional
|
||||
# heading does. If we want to infer headings, I think we have to be much more intelligent
|
||||
# about it and consider what elements came before and after to see if the text _behaves_ like
|
||||
# a heading, maybe whether it is bold and how many text elements follow it before the next
|
||||
# title and how long since the prior title, whether `h1..h6` are used elsewhere in the
|
||||
# document, etc.
|
||||
if is_possible_title(text):
|
||||
return Title
|
||||
|
||||
return Text
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------------------
|
||||
# HTML PARSER
|
||||
# ------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
html_parser = etree.HTMLParser(remove_comments=True)
|
||||
# -- elements that don't have a registered class get DefaultElement --
|
||||
fallback = etree.ElementDefaultClassLookup(element=DefaultElement)
|
||||
# -- elements that do have a registered class are assigned that class via lookup --
|
||||
element_class_lookup = etree.ElementNamespaceClassLookup(fallback)
|
||||
html_parser.set_element_class_lookup(element_class_lookup)
|
||||
|
||||
# -- register classes --
|
||||
element_class_lookup.get_namespace(None).update(
|
||||
{
|
||||
# -- flow/containers --
|
||||
"address": Flow,
|
||||
"article": Flow,
|
||||
"aside": Flow,
|
||||
"blockquote": Flow,
|
||||
"body": Flow,
|
||||
"center": Flow,
|
||||
"div": Flow,
|
||||
"footer": Flow,
|
||||
"header": Flow,
|
||||
"hgroup": Flow,
|
||||
"main": Flow,
|
||||
"section": Flow,
|
||||
# -- block items --
|
||||
"h1": Heading,
|
||||
"h2": Heading,
|
||||
"h3": Heading,
|
||||
"h4": Heading,
|
||||
"h5": Heading,
|
||||
"h6": Heading,
|
||||
"p": BlockItem,
|
||||
"pre": Pre,
|
||||
# -- list blocks --
|
||||
"ol": ListBlock,
|
||||
"ul": ListBlock,
|
||||
"li": ListItemBlock,
|
||||
# -- table --
|
||||
"table": TableBlock,
|
||||
# -- annotated phrasing --
|
||||
"a": Anchor,
|
||||
"b": Bold,
|
||||
"em": Italic,
|
||||
"i": Italic,
|
||||
"strong": Bold,
|
||||
# -- transparent phrasing --
|
||||
"abbr": Phrasing, # -- abbreviation, like "LLM (Large Language Model)"
|
||||
"bdi": Phrasing, # -- Bidirectional Isolate - important for RTL languages
|
||||
"bdo": Phrasing, # -- Bidirectional Override - maybe reverse
|
||||
"big": Phrasing, # -- deprecated --
|
||||
"br": LineBreak, # -- line break --
|
||||
"cite": Phrasing, # -- title of book or article etc. --
|
||||
"code": Phrasing, # -- monospaced terminal font --
|
||||
"data": Phrasing, # -- similar to `time`, provides machine readable value as attribute --
|
||||
"dfn": Phrasing, # -- definition, like new term in italic when first introduced --
|
||||
"kbd": Phrasing, # -- font that looks like keyboard keys --
|
||||
"mark": Phrasing, # -- like yellow highlighter --
|
||||
"meter": Phrasing, # -- bar thermometer progress-meter thing --
|
||||
"q": Phrasing, # -- inline quotation, usually quoted and maybe italic --
|
||||
"s": Phrasing, # -- strikethrough --
|
||||
"samp": Phrasing, # -- sample terminal output; like markdown back-ticks for inline code --
|
||||
"small": Phrasing, # -- fine-print; maybe likely boilerplate --
|
||||
"span": Phrasing,
|
||||
"strike": Phrasing, # -- deprecated - obsolete version of `del` or `s` --
|
||||
"sub": Phrasing, # -- subscript --
|
||||
"sup": Phrasing, # -- superscript --
|
||||
"time": Phrasing, # -- wrap human-readable time to provide machine-readable time as attr --
|
||||
"tt": Phrasing, # -- deprecated - "teletype", obsolete version of `code` or `samp` --
|
||||
"u": Phrasing, # -- red squiggly underline for e.g. spelling mistake; was underscore --
|
||||
"var": Phrasing, # -- variable like "x" in a mathematical expression --
|
||||
"wbr": Phrasing, # -- word-break opportunity; empty --
|
||||
# -- removed phrasing --
|
||||
"button": RemovedPhrasing,
|
||||
"label": RemovedPhrasing,
|
||||
# -- removed block --
|
||||
"details": RemovedBlock, # -- likely boilerplate --
|
||||
"dl": RemovedBlock,
|
||||
"dd": RemovedBlock,
|
||||
"dt": RemovedBlock,
|
||||
"figure": RemovedBlock,
|
||||
"hr": RemovedBlock,
|
||||
"nav": RemovedBlock,
|
||||
"template": RemovedBlock,
|
||||
# -- removed form-related --
|
||||
"form": RemovedBlock,
|
||||
"input": RemovedBlock,
|
||||
"summary": RemovedBlock, # -- child of `details`
|
||||
}
|
||||
)
|
||||
268
unstructured/partition/html/partition.py
Normal file
268
unstructured/partition/html/partition.py
Normal file
@ -0,0 +1,268 @@
|
||||
# pyright: reportPrivateUsage=false
|
||||
|
||||
"""Provides `partition_html()."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import IO, Any, Iterator, Optional, cast
|
||||
|
||||
import requests
|
||||
from lxml import etree
|
||||
|
||||
from unstructured.chunking import add_chunking_strategy
|
||||
from unstructured.documents.elements import Element, process_metadata
|
||||
from unstructured.documents.html import HTMLDocument
|
||||
from unstructured.file_utils.encoding import read_txt_file
|
||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||
from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file
|
||||
from unstructured.partition.html.parser import Flow, html_parser
|
||||
from unstructured.partition.lang import apply_lang_metadata
|
||||
from unstructured.utils import is_temp_file_path, lazyproperty
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.HTML)
|
||||
@add_chunking_strategy
|
||||
def partition_html(
|
||||
filename: Optional[str] = None,
|
||||
*,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
text: Optional[str] = None,
|
||||
encoding: Optional[str] = None,
|
||||
url: Optional[str] = None,
|
||||
headers: dict[str, str] = {},
|
||||
ssl_verify: bool = True,
|
||||
date_from_file_object: bool = False,
|
||||
detect_language_per_element: bool = False,
|
||||
languages: Optional[list[str]] = ["auto"],
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
skip_headers_and_footers: bool = False,
|
||||
detection_origin: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> list[Element]:
|
||||
"""Partitions an HTML document into its constituent elements.
|
||||
|
||||
HTML source parameters
|
||||
----------------------
|
||||
The HTML to be partitioned can be specified four different ways:
|
||||
|
||||
filename
|
||||
A string defining the target filename path.
|
||||
file
|
||||
A file-like object using "r" mode --> open(filename, "r").
|
||||
text
|
||||
The string representation of the HTML document.
|
||||
url
|
||||
The URL of a webpage to parse. Only for URLs that return an HTML document.
|
||||
headers
|
||||
The HTTP headers to be used in the HTTP request when `url` is specified.
|
||||
ssl_verify
|
||||
If the URL parameter is set, determines whether or not SSL verification is performed
|
||||
on the HTTP request.
|
||||
date_from_file_object
|
||||
Applies only when providing file via `file` parameter. If this option is True, attempt
|
||||
infer last_modified metadata from bytes, otherwise set it to None.
|
||||
encoding
|
||||
The encoding method used to decode the text input. If None, utf-8 will be used.
|
||||
|
||||
Other parameters
|
||||
----------------
|
||||
include_metadata
|
||||
Optionally allows for excluding metadata from the output. Primarily intended
|
||||
for when partition_html is called by other partitioners (like partition_email).
|
||||
languages
|
||||
User defined value for `metadata.languages` if provided. Otherwise language is detected
|
||||
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
||||
in either language.
|
||||
Additional Parameters:
|
||||
detect_language_per_element
|
||||
Detect language per element instead of at the document level.
|
||||
metadata_last_modified
|
||||
The last modified date for the document.
|
||||
skip_headers_and_footers
|
||||
If True, ignores any content that is within <header> or <footer> tags
|
||||
"""
|
||||
# -- parser rejects an empty str, nip that edge-case in the bud here --
|
||||
if text is not None and text.strip() == "" and not file and not filename and not url:
|
||||
return []
|
||||
|
||||
opts = HtmlPartitionerOptions(
|
||||
file_path=filename,
|
||||
file=file,
|
||||
text=text,
|
||||
encoding=encoding,
|
||||
url=url,
|
||||
headers=headers,
|
||||
ssl_verify=ssl_verify,
|
||||
date_from_file_object=date_from_file_object,
|
||||
metadata_last_modified=metadata_last_modified,
|
||||
skip_headers_and_footers=skip_headers_and_footers,
|
||||
detection_origin=detection_origin,
|
||||
)
|
||||
|
||||
document = HTMLDocument.load(opts)
|
||||
|
||||
elements = list(
|
||||
apply_lang_metadata(
|
||||
document.elements,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
)
|
||||
)
|
||||
|
||||
return elements
|
||||
|
||||
|
||||
class HtmlPartitionerOptions:
|
||||
"""Encapsulates partitioning option validation, computation, and application of defaults."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
file_path: str | None,
|
||||
file: IO[bytes] | None,
|
||||
text: str | None,
|
||||
encoding: str | None,
|
||||
url: str | None,
|
||||
headers: dict[str, str],
|
||||
ssl_verify: bool,
|
||||
date_from_file_object: bool,
|
||||
metadata_last_modified: str | None,
|
||||
skip_headers_and_footers: bool,
|
||||
detection_origin: str | None,
|
||||
):
|
||||
self._file_path = file_path
|
||||
self._file = file
|
||||
self._text = text
|
||||
self._encoding = encoding
|
||||
self._url = url
|
||||
self._headers = headers
|
||||
self._ssl_verify = ssl_verify
|
||||
self._date_from_file_object = date_from_file_object
|
||||
self._metadata_last_modified = metadata_last_modified
|
||||
self._skip_headers_and_footers = skip_headers_and_footers
|
||||
self._detection_origin = detection_origin
|
||||
|
||||
@lazyproperty
|
||||
def detection_origin(self) -> str | None:
|
||||
"""Trace of initial partitioner to be included in metadata for debugging purposes."""
|
||||
return self._detection_origin
|
||||
|
||||
@lazyproperty
|
||||
def encoding(self) -> str | None:
|
||||
"""Caller-provided encoding used to store HTML character stream as bytes.
|
||||
|
||||
`None` when no encoding was provided and encoding should be auto-detected.
|
||||
"""
|
||||
return self._encoding
|
||||
|
||||
@lazyproperty
|
||||
def html_text(self) -> str:
|
||||
"""The HTML document as a string, loaded from wherever the caller specified."""
|
||||
if self._file_path:
|
||||
return read_txt_file(filename=self._file_path, encoding=self._encoding)[1]
|
||||
|
||||
if self._file:
|
||||
return read_txt_file(file=self._file, encoding=self._encoding)[1]
|
||||
|
||||
if self._text:
|
||||
return str(self._text)
|
||||
|
||||
if self._url:
|
||||
response = requests.get(self._url, headers=self._headers, verify=self._ssl_verify)
|
||||
if not response.ok:
|
||||
raise ValueError(
|
||||
f"Error status code on GET of provided URL: {response.status_code}"
|
||||
)
|
||||
content_type = response.headers.get("Content-Type", "")
|
||||
if not content_type.startswith("text/html"):
|
||||
raise ValueError(f"Expected content type text/html. Got {content_type}.")
|
||||
|
||||
return response.text
|
||||
|
||||
raise ValueError("Exactly one of filename, file, text, or url must be specified.")
|
||||
|
||||
@lazyproperty
|
||||
def last_modified(self) -> str | None:
|
||||
"""The best last-modified date available, None if no sources are available."""
|
||||
# -- Value explicitly specified by caller takes precedence. This is used for example when
|
||||
# -- this file was converted from another format.
|
||||
if self._metadata_last_modified:
|
||||
return self._metadata_last_modified
|
||||
|
||||
if self._file_path:
|
||||
return (
|
||||
None
|
||||
if is_temp_file_path(self._file_path)
|
||||
else get_last_modified_date(self._file_path)
|
||||
)
|
||||
|
||||
if self._file:
|
||||
return (
|
||||
get_last_modified_date_from_file(self._file)
|
||||
if self._date_from_file_object
|
||||
else None
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
@lazyproperty
|
||||
def skip_headers_and_footers(self) -> bool:
|
||||
"""When True, elements located within a header or footer are pruned."""
|
||||
return self._skip_headers_and_footers
|
||||
|
||||
|
||||
class _HtmlPartitioner: # pyright: ignore[reportUnusedClass]
|
||||
"""Partition HTML document into document-elements."""
|
||||
|
||||
def __init__(self, opts: HtmlPartitionerOptions):
|
||||
self._opts = opts
|
||||
|
||||
@classmethod
|
||||
def iter_elements(cls, opts: HtmlPartitionerOptions) -> Iterator[Element]:
|
||||
"""Partition HTML document provided by `opts` into document-elements."""
|
||||
yield from cls(opts)._iter_elements()
|
||||
|
||||
def _iter_elements(self) -> Iterator[Element]:
|
||||
"""Generated document-elements (e.g. Title, NarrativeText, etc.) parsed from document.
|
||||
|
||||
Elements appear in document order.
|
||||
"""
|
||||
for e in self._main.iter_elements():
|
||||
e.metadata.last_modified = self._opts.last_modified
|
||||
e.metadata.detection_origin = self._opts.detection_origin
|
||||
yield e
|
||||
|
||||
@lazyproperty
|
||||
def _main(self) -> Flow:
|
||||
"""The root HTML element."""
|
||||
# NOTE(scanny) - get `html_text` first so any encoding error raised is not confused with a
|
||||
# recoverable parsing error.
|
||||
html_text = self._opts.html_text
|
||||
|
||||
# NOTE(scanny) - `lxml` will not parse a `str` that includes an XML encoding declaration
|
||||
# and will raise the following error:
|
||||
# ValueError: Unicode strings with encoding declaration are not supported. ...
|
||||
# This is not valid HTML (would be in XHTML), but Chrome accepts it so we work around it
|
||||
# by UTF-8 encoding the str bytes and parsing those.
|
||||
try:
|
||||
root = etree.fromstring(html_text, html_parser)
|
||||
except ValueError:
|
||||
root = etree.fromstring(html_text.encode("utf-8"), html_parser)
|
||||
|
||||
# -- remove a variety of HTML element types like <script> and <style> that we prefer not
|
||||
# -- to encounter while parsing.
|
||||
etree.strip_elements(
|
||||
root, ["del", "img", "link", "meta", "noscript", "script", "style"], with_tail=False
|
||||
)
|
||||
|
||||
# -- remove <header> and <footer> tags if the caller doesn't want their contents --
|
||||
if self._opts.skip_headers_and_footers:
|
||||
etree.strip_elements(root, ["header", "footer"], with_tail=False)
|
||||
|
||||
# -- jump to the core content if the document indicates where it is --
|
||||
if (main := root.find(".//main")) is not None:
|
||||
return cast(Flow, main)
|
||||
if (body := root.find(".//body")) is not None:
|
||||
return cast(Flow, body)
|
||||
return cast(Flow, root)
|
||||
Loading…
x
Reference in New Issue
Block a user