rfctr(html): break coupling to DocumentLayout (#3180)

**Summary**
Remove use of `partition.common.document_to_element_list()` by
`HTMLDocument`. The transitive coupling with layout-inference through
this shared function have been the source of frustration and a drain on
engineering time and there's no compelling reason for the two to share
this code.

**Additional Context**
`partition_html()` uses `partition.common.document_to_element_list()` to
get finalized elements from `HTMLDocument` (pages). This gives rise to a
very nasty coupling between `DocumentLayout`, used by
`unstructured_inference`, and `HTMLDocument`.
`document_to_element_list()` has evolved to work for both callers, but
they share very few common characteristics with each other.

This coupling is bad news for us and also, importantly, for the
inference and page layout folks working on PDF and images.

Break that coupling so those inference-related functions can evolve
whatever way they need to without being dragged down by legacy
`HTMLDocument` connections.

The initial step is to extract a `document_to_element_list()` function
of our own, getting rid of the coordinates and other
`DocumentLayout`-related bits we don't need. As you'll see in the next
few PRs, all of this `document_to_element_list()` code will end up
either going away or being relocated closer to where it's used in
`HTMLDocument`.
This commit is contained in:
Steve Canny 2024-06-11 13:54:11 -07:00 committed by GitHub
parent e39ee16161
commit 2f0400f279
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 279 additions and 86 deletions

View File

@ -1,4 +1,4 @@
## 0.14.6-dev1 ## 0.14.6-dev2
### Enhancements ### Enhancements

View File

@ -28,7 +28,13 @@ from unstructured.documents.elements import (
TableChunk, TableChunk,
Title, Title,
) )
from unstructured.documents.html_elements import HTMLTable, TagsMixin from unstructured.documents.html_elements import (
HTMLListItem,
HTMLNarrativeText,
HTMLTable,
HTMLTitle,
TagsMixin,
)
from unstructured.partition.html import partition_html from unstructured.partition.html import partition_html
# -- document-source (filename, file, text, url) ------------------------------------------------- # -- document-source (filename, file, text, url) -------------------------------------------------
@ -171,26 +177,6 @@ def test_pre_tag_parsing_respects_order():
] ]
@pytest.mark.parametrize(
("tag", "expected_text_as_html"),
[
("thead", "<table><tr><td>Header 1</td><td>Header 2</td></tr></table>"),
("tfoot", "<table><tr><td>Header 1</td><td>Header 2</td></tr></table>"),
],
)
def test_partition_html_with_table_without_tbody(tag: str, expected_text_as_html: str):
elements = partition_html(
text=(
f"<table>\n"
f" <{tag}>\n"
f" <tr><th>Header 1</th><th>Header 2</th></tr>\n"
f" </{tag}>\n"
f"</table>"
)
)
assert elements[0].metadata.text_as_html == expected_text_as_html
def test_partition_html_b_tag_parsing(): def test_partition_html_b_tag_parsing():
elements = partition_html( elements = partition_html(
text=( text=(
@ -467,6 +453,58 @@ def test_element_ids_are_deterministic():
assert ids == ids_2 assert ids == ids_2
# -- .metadata.category_depth + parent_id --------------------------------------------------------
def test_partition_html_records_hierarchy_metadata():
elements = partition_html(
text=(
"<html>\n"
" <p>Preamble gets no category_depth or parent_id</p>\n"
" <h1>Heading gets category_depth but no parent_id</h1>\n"
" <p>Body paragraph gets parent_id but no category_depth</p>\n"
" <ul>\n"
" <li>List item gets category_depth and parent_id</li>\n"
" <li>Second list item gets category_depth and parent_id</li>\n"
" </ul>\n"
" <p>Body paragraph after list gets parent_id but no category_depth</p>\n"
"</html>\n"
)
)
assert len(elements) == 6
e = elements[0]
assert isinstance(e, HTMLNarrativeText)
assert e.text == "Preamble gets no category_depth or parent_id"
assert e.metadata.category_depth is None
assert e.metadata.parent_id is None
e = elements[1]
assert isinstance(e, HTMLTitle)
assert e.text == "Heading gets category_depth but no parent_id"
assert e.metadata.category_depth == 0
assert e.metadata.parent_id is None
e = elements[2]
assert isinstance(e, HTMLNarrativeText)
assert e.text == "Body paragraph gets parent_id but no category_depth"
assert e.metadata.category_depth is None
assert e.metadata.parent_id == elements[1].id
e = elements[3]
assert isinstance(e, HTMLListItem)
assert e.text == "List item gets category_depth and parent_id"
assert e.metadata.category_depth == 1
assert e.metadata.parent_id == elements[1].id
e = elements[4]
assert isinstance(e, HTMLListItem)
assert e.text == "Second list item gets category_depth and parent_id"
assert e.metadata.category_depth == 1
assert e.metadata.parent_id == elements[1].id
e = elements[5]
assert isinstance(e, HTMLNarrativeText)
assert e.text == "Body paragraph after list gets parent_id but no category_depth"
assert e.metadata.category_depth is None
assert e.metadata.parent_id == elements[1].id
# -- .metadata.emphasis -------------------------------------------------------------------------- # -- .metadata.emphasis --------------------------------------------------------------------------
@ -509,6 +547,14 @@ def test_partition_html_grabs_emphasized_texts():
# -- .metadata.filename -------------------------------------------------------------------------- # -- .metadata.filename --------------------------------------------------------------------------
def test_partition_html_from_filename_uses_source_filename_for_metadata_by_default():
elements = partition_html(example_doc_path("example-10k-1p.html"))
assert len(elements) > 0
assert all(e.metadata.filename == "example-10k-1p.html" for e in elements)
assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
def test_partition_html_from_filename_prefers_metadata_filename(): def test_partition_html_from_filename_prefers_metadata_filename():
elements = partition_html(example_doc_path("example-10k-1p.html"), metadata_filename="test") elements = partition_html(example_doc_path("example-10k-1p.html"), metadata_filename="test")
@ -711,6 +757,75 @@ def test_partition_html_links():
assert e.metadata.link_start_indexes == [0, 12] assert e.metadata.link_start_indexes == [0, 12]
# -- .metadata.text_as_html ----------------------------------------------------------------------
@pytest.mark.parametrize(
("html_str", "expected_value"),
[
(
"<table><tr><th>Header 1</th><th>Header 2</th></tr></table>",
"<table><tr><td>Header 1</td><td>Header 2</td></tr></table>",
),
(
"<table>"
"<tr><td>Dimensions</td><td>Weight</td></tr>"
"<tr><td>4'-6\" x 1'</td><td>18 kg</td></tr>"
"</table>",
# ----------
"<table>"
"<tr><td>Dimensions</td><td>Weight</td></tr>"
"<tr><td>4&#x27;-6&quot; x 1&#x27;</td><td>18 kg</td></tr>"
"</table>",
),
],
)
def test_partition_html_applies_text_as_html_metadata_for_tables(
html_str: str, expected_value: str
):
elements = partition_html(text=html_str)
assert len(elements) == 1
assert elements[0].metadata.text_as_html == expected_value
@pytest.mark.parametrize(
("tag", "expected_text_as_html"),
[
("thead", "<table><tr><td>Header 1</td><td>Header 2</td></tr></table>"),
("tfoot", "<table><tr><td>Header 1</td><td>Header 2</td></tr></table>"),
],
)
def test_partition_html_parses_table_without_tbody(tag: str, expected_text_as_html: str):
elements = partition_html(
text=(
f"<table>\n"
f" <{tag}>\n"
f" <tr><th>Header 1</th><th>Header 2</th></tr>\n"
f" </{tag}>\n"
f"</table>"
)
)
assert elements[0].metadata.text_as_html == expected_text_as_html
# -- .metadata.url -------------------------------------------------------------------------------
def test_partition_html_from_url_adds_url_to_metadata(requests_get_: Mock):
requests_get_.return_value = FakeResponse(
text=example_doc_text("example-10k-1p.html"),
status_code=200,
headers={"Content-Type": "text/html"},
)
elements = partition_html(url="https://trusttheforceluke.com")
requests_get_.assert_called_once_with("https://trusttheforceluke.com", headers={}, verify=True)
assert len(elements) > 0
assert all(e.metadata.url == "https://trusttheforceluke.com" for e in elements)
# -- miscellaneous ------------------------------------------------------------------------------- # -- miscellaneous -------------------------------------------------------------------------------

View File

@ -1 +1 @@
__version__ = "0.14.6-dev1" # pragma: no cover __version__ = "0.14.6-dev2" # pragma: no cover

View File

@ -560,10 +560,12 @@ def assign_and_map_hash_ids(elements: list[Element]) -> list[Element]:
def process_metadata() -> Callable[[Callable[_P, list[Element]]], Callable[_P, list[Element]]]: def process_metadata() -> Callable[[Callable[_P, list[Element]]], Callable[_P, list[Element]]]:
"""Post-process element-metadata for this document. """Post-process element-metadata for this document.
This decorator adds a post-processing step to a document partitioner. It adds documentation for This decorator adds a post-processing step to a document partitioner.
`metadata_filename` and `include_metadata` parameters if not present. Also adds regex-metadata
when `regex_metadata` keyword-argument is provided and changes the element-id to a UUID when - Adds `metadata_filename` and `include_metadata` parameters to docstring if not present.
`unique_element_ids` argument is provided and True. - Adds `.metadata.regex-metadata` when `regex_metadata` keyword-argument is provided.
- Updates element.id to a UUID when `unique_element_ids` argument is provided and True.
""" """
def decorator(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]: def decorator(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]:

View File

@ -7,7 +7,7 @@ from typing import Any, Final, Iterator, Optional, cast
from lxml import etree from lxml import etree
from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes
from unstructured.documents.elements import Element, ElementMetadata, Link from unstructured.documents.elements import Element, ElementMetadata, Link, PageBreak
from unstructured.documents.html_elements import ( from unstructured.documents.html_elements import (
HTMLAddress, HTMLAddress,
HTMLEmailAddress, HTMLEmailAddress,
@ -16,6 +16,7 @@ from unstructured.documents.html_elements import (
HTMLTable, HTMLTable,
HTMLText, HTMLText,
HTMLTitle, HTMLTitle,
TagsMixin,
) )
from unstructured.file_utils.encoding import read_txt_file from unstructured.file_utils.encoding import read_txt_file
from unstructured.logger import logger from unstructured.logger import logger
@ -60,7 +61,7 @@ class HTMLDocument:
@classmethod @classmethod
def from_string(cls, text: str, **kwargs: Any) -> HTMLDocument: def from_string(cls, text: str, **kwargs: Any) -> HTMLDocument:
"""Supports reading in an XML file as a raw string rather than as a file.""" """Supports reading in an HTML file as a string rather than as a file."""
logger.info("Reading document from string ...") logger.info("Reading document from string ...")
return cls(text, **kwargs) return cls(text, **kwargs)
@ -230,6 +231,78 @@ class Page:
return "\n\n".join([str(element) for element in self.elements]) return "\n\n".join([str(element) for element in self.elements])
# -- TEMPORARY EXTRACTION OF document_to_element_list() ------------------------------------------
def document_to_element_list(
document: HTMLDocument,
*,
include_page_breaks: bool = False,
last_modified: str | None,
starting_page_number: int = 1,
detection_origin: str | None = None,
**kwargs: Any,
) -> Iterator[Element]:
"""Converts a DocumentLayout or HTMLDocument object to a list of unstructured elements."""
def iter_page_elements(page: Page, page_number: int | None) -> Iterator[Element]:
"""Generate each element in page after applying its metadata."""
for element in page.elements:
add_element_metadata(
element,
detection_origin=detection_origin,
last_modified=last_modified,
page_number=page_number,
**kwargs,
)
yield element
num_pages = len(document.pages)
for page_number, page in enumerate(document.pages, start=starting_page_number):
yield from iter_page_elements(page, page_number)
if include_page_breaks and page_number < num_pages + starting_page_number:
yield PageBreak(text="")
def add_element_metadata(
element: Element,
*,
detection_origin: str | None,
last_modified: str | None,
page_number: int | None,
**kwargs: Any,
) -> Element:
"""Adds document metadata to the document element.
Document metadata includes information like the filename, source url, and page number.
"""
assert isinstance(element, TagsMixin)
emphasized_text_contents = [et.get("text") or "" for et in element.emphasized_texts]
emphasized_text_tags = [et.get("tag") or "" for et in element.emphasized_texts]
link_urls = [link.get("url") for link in element.links]
link_texts = [link.get("text") or "" for link in element.links]
link_start_indexes = [link.get("start_index") for link in element.links]
metadata = ElementMetadata(
emphasized_text_contents=emphasized_text_contents or None,
emphasized_text_tags=emphasized_text_tags or None,
last_modified=last_modified,
link_start_indexes=link_start_indexes or None,
link_texts=link_texts or None,
link_urls=link_urls or None,
page_number=page_number,
text_as_html=element.text_as_html,
)
element.metadata.update(metadata)
if detection_origin is not None:
element.metadata.detection_origin = detection_origin
return element
# -- tag classifiers ----------------------------------------------------------------------------- # -- tag classifiers -----------------------------------------------------------------------------

View File

@ -610,7 +610,15 @@ def add_metadata(func: Callable[_P, List[Element]]) -> Callable[_P, List[Element
def add_filetype( def add_filetype(
filetype: FileType, filetype: FileType,
) -> Callable[[Callable[_P, List[Element]]], Callable[_P, List[Element]]]: ) -> Callable[[Callable[_P, List[Element]]], Callable[_P, List[Element]]]:
"""...""" """Post-process element-metadata for list[Element] from partitioning.
This decorator adds a post-processing step to a document partitioner.
- Adds `metadata_filename` and `include_metadata` parameters to docstring if not present.
- Adds `.metadata.regex-metadata` when `regex_metadata` keyword-argument is provided.
- Updates element.id to a UUID when `unique_element_ids` argument is provided and True.
"""
def decorator(func: Callable[_P, List[Element]]) -> Callable[_P, List[Element]]: def decorator(func: Callable[_P, List[Element]]) -> Callable[_P, List[Element]]:
@functools.wraps(func) @functools.wraps(func)

View File

@ -39,8 +39,6 @@ if TYPE_CHECKING:
from unstructured_inference.inference.layout import DocumentLayout, PageLayout from unstructured_inference.inference.layout import DocumentLayout, PageLayout
from unstructured_inference.inference.layoutelement import LayoutElement from unstructured_inference.inference.layoutelement import LayoutElement
from unstructured.documents.html import HTMLDocument
HIERARCHY_RULE_SET = { HIERARCHY_RULE_SET = {
"Title": [ "Title": [
"Text", "Text",
@ -542,8 +540,10 @@ def _get_page_image_metadata(page: PageLayout) -> dict[str, Any]:
# FIXME: document here can be either DocumentLayout or HTMLDocument; HTMLDocument is defined in # FIXME: document here can be either DocumentLayout or HTMLDocument; HTMLDocument is defined in
# unstructured.documents.html, which imports this module so we can't import the class for type # unstructured.documents.html, which imports this module so we can't import the class for type
# hints. Moreover, those two types of documents have different lists of attributes # hints. Moreover, those two types of documents have different lists of attributes
# UPDATE(scanny): HTMLDocument no longer uses this function, so it can be optimized for use by
# DocumentLayout only.
def document_to_element_list( def document_to_element_list(
document: "DocumentLayout | HTMLDocument", document: DocumentLayout,
sortable: bool = False, sortable: bool = False,
include_page_breaks: bool = False, include_page_breaks: bool = False,
last_modification_date: Optional[str] = None, last_modification_date: Optional[str] = None,
@ -555,7 +555,7 @@ def document_to_element_list(
starting_page_number: int = 1, starting_page_number: int = 1,
**kwargs: Any, **kwargs: Any,
) -> list[Element]: ) -> list[Element]:
"""Converts a DocumentLayout or HTMLDocument object to a list of unstructured elements.""" """Converts a DocumentLayout object to a list of unstructured elements."""
elements: list[Element] = [] elements: list[Element] = []
num_pages = len(document.pages) num_pages = len(document.pages)

View File

@ -6,13 +6,12 @@ import requests
from unstructured.chunking import add_chunking_strategy from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata from unstructured.documents.elements import Element, process_metadata
from unstructured.documents.html import HTMLDocument from unstructured.documents.html import HTMLDocument, document_to_element_list
from unstructured.documents.html_elements import TagsMixin from unstructured.documents.html_elements import TagsMixin
from unstructured.file_utils.encoding import read_txt_file from unstructured.file_utils.encoding import read_txt_file
from unstructured.file_utils.file_conversion import convert_file_to_html_text from unstructured.file_utils.file_conversion import convert_file_to_html_text
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.common import ( from unstructured.partition.common import (
document_to_element_list,
exactly_one, exactly_one,
get_last_modified_date, get_last_modified_date,
get_last_modified_date_from_file, get_last_modified_date_from_file,
@ -25,30 +24,27 @@ from unstructured.partition.lang import apply_lang_metadata
@add_chunking_strategy @add_chunking_strategy
def partition_html( def partition_html(
filename: Optional[str] = None, filename: Optional[str] = None,
*,
file: Optional[IO[bytes]] = None, file: Optional[IO[bytes]] = None,
text: Optional[str] = None, text: Optional[str] = None,
url: Optional[str] = None,
encoding: Optional[str] = None, encoding: Optional[str] = None,
include_page_breaks: bool = False, url: Optional[str] = None,
include_metadata: bool = True,
headers: dict[str, str] = {}, headers: dict[str, str] = {},
ssl_verify: bool = True, ssl_verify: bool = True,
source_format: Optional[str] = None, date_from_file_object: bool = False,
detect_language_per_element: bool = False,
html_assemble_articles: bool = False, html_assemble_articles: bool = False,
metadata_filename: Optional[str] = None, languages: Optional[list[str]] = ["auto"],
metadata_last_modified: Optional[str] = None, metadata_last_modified: Optional[str] = None,
skip_headers_and_footers: bool = False, skip_headers_and_footers: bool = False,
chunking_strategy: Optional[str] = None,
languages: Optional[list[str]] = ["auto"],
detect_language_per_element: bool = False,
detection_origin: Optional[str] = None,
date_from_file_object: bool = False,
**kwargs: Any, **kwargs: Any,
) -> list[Element]: ) -> list[Element]:
"""Partitions an HTML document into its constituent elements. """Partitions an HTML document into its constituent elements.
Parameters HTML source parameters
---------- ----------------------
The HTML to be partitioned can be specified four different ways:
filename filename
A string defining the target filename path. A string defining the target filename path.
file file
@ -57,25 +53,23 @@ def partition_html(
The string representation of the HTML document. The string representation of the HTML document.
url url
The URL of a webpage to parse. Only for URLs that return an HTML document. The URL of a webpage to parse. Only for URLs that return an HTML document.
headers
The HTTP headers to be used in the HTTP request when `url` is specified.
ssl_verify
If the URL parameter is set, determines whether or not SSL verification is performed
on the HTTP request.
date_from_file_object
Applies only when providing file via `file` parameter. If this option is True, attempt
infer last_modified metadata from bytes, otherwise set it to None.
encoding encoding
The encoding method used to decode the text input. If None, utf-8 will be used. The encoding method used to decode the text input. If None, utf-8 will be used.
include_page_breaks
If True, includes page breaks at the end of each page in the document. Other parameters
----------------
include_metadata include_metadata
Optionally allows for excluding metadata from the output. Primarily intended Optionally allows for excluding metadata from the output. Primarily intended
for when partition_html is called in other partition bricks (like partition_email) for when partition_html is called by other partitioners (like partition_email).
headers
The headers to be used in conjunction with the HTTP request if URL is set.
ssl_verify
If the URL parameter is set, determines whether or not partition uses SSL verification
in the HTTP request.
source_format
The source of the original html. If None we will return HTMLElements but for example
partition_rst will pass a value of 'rst' so that we return Title vs HTMLTitle
metadata_last_modified
The last modified date for the document.
skip_headers_and_footers
If True, ignores any content that is within <header> or <footer> tags
languages languages
User defined value for `metadata.languages` if provided. Otherwise language is detected User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
@ -83,26 +77,35 @@ def partition_html(
Additional Parameters: Additional Parameters:
detect_language_per_element detect_language_per_element
Detect language per element instead of at the document level. Detect language per element instead of at the document level.
date_from_file_object metadata_last_modified
Applies only when providing file via `file` parameter. If this option is True, attempt The last modified date for the document.
infer last_modified metadata from bytes, otherwise set it to None. skip_headers_and_footers
If True, ignores any content that is within <header> or <footer> tags
source_format
The source of the original html. If None we will return HTMLElements but for example
partition_rst will pass a value of 'rst' so that we return Title vs HTMLTitle
""" """
if text is not None and text.strip() == "" and not file and not filename and not url: if text is not None and text.strip() == "" and not file and not filename and not url:
return [] return []
# Verify that only one of the arguments was provided # Verify that only one of the arguments was provided
exactly_one(filename=filename, file=file, text=text, url=url) exactly_one(filename=filename, file=file, text=text, url=url)
last_modification_date = None
def last_modified():
if metadata_last_modified:
return metadata_last_modified
if filename:
return get_last_modified_date(filename)
if file and date_from_file_object:
return get_last_modified_date_from_file(file)
return None
if filename is not None: if filename is not None:
last_modification_date = get_last_modified_date(filename)
document = HTMLDocument.from_file( document = HTMLDocument.from_file(
filename, encoding=encoding, assemble_articles=html_assemble_articles filename, encoding=encoding, assemble_articles=html_assemble_articles
) )
elif file is not None: elif file is not None:
last_modification_date = (
get_last_modified_date_from_file(file) if date_from_file_object else None
)
_, file_text = read_txt_file(file=file, encoding=encoding) _, file_text = read_txt_file(file=file, encoding=encoding)
document = HTMLDocument.from_string(file_text, assemble_articles=html_assemble_articles) document = HTMLDocument.from_string(file_text, assemble_articles=html_assemble_articles)
@ -127,25 +130,17 @@ def partition_html(
elements = list( elements = list(
apply_lang_metadata( apply_lang_metadata(
document_to_element_list( document_to_element_list(document, last_modified=last_modified(), **kwargs),
document,
sortable=False,
include_page_breaks=include_page_breaks,
last_modification_date=metadata_last_modified or last_modification_date,
source_format=source_format if source_format else None,
detection_origin=detection_origin,
**kwargs,
),
languages=languages, languages=languages,
detect_language_per_element=detect_language_per_element, detect_language_per_element=detect_language_per_element,
), )
) )
# Note(yuming): Rip off page_number metadata fields here # Note(yuming): Rip off page_number metadata fields here until we have a better way to handle
# until we have a better way to handle page counting for html files # page counting for html files.
for element in elements: for e in elements:
if hasattr(element.metadata, "page_number"): e.metadata.page_number = None
element.metadata.page_number = None
return elements return elements