From 2f0400f2790b0e13a719b5bd4c441acab725230d Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Tue, 11 Jun 2024 13:54:11 -0700 Subject: [PATCH] rfctr(html): break coupling to DocumentLayout (#3180) **Summary** Remove use of `partition.common.document_to_element_list()` by `HTMLDocument`. The transitive coupling with layout-inference through this shared function have been the source of frustration and a drain on engineering time and there's no compelling reason for the two to share this code. **Additional Context** `partition_html()` uses `partition.common.document_to_element_list()` to get finalized elements from `HTMLDocument` (pages). This gives rise to a very nasty coupling between `DocumentLayout`, used by `unstructured_inference`, and `HTMLDocument`. `document_to_element_list()` has evolved to work for both callers, but they share very few common characteristics with each other. This coupling is bad news for us and also, importantly, for the inference and page layout folks working on PDF and images. Break that coupling so those inference-related functions can evolve whatever way they need to without being dragged down by legacy `HTMLDocument` connections. The initial step is to extract a `document_to_element_list()` function of our own, getting rid of the coordinates and other `DocumentLayout`-related bits we don't need. As you'll see in the next few PRs, all of this `document_to_element_list()` code will end up either going away or being relocated closer to where it's used in `HTMLDocument`. --- CHANGELOG.md | 2 +- test_unstructured/partition/test_html.py | 157 ++++++++++++++++++++--- unstructured/__version__.py | 2 +- unstructured/documents/elements.py | 10 +- unstructured/documents/html.py | 77 ++++++++++- unstructured/file_utils/filetype.py | 10 +- unstructured/partition/common.py | 8 +- unstructured/partition/html.py | 99 +++++++------- 8 files changed, 279 insertions(+), 86 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fc1091737..23aae59dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.14.6-dev1 +## 0.14.6-dev2 ### Enhancements diff --git a/test_unstructured/partition/test_html.py b/test_unstructured/partition/test_html.py index 2e353d6f6..0ae4086fc 100644 --- a/test_unstructured/partition/test_html.py +++ b/test_unstructured/partition/test_html.py @@ -28,7 +28,13 @@ from unstructured.documents.elements import ( TableChunk, Title, ) -from unstructured.documents.html_elements import HTMLTable, TagsMixin +from unstructured.documents.html_elements import ( + HTMLListItem, + HTMLNarrativeText, + HTMLTable, + HTMLTitle, + TagsMixin, +) from unstructured.partition.html import partition_html # -- document-source (filename, file, text, url) ------------------------------------------------- @@ -171,26 +177,6 @@ def test_pre_tag_parsing_respects_order(): ] -@pytest.mark.parametrize( - ("tag", "expected_text_as_html"), - [ - ("thead", "
Header 1Header 2
"), - ("tfoot", "
Header 1Header 2
"), - ], -) -def test_partition_html_with_table_without_tbody(tag: str, expected_text_as_html: str): - elements = partition_html( - text=( - f"\n" - f" <{tag}>\n" - f" \n" - f" \n" - f"
Header 1Header 2
" - ) - ) - assert elements[0].metadata.text_as_html == expected_text_as_html - - def test_partition_html_b_tag_parsing(): elements = partition_html( text=( @@ -467,6 +453,58 @@ def test_element_ids_are_deterministic(): assert ids == ids_2 +# -- .metadata.category_depth + parent_id -------------------------------------------------------- + + +def test_partition_html_records_hierarchy_metadata(): + elements = partition_html( + text=( + "\n" + "

Preamble gets no category_depth or parent_id

\n" + "

Heading gets category_depth but no parent_id

\n" + "

Body paragraph gets parent_id but no category_depth

\n" + " \n" + "

Body paragraph after list gets parent_id but no category_depth

\n" + "\n" + ) + ) + + assert len(elements) == 6 + e = elements[0] + assert isinstance(e, HTMLNarrativeText) + assert e.text == "Preamble gets no category_depth or parent_id" + assert e.metadata.category_depth is None + assert e.metadata.parent_id is None + e = elements[1] + assert isinstance(e, HTMLTitle) + assert e.text == "Heading gets category_depth but no parent_id" + assert e.metadata.category_depth == 0 + assert e.metadata.parent_id is None + e = elements[2] + assert isinstance(e, HTMLNarrativeText) + assert e.text == "Body paragraph gets parent_id but no category_depth" + assert e.metadata.category_depth is None + assert e.metadata.parent_id == elements[1].id + e = elements[3] + assert isinstance(e, HTMLListItem) + assert e.text == "List item gets category_depth and parent_id" + assert e.metadata.category_depth == 1 + assert e.metadata.parent_id == elements[1].id + e = elements[4] + assert isinstance(e, HTMLListItem) + assert e.text == "Second list item gets category_depth and parent_id" + assert e.metadata.category_depth == 1 + assert e.metadata.parent_id == elements[1].id + e = elements[5] + assert isinstance(e, HTMLNarrativeText) + assert e.text == "Body paragraph after list gets parent_id but no category_depth" + assert e.metadata.category_depth is None + assert e.metadata.parent_id == elements[1].id + + # -- .metadata.emphasis -------------------------------------------------------------------------- @@ -509,6 +547,14 @@ def test_partition_html_grabs_emphasized_texts(): # -- .metadata.filename -------------------------------------------------------------------------- +def test_partition_html_from_filename_uses_source_filename_for_metadata_by_default(): + elements = partition_html(example_doc_path("example-10k-1p.html")) + + assert len(elements) > 0 + assert all(e.metadata.filename == "example-10k-1p.html" for e in elements) + assert all(e.metadata.file_directory == example_doc_path("") for e in elements) + + def test_partition_html_from_filename_prefers_metadata_filename(): elements = partition_html(example_doc_path("example-10k-1p.html"), metadata_filename="test") @@ -711,6 +757,75 @@ def test_partition_html_links(): assert e.metadata.link_start_indexes == [0, 12] +# -- .metadata.text_as_html ---------------------------------------------------------------------- + + +@pytest.mark.parametrize( + ("html_str", "expected_value"), + [ + ( + "
Header 1Header 2
", + "
Header 1Header 2
", + ), + ( + "" + "" + "" + "
DimensionsWeight
4'-6\" x 1'18 kg
", + # ---------- + "" + "" + "" + "
DimensionsWeight
4'-6" x 1'18 kg
", + ), + ], +) +def test_partition_html_applies_text_as_html_metadata_for_tables( + html_str: str, expected_value: str +): + elements = partition_html(text=html_str) + + assert len(elements) == 1 + assert elements[0].metadata.text_as_html == expected_value + + +@pytest.mark.parametrize( + ("tag", "expected_text_as_html"), + [ + ("thead", "
Header 1Header 2
"), + ("tfoot", "
Header 1Header 2
"), + ], +) +def test_partition_html_parses_table_without_tbody(tag: str, expected_text_as_html: str): + elements = partition_html( + text=( + f"\n" + f" <{tag}>\n" + f" \n" + f" \n" + f"
Header 1Header 2
" + ) + ) + assert elements[0].metadata.text_as_html == expected_text_as_html + + +# -- .metadata.url ------------------------------------------------------------------------------- + + +def test_partition_html_from_url_adds_url_to_metadata(requests_get_: Mock): + requests_get_.return_value = FakeResponse( + text=example_doc_text("example-10k-1p.html"), + status_code=200, + headers={"Content-Type": "text/html"}, + ) + + elements = partition_html(url="https://trusttheforceluke.com") + + requests_get_.assert_called_once_with("https://trusttheforceluke.com", headers={}, verify=True) + assert len(elements) > 0 + assert all(e.metadata.url == "https://trusttheforceluke.com" for e in elements) + + # -- miscellaneous ------------------------------------------------------------------------------- diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 71babff5a..0ca39af87 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.6-dev1" # pragma: no cover +__version__ = "0.14.6-dev2" # pragma: no cover diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index 3d1506c90..7d300b3f6 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -560,10 +560,12 @@ def assign_and_map_hash_ids(elements: list[Element]) -> list[Element]: def process_metadata() -> Callable[[Callable[_P, list[Element]]], Callable[_P, list[Element]]]: """Post-process element-metadata for this document. - This decorator adds a post-processing step to a document partitioner. It adds documentation for - `metadata_filename` and `include_metadata` parameters if not present. Also adds regex-metadata - when `regex_metadata` keyword-argument is provided and changes the element-id to a UUID when - `unique_element_ids` argument is provided and True. + This decorator adds a post-processing step to a document partitioner. + + - Adds `metadata_filename` and `include_metadata` parameters to docstring if not present. + - Adds `.metadata.regex-metadata` when `regex_metadata` keyword-argument is provided. + - Updates element.id to a UUID when `unique_element_ids` argument is provided and True. + """ def decorator(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]: diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py index de0c8b4ff..513054d39 100644 --- a/unstructured/documents/html.py +++ b/unstructured/documents/html.py @@ -7,7 +7,7 @@ from typing import Any, Final, Iterator, Optional, cast from lxml import etree from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes -from unstructured.documents.elements import Element, ElementMetadata, Link +from unstructured.documents.elements import Element, ElementMetadata, Link, PageBreak from unstructured.documents.html_elements import ( HTMLAddress, HTMLEmailAddress, @@ -16,6 +16,7 @@ from unstructured.documents.html_elements import ( HTMLTable, HTMLText, HTMLTitle, + TagsMixin, ) from unstructured.file_utils.encoding import read_txt_file from unstructured.logger import logger @@ -60,7 +61,7 @@ class HTMLDocument: @classmethod def from_string(cls, text: str, **kwargs: Any) -> HTMLDocument: - """Supports reading in an XML file as a raw string rather than as a file.""" + """Supports reading in an HTML file as a string rather than as a file.""" logger.info("Reading document from string ...") return cls(text, **kwargs) @@ -230,6 +231,78 @@ class Page: return "\n\n".join([str(element) for element in self.elements]) +# -- TEMPORARY EXTRACTION OF document_to_element_list() ------------------------------------------ + + +def document_to_element_list( + document: HTMLDocument, + *, + include_page_breaks: bool = False, + last_modified: str | None, + starting_page_number: int = 1, + detection_origin: str | None = None, + **kwargs: Any, +) -> Iterator[Element]: + """Converts a DocumentLayout or HTMLDocument object to a list of unstructured elements.""" + + def iter_page_elements(page: Page, page_number: int | None) -> Iterator[Element]: + """Generate each element in page after applying its metadata.""" + for element in page.elements: + add_element_metadata( + element, + detection_origin=detection_origin, + last_modified=last_modified, + page_number=page_number, + **kwargs, + ) + yield element + + num_pages = len(document.pages) + for page_number, page in enumerate(document.pages, start=starting_page_number): + yield from iter_page_elements(page, page_number) + if include_page_breaks and page_number < num_pages + starting_page_number: + yield PageBreak(text="") + + +def add_element_metadata( + element: Element, + *, + detection_origin: str | None, + last_modified: str | None, + page_number: int | None, + **kwargs: Any, +) -> Element: + """Adds document metadata to the document element. + + Document metadata includes information like the filename, source url, and page number. + """ + assert isinstance(element, TagsMixin) + + emphasized_text_contents = [et.get("text") or "" for et in element.emphasized_texts] + emphasized_text_tags = [et.get("tag") or "" for et in element.emphasized_texts] + + link_urls = [link.get("url") for link in element.links] + link_texts = [link.get("text") or "" for link in element.links] + link_start_indexes = [link.get("start_index") for link in element.links] + + metadata = ElementMetadata( + emphasized_text_contents=emphasized_text_contents or None, + emphasized_text_tags=emphasized_text_tags or None, + last_modified=last_modified, + link_start_indexes=link_start_indexes or None, + link_texts=link_texts or None, + link_urls=link_urls or None, + page_number=page_number, + text_as_html=element.text_as_html, + ) + element.metadata.update(metadata) + + if detection_origin is not None: + element.metadata.detection_origin = detection_origin + + return element + + # -- tag classifiers ----------------------------------------------------------------------------- diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index be6efd451..b25fee91f 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -610,7 +610,15 @@ def add_metadata(func: Callable[_P, List[Element]]) -> Callable[_P, List[Element def add_filetype( filetype: FileType, ) -> Callable[[Callable[_P, List[Element]]], Callable[_P, List[Element]]]: - """...""" + """Post-process element-metadata for list[Element] from partitioning. + + This decorator adds a post-processing step to a document partitioner. + + - Adds `metadata_filename` and `include_metadata` parameters to docstring if not present. + - Adds `.metadata.regex-metadata` when `regex_metadata` keyword-argument is provided. + - Updates element.id to a UUID when `unique_element_ids` argument is provided and True. + + """ def decorator(func: Callable[_P, List[Element]]) -> Callable[_P, List[Element]]: @functools.wraps(func) diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py index 440e08af1..6fdcdfdc1 100644 --- a/unstructured/partition/common.py +++ b/unstructured/partition/common.py @@ -39,8 +39,6 @@ if TYPE_CHECKING: from unstructured_inference.inference.layout import DocumentLayout, PageLayout from unstructured_inference.inference.layoutelement import LayoutElement - from unstructured.documents.html import HTMLDocument - HIERARCHY_RULE_SET = { "Title": [ "Text", @@ -542,8 +540,10 @@ def _get_page_image_metadata(page: PageLayout) -> dict[str, Any]: # FIXME: document here can be either DocumentLayout or HTMLDocument; HTMLDocument is defined in # unstructured.documents.html, which imports this module so we can't import the class for type # hints. Moreover, those two types of documents have different lists of attributes +# UPDATE(scanny): HTMLDocument no longer uses this function, so it can be optimized for use by +# DocumentLayout only. def document_to_element_list( - document: "DocumentLayout | HTMLDocument", + document: DocumentLayout, sortable: bool = False, include_page_breaks: bool = False, last_modification_date: Optional[str] = None, @@ -555,7 +555,7 @@ def document_to_element_list( starting_page_number: int = 1, **kwargs: Any, ) -> list[Element]: - """Converts a DocumentLayout or HTMLDocument object to a list of unstructured elements.""" + """Converts a DocumentLayout object to a list of unstructured elements.""" elements: list[Element] = [] num_pages = len(document.pages) diff --git a/unstructured/partition/html.py b/unstructured/partition/html.py index 5a1237345..8e8bb1ac7 100644 --- a/unstructured/partition/html.py +++ b/unstructured/partition/html.py @@ -6,13 +6,12 @@ import requests from unstructured.chunking import add_chunking_strategy from unstructured.documents.elements import Element, process_metadata -from unstructured.documents.html import HTMLDocument +from unstructured.documents.html import HTMLDocument, document_to_element_list from unstructured.documents.html_elements import TagsMixin from unstructured.file_utils.encoding import read_txt_file from unstructured.file_utils.file_conversion import convert_file_to_html_text from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.partition.common import ( - document_to_element_list, exactly_one, get_last_modified_date, get_last_modified_date_from_file, @@ -25,30 +24,27 @@ from unstructured.partition.lang import apply_lang_metadata @add_chunking_strategy def partition_html( filename: Optional[str] = None, + *, file: Optional[IO[bytes]] = None, text: Optional[str] = None, - url: Optional[str] = None, encoding: Optional[str] = None, - include_page_breaks: bool = False, - include_metadata: bool = True, + url: Optional[str] = None, headers: dict[str, str] = {}, ssl_verify: bool = True, - source_format: Optional[str] = None, + date_from_file_object: bool = False, + detect_language_per_element: bool = False, html_assemble_articles: bool = False, - metadata_filename: Optional[str] = None, + languages: Optional[list[str]] = ["auto"], metadata_last_modified: Optional[str] = None, skip_headers_and_footers: bool = False, - chunking_strategy: Optional[str] = None, - languages: Optional[list[str]] = ["auto"], - detect_language_per_element: bool = False, - detection_origin: Optional[str] = None, - date_from_file_object: bool = False, **kwargs: Any, ) -> list[Element]: """Partitions an HTML document into its constituent elements. - Parameters - ---------- + HTML source parameters + ---------------------- + The HTML to be partitioned can be specified four different ways: + filename A string defining the target filename path. file @@ -57,25 +53,23 @@ def partition_html( The string representation of the HTML document. url The URL of a webpage to parse. Only for URLs that return an HTML document. + headers + The HTTP headers to be used in the HTTP request when `url` is specified. + ssl_verify + If the URL parameter is set, determines whether or not SSL verification is performed + on the HTTP request. + + date_from_file_object + Applies only when providing file via `file` parameter. If this option is True, attempt + infer last_modified metadata from bytes, otherwise set it to None. encoding The encoding method used to decode the text input. If None, utf-8 will be used. - include_page_breaks - If True, includes page breaks at the end of each page in the document. + + Other parameters + ---------------- include_metadata Optionally allows for excluding metadata from the output. Primarily intended - for when partition_html is called in other partition bricks (like partition_email) - headers - The headers to be used in conjunction with the HTTP request if URL is set. - ssl_verify - If the URL parameter is set, determines whether or not partition uses SSL verification - in the HTTP request. - source_format - The source of the original html. If None we will return HTMLElements but for example - partition_rst will pass a value of 'rst' so that we return Title vs HTMLTitle - metadata_last_modified - The last modified date for the document. - skip_headers_and_footers - If True, ignores any content that is within
or