"
+ )
+ )
+ assert elements[0].metadata.text_as_html == expected_text_as_html
+
+
+# -- .metadata.url -------------------------------------------------------------------------------
+
+
+def test_partition_html_from_url_adds_url_to_metadata(requests_get_: Mock):
+ requests_get_.return_value = FakeResponse(
+ text=example_doc_text("example-10k-1p.html"),
+ status_code=200,
+ headers={"Content-Type": "text/html"},
+ )
+
+ elements = partition_html(url="https://trusttheforceluke.com")
+
+ requests_get_.assert_called_once_with("https://trusttheforceluke.com", headers={}, verify=True)
+ assert len(elements) > 0
+ assert all(e.metadata.url == "https://trusttheforceluke.com" for e in elements)
+
+
# -- miscellaneous -------------------------------------------------------------------------------
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 71babff5a..0ca39af87 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.14.6-dev1" # pragma: no cover
+__version__ = "0.14.6-dev2" # pragma: no cover
diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py
index 3d1506c90..7d300b3f6 100644
--- a/unstructured/documents/elements.py
+++ b/unstructured/documents/elements.py
@@ -560,10 +560,12 @@ def assign_and_map_hash_ids(elements: list[Element]) -> list[Element]:
def process_metadata() -> Callable[[Callable[_P, list[Element]]], Callable[_P, list[Element]]]:
"""Post-process element-metadata for this document.
- This decorator adds a post-processing step to a document partitioner. It adds documentation for
- `metadata_filename` and `include_metadata` parameters if not present. Also adds regex-metadata
- when `regex_metadata` keyword-argument is provided and changes the element-id to a UUID when
- `unique_element_ids` argument is provided and True.
+ This decorator adds a post-processing step to a document partitioner.
+
+ - Adds `metadata_filename` and `include_metadata` parameters to docstring if not present.
+ - Adds `.metadata.regex-metadata` when `regex_metadata` keyword-argument is provided.
+ - Updates element.id to a UUID when `unique_element_ids` argument is provided and True.
+
"""
def decorator(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]:
diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py
index de0c8b4ff..513054d39 100644
--- a/unstructured/documents/html.py
+++ b/unstructured/documents/html.py
@@ -7,7 +7,7 @@ from typing import Any, Final, Iterator, Optional, cast
from lxml import etree
from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes
-from unstructured.documents.elements import Element, ElementMetadata, Link
+from unstructured.documents.elements import Element, ElementMetadata, Link, PageBreak
from unstructured.documents.html_elements import (
HTMLAddress,
HTMLEmailAddress,
@@ -16,6 +16,7 @@ from unstructured.documents.html_elements import (
HTMLTable,
HTMLText,
HTMLTitle,
+ TagsMixin,
)
from unstructured.file_utils.encoding import read_txt_file
from unstructured.logger import logger
@@ -60,7 +61,7 @@ class HTMLDocument:
@classmethod
def from_string(cls, text: str, **kwargs: Any) -> HTMLDocument:
- """Supports reading in an XML file as a raw string rather than as a file."""
+ """Supports reading in an HTML file as a string rather than as a file."""
logger.info("Reading document from string ...")
return cls(text, **kwargs)
@@ -230,6 +231,78 @@ class Page:
return "\n\n".join([str(element) for element in self.elements])
+# -- TEMPORARY EXTRACTION OF document_to_element_list() ------------------------------------------
+
+
+def document_to_element_list(
+ document: HTMLDocument,
+ *,
+ include_page_breaks: bool = False,
+ last_modified: str | None,
+ starting_page_number: int = 1,
+ detection_origin: str | None = None,
+ **kwargs: Any,
+) -> Iterator[Element]:
+ """Converts a DocumentLayout or HTMLDocument object to a list of unstructured elements."""
+
+ def iter_page_elements(page: Page, page_number: int | None) -> Iterator[Element]:
+ """Generate each element in page after applying its metadata."""
+ for element in page.elements:
+ add_element_metadata(
+ element,
+ detection_origin=detection_origin,
+ last_modified=last_modified,
+ page_number=page_number,
+ **kwargs,
+ )
+ yield element
+
+ num_pages = len(document.pages)
+ for page_number, page in enumerate(document.pages, start=starting_page_number):
+ yield from iter_page_elements(page, page_number)
+ if include_page_breaks and page_number < num_pages + starting_page_number:
+ yield PageBreak(text="")
+
+
+def add_element_metadata(
+ element: Element,
+ *,
+ detection_origin: str | None,
+ last_modified: str | None,
+ page_number: int | None,
+ **kwargs: Any,
+) -> Element:
+ """Adds document metadata to the document element.
+
+ Document metadata includes information like the filename, source url, and page number.
+ """
+ assert isinstance(element, TagsMixin)
+
+ emphasized_text_contents = [et.get("text") or "" for et in element.emphasized_texts]
+ emphasized_text_tags = [et.get("tag") or "" for et in element.emphasized_texts]
+
+ link_urls = [link.get("url") for link in element.links]
+ link_texts = [link.get("text") or "" for link in element.links]
+ link_start_indexes = [link.get("start_index") for link in element.links]
+
+ metadata = ElementMetadata(
+ emphasized_text_contents=emphasized_text_contents or None,
+ emphasized_text_tags=emphasized_text_tags or None,
+ last_modified=last_modified,
+ link_start_indexes=link_start_indexes or None,
+ link_texts=link_texts or None,
+ link_urls=link_urls or None,
+ page_number=page_number,
+ text_as_html=element.text_as_html,
+ )
+ element.metadata.update(metadata)
+
+ if detection_origin is not None:
+ element.metadata.detection_origin = detection_origin
+
+ return element
+
+
# -- tag classifiers -----------------------------------------------------------------------------
diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py
index be6efd451..b25fee91f 100644
--- a/unstructured/file_utils/filetype.py
+++ b/unstructured/file_utils/filetype.py
@@ -610,7 +610,15 @@ def add_metadata(func: Callable[_P, List[Element]]) -> Callable[_P, List[Element
def add_filetype(
filetype: FileType,
) -> Callable[[Callable[_P, List[Element]]], Callable[_P, List[Element]]]:
- """..."""
+ """Post-process element-metadata for list[Element] from partitioning.
+
+ This decorator adds a post-processing step to a document partitioner.
+
+ - Adds `metadata_filename` and `include_metadata` parameters to docstring if not present.
+ - Adds `.metadata.regex-metadata` when `regex_metadata` keyword-argument is provided.
+ - Updates element.id to a UUID when `unique_element_ids` argument is provided and True.
+
+ """
def decorator(func: Callable[_P, List[Element]]) -> Callable[_P, List[Element]]:
@functools.wraps(func)
diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py
index 440e08af1..6fdcdfdc1 100644
--- a/unstructured/partition/common.py
+++ b/unstructured/partition/common.py
@@ -39,8 +39,6 @@ if TYPE_CHECKING:
from unstructured_inference.inference.layout import DocumentLayout, PageLayout
from unstructured_inference.inference.layoutelement import LayoutElement
- from unstructured.documents.html import HTMLDocument
-
HIERARCHY_RULE_SET = {
"Title": [
"Text",
@@ -542,8 +540,10 @@ def _get_page_image_metadata(page: PageLayout) -> dict[str, Any]:
# FIXME: document here can be either DocumentLayout or HTMLDocument; HTMLDocument is defined in
# unstructured.documents.html, which imports this module so we can't import the class for type
# hints. Moreover, those two types of documents have different lists of attributes
+# UPDATE(scanny): HTMLDocument no longer uses this function, so it can be optimized for use by
+# DocumentLayout only.
def document_to_element_list(
- document: "DocumentLayout | HTMLDocument",
+ document: DocumentLayout,
sortable: bool = False,
include_page_breaks: bool = False,
last_modification_date: Optional[str] = None,
@@ -555,7 +555,7 @@ def document_to_element_list(
starting_page_number: int = 1,
**kwargs: Any,
) -> list[Element]:
- """Converts a DocumentLayout or HTMLDocument object to a list of unstructured elements."""
+ """Converts a DocumentLayout object to a list of unstructured elements."""
elements: list[Element] = []
num_pages = len(document.pages)
diff --git a/unstructured/partition/html.py b/unstructured/partition/html.py
index 5a1237345..8e8bb1ac7 100644
--- a/unstructured/partition/html.py
+++ b/unstructured/partition/html.py
@@ -6,13 +6,12 @@ import requests
from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata
-from unstructured.documents.html import HTMLDocument
+from unstructured.documents.html import HTMLDocument, document_to_element_list
from unstructured.documents.html_elements import TagsMixin
from unstructured.file_utils.encoding import read_txt_file
from unstructured.file_utils.file_conversion import convert_file_to_html_text
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.common import (
- document_to_element_list,
exactly_one,
get_last_modified_date,
get_last_modified_date_from_file,
@@ -25,30 +24,27 @@ from unstructured.partition.lang import apply_lang_metadata
@add_chunking_strategy
def partition_html(
filename: Optional[str] = None,
+ *,
file: Optional[IO[bytes]] = None,
text: Optional[str] = None,
- url: Optional[str] = None,
encoding: Optional[str] = None,
- include_page_breaks: bool = False,
- include_metadata: bool = True,
+ url: Optional[str] = None,
headers: dict[str, str] = {},
ssl_verify: bool = True,
- source_format: Optional[str] = None,
+ date_from_file_object: bool = False,
+ detect_language_per_element: bool = False,
html_assemble_articles: bool = False,
- metadata_filename: Optional[str] = None,
+ languages: Optional[list[str]] = ["auto"],
metadata_last_modified: Optional[str] = None,
skip_headers_and_footers: bool = False,
- chunking_strategy: Optional[str] = None,
- languages: Optional[list[str]] = ["auto"],
- detect_language_per_element: bool = False,
- detection_origin: Optional[str] = None,
- date_from_file_object: bool = False,
**kwargs: Any,
) -> list[Element]:
"""Partitions an HTML document into its constituent elements.
- Parameters
- ----------
+ HTML source parameters
+ ----------------------
+ The HTML to be partitioned can be specified four different ways:
+
filename
A string defining the target filename path.
file
@@ -57,25 +53,23 @@ def partition_html(
The string representation of the HTML document.
url
The URL of a webpage to parse. Only for URLs that return an HTML document.
+ headers
+ The HTTP headers to be used in the HTTP request when `url` is specified.
+ ssl_verify
+ If the URL parameter is set, determines whether or not SSL verification is performed
+ on the HTTP request.
+
+ date_from_file_object
+ Applies only when providing file via `file` parameter. If this option is True, attempt
+ infer last_modified metadata from bytes, otherwise set it to None.
encoding
The encoding method used to decode the text input. If None, utf-8 will be used.
- include_page_breaks
- If True, includes page breaks at the end of each page in the document.
+
+ Other parameters
+ ----------------
include_metadata
Optionally allows for excluding metadata from the output. Primarily intended
- for when partition_html is called in other partition bricks (like partition_email)
- headers
- The headers to be used in conjunction with the HTTP request if URL is set.
- ssl_verify
- If the URL parameter is set, determines whether or not partition uses SSL verification
- in the HTTP request.
- source_format
- The source of the original html. If None we will return HTMLElements but for example
- partition_rst will pass a value of 'rst' so that we return Title vs HTMLTitle
- metadata_last_modified
- The last modified date for the document.
- skip_headers_and_footers
- If True, ignores any content that is within or