diff --git a/CHANGELOG.md b/CHANGELOG.md index fc1091737..23aae59dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.14.6-dev1 +## 0.14.6-dev2 ### Enhancements diff --git a/test_unstructured/partition/test_html.py b/test_unstructured/partition/test_html.py index 2e353d6f6..0ae4086fc 100644 --- a/test_unstructured/partition/test_html.py +++ b/test_unstructured/partition/test_html.py @@ -28,7 +28,13 @@ from unstructured.documents.elements import ( TableChunk, Title, ) -from unstructured.documents.html_elements import HTMLTable, TagsMixin +from unstructured.documents.html_elements import ( + HTMLListItem, + HTMLNarrativeText, + HTMLTable, + HTMLTitle, + TagsMixin, +) from unstructured.partition.html import partition_html # -- document-source (filename, file, text, url) ------------------------------------------------- @@ -171,26 +177,6 @@ def test_pre_tag_parsing_respects_order(): ] -@pytest.mark.parametrize( - ("tag", "expected_text_as_html"), - [ - ("thead", "
Header 1Header 2
"), - ("tfoot", "
Header 1Header 2
"), - ], -) -def test_partition_html_with_table_without_tbody(tag: str, expected_text_as_html: str): - elements = partition_html( - text=( - f"\n" - f" <{tag}>\n" - f" \n" - f" \n" - f"
Header 1Header 2
" - ) - ) - assert elements[0].metadata.text_as_html == expected_text_as_html - - def test_partition_html_b_tag_parsing(): elements = partition_html( text=( @@ -467,6 +453,58 @@ def test_element_ids_are_deterministic(): assert ids == ids_2 +# -- .metadata.category_depth + parent_id -------------------------------------------------------- + + +def test_partition_html_records_hierarchy_metadata(): + elements = partition_html( + text=( + "\n" + "

Preamble gets no category_depth or parent_id

\n" + "

Heading gets category_depth but no parent_id

\n" + "

Body paragraph gets parent_id but no category_depth

\n" + " \n" + "

Body paragraph after list gets parent_id but no category_depth

\n" + "\n" + ) + ) + + assert len(elements) == 6 + e = elements[0] + assert isinstance(e, HTMLNarrativeText) + assert e.text == "Preamble gets no category_depth or parent_id" + assert e.metadata.category_depth is None + assert e.metadata.parent_id is None + e = elements[1] + assert isinstance(e, HTMLTitle) + assert e.text == "Heading gets category_depth but no parent_id" + assert e.metadata.category_depth == 0 + assert e.metadata.parent_id is None + e = elements[2] + assert isinstance(e, HTMLNarrativeText) + assert e.text == "Body paragraph gets parent_id but no category_depth" + assert e.metadata.category_depth is None + assert e.metadata.parent_id == elements[1].id + e = elements[3] + assert isinstance(e, HTMLListItem) + assert e.text == "List item gets category_depth and parent_id" + assert e.metadata.category_depth == 1 + assert e.metadata.parent_id == elements[1].id + e = elements[4] + assert isinstance(e, HTMLListItem) + assert e.text == "Second list item gets category_depth and parent_id" + assert e.metadata.category_depth == 1 + assert e.metadata.parent_id == elements[1].id + e = elements[5] + assert isinstance(e, HTMLNarrativeText) + assert e.text == "Body paragraph after list gets parent_id but no category_depth" + assert e.metadata.category_depth is None + assert e.metadata.parent_id == elements[1].id + + # -- .metadata.emphasis -------------------------------------------------------------------------- @@ -509,6 +547,14 @@ def test_partition_html_grabs_emphasized_texts(): # -- .metadata.filename -------------------------------------------------------------------------- +def test_partition_html_from_filename_uses_source_filename_for_metadata_by_default(): + elements = partition_html(example_doc_path("example-10k-1p.html")) + + assert len(elements) > 0 + assert all(e.metadata.filename == "example-10k-1p.html" for e in elements) + assert all(e.metadata.file_directory == example_doc_path("") for e in elements) + + def test_partition_html_from_filename_prefers_metadata_filename(): elements = partition_html(example_doc_path("example-10k-1p.html"), metadata_filename="test") @@ -711,6 +757,75 @@ def test_partition_html_links(): assert e.metadata.link_start_indexes == [0, 12] +# -- .metadata.text_as_html ---------------------------------------------------------------------- + + +@pytest.mark.parametrize( + ("html_str", "expected_value"), + [ + ( + "
Header 1Header 2
", + "
Header 1Header 2
", + ), + ( + "" + "" + "" + "
DimensionsWeight
4'-6\" x 1'18 kg
", + # ---------- + "" + "" + "" + "
DimensionsWeight
4'-6" x 1'18 kg
", + ), + ], +) +def test_partition_html_applies_text_as_html_metadata_for_tables( + html_str: str, expected_value: str +): + elements = partition_html(text=html_str) + + assert len(elements) == 1 + assert elements[0].metadata.text_as_html == expected_value + + +@pytest.mark.parametrize( + ("tag", "expected_text_as_html"), + [ + ("thead", "
Header 1Header 2
"), + ("tfoot", "
Header 1Header 2
"), + ], +) +def test_partition_html_parses_table_without_tbody(tag: str, expected_text_as_html: str): + elements = partition_html( + text=( + f"\n" + f" <{tag}>\n" + f" \n" + f" \n" + f"
Header 1Header 2
" + ) + ) + assert elements[0].metadata.text_as_html == expected_text_as_html + + +# -- .metadata.url ------------------------------------------------------------------------------- + + +def test_partition_html_from_url_adds_url_to_metadata(requests_get_: Mock): + requests_get_.return_value = FakeResponse( + text=example_doc_text("example-10k-1p.html"), + status_code=200, + headers={"Content-Type": "text/html"}, + ) + + elements = partition_html(url="https://trusttheforceluke.com") + + requests_get_.assert_called_once_with("https://trusttheforceluke.com", headers={}, verify=True) + assert len(elements) > 0 + assert all(e.metadata.url == "https://trusttheforceluke.com" for e in elements) + + # -- miscellaneous ------------------------------------------------------------------------------- diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 71babff5a..0ca39af87 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.6-dev1" # pragma: no cover +__version__ = "0.14.6-dev2" # pragma: no cover diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index 3d1506c90..7d300b3f6 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -560,10 +560,12 @@ def assign_and_map_hash_ids(elements: list[Element]) -> list[Element]: def process_metadata() -> Callable[[Callable[_P, list[Element]]], Callable[_P, list[Element]]]: """Post-process element-metadata for this document. - This decorator adds a post-processing step to a document partitioner. It adds documentation for - `metadata_filename` and `include_metadata` parameters if not present. Also adds regex-metadata - when `regex_metadata` keyword-argument is provided and changes the element-id to a UUID when - `unique_element_ids` argument is provided and True. + This decorator adds a post-processing step to a document partitioner. + + - Adds `metadata_filename` and `include_metadata` parameters to docstring if not present. + - Adds `.metadata.regex-metadata` when `regex_metadata` keyword-argument is provided. + - Updates element.id to a UUID when `unique_element_ids` argument is provided and True. + """ def decorator(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]: diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py index de0c8b4ff..513054d39 100644 --- a/unstructured/documents/html.py +++ b/unstructured/documents/html.py @@ -7,7 +7,7 @@ from typing import Any, Final, Iterator, Optional, cast from lxml import etree from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes -from unstructured.documents.elements import Element, ElementMetadata, Link +from unstructured.documents.elements import Element, ElementMetadata, Link, PageBreak from unstructured.documents.html_elements import ( HTMLAddress, HTMLEmailAddress, @@ -16,6 +16,7 @@ from unstructured.documents.html_elements import ( HTMLTable, HTMLText, HTMLTitle, + TagsMixin, ) from unstructured.file_utils.encoding import read_txt_file from unstructured.logger import logger @@ -60,7 +61,7 @@ class HTMLDocument: @classmethod def from_string(cls, text: str, **kwargs: Any) -> HTMLDocument: - """Supports reading in an XML file as a raw string rather than as a file.""" + """Supports reading in an HTML file as a string rather than as a file.""" logger.info("Reading document from string ...") return cls(text, **kwargs) @@ -230,6 +231,78 @@ class Page: return "\n\n".join([str(element) for element in self.elements]) +# -- TEMPORARY EXTRACTION OF document_to_element_list() ------------------------------------------ + + +def document_to_element_list( + document: HTMLDocument, + *, + include_page_breaks: bool = False, + last_modified: str | None, + starting_page_number: int = 1, + detection_origin: str | None = None, + **kwargs: Any, +) -> Iterator[Element]: + """Converts a DocumentLayout or HTMLDocument object to a list of unstructured elements.""" + + def iter_page_elements(page: Page, page_number: int | None) -> Iterator[Element]: + """Generate each element in page after applying its metadata.""" + for element in page.elements: + add_element_metadata( + element, + detection_origin=detection_origin, + last_modified=last_modified, + page_number=page_number, + **kwargs, + ) + yield element + + num_pages = len(document.pages) + for page_number, page in enumerate(document.pages, start=starting_page_number): + yield from iter_page_elements(page, page_number) + if include_page_breaks and page_number < num_pages + starting_page_number: + yield PageBreak(text="") + + +def add_element_metadata( + element: Element, + *, + detection_origin: str | None, + last_modified: str | None, + page_number: int | None, + **kwargs: Any, +) -> Element: + """Adds document metadata to the document element. + + Document metadata includes information like the filename, source url, and page number. + """ + assert isinstance(element, TagsMixin) + + emphasized_text_contents = [et.get("text") or "" for et in element.emphasized_texts] + emphasized_text_tags = [et.get("tag") or "" for et in element.emphasized_texts] + + link_urls = [link.get("url") for link in element.links] + link_texts = [link.get("text") or "" for link in element.links] + link_start_indexes = [link.get("start_index") for link in element.links] + + metadata = ElementMetadata( + emphasized_text_contents=emphasized_text_contents or None, + emphasized_text_tags=emphasized_text_tags or None, + last_modified=last_modified, + link_start_indexes=link_start_indexes or None, + link_texts=link_texts or None, + link_urls=link_urls or None, + page_number=page_number, + text_as_html=element.text_as_html, + ) + element.metadata.update(metadata) + + if detection_origin is not None: + element.metadata.detection_origin = detection_origin + + return element + + # -- tag classifiers ----------------------------------------------------------------------------- diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index be6efd451..b25fee91f 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -610,7 +610,15 @@ def add_metadata(func: Callable[_P, List[Element]]) -> Callable[_P, List[Element def add_filetype( filetype: FileType, ) -> Callable[[Callable[_P, List[Element]]], Callable[_P, List[Element]]]: - """...""" + """Post-process element-metadata for list[Element] from partitioning. + + This decorator adds a post-processing step to a document partitioner. + + - Adds `metadata_filename` and `include_metadata` parameters to docstring if not present. + - Adds `.metadata.regex-metadata` when `regex_metadata` keyword-argument is provided. + - Updates element.id to a UUID when `unique_element_ids` argument is provided and True. + + """ def decorator(func: Callable[_P, List[Element]]) -> Callable[_P, List[Element]]: @functools.wraps(func) diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py index 440e08af1..6fdcdfdc1 100644 --- a/unstructured/partition/common.py +++ b/unstructured/partition/common.py @@ -39,8 +39,6 @@ if TYPE_CHECKING: from unstructured_inference.inference.layout import DocumentLayout, PageLayout from unstructured_inference.inference.layoutelement import LayoutElement - from unstructured.documents.html import HTMLDocument - HIERARCHY_RULE_SET = { "Title": [ "Text", @@ -542,8 +540,10 @@ def _get_page_image_metadata(page: PageLayout) -> dict[str, Any]: # FIXME: document here can be either DocumentLayout or HTMLDocument; HTMLDocument is defined in # unstructured.documents.html, which imports this module so we can't import the class for type # hints. Moreover, those two types of documents have different lists of attributes +# UPDATE(scanny): HTMLDocument no longer uses this function, so it can be optimized for use by +# DocumentLayout only. def document_to_element_list( - document: "DocumentLayout | HTMLDocument", + document: DocumentLayout, sortable: bool = False, include_page_breaks: bool = False, last_modification_date: Optional[str] = None, @@ -555,7 +555,7 @@ def document_to_element_list( starting_page_number: int = 1, **kwargs: Any, ) -> list[Element]: - """Converts a DocumentLayout or HTMLDocument object to a list of unstructured elements.""" + """Converts a DocumentLayout object to a list of unstructured elements.""" elements: list[Element] = [] num_pages = len(document.pages) diff --git a/unstructured/partition/html.py b/unstructured/partition/html.py index 5a1237345..8e8bb1ac7 100644 --- a/unstructured/partition/html.py +++ b/unstructured/partition/html.py @@ -6,13 +6,12 @@ import requests from unstructured.chunking import add_chunking_strategy from unstructured.documents.elements import Element, process_metadata -from unstructured.documents.html import HTMLDocument +from unstructured.documents.html import HTMLDocument, document_to_element_list from unstructured.documents.html_elements import TagsMixin from unstructured.file_utils.encoding import read_txt_file from unstructured.file_utils.file_conversion import convert_file_to_html_text from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.partition.common import ( - document_to_element_list, exactly_one, get_last_modified_date, get_last_modified_date_from_file, @@ -25,30 +24,27 @@ from unstructured.partition.lang import apply_lang_metadata @add_chunking_strategy def partition_html( filename: Optional[str] = None, + *, file: Optional[IO[bytes]] = None, text: Optional[str] = None, - url: Optional[str] = None, encoding: Optional[str] = None, - include_page_breaks: bool = False, - include_metadata: bool = True, + url: Optional[str] = None, headers: dict[str, str] = {}, ssl_verify: bool = True, - source_format: Optional[str] = None, + date_from_file_object: bool = False, + detect_language_per_element: bool = False, html_assemble_articles: bool = False, - metadata_filename: Optional[str] = None, + languages: Optional[list[str]] = ["auto"], metadata_last_modified: Optional[str] = None, skip_headers_and_footers: bool = False, - chunking_strategy: Optional[str] = None, - languages: Optional[list[str]] = ["auto"], - detect_language_per_element: bool = False, - detection_origin: Optional[str] = None, - date_from_file_object: bool = False, **kwargs: Any, ) -> list[Element]: """Partitions an HTML document into its constituent elements. - Parameters - ---------- + HTML source parameters + ---------------------- + The HTML to be partitioned can be specified four different ways: + filename A string defining the target filename path. file @@ -57,25 +53,23 @@ def partition_html( The string representation of the HTML document. url The URL of a webpage to parse. Only for URLs that return an HTML document. + headers + The HTTP headers to be used in the HTTP request when `url` is specified. + ssl_verify + If the URL parameter is set, determines whether or not SSL verification is performed + on the HTTP request. + + date_from_file_object + Applies only when providing file via `file` parameter. If this option is True, attempt + infer last_modified metadata from bytes, otherwise set it to None. encoding The encoding method used to decode the text input. If None, utf-8 will be used. - include_page_breaks - If True, includes page breaks at the end of each page in the document. + + Other parameters + ---------------- include_metadata Optionally allows for excluding metadata from the output. Primarily intended - for when partition_html is called in other partition bricks (like partition_email) - headers - The headers to be used in conjunction with the HTTP request if URL is set. - ssl_verify - If the URL parameter is set, determines whether or not partition uses SSL verification - in the HTTP request. - source_format - The source of the original html. If None we will return HTMLElements but for example - partition_rst will pass a value of 'rst' so that we return Title vs HTMLTitle - metadata_last_modified - The last modified date for the document. - skip_headers_and_footers - If True, ignores any content that is within
or