feat: add allow custom parsers in partition_html (#251)

This will allow partition_html to use a custom XMLParser or HTMLParser. It can be useful if one needs to specify additional arguments to these parsers (not only built-in remove_comments=True). --------- Co-authored-by: Viktor Zhemchuzhnikov <v.zhemchuzhnikov@xsolla.com>
2026-01-03 18:54:01 +00:00 · 2023-02-23 09:57:42 +08:00 · 2023-02-23 09:57:42 +08:00 · 60abac2c4b
commit 60abac2c4b
parent 1b8bf318b8
3 changed files with 11 additions and 6 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,6 +1,7 @@
-## 0.4.12-dev1
+## 0.4.12-dev2

 * Adds console_entrypoint for unstructured-ingest, other structure/doc updates related to ingest.
+* Add `parser` parameter to `partition_html`.

 ## 0.4.11

--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.4.12"  # pragma: no cover
+__version__ = "0.4.12-dev2"  # pragma: no cover
--- a/unstructured/partition/html.py
+++ b/unstructured/partition/html.py
@ -4,6 +4,7 @@ import requests

 from unstructured.documents.elements import Element
 from unstructured.documents.html import HTMLDocument
+from unstructured.documents.xml import VALID_PARSERS
 from unstructured.partition.common import add_element_metadata, document_to_element_list


@ -14,6 +15,7 @@ def partition_html(
    url: Optional[str] = None,
    include_page_breaks: bool = False,
    include_metadata: bool = True,
+    parser: VALID_PARSERS = None,
 ) -> List[Element]:
    """Partitions an HTML document into its constituent elements.

@ -32,12 +34,14 @@ def partition_html(
    include_metadata
        Optionally allows for excluding metadata from the output. Primarily intended
        for when partition_html is called in other partition bricks (like partition_email)
+    parser
+        The parser to use for parsing the HTML document. If None, default parser will be used.
    """
    if not any([filename, file, text, url]):
        raise ValueError("One of filename, file, or text must be specified.")

    if filename is not None and not file and not text and not url:
-        document = HTMLDocument.from_file(filename)
+        document = HTMLDocument.from_file(filename, parser=parser)

    elif file is not None and not filename and not text and not url:
        file_content = file.read()
@ -46,11 +50,11 @@ def partition_html(
        else:
            file_text = file_content

-        document = HTMLDocument.from_string(file_text)
+        document = HTMLDocument.from_string(file_text, parser=parser)

    elif text is not None and not filename and not file and not url:
        _text: str = str(text)
-        document = HTMLDocument.from_string(_text)
+        document = HTMLDocument.from_string(_text, parser=parser)

    elif url is not None and not filename and not file and not text:
        response = requests.get(url)
@ -61,7 +65,7 @@ def partition_html(
        if not content_type.startswith("text/html"):
            raise ValueError(f"Expected content type text/html. Got {content_type}.")

-        document = HTMLDocument.from_string(response.text)
+        document = HTMLDocument.from_string(response.text, parser=parser)

    else:
        raise ValueError("Only one of filename, file, or text can be specified.")