feat: add allow custom parsers in partition_html (#251)

This will allow partition_html to use a custom XMLParser or HTMLParser.
It can be useful if one needs to specify additional arguments to these parsers (not only built-in remove_comments=True).
---------

Co-authored-by: Viktor Zhemchuzhnikov <v.zhemchuzhnikov@xsolla.com>
This commit is contained in:
Viktor Zhemchuzhnikov 2023-02-23 09:57:42 +08:00 committed by GitHub
parent 1b8bf318b8
commit 60abac2c4b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 11 additions and 6 deletions

View File

@ -1,6 +1,7 @@
## 0.4.12-dev1
## 0.4.12-dev2
* Adds console_entrypoint for unstructured-ingest, other structure/doc updates related to ingest.
* Add `parser` parameter to `partition_html`.
## 0.4.11

View File

@ -1 +1 @@
__version__ = "0.4.12" # pragma: no cover
__version__ = "0.4.12-dev2" # pragma: no cover

View File

@ -4,6 +4,7 @@ import requests
from unstructured.documents.elements import Element
from unstructured.documents.html import HTMLDocument
from unstructured.documents.xml import VALID_PARSERS
from unstructured.partition.common import add_element_metadata, document_to_element_list
@ -14,6 +15,7 @@ def partition_html(
url: Optional[str] = None,
include_page_breaks: bool = False,
include_metadata: bool = True,
parser: VALID_PARSERS = None,
) -> List[Element]:
"""Partitions an HTML document into its constituent elements.
@ -32,12 +34,14 @@ def partition_html(
include_metadata
Optionally allows for excluding metadata from the output. Primarily intended
for when partition_html is called in other partition bricks (like partition_email)
parser
The parser to use for parsing the HTML document. If None, default parser will be used.
"""
if not any([filename, file, text, url]):
raise ValueError("One of filename, file, or text must be specified.")
if filename is not None and not file and not text and not url:
document = HTMLDocument.from_file(filename)
document = HTMLDocument.from_file(filename, parser=parser)
elif file is not None and not filename and not text and not url:
file_content = file.read()
@ -46,11 +50,11 @@ def partition_html(
else:
file_text = file_content
document = HTMLDocument.from_string(file_text)
document = HTMLDocument.from_string(file_text, parser=parser)
elif text is not None and not filename and not file and not url:
_text: str = str(text)
document = HTMLDocument.from_string(_text)
document = HTMLDocument.from_string(_text, parser=parser)
elif url is not None and not filename and not file and not text:
response = requests.get(url)
@ -61,7 +65,7 @@ def partition_html(
if not content_type.startswith("text/html"):
raise ValueError(f"Expected content type text/html. Got {content_type}.")
document = HTMLDocument.from_string(response.text)
document = HTMLDocument.from_string(response.text, parser=parser)
else:
raise ValueError("Only one of filename, file, or text can be specified.")