Matt Robinson c53ce117bc
fix: enable partition_html to grab content outside of <article> tags (#772)
* optionally dont assemble articles

* add test for content outside of articles

* pass kwargs in partition

* changelog and version

* update default to False

* bump version for release

* back to dev version to get another fix in the release
2023-06-20 17:07:30 +00:00

136 lines
4.6 KiB
Python

from typing import IO, Dict, List, Optional
import requests
from unstructured.documents.elements import Element, process_metadata
from unstructured.documents.html import HTMLDocument
from unstructured.documents.xml import VALID_PARSERS
from unstructured.file_utils.encoding import read_txt_file
from unstructured.file_utils.file_conversion import convert_file_to_html_text
from unstructured.file_utils.filetype import (
FileType,
add_metadata_with_filetype,
document_to_element_list,
)
from unstructured.partition.common import (
exactly_one,
)
@process_metadata()
@add_metadata_with_filetype(FileType.HTML)
def partition_html(
filename: Optional[str] = None,
file: Optional[IO] = None,
text: Optional[str] = None,
url: Optional[str] = None,
encoding: Optional[str] = None,
include_page_breaks: bool = False,
include_metadata: bool = True,
headers: Dict[str, str] = {},
ssl_verify: bool = True,
parser: VALID_PARSERS = None,
html_assemble_articles: bool = False,
**kwargs,
) -> List[Element]:
"""Partitions an HTML document into its constituent elements.
Parameters
----------
filename
A string defining the target filename path.
file
A file-like object using "r" mode --> open(filename, "r").
text
The string representation of the HTML document.
url
The URL of a webpage to parse. Only for URLs that return an HTML document.
encoding
The encoding method used to decode the text input. If None, utf-8 will be used.
include_page_breaks
If True, includes page breaks at the end of each page in the document.
include_metadata
Optionally allows for excluding metadata from the output. Primarily intended
for when partition_html is called in other partition bricks (like partition_email)
headers
The headers to be used in conjunction with the HTTP request if URL is set.
ssl_verify
If the URL parameter is set, determines whether or not partition uses SSL verification
in the HTTP request.
parser
The parser to use for parsing the HTML document. If None, default parser will be used.
"""
if text is not None and text.strip() == "" and not file and not filename and not url:
return []
# Verify that only one of the arguments was provided
exactly_one(filename=filename, file=file, text=text, url=url)
if filename is not None:
document = HTMLDocument.from_file(
filename,
parser=parser,
encoding=encoding,
assemble_articles=html_assemble_articles,
)
elif file is not None:
_, file_text = read_txt_file(file=file, encoding=encoding)
document = HTMLDocument.from_string(
file_text,
parser=parser,
assemble_articles=html_assemble_articles,
)
elif text is not None:
_text: str = str(text)
document = HTMLDocument.from_string(
_text,
parser=parser,
assemble_articles=html_assemble_articles,
)
elif url is not None:
response = requests.get(url, headers=headers, verify=ssl_verify)
if not response.ok:
raise ValueError(f"URL return an error: {response.status_code}")
content_type = response.headers.get("Content-Type", "")
if not content_type.startswith("text/html"):
raise ValueError(f"Expected content type text/html. Got {content_type}.")
document = HTMLDocument.from_string(response.text, parser=parser)
return document_to_element_list(document, include_page_breaks=include_page_breaks)
def convert_and_partition_html(
source_format: str,
filename: Optional[str] = None,
file: Optional[IO] = None,
include_page_breaks: bool = False,
) -> List[Element]:
"""Converts a document to HTML and then partitions it using partition_html. Works with
any file format support by pandoc.
Parameters
----------
source_format
The format of the source document, i.e. rst
filename
A string defining the target filename path.
file
A file-like object using "rb" mode --> open(filename, "rb").
include_page_breaks
If True, the output will include page breaks if the filetype supports it
"""
html_text = convert_file_to_html_text(source_format=source_format, filename=filename, file=file)
# NOTE(robinson) - pypandoc returns a text string with unicode encoding
# ref: https://github.com/JessicaTegner/pypandoc#usage
return partition_html(
text=html_text,
include_page_breaks=include_page_breaks,
encoding="unicode",
)