Steve Canny 77a9e1b54d
rfctr(html): drop convert_and_partition_html() (#3215)
**Summary**
Remove `unstructured.partition.html.convert_and_partition_html()`. Move
file-type conversion (to HTML) responsibility to each brokering
partitioner that uses that strategy and let them call `partition_html()`
for themselves with the result.

**Additional Context**

Rationale:
- `partition_html()` does not want or need to know which partitioners
might broker partitioning to it.
- Different brokering partitioners have their own methods to convert
their format to HTML and quirks that may be involved for their format.
Avoid coupling them so they can evolve independently.
- The core of the conversion work is already encapsulated in
`unstructured.partition.common.convert_file_to_html_text_using_pandoc()`.
- `convert_and_partition_html()` represents an additional brokering
layer with the entailed complexities of an additional site for default
parameter values to be (mis-)applied and/or dropped and is an additional
location for new parameters to be added.
2024-06-17 19:43:18 +00:00

68 lines
2.5 KiB
Python

from __future__ import annotations
from typing import IO, Any, Optional
from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.common import exactly_one, get_last_modified
from unstructured.partition.html import partition_html
DETECTION_ORIGIN: str = "epub"
@process_metadata()
@add_metadata_with_filetype(FileType.EPUB)
@add_chunking_strategy
def partition_epub(
filename: Optional[str] = None,
*,
file: Optional[IO[bytes]] = None,
metadata_filename: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
languages: Optional[list[str]] = ["auto"],
detect_language_per_element: bool = False,
date_from_file_object: bool = False,
**kwargs: Any,
) -> list[Element]:
"""Partitions an EPUB document. The document is first converted to HTML and then
partitioned using partition_html.
Parameters
----------
filename
A string defining the target filename path.
file
A file-like object using "rb" mode --> open(filename, "rb").
metadata_last_modified
The last modified date for the document.
languages
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
date_from_file_object
Applies only when providing file via `file` parameter. If this option is True, attempt
infer last_modified metadata from bytes, otherwise set it to None.
"""
exactly_one(filename=filename, file=file)
html_text = convert_file_to_html_text_using_pandoc(
source_format="epub", filename=filename, file=file
)
return partition_html(
text=html_text,
encoding="unicode",
metadata_filename=metadata_filename,
metadata_last_modified=(
metadata_last_modified or get_last_modified(filename, file, date_from_file_object)
),
languages=languages,
detect_language_per_element=detect_language_per_element,
detection_origin=DETECTION_ORIGIN,
)