mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-14 00:29:47 +00:00
**Summary** Remove `unstructured.partition.html.convert_and_partition_html()`. Move file-type conversion (to HTML) responsibility to each brokering partitioner that uses that strategy and let them call `partition_html()` for themselves with the result. **Additional Context** Rationale: - `partition_html()` does not want or need to know which partitioners might broker partitioning to it. - Different brokering partitioners have their own methods to convert their format to HTML and quirks that may be involved for their format. Avoid coupling them so they can evolve independently. - The core of the conversion work is already encapsulated in `unstructured.partition.common.convert_file_to_html_text_using_pandoc()`. - `convert_and_partition_html()` represents an additional brokering layer with the entailed complexities of an additional site for default parameter values to be (mis-)applied and/or dropped and is an additional location for new parameters to be added.
68 lines
2.5 KiB
Python
68 lines
2.5 KiB
Python
from __future__ import annotations
|
|
|
|
from typing import IO, Any, Optional
|
|
|
|
from unstructured.chunking import add_chunking_strategy
|
|
from unstructured.documents.elements import Element, process_metadata
|
|
from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
|
|
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
|
from unstructured.partition.common import exactly_one, get_last_modified
|
|
from unstructured.partition.html import partition_html
|
|
|
|
DETECTION_ORIGIN: str = "epub"
|
|
|
|
|
|
@process_metadata()
|
|
@add_metadata_with_filetype(FileType.EPUB)
|
|
@add_chunking_strategy
|
|
def partition_epub(
|
|
filename: Optional[str] = None,
|
|
*,
|
|
file: Optional[IO[bytes]] = None,
|
|
metadata_filename: Optional[str] = None,
|
|
metadata_last_modified: Optional[str] = None,
|
|
languages: Optional[list[str]] = ["auto"],
|
|
detect_language_per_element: bool = False,
|
|
date_from_file_object: bool = False,
|
|
**kwargs: Any,
|
|
) -> list[Element]:
|
|
"""Partitions an EPUB document. The document is first converted to HTML and then
|
|
partitioned using partition_html.
|
|
|
|
Parameters
|
|
----------
|
|
filename
|
|
A string defining the target filename path.
|
|
file
|
|
A file-like object using "rb" mode --> open(filename, "rb").
|
|
metadata_last_modified
|
|
The last modified date for the document.
|
|
languages
|
|
User defined value for `metadata.languages` if provided. Otherwise language is detected
|
|
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
|
in either language.
|
|
Additional Parameters:
|
|
detect_language_per_element
|
|
Detect language per element instead of at the document level.
|
|
date_from_file_object
|
|
Applies only when providing file via `file` parameter. If this option is True, attempt
|
|
infer last_modified metadata from bytes, otherwise set it to None.
|
|
"""
|
|
exactly_one(filename=filename, file=file)
|
|
|
|
html_text = convert_file_to_html_text_using_pandoc(
|
|
source_format="epub", filename=filename, file=file
|
|
)
|
|
|
|
return partition_html(
|
|
text=html_text,
|
|
encoding="unicode",
|
|
metadata_filename=metadata_filename,
|
|
metadata_last_modified=(
|
|
metadata_last_modified or get_last_modified(filename, file, date_from_file_object)
|
|
),
|
|
languages=languages,
|
|
detect_language_per_element=detect_language_per_element,
|
|
detection_origin=DETECTION_ORIGIN,
|
|
)
|