Steve Canny aa7794a566
rfctr(chunking): move add_chunking_strategy() decorator up (#2265)
The chunking subpackage `unstructured.chunking` currently contains only
the `title` module and the `@add_chunking_strategy()` decorator is
located in that module even though it has no special relationship to the
`by_title` chunking strategy.

Move it to the `__init__.py` module such that it is exported from
`unstructured.chunking`. Adjust all references, pretty much one per
partitioner, to import it from there.

This prepares the way for further separation of the chunking package
into modules, including a new `character` module for the `by_character`
chunking strategy.
2023-12-14 19:16:16 +00:00

68 lines
2.3 KiB
Python

from typing import IO, List, Optional
from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.html import convert_and_partition_html
from unstructured.partition.lang import apply_lang_metadata
DETECTION_ORIGIN: str = "epub"
@process_metadata()
@add_metadata_with_filetype(FileType.EPUB)
@add_chunking_strategy()
def partition_epub(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,
include_page_breaks: bool = False,
include_metadata: bool = True,
metadata_filename: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
chunking_strategy: Optional[str] = None,
languages: Optional[List[str]] = ["auto"],
detect_language_per_element: bool = False,
**kwargs,
) -> List[Element]:
"""Partitions an EPUB document. The document is first converted to HTML and then
partitioned using partition_html.
Parameters
----------
filename
A string defining the target filename path.
file
A file-like object using "rb" mode --> open(filename, "rb").
include_page_breaks
If True, the output will include page breaks if the filetype supports it
metadata_last_modified
The last modified date for the document.
languages
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
"""
elements = convert_and_partition_html(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
metadata_filename=metadata_filename,
metadata_last_modified=metadata_last_modified,
source_format="epub",
detection_origin=DETECTION_ORIGIN,
)
elements = list(
apply_lang_metadata(
elements,
languages=languages,
detect_language_per_element=detect_language_per_element,
),
)
return elements