mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-21 20:25:15 +00:00
The chunking subpackage `unstructured.chunking` currently contains only the `title` module and the `@add_chunking_strategy()` decorator is located in that module even though it has no special relationship to the `by_title` chunking strategy. Move it to the `__init__.py` module such that it is exported from `unstructured.chunking`. Adjust all references, pretty much one per partitioner, to import it from there. This prepares the way for further separation of the chunking package into modules, including a new `character` module for the `by_character` chunking strategy.
68 lines
2.3 KiB
Python
68 lines
2.3 KiB
Python
from typing import IO, List, Optional
|
|
|
|
from unstructured.chunking import add_chunking_strategy
|
|
from unstructured.documents.elements import Element, process_metadata
|
|
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
|
from unstructured.partition.html import convert_and_partition_html
|
|
from unstructured.partition.lang import apply_lang_metadata
|
|
|
|
DETECTION_ORIGIN: str = "epub"
|
|
|
|
|
|
@process_metadata()
|
|
@add_metadata_with_filetype(FileType.EPUB)
|
|
@add_chunking_strategy()
|
|
def partition_epub(
|
|
filename: Optional[str] = None,
|
|
file: Optional[IO[bytes]] = None,
|
|
include_page_breaks: bool = False,
|
|
include_metadata: bool = True,
|
|
metadata_filename: Optional[str] = None,
|
|
metadata_last_modified: Optional[str] = None,
|
|
chunking_strategy: Optional[str] = None,
|
|
languages: Optional[List[str]] = ["auto"],
|
|
detect_language_per_element: bool = False,
|
|
**kwargs,
|
|
) -> List[Element]:
|
|
"""Partitions an EPUB document. The document is first converted to HTML and then
|
|
partitioned using partition_html.
|
|
|
|
Parameters
|
|
----------
|
|
filename
|
|
A string defining the target filename path.
|
|
file
|
|
A file-like object using "rb" mode --> open(filename, "rb").
|
|
include_page_breaks
|
|
If True, the output will include page breaks if the filetype supports it
|
|
metadata_last_modified
|
|
The last modified date for the document.
|
|
languages
|
|
User defined value for `metadata.languages` if provided. Otherwise language is detected
|
|
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
|
in either language.
|
|
Additional Parameters:
|
|
detect_language_per_element
|
|
Detect language per element instead of at the document level.
|
|
"""
|
|
|
|
elements = convert_and_partition_html(
|
|
filename=filename,
|
|
file=file,
|
|
include_page_breaks=include_page_breaks,
|
|
metadata_filename=metadata_filename,
|
|
metadata_last_modified=metadata_last_modified,
|
|
source_format="epub",
|
|
detection_origin=DETECTION_ORIGIN,
|
|
)
|
|
|
|
elements = list(
|
|
apply_lang_metadata(
|
|
elements,
|
|
languages=languages,
|
|
detect_language_per_element=detect_language_per_element,
|
|
),
|
|
)
|
|
|
|
return elements
|