mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
rfctr: prep for pluggable partitioners (#3806)
**Summary** Prepare auto-partitioning for pluggable partitioners. Move toward a uniform partitioner call signature in `auto/partition()` such that a custom or override partitioner can be registered without requiring code changes. **Additional Context** The central job of `auto/partition()` is to detect the file-type of the given file and use that to dispatch partitioning to the corresponding partitioner function e.g. `partition_pdf()` or `partition_docx()`. In the existing code, each partitioner function is called with parameters "hand-picked" from the available parameters passed to the `partition()` function. This is unnecessary and couples those partitioners tightly with the dispatch function. The desired state is that all available arguments are passed as `kwargs` and the partitioner function "self-selects" the arguments it will be sensitive to, applies its own appropriate default values when the argument is omitted, and simply ignore any arguments it doesn't use. Note that achieving this requires no changes to partitioner functions because they already do precisely this. So the job is to pass all arguments (other than `filename` and `file`) to the partitioner as `kwargs`. This will allow additional or alternate partitioners to be registered at runtime and dispatched to, because as long as they have the signature `partition_x(filename, file, kwargs) -> list[Element]` then they can be dispatched to without customization.
This commit is contained in:
parent
b981d7197f
commit
3b718ec89a
12
CHANGELOG.md
12
CHANGELOG.md
@ -1,8 +1,14 @@
|
||||
## 0.16.11
|
||||
## 0.16.12-dev0
|
||||
|
||||
### Enhancements
|
||||
|
||||
- **Prepare auto-partitioning for pluggable partitioners**. Move toward a uniform partitioner call signature so a custom or override partitioner can be registered without code changes.
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
- Fix ipv4 regex to correctly include up to three digit octets.
|
||||
## 0.16.11
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -14,6 +20,8 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
- Fix ipv4 regex to correctly include up to three digit octets.
|
||||
|
||||
## 0.16.10
|
||||
|
||||
### Enhancements
|
||||
|
@ -29,6 +29,7 @@ from unstructured.staging.base import elements_to_json
|
||||
("Title", 0): 4,
|
||||
("Title", 1): 1,
|
||||
("NarrativeText", 0): 3,
|
||||
("PageBreak", None): 3,
|
||||
("ListItem", 0): 6,
|
||||
("ListItem", 1): 6,
|
||||
("ListItem", 2): 3,
|
||||
|
@ -1232,17 +1232,6 @@ class DescribeHtmlPartitionerOptions:
|
||||
|
||||
assert opts.detection_origin == detection_origin
|
||||
|
||||
# -- .encoding -------------------------------
|
||||
|
||||
@pytest.mark.parametrize("encoding", ["utf-8", None])
|
||||
def it_knows_the_caller_provided_encoding(
|
||||
self, encoding: str | None, opts_args: dict[str, Any]
|
||||
):
|
||||
opts_args["encoding"] = encoding
|
||||
opts = HtmlPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.encoding == encoding
|
||||
|
||||
# -- .html_text ------------------------------
|
||||
|
||||
def it_gets_the_HTML_from_the_file_path_when_one_is_provided(self, opts_args: dict[str, Any]):
|
||||
|
@ -2,7 +2,6 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import pathlib
|
||||
@ -561,7 +560,6 @@ def test_auto_partition_pdf_with_fast_strategy(request: FixtureRequest):
|
||||
strategy=PartitionStrategy.FAST,
|
||||
languages=None,
|
||||
metadata_filename=None,
|
||||
include_page_breaks=False,
|
||||
infer_table_structure=False,
|
||||
extract_images_in_pdf=False,
|
||||
extract_image_block_types=None,
|
||||
@ -897,7 +895,7 @@ def test_auto_partition_raises_with_bad_type(request: FixtureRequest):
|
||||
|
||||
with pytest.raises(
|
||||
UnsupportedFileFormatError,
|
||||
match="Invalid file made-up.fake. The FileType.UNK file type is not supported in partiti",
|
||||
match="Partitioning is not supported for the FileType.UNK file type.",
|
||||
):
|
||||
partition(filename="made-up.fake", strategy=PartitionStrategy.HI_RES)
|
||||
|
||||
@ -1037,26 +1035,6 @@ def test_auto_partition_forwards_metadata_filename_via_kwargs():
|
||||
assert all(e.metadata.filename == "much-more-interesting-name.txt" for e in elements)
|
||||
|
||||
|
||||
def test_auto_partition_warns_about_file_filename_deprecation(caplog: LogCaptureFixture):
|
||||
file_path = example_doc_path("fake-text.txt")
|
||||
|
||||
with open(file_path, "rb") as f:
|
||||
elements = partition(file=f, file_filename=file_path)
|
||||
|
||||
assert all(e.metadata.filename == "fake-text.txt" for e in elements)
|
||||
assert caplog.records[0].levelname == "WARNING"
|
||||
assert "The file_filename kwarg will be deprecated" in caplog.text
|
||||
|
||||
|
||||
def test_auto_partition_raises_when_both_file_filename_and_metadata_filename_args_are_used():
|
||||
file_path = example_doc_path("fake-text.txt")
|
||||
with open(file_path, "rb") as f:
|
||||
file = io.BytesIO(f.read())
|
||||
|
||||
with pytest.raises(ValueError, match="Only one of metadata_filename and file_filename is spe"):
|
||||
partition(file=file, file_filename=file_path, metadata_filename=file_path)
|
||||
|
||||
|
||||
# -- ocr_languages --------------------------------------------------------
|
||||
|
||||
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.16.11" # pragma: no cover
|
||||
__version__ = "0.16.12-dev0" # pragma: no cover
|
||||
|
@ -2,9 +2,10 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import copy
|
||||
import importlib
|
||||
import io
|
||||
from typing import IO, Any, Callable, Literal, Optional
|
||||
from typing import IO, Any, Callable, Optional
|
||||
|
||||
import requests
|
||||
from typing_extensions import TypeAlias
|
||||
@ -25,17 +26,15 @@ Partitioner: TypeAlias = Callable[..., list[Element]]
|
||||
def partition(
|
||||
filename: Optional[str] = None,
|
||||
*,
|
||||
content_type: Optional[str] = None,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
file_filename: Optional[str] = None,
|
||||
url: Optional[str] = None,
|
||||
include_page_breaks: bool = False,
|
||||
strategy: str = PartitionStrategy.AUTO,
|
||||
encoding: Optional[str] = None,
|
||||
paragraph_grouper: Optional[Callable[[str], str]] | Literal[False] = None,
|
||||
content_type: Optional[str] = None,
|
||||
url: Optional[str] = None,
|
||||
headers: dict[str, str] = {},
|
||||
skip_infer_table_types: list[str] = ["pdf", "jpg", "png", "heic"],
|
||||
ssl_verify: bool = True,
|
||||
request_timeout: Optional[int] = None,
|
||||
strategy: str = PartitionStrategy.AUTO,
|
||||
skip_infer_table_types: list[str] = ["pdf", "jpg", "png", "heic"],
|
||||
ocr_languages: Optional[str] = None, # changing to optional for deprecation
|
||||
languages: Optional[list[str]] = None,
|
||||
detect_language_per_element: bool = False,
|
||||
@ -44,15 +43,13 @@ def partition(
|
||||
extract_image_block_types: Optional[list[str]] = None,
|
||||
extract_image_block_output_dir: Optional[str] = None,
|
||||
extract_image_block_to_payload: bool = False,
|
||||
xml_keep_tags: bool = False,
|
||||
data_source_metadata: Optional[DataSourceMetadata] = None,
|
||||
metadata_filename: Optional[str] = None,
|
||||
request_timeout: Optional[int] = None,
|
||||
hi_res_model_name: Optional[str] = None,
|
||||
model_name: Optional[str] = None, # to be deprecated
|
||||
starting_page_number: int = 1,
|
||||
**kwargs: Any,
|
||||
):
|
||||
) -> list[Element]:
|
||||
"""Partitions a document into its constituent elements.
|
||||
|
||||
Uses libmagic to determine the file's type and route it to the appropriate partitioning
|
||||
@ -63,30 +60,32 @@ def partition(
|
||||
----------
|
||||
filename
|
||||
A string defining the target filename path.
|
||||
content_type
|
||||
A string defining the file content in MIME type
|
||||
file
|
||||
A file-like object using "rb" mode --> open(filename, "rb").
|
||||
metadata_filename
|
||||
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
|
||||
encoding
|
||||
The character-encoding used to decode the input bytes when drawn from `filename` or `file`.
|
||||
Defaults to "utf-8".
|
||||
url
|
||||
The url for a remote document. Pass in content_type if you want partition to treat
|
||||
the document as a specific content_type.
|
||||
include_page_breaks
|
||||
If True, the output will include page breaks if the filetype supports it
|
||||
headers
|
||||
The headers to be used in conjunction with the HTTP request if URL is set.
|
||||
ssl_verify
|
||||
If the URL parameter is set, determines whether or not partition uses SSL verification
|
||||
in the HTTP request.
|
||||
request_timeout
|
||||
The timeout for the HTTP request if URL is set. Defaults to None meaning no timeout and
|
||||
requests will block indefinitely.
|
||||
content_type
|
||||
A string defining the file content in MIME type
|
||||
metadata_filename
|
||||
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
|
||||
strategy
|
||||
The strategy to use for partitioning PDF/image. Uses a layout detection model if set
|
||||
to 'hi_res', otherwise partition simply extracts the text from the document
|
||||
and processes it.
|
||||
encoding
|
||||
The encoding method used to decode the text input. If None, utf-8 will be used.
|
||||
headers
|
||||
The headers to be used in conjunction with the HTTP request if URL is set.
|
||||
skip_infer_table_types
|
||||
The document types that you want to skip table extraction with.
|
||||
ssl_verify
|
||||
If the URL parameter is set, determines whether or not partition uses SSL verification
|
||||
in the HTTP request.
|
||||
languages
|
||||
The languages present in the document, for use in partitioning and/or OCR. For partitioning
|
||||
image or pdf documents with Tesseract, you'll first need to install the appropriate
|
||||
@ -124,12 +123,6 @@ def partition(
|
||||
Only applicable if `strategy=hi_res` and `extract_image_block_to_payload=False`.
|
||||
The filesystem path for saving images of the element type(s)
|
||||
specified in 'extract_image_block_types'.
|
||||
xml_keep_tags
|
||||
If True, will retain the XML tags in the output. Otherwise it will simply extract
|
||||
the text from within the tags. Only applies to partition_xml.
|
||||
request_timeout
|
||||
The timeout for the HTTP request if URL is set. Defaults to None meaning no timeout and
|
||||
requests will block indefinitely.
|
||||
hi_res_model_name
|
||||
The layout detection model used when partitioning strategy is set to `hi_res`.
|
||||
model_name
|
||||
@ -142,18 +135,6 @@ def partition(
|
||||
"""
|
||||
exactly_one(file=file, filename=filename, url=url)
|
||||
|
||||
if metadata_filename and file_filename:
|
||||
raise ValueError(
|
||||
"Only one of metadata_filename and file_filename is specified. "
|
||||
"metadata_filename is preferred. file_filename is marked for deprecation.",
|
||||
)
|
||||
|
||||
if file_filename is not None:
|
||||
metadata_filename = file_filename
|
||||
logger.warning(
|
||||
"The file_filename kwarg will be deprecated in a future version of unstructured. "
|
||||
"Please use metadata_filename instead.",
|
||||
)
|
||||
kwargs.setdefault("metadata_filename", metadata_filename)
|
||||
|
||||
if pdf_infer_table_structure:
|
||||
@ -197,80 +178,28 @@ def partition(
|
||||
|
||||
partitioner_loader = _PartitionerLoader()
|
||||
|
||||
if file_type == FileType.CSV:
|
||||
partition_csv = partitioner_loader.get(file_type)
|
||||
elements = partition_csv(
|
||||
filename=filename,
|
||||
file=file,
|
||||
encoding=encoding,
|
||||
infer_table_structure=infer_table_structure,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
)
|
||||
elif file_type == FileType.DOC:
|
||||
partition_doc = partitioner_loader.get(file_type)
|
||||
elements = partition_doc(
|
||||
filename=filename,
|
||||
file=file,
|
||||
infer_table_structure=infer_table_structure,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
starting_page_number=starting_page_number,
|
||||
strategy=strategy,
|
||||
**kwargs,
|
||||
)
|
||||
elif file_type == FileType.DOCX:
|
||||
partition_docx = partitioner_loader.get(file_type)
|
||||
elements = partition_docx(
|
||||
filename=filename,
|
||||
file=file,
|
||||
infer_table_structure=infer_table_structure,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
starting_page_number=starting_page_number,
|
||||
strategy=strategy,
|
||||
**kwargs,
|
||||
)
|
||||
elif file_type == FileType.EML:
|
||||
partition_email = partitioner_loader.get(file_type)
|
||||
elements = partition_email(
|
||||
filename=filename,
|
||||
file=file,
|
||||
encoding=encoding,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
)
|
||||
elif file_type == FileType.EPUB:
|
||||
partition_epub = partitioner_loader.get(file_type)
|
||||
elements = partition_epub(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
infer_table_structure=infer_table_structure,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
)
|
||||
elif file_type == FileType.HTML:
|
||||
partition_html = partitioner_loader.get(file_type)
|
||||
elements = partition_html(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
encoding=encoding,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
)
|
||||
elif file_type.partitioner_shortname == "image":
|
||||
partition_image = partitioner_loader.get(file_type)
|
||||
elements = partition_image(
|
||||
# -- extracting this post-processing to allow multiple exit-points from function --
|
||||
def augment_metadata(elements: list[Element]) -> list[Element]:
|
||||
"""Add some metadata fields to each element."""
|
||||
for element in elements:
|
||||
element.metadata.url = url
|
||||
element.metadata.data_source = data_source_metadata
|
||||
if content_type is not None:
|
||||
out_filetype = FileType.from_mime_type(content_type)
|
||||
element.metadata.filetype = out_filetype.mime_type if out_filetype else None
|
||||
else:
|
||||
element.metadata.filetype = file_type.mime_type
|
||||
|
||||
return elements
|
||||
|
||||
# -- handle PDF/Image partitioning separately because they have a lot of special-case
|
||||
# -- parameters. We'll come back to this after sorting out the other file types.
|
||||
if file_type == FileType.PDF:
|
||||
partition_pdf = partitioner_loader.get(file_type)
|
||||
elements = partition_pdf(
|
||||
filename=filename,
|
||||
file=file,
|
||||
url=None,
|
||||
include_page_breaks=include_page_breaks,
|
||||
infer_table_structure=infer_table_structure,
|
||||
strategy=strategy,
|
||||
languages=languages,
|
||||
@ -282,7 +211,30 @@ def partition(
|
||||
starting_page_number=starting_page_number,
|
||||
**kwargs,
|
||||
)
|
||||
elif file_type == FileType.JSON:
|
||||
return augment_metadata(elements)
|
||||
|
||||
if file_type.partitioner_shortname == "image":
|
||||
partition_image = partitioner_loader.get(file_type)
|
||||
elements = partition_image(
|
||||
filename=filename,
|
||||
file=file,
|
||||
url=None,
|
||||
infer_table_structure=infer_table_structure,
|
||||
strategy=strategy,
|
||||
languages=languages,
|
||||
hi_res_model_name=hi_res_model_name or model_name,
|
||||
extract_images_in_pdf=extract_images_in_pdf,
|
||||
extract_image_block_types=extract_image_block_types,
|
||||
extract_image_block_output_dir=extract_image_block_output_dir,
|
||||
extract_image_block_to_payload=extract_image_block_to_payload,
|
||||
starting_page_number=starting_page_number,
|
||||
**kwargs,
|
||||
)
|
||||
return augment_metadata(elements)
|
||||
|
||||
# -- JSON is a special case because it's not a document format per se and is insensitive to
|
||||
# -- most of the parameters that apply to other file types.
|
||||
if file_type == FileType.JSON:
|
||||
if not is_json_processable(filename=filename, file=file):
|
||||
raise ValueError(
|
||||
"Detected a JSON file that does not conform to the Unstructured schema. "
|
||||
@ -290,173 +242,28 @@ def partition(
|
||||
)
|
||||
partition_json = partitioner_loader.get(file_type)
|
||||
elements = partition_json(filename=filename, file=file, **kwargs)
|
||||
elif file_type == FileType.MD:
|
||||
partition_md = partitioner_loader.get(file_type)
|
||||
elements = partition_md(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
infer_table_structure=infer_table_structure,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
)
|
||||
elif file_type == FileType.MSG:
|
||||
partition_msg = partitioner_loader.get(file_type)
|
||||
elements = partition_msg(
|
||||
filename=filename,
|
||||
file=file,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
)
|
||||
elif file_type == FileType.ODT:
|
||||
partition_odt = partitioner_loader.get(file_type)
|
||||
elements = partition_odt(
|
||||
filename=filename,
|
||||
file=file,
|
||||
infer_table_structure=infer_table_structure,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
starting_page_number=starting_page_number,
|
||||
strategy=strategy,
|
||||
**kwargs,
|
||||
)
|
||||
elif file_type == FileType.ORG:
|
||||
partition_org = partitioner_loader.get(file_type)
|
||||
elements = partition_org(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
)
|
||||
elif file_type == FileType.PDF:
|
||||
partition_pdf = partitioner_loader.get(file_type)
|
||||
elements = partition_pdf(
|
||||
filename=filename,
|
||||
file=file,
|
||||
url=None,
|
||||
include_page_breaks=include_page_breaks,
|
||||
infer_table_structure=infer_table_structure,
|
||||
strategy=strategy,
|
||||
languages=languages,
|
||||
hi_res_model_name=hi_res_model_name or model_name,
|
||||
extract_images_in_pdf=extract_images_in_pdf,
|
||||
extract_image_block_types=extract_image_block_types,
|
||||
extract_image_block_output_dir=extract_image_block_output_dir,
|
||||
extract_image_block_to_payload=extract_image_block_to_payload,
|
||||
starting_page_number=starting_page_number,
|
||||
**kwargs,
|
||||
)
|
||||
elif file_type == FileType.PPT:
|
||||
partition_ppt = partitioner_loader.get(file_type)
|
||||
elements = partition_ppt(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
infer_table_structure=infer_table_structure,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
strategy=strategy,
|
||||
**kwargs,
|
||||
)
|
||||
elif file_type == FileType.PPTX:
|
||||
partition_pptx = partitioner_loader.get(file_type)
|
||||
elements = partition_pptx(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
infer_table_structure=infer_table_structure,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
starting_page_number=starting_page_number,
|
||||
strategy=strategy,
|
||||
**kwargs,
|
||||
)
|
||||
elif file_type == FileType.RST:
|
||||
partition_rst = partitioner_loader.get(file_type)
|
||||
elements = partition_rst(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
infer_table_structure=infer_table_structure,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
)
|
||||
elif file_type == FileType.RTF:
|
||||
partition_rtf = partitioner_loader.get(file_type)
|
||||
elements = partition_rtf(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
infer_table_structure=infer_table_structure,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
)
|
||||
elif file_type == FileType.TSV:
|
||||
partition_tsv = partitioner_loader.get(file_type)
|
||||
elements = partition_tsv(
|
||||
filename=filename,
|
||||
file=file,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
)
|
||||
elif file_type == FileType.TXT:
|
||||
partition_text = partitioner_loader.get(file_type)
|
||||
elements = partition_text(
|
||||
filename=filename,
|
||||
file=file,
|
||||
encoding=encoding,
|
||||
paragraph_grouper=paragraph_grouper,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
)
|
||||
elif file_type in (FileType.XLS, FileType.XLSX):
|
||||
partition_xlsx = partitioner_loader.get(file_type)
|
||||
elements = partition_xlsx(
|
||||
filename=filename,
|
||||
file=file,
|
||||
infer_table_structure=infer_table_structure,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
starting_page_number=starting_page_number,
|
||||
**kwargs,
|
||||
)
|
||||
elif file_type == FileType.XML:
|
||||
partition_xml = partitioner_loader.get(file_type)
|
||||
elements = partition_xml(
|
||||
filename=filename,
|
||||
file=file,
|
||||
encoding=encoding,
|
||||
xml_keep_tags=xml_keep_tags,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
)
|
||||
elif file_type == FileType.EMPTY:
|
||||
elements = []
|
||||
else:
|
||||
msg = "Invalid file" if not filename else f"Invalid file {filename}"
|
||||
raise UnsupportedFileFormatError(
|
||||
f"{msg}. The {file_type} file type is not supported in partition."
|
||||
)
|
||||
return augment_metadata(elements)
|
||||
|
||||
for element in elements:
|
||||
element.metadata.url = url
|
||||
element.metadata.data_source = data_source_metadata
|
||||
if content_type is not None:
|
||||
out_filetype = FileType.from_mime_type(content_type)
|
||||
element.metadata.filetype = out_filetype.mime_type if out_filetype is not None else None
|
||||
else:
|
||||
element.metadata.filetype = file_type.mime_type
|
||||
# -- EMPTY is also a special case because while we can't determine the file type, we can be
|
||||
# -- sure it doesn't contain any elements.
|
||||
if file_type == FileType.EMPTY:
|
||||
return []
|
||||
|
||||
return elements
|
||||
# ============================================================================================
|
||||
# ALL OTHER FILE TYPES
|
||||
# ============================================================================================
|
||||
|
||||
partitioning_kwargs = copy.deepcopy(kwargs)
|
||||
partitioning_kwargs["detect_language_per_element"] = detect_language_per_element
|
||||
partitioning_kwargs["encoding"] = encoding
|
||||
partitioning_kwargs["infer_table_structure"] = infer_table_structure
|
||||
partitioning_kwargs["languages"] = languages
|
||||
partitioning_kwargs["starting_page_number"] = starting_page_number
|
||||
partitioning_kwargs["strategy"] = strategy
|
||||
|
||||
partition = partitioner_loader.get(file_type)
|
||||
elements = partition(filename=filename, file=file, **partitioning_kwargs)
|
||||
return augment_metadata(elements)
|
||||
|
||||
|
||||
def file_and_type_from_url(
|
||||
@ -499,17 +306,15 @@ def decide_table_extraction(
|
||||
class _PartitionerLoader:
|
||||
"""Provides uniform helpful error when a partitioner dependency is not installed.
|
||||
|
||||
Used by `partition()` to encapsulate coping with the possibility the Python
|
||||
environment it is executing in may not have all dependencies installed for a
|
||||
particular partitioner.
|
||||
Used by `partition()` to encapsulate coping with the possibility the Python environment it is
|
||||
executing in may not have all dependencies installed for a particular partitioner.
|
||||
|
||||
Provides `.get()` to access partitioners by file-type, which raises when one or
|
||||
more dependencies for that partitioner are not installed.
|
||||
Provides `.get()` to access partitioners by file-type, which raises when one or more
|
||||
dependencies for that partitioner are not installed.
|
||||
|
||||
The error message indicates what extra needs to be installed to enable that
|
||||
partitioner. This avoids an inconsistent variety of possibly puzzling exceptions
|
||||
arising from much deeper in the partitioner when access to the missing dependency is
|
||||
first attempted.
|
||||
The error message indicates what extra needs to be installed to enable that partitioner. This
|
||||
avoids an inconsistent variety of possibly puzzling exceptions arising from much deeper in the
|
||||
partitioner when access to the missing dependency is first attempted.
|
||||
"""
|
||||
|
||||
# -- module-lifetime cache for partitioners once loaded --
|
||||
@ -519,8 +324,15 @@ class _PartitionerLoader:
|
||||
"""Return partitioner for `file_type`.
|
||||
|
||||
Raises when one or more package dependencies for that file-type have not been
|
||||
installed.
|
||||
installed. Also raises when the file-type is not partitionable.
|
||||
"""
|
||||
if not file_type.is_partitionable:
|
||||
raise UnsupportedFileFormatError(
|
||||
f"Partitioning is not supported for the {file_type} file type."
|
||||
)
|
||||
|
||||
# -- if the partitioner is not in the cache, load it; note this raises if one or more of
|
||||
# -- the partitioner's dependencies is not installed.
|
||||
if file_type not in self._partitioners:
|
||||
self._partitioners[file_type] = self._load_partitioner(file_type)
|
||||
|
||||
|
@ -51,7 +51,6 @@ def partition_epub(
|
||||
|
||||
return partition_html(
|
||||
text=html_text,
|
||||
encoding="unicode",
|
||||
metadata_filename=metadata_filename or filename,
|
||||
metadata_file_type=FileType.EPUB,
|
||||
metadata_last_modified=metadata_last_modified or last_modified,
|
||||
|
@ -126,14 +126,6 @@ class HtmlPartitionerOptions:
|
||||
"""Trace of initial partitioner to be included in metadata for debugging purposes."""
|
||||
return self._detection_origin
|
||||
|
||||
@lazyproperty
|
||||
def encoding(self) -> str | None:
|
||||
"""Caller-provided encoding used to store HTML character stream as bytes.
|
||||
|
||||
`None` when no encoding was provided and encoding should be auto-detected.
|
||||
"""
|
||||
return self._encoding
|
||||
|
||||
@lazyproperty
|
||||
def html_text(self) -> str:
|
||||
"""The HTML document as a string, loaded from wherever the caller specified."""
|
||||
|
@ -42,7 +42,6 @@ def partition_org(
|
||||
|
||||
return partition_html(
|
||||
text=html_text,
|
||||
encoding="unicode",
|
||||
metadata_filename=metadata_filename or filename,
|
||||
metadata_file_type=FileType.ORG,
|
||||
metadata_last_modified=metadata_last_modified or last_modified,
|
||||
|
@ -42,7 +42,6 @@ def partition_rst(
|
||||
|
||||
return partition_html(
|
||||
text=html_text,
|
||||
encoding="unicode",
|
||||
metadata_filename=metadata_filename or filename,
|
||||
metadata_file_type=FileType.RST,
|
||||
metadata_last_modified=metadata_last_modified or last_modified,
|
||||
|
@ -42,7 +42,6 @@ def partition_rtf(
|
||||
|
||||
return partition_html(
|
||||
text=html_text,
|
||||
encoding="unicode",
|
||||
metadata_filename=metadata_filename or filename,
|
||||
metadata_file_type=FileType.RTF,
|
||||
metadata_last_modified=metadata_last_modified or last_modified,
|
||||
|
Loading…
x
Reference in New Issue
Block a user