rfctr: prep for pluggable partitioners (#3806)

**Summary**
Prepare auto-partitioning for pluggable partitioners.

Move toward a uniform partitioner call signature in `auto/partition()`
such that a custom or override partitioner can be registered without
requiring code changes.

**Additional Context**
The central job of `auto/partition()` is to detect the file-type of the
given file and use that to dispatch partitioning to the corresponding
partitioner function e.g. `partition_pdf()` or `partition_docx()`.

In the existing code, each partitioner function is called with
parameters "hand-picked" from the available parameters passed to the
`partition()` function. This is unnecessary and couples those
partitioners tightly with the dispatch function. The desired state is
that all available arguments are passed as `kwargs` and the partitioner
function "self-selects" the arguments it will be sensitive to, applies
its own appropriate default values when the argument is omitted, and
simply ignore any arguments it doesn't use. Note that achieving this
requires no changes to partitioner functions because they already do
precisely this.

So the job is to pass all arguments (other than `filename` and `file`)
to the partitioner as `kwargs`. This will allow additional or alternate
partitioners to be registered at runtime and dispatched to, because as
long as they have the signature `partition_x(filename, file, kwargs) ->
list[Element]` then they can be dispatched to without customization.
This commit is contained in:
Steve Canny 2024-12-10 12:44:34 -08:00 committed by GitHub
parent b981d7197f
commit 3b718ec89a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 114 additions and 338 deletions

View File

@ -1,8 +1,14 @@
## 0.16.11 ## 0.16.12-dev0
### Enhancements
- **Prepare auto-partitioning for pluggable partitioners**. Move toward a uniform partitioner call signature so a custom or override partitioner can be registered without code changes.
### Features
### Fixes ### Fixes
- Fix ipv4 regex to correctly include up to three digit octets. ## 0.16.11
### Enhancements ### Enhancements
@ -14,6 +20,8 @@
### Fixes ### Fixes
- Fix ipv4 regex to correctly include up to three digit octets.
## 0.16.10 ## 0.16.10
### Enhancements ### Enhancements

View File

@ -29,6 +29,7 @@ from unstructured.staging.base import elements_to_json
("Title", 0): 4, ("Title", 0): 4,
("Title", 1): 1, ("Title", 1): 1,
("NarrativeText", 0): 3, ("NarrativeText", 0): 3,
("PageBreak", None): 3,
("ListItem", 0): 6, ("ListItem", 0): 6,
("ListItem", 1): 6, ("ListItem", 1): 6,
("ListItem", 2): 3, ("ListItem", 2): 3,

View File

@ -1232,17 +1232,6 @@ class DescribeHtmlPartitionerOptions:
assert opts.detection_origin == detection_origin assert opts.detection_origin == detection_origin
# -- .encoding -------------------------------
@pytest.mark.parametrize("encoding", ["utf-8", None])
def it_knows_the_caller_provided_encoding(
self, encoding: str | None, opts_args: dict[str, Any]
):
opts_args["encoding"] = encoding
opts = HtmlPartitionerOptions(**opts_args)
assert opts.encoding == encoding
# -- .html_text ------------------------------ # -- .html_text ------------------------------
def it_gets_the_HTML_from_the_file_path_when_one_is_provided(self, opts_args: dict[str, Any]): def it_gets_the_HTML_from_the_file_path_when_one_is_provided(self, opts_args: dict[str, Any]):

View File

@ -2,7 +2,6 @@
from __future__ import annotations from __future__ import annotations
import io
import json import json
import os import os
import pathlib import pathlib
@ -561,7 +560,6 @@ def test_auto_partition_pdf_with_fast_strategy(request: FixtureRequest):
strategy=PartitionStrategy.FAST, strategy=PartitionStrategy.FAST,
languages=None, languages=None,
metadata_filename=None, metadata_filename=None,
include_page_breaks=False,
infer_table_structure=False, infer_table_structure=False,
extract_images_in_pdf=False, extract_images_in_pdf=False,
extract_image_block_types=None, extract_image_block_types=None,
@ -897,7 +895,7 @@ def test_auto_partition_raises_with_bad_type(request: FixtureRequest):
with pytest.raises( with pytest.raises(
UnsupportedFileFormatError, UnsupportedFileFormatError,
match="Invalid file made-up.fake. The FileType.UNK file type is not supported in partiti", match="Partitioning is not supported for the FileType.UNK file type.",
): ):
partition(filename="made-up.fake", strategy=PartitionStrategy.HI_RES) partition(filename="made-up.fake", strategy=PartitionStrategy.HI_RES)
@ -1037,26 +1035,6 @@ def test_auto_partition_forwards_metadata_filename_via_kwargs():
assert all(e.metadata.filename == "much-more-interesting-name.txt" for e in elements) assert all(e.metadata.filename == "much-more-interesting-name.txt" for e in elements)
def test_auto_partition_warns_about_file_filename_deprecation(caplog: LogCaptureFixture):
file_path = example_doc_path("fake-text.txt")
with open(file_path, "rb") as f:
elements = partition(file=f, file_filename=file_path)
assert all(e.metadata.filename == "fake-text.txt" for e in elements)
assert caplog.records[0].levelname == "WARNING"
assert "The file_filename kwarg will be deprecated" in caplog.text
def test_auto_partition_raises_when_both_file_filename_and_metadata_filename_args_are_used():
file_path = example_doc_path("fake-text.txt")
with open(file_path, "rb") as f:
file = io.BytesIO(f.read())
with pytest.raises(ValueError, match="Only one of metadata_filename and file_filename is spe"):
partition(file=file, file_filename=file_path, metadata_filename=file_path)
# -- ocr_languages -------------------------------------------------------- # -- ocr_languages --------------------------------------------------------

View File

@ -1 +1 @@
__version__ = "0.16.11" # pragma: no cover __version__ = "0.16.12-dev0" # pragma: no cover

View File

@ -2,9 +2,10 @@
from __future__ import annotations from __future__ import annotations
import copy
import importlib import importlib
import io import io
from typing import IO, Any, Callable, Literal, Optional from typing import IO, Any, Callable, Optional
import requests import requests
from typing_extensions import TypeAlias from typing_extensions import TypeAlias
@ -25,17 +26,15 @@ Partitioner: TypeAlias = Callable[..., list[Element]]
def partition( def partition(
filename: Optional[str] = None, filename: Optional[str] = None,
*, *,
content_type: Optional[str] = None,
file: Optional[IO[bytes]] = None, file: Optional[IO[bytes]] = None,
file_filename: Optional[str] = None,
url: Optional[str] = None,
include_page_breaks: bool = False,
strategy: str = PartitionStrategy.AUTO,
encoding: Optional[str] = None, encoding: Optional[str] = None,
paragraph_grouper: Optional[Callable[[str], str]] | Literal[False] = None, content_type: Optional[str] = None,
url: Optional[str] = None,
headers: dict[str, str] = {}, headers: dict[str, str] = {},
skip_infer_table_types: list[str] = ["pdf", "jpg", "png", "heic"],
ssl_verify: bool = True, ssl_verify: bool = True,
request_timeout: Optional[int] = None,
strategy: str = PartitionStrategy.AUTO,
skip_infer_table_types: list[str] = ["pdf", "jpg", "png", "heic"],
ocr_languages: Optional[str] = None, # changing to optional for deprecation ocr_languages: Optional[str] = None, # changing to optional for deprecation
languages: Optional[list[str]] = None, languages: Optional[list[str]] = None,
detect_language_per_element: bool = False, detect_language_per_element: bool = False,
@ -44,15 +43,13 @@ def partition(
extract_image_block_types: Optional[list[str]] = None, extract_image_block_types: Optional[list[str]] = None,
extract_image_block_output_dir: Optional[str] = None, extract_image_block_output_dir: Optional[str] = None,
extract_image_block_to_payload: bool = False, extract_image_block_to_payload: bool = False,
xml_keep_tags: bool = False,
data_source_metadata: Optional[DataSourceMetadata] = None, data_source_metadata: Optional[DataSourceMetadata] = None,
metadata_filename: Optional[str] = None, metadata_filename: Optional[str] = None,
request_timeout: Optional[int] = None,
hi_res_model_name: Optional[str] = None, hi_res_model_name: Optional[str] = None,
model_name: Optional[str] = None, # to be deprecated model_name: Optional[str] = None, # to be deprecated
starting_page_number: int = 1, starting_page_number: int = 1,
**kwargs: Any, **kwargs: Any,
): ) -> list[Element]:
"""Partitions a document into its constituent elements. """Partitions a document into its constituent elements.
Uses libmagic to determine the file's type and route it to the appropriate partitioning Uses libmagic to determine the file's type and route it to the appropriate partitioning
@ -63,30 +60,32 @@ def partition(
---------- ----------
filename filename
A string defining the target filename path. A string defining the target filename path.
content_type
A string defining the file content in MIME type
file file
A file-like object using "rb" mode --> open(filename, "rb"). A file-like object using "rb" mode --> open(filename, "rb").
metadata_filename encoding
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt" The character-encoding used to decode the input bytes when drawn from `filename` or `file`.
Defaults to "utf-8".
url url
The url for a remote document. Pass in content_type if you want partition to treat The url for a remote document. Pass in content_type if you want partition to treat
the document as a specific content_type. the document as a specific content_type.
include_page_breaks headers
If True, the output will include page breaks if the filetype supports it The headers to be used in conjunction with the HTTP request if URL is set.
ssl_verify
If the URL parameter is set, determines whether or not partition uses SSL verification
in the HTTP request.
request_timeout
The timeout for the HTTP request if URL is set. Defaults to None meaning no timeout and
requests will block indefinitely.
content_type
A string defining the file content in MIME type
metadata_filename
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
strategy strategy
The strategy to use for partitioning PDF/image. Uses a layout detection model if set The strategy to use for partitioning PDF/image. Uses a layout detection model if set
to 'hi_res', otherwise partition simply extracts the text from the document to 'hi_res', otherwise partition simply extracts the text from the document
and processes it. and processes it.
encoding
The encoding method used to decode the text input. If None, utf-8 will be used.
headers
The headers to be used in conjunction with the HTTP request if URL is set.
skip_infer_table_types skip_infer_table_types
The document types that you want to skip table extraction with. The document types that you want to skip table extraction with.
ssl_verify
If the URL parameter is set, determines whether or not partition uses SSL verification
in the HTTP request.
languages languages
The languages present in the document, for use in partitioning and/or OCR. For partitioning The languages present in the document, for use in partitioning and/or OCR. For partitioning
image or pdf documents with Tesseract, you'll first need to install the appropriate image or pdf documents with Tesseract, you'll first need to install the appropriate
@ -124,12 +123,6 @@ def partition(
Only applicable if `strategy=hi_res` and `extract_image_block_to_payload=False`. Only applicable if `strategy=hi_res` and `extract_image_block_to_payload=False`.
The filesystem path for saving images of the element type(s) The filesystem path for saving images of the element type(s)
specified in 'extract_image_block_types'. specified in 'extract_image_block_types'.
xml_keep_tags
If True, will retain the XML tags in the output. Otherwise it will simply extract
the text from within the tags. Only applies to partition_xml.
request_timeout
The timeout for the HTTP request if URL is set. Defaults to None meaning no timeout and
requests will block indefinitely.
hi_res_model_name hi_res_model_name
The layout detection model used when partitioning strategy is set to `hi_res`. The layout detection model used when partitioning strategy is set to `hi_res`.
model_name model_name
@ -142,18 +135,6 @@ def partition(
""" """
exactly_one(file=file, filename=filename, url=url) exactly_one(file=file, filename=filename, url=url)
if metadata_filename and file_filename:
raise ValueError(
"Only one of metadata_filename and file_filename is specified. "
"metadata_filename is preferred. file_filename is marked for deprecation.",
)
if file_filename is not None:
metadata_filename = file_filename
logger.warning(
"The file_filename kwarg will be deprecated in a future version of unstructured. "
"Please use metadata_filename instead.",
)
kwargs.setdefault("metadata_filename", metadata_filename) kwargs.setdefault("metadata_filename", metadata_filename)
if pdf_infer_table_structure: if pdf_infer_table_structure:
@ -197,80 +178,28 @@ def partition(
partitioner_loader = _PartitionerLoader() partitioner_loader = _PartitionerLoader()
if file_type == FileType.CSV: # -- extracting this post-processing to allow multiple exit-points from function --
partition_csv = partitioner_loader.get(file_type) def augment_metadata(elements: list[Element]) -> list[Element]:
elements = partition_csv( """Add some metadata fields to each element."""
filename=filename, for element in elements:
file=file, element.metadata.url = url
encoding=encoding, element.metadata.data_source = data_source_metadata
infer_table_structure=infer_table_structure, if content_type is not None:
languages=languages, out_filetype = FileType.from_mime_type(content_type)
detect_language_per_element=detect_language_per_element, element.metadata.filetype = out_filetype.mime_type if out_filetype else None
**kwargs, else:
) element.metadata.filetype = file_type.mime_type
elif file_type == FileType.DOC:
partition_doc = partitioner_loader.get(file_type) return elements
elements = partition_doc(
filename=filename, # -- handle PDF/Image partitioning separately because they have a lot of special-case
file=file, # -- parameters. We'll come back to this after sorting out the other file types.
infer_table_structure=infer_table_structure, if file_type == FileType.PDF:
languages=languages, partition_pdf = partitioner_loader.get(file_type)
detect_language_per_element=detect_language_per_element, elements = partition_pdf(
starting_page_number=starting_page_number,
strategy=strategy,
**kwargs,
)
elif file_type == FileType.DOCX:
partition_docx = partitioner_loader.get(file_type)
elements = partition_docx(
filename=filename,
file=file,
infer_table_structure=infer_table_structure,
languages=languages,
detect_language_per_element=detect_language_per_element,
starting_page_number=starting_page_number,
strategy=strategy,
**kwargs,
)
elif file_type == FileType.EML:
partition_email = partitioner_loader.get(file_type)
elements = partition_email(
filename=filename,
file=file,
encoding=encoding,
languages=languages,
detect_language_per_element=detect_language_per_element,
**kwargs,
)
elif file_type == FileType.EPUB:
partition_epub = partitioner_loader.get(file_type)
elements = partition_epub(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
infer_table_structure=infer_table_structure,
languages=languages,
detect_language_per_element=detect_language_per_element,
**kwargs,
)
elif file_type == FileType.HTML:
partition_html = partitioner_loader.get(file_type)
elements = partition_html(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
encoding=encoding,
languages=languages,
detect_language_per_element=detect_language_per_element,
**kwargs,
)
elif file_type.partitioner_shortname == "image":
partition_image = partitioner_loader.get(file_type)
elements = partition_image(
filename=filename, filename=filename,
file=file, file=file,
url=None, url=None,
include_page_breaks=include_page_breaks,
infer_table_structure=infer_table_structure, infer_table_structure=infer_table_structure,
strategy=strategy, strategy=strategy,
languages=languages, languages=languages,
@ -282,7 +211,30 @@ def partition(
starting_page_number=starting_page_number, starting_page_number=starting_page_number,
**kwargs, **kwargs,
) )
elif file_type == FileType.JSON: return augment_metadata(elements)
if file_type.partitioner_shortname == "image":
partition_image = partitioner_loader.get(file_type)
elements = partition_image(
filename=filename,
file=file,
url=None,
infer_table_structure=infer_table_structure,
strategy=strategy,
languages=languages,
hi_res_model_name=hi_res_model_name or model_name,
extract_images_in_pdf=extract_images_in_pdf,
extract_image_block_types=extract_image_block_types,
extract_image_block_output_dir=extract_image_block_output_dir,
extract_image_block_to_payload=extract_image_block_to_payload,
starting_page_number=starting_page_number,
**kwargs,
)
return augment_metadata(elements)
# -- JSON is a special case because it's not a document format per se and is insensitive to
# -- most of the parameters that apply to other file types.
if file_type == FileType.JSON:
if not is_json_processable(filename=filename, file=file): if not is_json_processable(filename=filename, file=file):
raise ValueError( raise ValueError(
"Detected a JSON file that does not conform to the Unstructured schema. " "Detected a JSON file that does not conform to the Unstructured schema. "
@ -290,173 +242,28 @@ def partition(
) )
partition_json = partitioner_loader.get(file_type) partition_json = partitioner_loader.get(file_type)
elements = partition_json(filename=filename, file=file, **kwargs) elements = partition_json(filename=filename, file=file, **kwargs)
elif file_type == FileType.MD: return augment_metadata(elements)
partition_md = partitioner_loader.get(file_type)
elements = partition_md(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
infer_table_structure=infer_table_structure,
languages=languages,
detect_language_per_element=detect_language_per_element,
**kwargs,
)
elif file_type == FileType.MSG:
partition_msg = partitioner_loader.get(file_type)
elements = partition_msg(
filename=filename,
file=file,
languages=languages,
detect_language_per_element=detect_language_per_element,
**kwargs,
)
elif file_type == FileType.ODT:
partition_odt = partitioner_loader.get(file_type)
elements = partition_odt(
filename=filename,
file=file,
infer_table_structure=infer_table_structure,
languages=languages,
detect_language_per_element=detect_language_per_element,
starting_page_number=starting_page_number,
strategy=strategy,
**kwargs,
)
elif file_type == FileType.ORG:
partition_org = partitioner_loader.get(file_type)
elements = partition_org(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
languages=languages,
detect_language_per_element=detect_language_per_element,
**kwargs,
)
elif file_type == FileType.PDF:
partition_pdf = partitioner_loader.get(file_type)
elements = partition_pdf(
filename=filename,
file=file,
url=None,
include_page_breaks=include_page_breaks,
infer_table_structure=infer_table_structure,
strategy=strategy,
languages=languages,
hi_res_model_name=hi_res_model_name or model_name,
extract_images_in_pdf=extract_images_in_pdf,
extract_image_block_types=extract_image_block_types,
extract_image_block_output_dir=extract_image_block_output_dir,
extract_image_block_to_payload=extract_image_block_to_payload,
starting_page_number=starting_page_number,
**kwargs,
)
elif file_type == FileType.PPT:
partition_ppt = partitioner_loader.get(file_type)
elements = partition_ppt(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
infer_table_structure=infer_table_structure,
languages=languages,
detect_language_per_element=detect_language_per_element,
strategy=strategy,
**kwargs,
)
elif file_type == FileType.PPTX:
partition_pptx = partitioner_loader.get(file_type)
elements = partition_pptx(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
infer_table_structure=infer_table_structure,
languages=languages,
detect_language_per_element=detect_language_per_element,
starting_page_number=starting_page_number,
strategy=strategy,
**kwargs,
)
elif file_type == FileType.RST:
partition_rst = partitioner_loader.get(file_type)
elements = partition_rst(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
infer_table_structure=infer_table_structure,
languages=languages,
detect_language_per_element=detect_language_per_element,
**kwargs,
)
elif file_type == FileType.RTF:
partition_rtf = partitioner_loader.get(file_type)
elements = partition_rtf(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
infer_table_structure=infer_table_structure,
languages=languages,
detect_language_per_element=detect_language_per_element,
**kwargs,
)
elif file_type == FileType.TSV:
partition_tsv = partitioner_loader.get(file_type)
elements = partition_tsv(
filename=filename,
file=file,
languages=languages,
detect_language_per_element=detect_language_per_element,
**kwargs,
)
elif file_type == FileType.TXT:
partition_text = partitioner_loader.get(file_type)
elements = partition_text(
filename=filename,
file=file,
encoding=encoding,
paragraph_grouper=paragraph_grouper,
languages=languages,
detect_language_per_element=detect_language_per_element,
**kwargs,
)
elif file_type in (FileType.XLS, FileType.XLSX):
partition_xlsx = partitioner_loader.get(file_type)
elements = partition_xlsx(
filename=filename,
file=file,
infer_table_structure=infer_table_structure,
languages=languages,
detect_language_per_element=detect_language_per_element,
starting_page_number=starting_page_number,
**kwargs,
)
elif file_type == FileType.XML:
partition_xml = partitioner_loader.get(file_type)
elements = partition_xml(
filename=filename,
file=file,
encoding=encoding,
xml_keep_tags=xml_keep_tags,
languages=languages,
detect_language_per_element=detect_language_per_element,
**kwargs,
)
elif file_type == FileType.EMPTY:
elements = []
else:
msg = "Invalid file" if not filename else f"Invalid file {filename}"
raise UnsupportedFileFormatError(
f"{msg}. The {file_type} file type is not supported in partition."
)
for element in elements: # -- EMPTY is also a special case because while we can't determine the file type, we can be
element.metadata.url = url # -- sure it doesn't contain any elements.
element.metadata.data_source = data_source_metadata if file_type == FileType.EMPTY:
if content_type is not None: return []
out_filetype = FileType.from_mime_type(content_type)
element.metadata.filetype = out_filetype.mime_type if out_filetype is not None else None
else:
element.metadata.filetype = file_type.mime_type
return elements # ============================================================================================
# ALL OTHER FILE TYPES
# ============================================================================================
partitioning_kwargs = copy.deepcopy(kwargs)
partitioning_kwargs["detect_language_per_element"] = detect_language_per_element
partitioning_kwargs["encoding"] = encoding
partitioning_kwargs["infer_table_structure"] = infer_table_structure
partitioning_kwargs["languages"] = languages
partitioning_kwargs["starting_page_number"] = starting_page_number
partitioning_kwargs["strategy"] = strategy
partition = partitioner_loader.get(file_type)
elements = partition(filename=filename, file=file, **partitioning_kwargs)
return augment_metadata(elements)
def file_and_type_from_url( def file_and_type_from_url(
@ -499,17 +306,15 @@ def decide_table_extraction(
class _PartitionerLoader: class _PartitionerLoader:
"""Provides uniform helpful error when a partitioner dependency is not installed. """Provides uniform helpful error when a partitioner dependency is not installed.
Used by `partition()` to encapsulate coping with the possibility the Python Used by `partition()` to encapsulate coping with the possibility the Python environment it is
environment it is executing in may not have all dependencies installed for a executing in may not have all dependencies installed for a particular partitioner.
particular partitioner.
Provides `.get()` to access partitioners by file-type, which raises when one or Provides `.get()` to access partitioners by file-type, which raises when one or more
more dependencies for that partitioner are not installed. dependencies for that partitioner are not installed.
The error message indicates what extra needs to be installed to enable that The error message indicates what extra needs to be installed to enable that partitioner. This
partitioner. This avoids an inconsistent variety of possibly puzzling exceptions avoids an inconsistent variety of possibly puzzling exceptions arising from much deeper in the
arising from much deeper in the partitioner when access to the missing dependency is partitioner when access to the missing dependency is first attempted.
first attempted.
""" """
# -- module-lifetime cache for partitioners once loaded -- # -- module-lifetime cache for partitioners once loaded --
@ -519,8 +324,15 @@ class _PartitionerLoader:
"""Return partitioner for `file_type`. """Return partitioner for `file_type`.
Raises when one or more package dependencies for that file-type have not been Raises when one or more package dependencies for that file-type have not been
installed. installed. Also raises when the file-type is not partitionable.
""" """
if not file_type.is_partitionable:
raise UnsupportedFileFormatError(
f"Partitioning is not supported for the {file_type} file type."
)
# -- if the partitioner is not in the cache, load it; note this raises if one or more of
# -- the partitioner's dependencies is not installed.
if file_type not in self._partitioners: if file_type not in self._partitioners:
self._partitioners[file_type] = self._load_partitioner(file_type) self._partitioners[file_type] = self._load_partitioner(file_type)

View File

@ -51,7 +51,6 @@ def partition_epub(
return partition_html( return partition_html(
text=html_text, text=html_text,
encoding="unicode",
metadata_filename=metadata_filename or filename, metadata_filename=metadata_filename or filename,
metadata_file_type=FileType.EPUB, metadata_file_type=FileType.EPUB,
metadata_last_modified=metadata_last_modified or last_modified, metadata_last_modified=metadata_last_modified or last_modified,

View File

@ -126,14 +126,6 @@ class HtmlPartitionerOptions:
"""Trace of initial partitioner to be included in metadata for debugging purposes.""" """Trace of initial partitioner to be included in metadata for debugging purposes."""
return self._detection_origin return self._detection_origin
@lazyproperty
def encoding(self) -> str | None:
"""Caller-provided encoding used to store HTML character stream as bytes.
`None` when no encoding was provided and encoding should be auto-detected.
"""
return self._encoding
@lazyproperty @lazyproperty
def html_text(self) -> str: def html_text(self) -> str:
"""The HTML document as a string, loaded from wherever the caller specified.""" """The HTML document as a string, loaded from wherever the caller specified."""

View File

@ -42,7 +42,6 @@ def partition_org(
return partition_html( return partition_html(
text=html_text, text=html_text,
encoding="unicode",
metadata_filename=metadata_filename or filename, metadata_filename=metadata_filename or filename,
metadata_file_type=FileType.ORG, metadata_file_type=FileType.ORG,
metadata_last_modified=metadata_last_modified or last_modified, metadata_last_modified=metadata_last_modified or last_modified,

View File

@ -42,7 +42,6 @@ def partition_rst(
return partition_html( return partition_html(
text=html_text, text=html_text,
encoding="unicode",
metadata_filename=metadata_filename or filename, metadata_filename=metadata_filename or filename,
metadata_file_type=FileType.RST, metadata_file_type=FileType.RST,
metadata_last_modified=metadata_last_modified or last_modified, metadata_last_modified=metadata_last_modified or last_modified,

View File

@ -42,7 +42,6 @@ def partition_rtf(
return partition_html( return partition_html(
text=html_text, text=html_text,
encoding="unicode",
metadata_filename=metadata_filename or filename, metadata_filename=metadata_filename or filename,
metadata_file_type=FileType.RTF, metadata_file_type=FileType.RTF,
metadata_last_modified=metadata_last_modified or last_modified, metadata_last_modified=metadata_last_modified or last_modified,