mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-25 08:19:22 +00:00
rfctr(auto): add _PartitionerLoader (#3418)
**Summary** Replace conditional explicit import of partitioner modules in `.partition.auto` with the new `_PartitionerLoader` class. This avoids unbound variable warnings and is much less noisy. `_PartitionerLoader` makes use of the new `FileType` property `.importable_package_dependencies` to determine whether all required packages are importable before dispatching the file to its partitioner. It uses `FileType.extra_name` to form a helpful error message when a dependency is not installed, so the caller knows which `pip install` extra to specify to remedy the error. `PartitionerLoader` uses the `FileType` properties `.partitioner_module_qname` and `partitioner_function_name` to load the partitioner once its dependencies are verified. Loaded partitioners are cached with module lifetime scope for efficiency.
This commit is contained in:
parent
ec59abfabc
commit
49c4bd34be
@ -1,3 +1,11 @@
|
||||
## 0.15.1-dev0
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
## 0.15.0
|
||||
|
||||
### Enhancements
|
||||
|
@ -50,6 +50,68 @@ class DescribeFileType:
|
||||
def but_not_when_that_mime_type_is_not_registered_by_a_file_type(self, mime_type: str):
|
||||
assert FileType.from_mime_type(mime_type) is None
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("file_type", "expected_value"),
|
||||
[
|
||||
(FileType.BMP, ("unstructured_inference",)),
|
||||
(FileType.CSV, ("pandas",)),
|
||||
(FileType.DOC, ("docx",)),
|
||||
(FileType.EMPTY, ()),
|
||||
(FileType.HTML, ()),
|
||||
(FileType.ODT, ("docx", "pypandoc")),
|
||||
(FileType.PDF, ("pdf2image", "pdfminer", "PIL")),
|
||||
(FileType.UNK, ()),
|
||||
(FileType.WAV, ()),
|
||||
(FileType.ZIP, ()),
|
||||
],
|
||||
)
|
||||
def it_knows_which_importable_packages_its_partitioner_depends_on(
|
||||
self, file_type: FileType, expected_value: tuple[str, ...]
|
||||
):
|
||||
assert file_type.importable_package_dependencies == expected_value
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("file_type", "expected_value"),
|
||||
[
|
||||
(FileType.BMP, "image"),
|
||||
(FileType.DOC, "doc"),
|
||||
(FileType.DOCX, "docx"),
|
||||
(FileType.EML, None),
|
||||
(FileType.EMPTY, None),
|
||||
(FileType.MSG, "msg"),
|
||||
(FileType.PDF, "pdf"),
|
||||
(FileType.XLS, "xlsx"),
|
||||
(FileType.UNK, None),
|
||||
(FileType.WAV, None),
|
||||
(FileType.ZIP, None),
|
||||
],
|
||||
)
|
||||
def and_it_knows_which_pip_extra_needs_to_be_installed_to_get_those_dependencies(
|
||||
self, file_type: FileType, expected_value: str | None
|
||||
):
|
||||
assert file_type.extra_name == expected_value
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("file_type", "expected_value"),
|
||||
[
|
||||
(FileType.BMP, True),
|
||||
(FileType.CSV, True),
|
||||
(FileType.DOC, True),
|
||||
(FileType.EML, True),
|
||||
(FileType.JPG, True),
|
||||
(FileType.PDF, True),
|
||||
(FileType.PPTX, True),
|
||||
(FileType.WAV, False),
|
||||
(FileType.ZIP, False),
|
||||
(FileType.EMPTY, False),
|
||||
(FileType.UNK, False),
|
||||
],
|
||||
)
|
||||
def it_knows_whether_files_of_its_type_are_directly_partitionable(
|
||||
self, file_type: FileType, expected_value: str
|
||||
):
|
||||
assert file_type.is_partitionable is expected_value
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("file_type", "mime_type"),
|
||||
[
|
||||
@ -68,3 +130,72 @@ class DescribeFileType:
|
||||
)
|
||||
def it_knows_its_canonical_MIME_type(self, file_type: FileType, mime_type: str):
|
||||
assert file_type.mime_type == mime_type
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("file_type", "expected_value"),
|
||||
[
|
||||
(FileType.BMP, "partition_image"),
|
||||
(FileType.CSV, "partition_csv"),
|
||||
(FileType.DOC, "partition_doc"),
|
||||
(FileType.DOCX, "partition_docx"),
|
||||
(FileType.JPG, "partition_image"),
|
||||
(FileType.PNG, "partition_image"),
|
||||
(FileType.TIFF, "partition_image"),
|
||||
],
|
||||
)
|
||||
def it_knows_its_partitioner_function_name(self, file_type: FileType, expected_value: str):
|
||||
assert file_type.partitioner_function_name == expected_value
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"file_type", [FileType.WAV, FileType.ZIP, FileType.EMPTY, FileType.UNK]
|
||||
)
|
||||
def but_it_raises_on_partitioner_function_name_access_when_the_file_type_is_not_partitionable(
|
||||
self, file_type: FileType
|
||||
):
|
||||
with pytest.raises(ValueError, match="`.partitioner_function_name` is undefined because "):
|
||||
file_type.partitioner_function_name
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("file_type", "expected_value"),
|
||||
[
|
||||
(FileType.BMP, "unstructured.partition.image"),
|
||||
(FileType.CSV, "unstructured.partition.csv"),
|
||||
(FileType.DOC, "unstructured.partition.doc"),
|
||||
(FileType.DOCX, "unstructured.partition.docx"),
|
||||
(FileType.JPG, "unstructured.partition.image"),
|
||||
(FileType.PNG, "unstructured.partition.image"),
|
||||
(FileType.TIFF, "unstructured.partition.image"),
|
||||
],
|
||||
)
|
||||
def it_knows_the_fully_qualified_name_of_its_partitioner_module(
|
||||
self, file_type: FileType, expected_value: str
|
||||
):
|
||||
assert file_type.partitioner_module_qname == expected_value
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"file_type", [FileType.WAV, FileType.ZIP, FileType.EMPTY, FileType.UNK]
|
||||
)
|
||||
def but_it_raises_on_partitioner_module_qname_access_when_the_file_type_is_not_partitionable(
|
||||
self, file_type: FileType
|
||||
):
|
||||
with pytest.raises(ValueError, match="`.partitioner_module_qname` is undefined because "):
|
||||
file_type.partitioner_module_qname
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("file_type", "expected_value"),
|
||||
[
|
||||
(FileType.BMP, "image"),
|
||||
(FileType.CSV, "csv"),
|
||||
(FileType.DOC, "doc"),
|
||||
(FileType.DOCX, "docx"),
|
||||
(FileType.JPG, "image"),
|
||||
(FileType.PNG, "image"),
|
||||
(FileType.TIFF, "image"),
|
||||
(FileType.XLS, "xlsx"),
|
||||
(FileType.XLSX, "xlsx"),
|
||||
],
|
||||
)
|
||||
def it_provides_access_to_the_partitioner_shortname(
|
||||
self, file_type: FileType, expected_value: str
|
||||
):
|
||||
assert file_type.partitioner_shortname == expected_value
|
||||
|
@ -10,7 +10,7 @@ import sys
|
||||
import tempfile
|
||||
import warnings
|
||||
from importlib import import_module
|
||||
from typing import Callable, Iterator, cast
|
||||
from typing import Iterator, cast
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
@ -27,7 +27,6 @@ from test_unstructured.unit_utils import (
|
||||
ANY,
|
||||
FixtureRequest,
|
||||
LogCaptureFixture,
|
||||
MonkeyPatch,
|
||||
example_doc_path,
|
||||
function_mock,
|
||||
method_mock,
|
||||
@ -46,8 +45,7 @@ from unstructured.documents.elements import (
|
||||
Title,
|
||||
)
|
||||
from unstructured.file_utils.model import FileType
|
||||
from unstructured.partition import auto
|
||||
from unstructured.partition.auto import IMAGE_FILETYPES, _get_partition_with_extras, partition
|
||||
from unstructured.partition.auto import _PartitionerLoader, partition
|
||||
from unstructured.partition.utils.constants import PartitionStrategy
|
||||
from unstructured.staging.base import elements_from_json, elements_to_dicts, elements_to_json
|
||||
|
||||
@ -570,16 +568,21 @@ def test_auto_partition_pdf_from_file(pass_metadata_filename: bool, content_type
|
||||
assert e.text.startswith("Zejiang Shen")
|
||||
|
||||
|
||||
def test_auto_partition_pdf_with_fast_strategy(monkeypatch: MonkeyPatch):
|
||||
def test_auto_partition_pdf_with_fast_strategy(request: FixtureRequest):
|
||||
partition_pdf_ = function_mock(
|
||||
request,
|
||||
"unstructured.partition.pdf.partition_pdf",
|
||||
return_value=[NarrativeText("Hello there!")],
|
||||
)
|
||||
partitioner_loader_get_ = method_mock(
|
||||
request, _PartitionerLoader, "get", return_value=partition_pdf_
|
||||
)
|
||||
file_path = example_doc_path("pdf/layout-parser-paper-fast.pdf")
|
||||
|
||||
mock_return = [NarrativeText("Hello there!")]
|
||||
with patch.object(auto, "partition_pdf", return_value=mock_return) as mock_partition:
|
||||
mock_partition_with_extras_map = {"pdf": mock_partition}
|
||||
monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
|
||||
partition(filename=file_path, strategy=PartitionStrategy.FAST)
|
||||
partition(file_path, strategy=PartitionStrategy.FAST)
|
||||
|
||||
mock_partition.assert_called_once_with(
|
||||
partitioner_loader_get_.assert_called_once_with(ANY, FileType.PDF)
|
||||
partition_pdf_.assert_called_once_with(
|
||||
filename=file_path,
|
||||
file=None,
|
||||
url=None,
|
||||
@ -919,10 +922,10 @@ def test_auto_partition_xml_from_file_with_tags():
|
||||
|
||||
def test_auto_partition_raises_with_bad_type(request: FixtureRequest):
|
||||
detect_filetype_ = function_mock(
|
||||
request, "unstructured.partition.auto.detect_filetype", return_value=None
|
||||
request, "unstructured.partition.auto.detect_filetype", return_value=FileType.UNK
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match="Invalid file made-up.fake. The None file type is not "):
|
||||
with pytest.raises(ValueError, match="Invalid file made-up.fake. The FileType.UNK file type "):
|
||||
partition(filename="made-up.fake", strategy=PartitionStrategy.HI_RES)
|
||||
|
||||
detect_filetype_.assert_called_once_with(
|
||||
@ -1026,23 +1029,7 @@ def test_auto_partition_respects_detect_language_per_element_arg():
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"file_extension",
|
||||
[
|
||||
"doc",
|
||||
"docx",
|
||||
"eml",
|
||||
"epub",
|
||||
"html",
|
||||
"md",
|
||||
"odt",
|
||||
"org",
|
||||
"ppt",
|
||||
"pptx",
|
||||
"rst",
|
||||
"rtf",
|
||||
"txt",
|
||||
"xml",
|
||||
],
|
||||
"file_extension", "doc docx eml epub html md odt org ppt pptx rst rtf txt xml".split()
|
||||
)
|
||||
def test_auto_partition_respects_language_arg(file_extension: str):
|
||||
elements = partition(
|
||||
@ -1167,7 +1154,7 @@ def test_auto_partition_respects_skip_infer_table_types(
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("content_type", "filetype_shortname", "expected_value"),
|
||||
("content_type", "shortname", "expected_value"),
|
||||
[
|
||||
("text/csv", "csv", "text/csv"),
|
||||
("text/html", "html", "text/html"),
|
||||
@ -1177,22 +1164,23 @@ def test_auto_partition_respects_skip_infer_table_types(
|
||||
def test_auto_partition_adds_filetype_to_metadata(
|
||||
request: FixtureRequest,
|
||||
content_type: str,
|
||||
filetype_shortname: str,
|
||||
shortname: str,
|
||||
expected_value: str | None,
|
||||
monkeypatch: MonkeyPatch,
|
||||
):
|
||||
partition_fn_ = function_mock(
|
||||
request,
|
||||
f"unstructured.partition.auto.partition_{filetype_shortname}",
|
||||
f"unstructured.partition.{shortname}.partition_{shortname}",
|
||||
return_value=[Text("text 1"), Text("text 2")],
|
||||
)
|
||||
mock_partition_with_extras_map = {filetype_shortname: partition_fn_}
|
||||
monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
|
||||
partitioner_loader_get_ = method_mock(
|
||||
request, _PartitionerLoader, "get", return_value=partition_fn_
|
||||
)
|
||||
|
||||
elements = partition(
|
||||
example_doc_path("pdf/layout-parser-paper-fast.pdf"), content_type=content_type
|
||||
)
|
||||
|
||||
partitioner_loader_get_.assert_called_once()
|
||||
assert len(elements) == 2
|
||||
assert all(e.metadata.filetype == expected_value for e in elements)
|
||||
|
||||
@ -1207,20 +1195,23 @@ def test_auto_partition_adds_filetype_to_metadata(
|
||||
],
|
||||
)
|
||||
def test_auto_partition_overwrites_any_filetype_applied_by_file_specific_partitioner(
|
||||
request: FixtureRequest, content_type: str | None, monkeypatch: MonkeyPatch
|
||||
request: FixtureRequest, content_type: str | None
|
||||
):
|
||||
metadata = ElementMetadata(filetype="imapdf")
|
||||
partition_pdf_ = function_mock(
|
||||
request,
|
||||
"unstructured.partition.auto.partition_pdf",
|
||||
"unstructured.partition.pdf.partition_pdf",
|
||||
return_value=[Text("text 1", metadata=metadata), Text("text 2", metadata=metadata)],
|
||||
)
|
||||
monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", {"pdf": partition_pdf_})
|
||||
partitioner_loader_get_ = method_mock(
|
||||
request, _PartitionerLoader, "get", return_value=partition_pdf_
|
||||
)
|
||||
|
||||
elements = partition(
|
||||
example_doc_path("pdf/layout-parser-paper-fast.pdf"), content_type=content_type
|
||||
)
|
||||
|
||||
partitioner_loader_get_.assert_called_once_with(ANY, FileType.PDF)
|
||||
assert len(elements) == 2
|
||||
assert all(e.metadata.filetype == "application/pdf" for e in elements)
|
||||
|
||||
@ -1231,7 +1222,7 @@ def test_auto_partition_overwrites_any_filetype_applied_by_file_specific_partiti
|
||||
t
|
||||
for t in FileType
|
||||
if t not in (FileType.EMPTY, FileType.UNK, FileType.WAV, FileType.XLS, FileType.ZIP)
|
||||
and t not in IMAGE_FILETYPES
|
||||
and t.partitioner_function_name != "partition_image"
|
||||
],
|
||||
)
|
||||
def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(filetype: FileType):
|
||||
@ -1305,10 +1296,18 @@ def test_auto_partition_from_file_works_on_empty_file():
|
||||
assert partition(file=f) == []
|
||||
|
||||
|
||||
def test_auto_partition_requiring_extras_prompts_to_install_missing_dependencies():
|
||||
partition_with_extras_map: dict[str, Callable[..., list[Element]]] = {}
|
||||
with pytest.raises(ImportError, match="partition_pdf is not available. Install the pdf depen"):
|
||||
_get_partition_with_extras("pdf", partition_with_extras_map)
|
||||
def test_auto_partition_that_requires_extras_raises_when_dependencies_are_not_installed(
|
||||
request: FixtureRequest,
|
||||
):
|
||||
_PartitionerLoader._partitioners.pop(FileType.PDF, None)
|
||||
dependency_exists_ = function_mock(
|
||||
request, "unstructured.partition.auto.dependency_exists", return_value=False
|
||||
)
|
||||
match = r"partition_pdf\(\) is not available because one or more dependencies are not installed"
|
||||
with pytest.raises(ImportError, match=match):
|
||||
partition(example_doc_path("layout-parser-paper-fast.pdf"))
|
||||
|
||||
dependency_exists_.assert_called_once_with("pdf2image")
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.15.0" # pragma: no cover
|
||||
__version__ = "0.15.1-dev0" # pragma: no cover
|
||||
|
@ -32,7 +32,7 @@ def detect_filetype(
|
||||
file: Optional[IO[bytes]] = None,
|
||||
file_filename: Optional[str] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
) -> Optional[FileType]:
|
||||
) -> FileType:
|
||||
"""Use libmagic to determine a file's type.
|
||||
|
||||
Helps determine which partition brick to use for a given file. A return value of None indicates
|
||||
@ -122,7 +122,7 @@ def detect_filetype(
|
||||
".tsv",
|
||||
".json",
|
||||
]:
|
||||
return FileType.from_extension(extension)
|
||||
return FileType.from_extension(extension) or FileType.TXT
|
||||
|
||||
# NOTE(crag): for older versions of the OS libmagic package, such as is currently
|
||||
# installed on the Unstructured docker image, .json files resolve to "text/plain"
|
||||
|
@ -12,7 +12,17 @@ class FileType(enum.Enum):
|
||||
Note not all of these can be partitioned, e.g. WAV and ZIP have no partitioner.
|
||||
"""
|
||||
|
||||
_partitioner_shortname: str | None
|
||||
"""Like "docx", from which partitioner module and function-name can be derived via template."""
|
||||
|
||||
_importable_package_dependencies: tuple[str, ...]
|
||||
"""Packages that must be available for import for this file-type's partitioner to work."""
|
||||
|
||||
_extra_name: str | None
|
||||
"""`pip install` extra that provides package dependencies for this file-type."""
|
||||
|
||||
_extensions: tuple[str, ...]
|
||||
"""Filename-extensions recognized as this file-type. Use for secondary identification only."""
|
||||
|
||||
_canonical_mime_type: str
|
||||
"""The MIME-type used as `.metadata.filetype` for this file-type."""
|
||||
@ -23,12 +33,18 @@ class FileType(enum.Enum):
|
||||
def __new__(
|
||||
cls,
|
||||
value: str,
|
||||
partitioner_shortname: str | None,
|
||||
importable_package_dependencies: Iterable[str],
|
||||
extra_name: str | None,
|
||||
extensions: Iterable[str],
|
||||
canonical_mime_type: str,
|
||||
alias_mime_types: Iterable[str],
|
||||
):
|
||||
self = object.__new__(cls)
|
||||
self._value_ = value
|
||||
self._partitioner_shortname = partitioner_shortname
|
||||
self._importable_package_dependencies = tuple(importable_package_dependencies)
|
||||
self._extra_name = extra_name
|
||||
self._extensions = tuple(extensions)
|
||||
self._canonical_mime_type = canonical_mime_type
|
||||
self._alias_mime_types = tuple(alias_mime_types)
|
||||
@ -41,8 +57,150 @@ class FileType(enum.Enum):
|
||||
"""
|
||||
return self.name < other.name
|
||||
|
||||
BMP = ("bmp", [".bmp"], "image/bmp", cast(list[str], []))
|
||||
@classmethod
|
||||
def from_extension(cls, extension: str | None) -> FileType | None:
|
||||
"""Select a FileType member based on an extension.
|
||||
|
||||
`extension` must include the leading period, like `".pdf"`. Extension is suitable as a
|
||||
secondary file-type identification method but is unreliable for primary identification.
|
||||
|
||||
Returns `None` when `extension` is not registered for any supported file-type.
|
||||
"""
|
||||
if extension in (None, "", "."):
|
||||
return None
|
||||
# -- not super efficient but plenty fast enough for once-or-twice-per-file use and avoids
|
||||
# -- limitations on defining a class variable on an Enum.
|
||||
for m in cls.__members__.values():
|
||||
if extension in m._extensions:
|
||||
return m
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def from_mime_type(cls, mime_type: str) -> FileType | None:
|
||||
"""Select a FileType member based on a MIME-type.
|
||||
|
||||
Returns `None` when `mime_type` is `None` or does not map to the canonical MIME-type of a
|
||||
`FileType` member or one of its alias MIME-types.
|
||||
"""
|
||||
# -- not super efficient but plenty fast enough for once-or-twice-per-file use and avoids
|
||||
# -- limitations on defining a class variable on an Enum.
|
||||
for m in cls.__members__.values():
|
||||
if mime_type == m._canonical_mime_type or mime_type in m._alias_mime_types:
|
||||
return m
|
||||
return None
|
||||
|
||||
@property
|
||||
def extra_name(self) -> str | None:
|
||||
"""The `pip` "extra" that must be installed to provide this file-type's dependencies.
|
||||
|
||||
Like "image" for PNG, as in `pip install "unstructured[image]"`.
|
||||
|
||||
`None` when partitioning this file-type requires only the base `unstructured` install.
|
||||
"""
|
||||
return self._extra_name
|
||||
|
||||
@property
|
||||
def importable_package_dependencies(self) -> tuple[str, ...]:
|
||||
"""Packages that must be importable for this file-type's partitioner to work.
|
||||
|
||||
In general, these are the packages provided by the `pip install` "extra" for this file-type,
|
||||
like `pip install "unstructured[docx]"` loads the `python-docx` package.
|
||||
|
||||
Note that these names are the ones used in an `import` statement, which is not necessarily
|
||||
the same as the _distribution_ package name used by `pip`. For example, the DOCX
|
||||
distribution package name is `"python-docx"` whereas the _importable_ package name is
|
||||
`"docx"`. This latter name as it appears like `import docx` is what is provided by this
|
||||
property.
|
||||
|
||||
The return value is an empty tuple for file-types that do not require optional dependencies.
|
||||
|
||||
Note this property does not complain when accessed on a non-partitionable file-type, it
|
||||
simply returns an empty tuple because file-types that are not partitionable require no
|
||||
optional dependencies.
|
||||
"""
|
||||
return self._importable_package_dependencies
|
||||
|
||||
@property
|
||||
def is_partitionable(self) -> bool:
|
||||
"""True when there is a partitioner for this file-type.
|
||||
|
||||
Note this does not check whether the dependencies for this file-type are installed so
|
||||
attempting to partition a file of this type may still fail. This is meant for
|
||||
distinguishing file-types like WAV, ZIP, EMPTY, and UNK which are legitimate file-types
|
||||
but have no associated partitioner.
|
||||
"""
|
||||
return bool(self._partitioner_shortname)
|
||||
|
||||
@property
|
||||
def mime_type(self) -> str:
|
||||
"""The canonical MIME-type for this file-type, suitable for use in metadata.
|
||||
|
||||
This value is used in `.metadata.filetype` for elements partitioned from files of this
|
||||
type. In general it is the "offical", "recommended", or "defacto-standard" MIME-type for
|
||||
files of this type, in that order, as available.
|
||||
"""
|
||||
return self._canonical_mime_type
|
||||
|
||||
@property
|
||||
def partitioner_function_name(self) -> str:
|
||||
"""Name of partitioner function for this file-type. Like "partition_docx".
|
||||
|
||||
Raises when this property is accessed on a file-type that is not partitionable. Use
|
||||
`.is_partitionable` to avoid exceptions when partitionability is unknown.
|
||||
"""
|
||||
# -- Raise when this property is accessed on a FileType member that has no partitioner
|
||||
# -- shortname. This prevents a harder-to-find bug from appearing far away from this call
|
||||
# -- when code would try to `getattr(module, None)` or whatever.
|
||||
if (shortname := self._partitioner_shortname) is None:
|
||||
raise ValueError(
|
||||
f"`.partitioner_function_name` is undefined because FileType.{self.name} is not"
|
||||
f" partitionable. Use `.is_partitionable` to determine whether a `FileType`"
|
||||
f" is partitionable."
|
||||
)
|
||||
return f"partition_{shortname}"
|
||||
|
||||
@property
|
||||
def partitioner_module_qname(self) -> str:
|
||||
"""Fully-qualified name of module providing partitioner for this file-type.
|
||||
|
||||
e.g. "unstructured.partition.docx" for FileType.DOCX.
|
||||
"""
|
||||
# -- Raise when this property is accessed on a FileType member that has no partitioner
|
||||
# -- shortname. This prevents a harder-to-find bug from appearing far away from this call
|
||||
# -- when code would try to `importlib.import_module(None)` or whatever.
|
||||
if (shortname := self._partitioner_shortname) is None:
|
||||
raise ValueError(
|
||||
f"`.partitioner_module_qname` is undefined because FileType.{self.name} is not"
|
||||
f" partitionable. Use `.is_partitionable` to determine whether a `FileType`"
|
||||
f" is partitionable."
|
||||
)
|
||||
return f"unstructured.partition.{shortname}"
|
||||
|
||||
@property
|
||||
def partitioner_shortname(self) -> str | None:
|
||||
"""Familiar name of partitioner, like "image" for file-types that use `partition_image()`.
|
||||
|
||||
One use is to determine whether a file-type is one of the five image types, all of which
|
||||
are processed by `partition_image()`.
|
||||
|
||||
`None` for file-types that are not partitionable, although `.is_partitionable` is the
|
||||
preferred way of discovering that.
|
||||
"""
|
||||
return self._partitioner_shortname
|
||||
|
||||
BMP = (
|
||||
"bmp", # -- value for this Enum member, like BMP = "bmp" in a simple enum --
|
||||
"image", # -- partitioner_shortname --
|
||||
["unstructured_inference"], # -- importable_package_dependencies --
|
||||
"image", # -- extra_name - like `pip install "unstructured[image]"` in this case --
|
||||
[".bmp"], # -- extensions - filename extensions that map to this file-type --
|
||||
"image/bmp", # -- canonical_mime_type - MIME-type written to `.metadata.filetype` --
|
||||
cast(list[str], []), # -- alias_mime-types - other MIME-types that map to this file-type --
|
||||
)
|
||||
CSV = (
|
||||
"csv",
|
||||
"csv",
|
||||
["pandas"],
|
||||
"csv",
|
||||
[".csv"],
|
||||
"text/csv",
|
||||
@ -54,38 +212,143 @@ class FileType(enum.Enum):
|
||||
"text/x-csv",
|
||||
],
|
||||
)
|
||||
DOC = ("doc", [".doc"], "application/msword", cast(list[str], []))
|
||||
DOC = ("doc", "doc", ["docx"], "doc", [".doc"], "application/msword", cast(list[str], []))
|
||||
DOCX = (
|
||||
"docx",
|
||||
"docx",
|
||||
["docx"],
|
||||
"docx",
|
||||
[".docx"],
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
cast(list[str], []),
|
||||
)
|
||||
EML = ("eml", [".eml", ".p7s"], "message/rfc822", cast(list[str], []))
|
||||
EPUB = ("epub", [".epub"], "application/epub", ["application/epub+zip"])
|
||||
HEIC = ("heic", [".heic"], "image/heic", cast(list[str], []))
|
||||
HTML = ("html", [".html", ".htm"], "text/html", cast(list[str], []))
|
||||
JPG = ("jpg", [".jpeg", ".jpg"], "image/jpeg", cast(list[str], []))
|
||||
JSON = ("json", [".json"], "application/json", cast(list[str], []))
|
||||
MD = ("md", [".md"], "text/markdown", ["text/x-markdown"])
|
||||
MSG = ("msg", [".msg"], "application/vnd.ms-outlook", ["application/x-ole-storage"])
|
||||
ODT = ("odt", [".odt"], "application/vnd.oasis.opendocument.text", cast(list[str], []))
|
||||
ORG = ("org", [".org"], "text/org", cast(list[str], []))
|
||||
PDF = ("pdf", [".pdf"], "application/pdf", cast(list[str], []))
|
||||
PNG = ("png", [".png"], "image/png", cast(list[str], []))
|
||||
PPT = ("ppt", [".ppt"], "application/vnd.ms-powerpoint", cast(list[str], []))
|
||||
EML = (
|
||||
"eml",
|
||||
"email",
|
||||
cast(list[str], []),
|
||||
None,
|
||||
[".eml", ".p7s"],
|
||||
"message/rfc822",
|
||||
cast(list[str], []),
|
||||
)
|
||||
EPUB = (
|
||||
"epub",
|
||||
"epub",
|
||||
["pypandoc"],
|
||||
"epub",
|
||||
[".epub"],
|
||||
"application/epub",
|
||||
["application/epub+zip"],
|
||||
)
|
||||
HEIC = (
|
||||
"heic",
|
||||
"image",
|
||||
["unstructured_inference"],
|
||||
"image",
|
||||
[".heic"],
|
||||
"image/heic",
|
||||
cast(list[str], []),
|
||||
)
|
||||
HTML = (
|
||||
"html",
|
||||
"html",
|
||||
cast(list[str], []),
|
||||
None,
|
||||
[".html", ".htm"],
|
||||
"text/html",
|
||||
cast(list[str], []),
|
||||
)
|
||||
JPG = (
|
||||
"jpg",
|
||||
"image",
|
||||
["unstructured_inference"],
|
||||
"image",
|
||||
[".jpeg", ".jpg"],
|
||||
"image/jpeg",
|
||||
cast(list[str], []),
|
||||
)
|
||||
JSON = (
|
||||
"json",
|
||||
"json",
|
||||
cast(list[str], []),
|
||||
None,
|
||||
[".json"],
|
||||
"application/json",
|
||||
cast(list[str], []),
|
||||
)
|
||||
MD = ("md", "md", ["markdown"], "md", [".md"], "text/markdown", ["text/x-markdown"])
|
||||
MSG = (
|
||||
"msg",
|
||||
"msg",
|
||||
["oxmsg"],
|
||||
"msg",
|
||||
[".msg"],
|
||||
"application/vnd.ms-outlook",
|
||||
["application/x-ole-storage"],
|
||||
)
|
||||
ODT = (
|
||||
"odt",
|
||||
"odt",
|
||||
["docx", "pypandoc"],
|
||||
"odt",
|
||||
[".odt"],
|
||||
"application/vnd.oasis.opendocument.text",
|
||||
cast(list[str], []),
|
||||
)
|
||||
ORG = ("org", "org", ["pypandoc"], "org", [".org"], "text/org", cast(list[str], []))
|
||||
PDF = (
|
||||
"pdf",
|
||||
"pdf",
|
||||
["pdf2image", "pdfminer", "PIL"],
|
||||
"pdf",
|
||||
[".pdf"],
|
||||
"application/pdf",
|
||||
cast(list[str], []),
|
||||
)
|
||||
PNG = (
|
||||
"png",
|
||||
"image",
|
||||
["unstructured_inference"],
|
||||
"image",
|
||||
[".png"],
|
||||
"image/png",
|
||||
cast(list[str], []),
|
||||
)
|
||||
PPT = (
|
||||
"ppt",
|
||||
"ppt",
|
||||
["pptx"],
|
||||
"ppt",
|
||||
[".ppt"],
|
||||
"application/vnd.ms-powerpoint",
|
||||
cast(list[str], []),
|
||||
)
|
||||
PPTX = (
|
||||
"pptx",
|
||||
"pptx",
|
||||
["pptx"],
|
||||
"pptx",
|
||||
[".pptx"],
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
cast(list[str], []),
|
||||
)
|
||||
RST = ("rst", [".rst"], "text/x-rst", cast(list[str], []))
|
||||
RTF = ("rtf", [".rtf"], "text/rtf", ["application/rtf"])
|
||||
TIFF = ("tiff", [".tiff"], "image/tiff", cast(list[str], []))
|
||||
TSV = ("tsv", [".tab", ".tsv"], "text/tsv", cast(list[str], []))
|
||||
RST = ("rst", "rst", ["pypandoc"], "rst", [".rst"], "text/x-rst", cast(list[str], []))
|
||||
RTF = ("rtf", "rtf", ["pypandoc"], "rtf", [".rtf"], "text/rtf", ["application/rtf"])
|
||||
TIFF = (
|
||||
"tiff",
|
||||
"image",
|
||||
["unstructured_inference"],
|
||||
"image",
|
||||
[".tiff"],
|
||||
"image/tiff",
|
||||
cast(list[str], []),
|
||||
)
|
||||
TSV = ("tsv", "tsv", ["pandas"], "tsv", [".tab", ".tsv"], "text/tsv", cast(list[str], []))
|
||||
TXT = (
|
||||
"txt",
|
||||
"text",
|
||||
cast(list[str], []),
|
||||
None,
|
||||
[
|
||||
".txt",
|
||||
".text",
|
||||
@ -119,6 +382,9 @@ class FileType(enum.Enum):
|
||||
)
|
||||
WAV = (
|
||||
"wav",
|
||||
None,
|
||||
cast(list[str], []),
|
||||
None,
|
||||
[".wav"],
|
||||
"audio/wav",
|
||||
[
|
||||
@ -129,60 +395,45 @@ class FileType(enum.Enum):
|
||||
"audio/x-wav",
|
||||
],
|
||||
)
|
||||
XLS = ("xls", [".xls"], "application/vnd.ms-excel", cast(list[str], []))
|
||||
XLS = (
|
||||
"xls",
|
||||
"xlsx",
|
||||
["pandas", "openpyxl"],
|
||||
"xlsx",
|
||||
[".xls"],
|
||||
"application/vnd.ms-excel",
|
||||
cast(list[str], []),
|
||||
)
|
||||
XLSX = (
|
||||
"xlsx",
|
||||
"xlsx",
|
||||
["pandas", "openpyxl"],
|
||||
"xlsx",
|
||||
[".xlsx"],
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
cast(list[str], []),
|
||||
)
|
||||
XML = ("xml", [".xml"], "application/xml", ["text/xml"])
|
||||
ZIP = ("zip", [".zip"], "application/zip", cast(list[str], []))
|
||||
XML = ("xml", "xml", cast(list[str], []), None, [".xml"], "application/xml", ["text/xml"])
|
||||
ZIP = ("zip", None, cast(list[str], []), None, [".zip"], "application/zip", cast(list[str], []))
|
||||
|
||||
UNK = ("unk", cast(list[str], []), "application/octet-stream", cast(list[str], []))
|
||||
EMPTY = ("empty", cast(list[str], []), "inode/x-empty", cast(list[str], []))
|
||||
|
||||
@classmethod
|
||||
def from_extension(cls, extension: str | None) -> FileType | None:
|
||||
"""Select a FileType member based on an extension.
|
||||
|
||||
`extension` must include the leading period, like `".pdf"`. Extension is suitable as a
|
||||
secondary file-type identification method but is unreliable for primary identification..
|
||||
|
||||
Returns `None` when `extension` is not registered for any supported file-type.
|
||||
"""
|
||||
if extension in (None, "", "."):
|
||||
return None
|
||||
# -- not super efficient but plenty fast enough for once-or-twice-per-file use and avoids
|
||||
# -- limitations on defining a class variable on an Enum.
|
||||
for m in cls.__members__.values():
|
||||
if extension in m._extensions:
|
||||
return m
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def from_mime_type(cls, mime_type: str) -> FileType | None:
|
||||
"""Select a FileType member based on a MIME-type.
|
||||
|
||||
`extension` must include the leading period, like `".pdf"`. Extension is suitable as a
|
||||
secondary file-type identification method but is unreliable for primary identification..
|
||||
"""
|
||||
# -- not super efficient but plenty fast enough for once-or-twice-per-file use and avoids
|
||||
# -- limitations on defining a class variable on an Enum.
|
||||
for m in cls.__members__.values():
|
||||
if mime_type == m._canonical_mime_type or mime_type in m._alias_mime_types:
|
||||
return m
|
||||
return None
|
||||
|
||||
@property
|
||||
def mime_type(self) -> str:
|
||||
"""The canonical MIME-type for this file-type, suitable for use in metadata.
|
||||
|
||||
This value is used in `.metadata.filetype` for elements partitioned from files of this
|
||||
type. In general it is the "offical", "recommended", or "defacto-standard" MIME-type for
|
||||
files of this type, in that order, as available.
|
||||
"""
|
||||
return self._canonical_mime_type
|
||||
UNK = (
|
||||
"unk",
|
||||
None,
|
||||
cast(list[str], []),
|
||||
None,
|
||||
cast(list[str], []),
|
||||
"application/octet-stream",
|
||||
cast(list[str], []),
|
||||
)
|
||||
EMPTY = (
|
||||
"empty",
|
||||
None,
|
||||
cast(list[str], []),
|
||||
None,
|
||||
cast(list[str], []),
|
||||
"inode/x-empty",
|
||||
cast(list[str], []),
|
||||
)
|
||||
|
||||
|
||||
PLAIN_TEXT_EXTENSIONS = ".csv .eml .html .json .md .org .p7s .rst .rtf .tab .text .tsv .txt".split()
|
||||
|
@ -2,131 +2,28 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
import io
|
||||
from typing import IO, Any, Callable, Literal, Optional
|
||||
|
||||
import requests
|
||||
from typing_extensions import TypeAlias
|
||||
|
||||
from unstructured.documents.elements import DataSourceMetadata, Element
|
||||
from unstructured.file_utils.filetype import detect_filetype, is_json_processable
|
||||
from unstructured.file_utils.model import FileType
|
||||
from unstructured.logger import logger
|
||||
from unstructured.partition.common import exactly_one
|
||||
from unstructured.partition.email import partition_email
|
||||
from unstructured.partition.html import partition_html
|
||||
from unstructured.partition.json import partition_json
|
||||
from unstructured.partition.lang import check_language_args
|
||||
from unstructured.partition.text import partition_text
|
||||
from unstructured.partition.utils.constants import PartitionStrategy
|
||||
from unstructured.partition.xml import partition_xml
|
||||
from unstructured.utils import dependency_exists
|
||||
|
||||
PARTITION_WITH_EXTRAS_MAP: dict[str, Callable[..., list[Element]]] = {}
|
||||
|
||||
if dependency_exists("pandas"):
|
||||
from unstructured.partition.csv import partition_csv
|
||||
from unstructured.partition.tsv import partition_tsv
|
||||
|
||||
PARTITION_WITH_EXTRAS_MAP["csv"] = partition_csv
|
||||
PARTITION_WITH_EXTRAS_MAP["tsv"] = partition_tsv
|
||||
|
||||
|
||||
if dependency_exists("docx"):
|
||||
from unstructured.partition.doc import partition_doc
|
||||
from unstructured.partition.docx import partition_docx
|
||||
|
||||
PARTITION_WITH_EXTRAS_MAP["doc"] = partition_doc
|
||||
PARTITION_WITH_EXTRAS_MAP["docx"] = partition_docx
|
||||
|
||||
|
||||
if dependency_exists("docx") and dependency_exists("pypandoc"):
|
||||
from unstructured.partition.odt import partition_odt
|
||||
|
||||
PARTITION_WITH_EXTRAS_MAP["odt"] = partition_odt
|
||||
|
||||
|
||||
if dependency_exists("pypandoc"):
|
||||
from unstructured.partition.epub import partition_epub
|
||||
|
||||
PARTITION_WITH_EXTRAS_MAP["epub"] = partition_epub
|
||||
|
||||
|
||||
if dependency_exists("pypandoc"):
|
||||
from unstructured.partition.org import partition_org
|
||||
from unstructured.partition.rst import partition_rst
|
||||
from unstructured.partition.rtf import partition_rtf
|
||||
|
||||
PARTITION_WITH_EXTRAS_MAP["org"] = partition_org
|
||||
PARTITION_WITH_EXTRAS_MAP["rst"] = partition_rst
|
||||
PARTITION_WITH_EXTRAS_MAP["rtf"] = partition_rtf
|
||||
|
||||
|
||||
if dependency_exists("markdown"):
|
||||
from unstructured.partition.md import partition_md
|
||||
|
||||
PARTITION_WITH_EXTRAS_MAP["md"] = partition_md
|
||||
|
||||
|
||||
if dependency_exists("oxmsg"):
|
||||
from unstructured.partition.msg import partition_msg
|
||||
|
||||
PARTITION_WITH_EXTRAS_MAP["msg"] = partition_msg
|
||||
|
||||
|
||||
pdf_imports = ["pdf2image", "pdfminer", "PIL"]
|
||||
if all(dependency_exists(dep) for dep in pdf_imports):
|
||||
from unstructured.partition.pdf import partition_pdf
|
||||
|
||||
PARTITION_WITH_EXTRAS_MAP["pdf"] = partition_pdf
|
||||
|
||||
|
||||
if dependency_exists("unstructured_inference"):
|
||||
from unstructured.partition.image import partition_image
|
||||
|
||||
PARTITION_WITH_EXTRAS_MAP["image"] = partition_image
|
||||
|
||||
|
||||
if dependency_exists("pptx"):
|
||||
from unstructured.partition.ppt import partition_ppt
|
||||
from unstructured.partition.pptx import partition_pptx
|
||||
|
||||
PARTITION_WITH_EXTRAS_MAP["ppt"] = partition_ppt
|
||||
PARTITION_WITH_EXTRAS_MAP["pptx"] = partition_pptx
|
||||
|
||||
|
||||
if dependency_exists("pandas") and dependency_exists("openpyxl"):
|
||||
from unstructured.partition.xlsx import partition_xlsx
|
||||
|
||||
PARTITION_WITH_EXTRAS_MAP["xlsx"] = partition_xlsx
|
||||
|
||||
|
||||
IMAGE_FILETYPES = [
|
||||
FileType.HEIC,
|
||||
FileType.PNG,
|
||||
FileType.JPG,
|
||||
FileType.TIFF,
|
||||
FileType.BMP,
|
||||
]
|
||||
|
||||
|
||||
def _get_partition_with_extras(
|
||||
doc_type: str,
|
||||
partition_with_extras_map: Optional[dict[str, Callable[..., list[Element]]]] = None,
|
||||
):
|
||||
if partition_with_extras_map is None:
|
||||
partition_with_extras_map = PARTITION_WITH_EXTRAS_MAP
|
||||
_partition_func = partition_with_extras_map.get(doc_type)
|
||||
if _partition_func is None:
|
||||
raise ImportError(
|
||||
f"partition_{doc_type} is not available. "
|
||||
f"Install the {doc_type} dependencies with "
|
||||
f'pip install "unstructured[{doc_type}]"',
|
||||
)
|
||||
return _partition_func
|
||||
Partitioner: TypeAlias = Callable[..., list[Element]]
|
||||
|
||||
|
||||
def partition(
|
||||
filename: Optional[str] = None,
|
||||
*,
|
||||
content_type: Optional[str] = None,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
file_filename: Optional[str] = None,
|
||||
@ -156,10 +53,11 @@ def partition(
|
||||
starting_page_number: int = 1,
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""Partitions a document into its constituent elements. Will use libmagic to determine
|
||||
the file's type and route it to the appropriate partitioning function. Applies the default
|
||||
parameters for each partitioning function. Use the document-type specific partitioning
|
||||
functions if you need access to additional kwarg options.
|
||||
"""Partitions a document into its constituent elements.
|
||||
|
||||
Uses libmagic to determine the file's type and route it to the appropriate partitioning
|
||||
function. Applies the default parameters for each partitioning function. Use the document-type
|
||||
specific partitioning functions if you need access to additional kwarg options.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
@ -272,7 +170,7 @@ def partition(
|
||||
languages = check_language_args(languages or [], ocr_languages)
|
||||
|
||||
if url is not None:
|
||||
file, filetype = file_and_type_from_url(
|
||||
file, file_type = file_and_type_from_url(
|
||||
url=url,
|
||||
content_type=content_type,
|
||||
headers=headers,
|
||||
@ -285,7 +183,7 @@ def partition(
|
||||
"The headers kwarg is set but the url kwarg is not. "
|
||||
"The headers kwarg will be ignored.",
|
||||
)
|
||||
filetype = detect_filetype(
|
||||
file_type = detect_filetype(
|
||||
filename=filename,
|
||||
file=file,
|
||||
file_filename=metadata_filename,
|
||||
@ -297,14 +195,16 @@ def partition(
|
||||
file.seek(0)
|
||||
|
||||
infer_table_structure = decide_table_extraction(
|
||||
filetype,
|
||||
file_type,
|
||||
skip_infer_table_types,
|
||||
pdf_infer_table_structure,
|
||||
)
|
||||
|
||||
if filetype == FileType.CSV:
|
||||
_partition_csv = _get_partition_with_extras("csv")
|
||||
elements = _partition_csv(
|
||||
partitioner_loader = _PartitionerLoader()
|
||||
|
||||
if file_type == FileType.CSV:
|
||||
partition_csv = partitioner_loader.get(file_type)
|
||||
elements = partition_csv(
|
||||
filename=filename,
|
||||
file=file,
|
||||
infer_table_structure=infer_table_structure,
|
||||
@ -312,9 +212,9 @@ def partition(
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.DOC:
|
||||
_partition_doc = _get_partition_with_extras("doc")
|
||||
elements = _partition_doc(
|
||||
elif file_type == FileType.DOC:
|
||||
partition_doc = partitioner_loader.get(file_type)
|
||||
elements = partition_doc(
|
||||
filename=filename,
|
||||
file=file,
|
||||
infer_table_structure=infer_table_structure,
|
||||
@ -324,9 +224,9 @@ def partition(
|
||||
strategy=strategy,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.DOCX:
|
||||
_partition_docx = _get_partition_with_extras("docx")
|
||||
elements = _partition_docx(
|
||||
elif file_type == FileType.DOCX:
|
||||
partition_docx = partitioner_loader.get(file_type)
|
||||
elements = partition_docx(
|
||||
filename=filename,
|
||||
file=file,
|
||||
infer_table_structure=infer_table_structure,
|
||||
@ -336,7 +236,8 @@ def partition(
|
||||
strategy=strategy,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.EML:
|
||||
elif file_type == FileType.EML:
|
||||
partition_email = partitioner_loader.get(file_type)
|
||||
elements = partition_email(
|
||||
filename=filename,
|
||||
file=file,
|
||||
@ -345,9 +246,9 @@ def partition(
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.EPUB:
|
||||
_partition_epub = _get_partition_with_extras("epub")
|
||||
elements = _partition_epub(
|
||||
elif file_type == FileType.EPUB:
|
||||
partition_epub = partitioner_loader.get(file_type)
|
||||
elements = partition_epub(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
@ -356,7 +257,8 @@ def partition(
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.HTML:
|
||||
elif file_type == FileType.HTML:
|
||||
partition_html = partitioner_loader.get(file_type)
|
||||
elements = partition_html(
|
||||
filename=filename,
|
||||
file=file,
|
||||
@ -366,9 +268,9 @@ def partition(
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype in IMAGE_FILETYPES:
|
||||
_partition_image = _get_partition_with_extras("image")
|
||||
elements = _partition_image(
|
||||
elif file_type.partitioner_shortname == "image":
|
||||
partition_image = partitioner_loader.get(file_type)
|
||||
elements = partition_image(
|
||||
filename=filename,
|
||||
file=file,
|
||||
url=None,
|
||||
@ -384,16 +286,17 @@ def partition(
|
||||
starting_page_number=starting_page_number,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.JSON:
|
||||
elif file_type == FileType.JSON:
|
||||
if not is_json_processable(filename=filename, file=file):
|
||||
raise ValueError(
|
||||
"Detected a JSON file that does not conform to the Unstructured schema. "
|
||||
"partition_json currently only processes serialized Unstructured output.",
|
||||
)
|
||||
partition_json = partitioner_loader.get(file_type)
|
||||
elements = partition_json(filename=filename, file=file, **kwargs)
|
||||
elif filetype == FileType.MD:
|
||||
_partition_md = _get_partition_with_extras("md")
|
||||
elements = _partition_md(
|
||||
elif file_type == FileType.MD:
|
||||
partition_md = partitioner_loader.get(file_type)
|
||||
elements = partition_md(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
@ -402,18 +305,18 @@ def partition(
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.MSG:
|
||||
_partition_msg = _get_partition_with_extras("msg")
|
||||
elements = _partition_msg(
|
||||
elif file_type == FileType.MSG:
|
||||
partition_msg = partitioner_loader.get(file_type)
|
||||
elements = partition_msg(
|
||||
filename=filename,
|
||||
file=file,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.ODT:
|
||||
_partition_odt = _get_partition_with_extras("odt")
|
||||
elements = _partition_odt(
|
||||
elif file_type == FileType.ODT:
|
||||
partition_odt = partitioner_loader.get(file_type)
|
||||
elements = partition_odt(
|
||||
filename=filename,
|
||||
file=file,
|
||||
infer_table_structure=infer_table_structure,
|
||||
@ -423,9 +326,9 @@ def partition(
|
||||
strategy=strategy,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.ORG:
|
||||
_partition_org = _get_partition_with_extras("org")
|
||||
elements = _partition_org(
|
||||
elif file_type == FileType.ORG:
|
||||
partition_org = partitioner_loader.get(file_type)
|
||||
elements = partition_org(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
@ -433,9 +336,9 @@ def partition(
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.PDF:
|
||||
_partition_pdf = _get_partition_with_extras("pdf")
|
||||
elements = _partition_pdf(
|
||||
elif file_type == FileType.PDF:
|
||||
partition_pdf = partitioner_loader.get(file_type)
|
||||
elements = partition_pdf(
|
||||
filename=filename,
|
||||
file=file,
|
||||
url=None,
|
||||
@ -451,9 +354,9 @@ def partition(
|
||||
starting_page_number=starting_page_number,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.PPT:
|
||||
_partition_ppt = _get_partition_with_extras("ppt")
|
||||
elements = _partition_ppt(
|
||||
elif file_type == FileType.PPT:
|
||||
partition_ppt = partitioner_loader.get(file_type)
|
||||
elements = partition_ppt(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
@ -463,9 +366,9 @@ def partition(
|
||||
strategy=strategy,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.PPTX:
|
||||
_partition_pptx = _get_partition_with_extras("pptx")
|
||||
elements = _partition_pptx(
|
||||
elif file_type == FileType.PPTX:
|
||||
partition_pptx = partitioner_loader.get(file_type)
|
||||
elements = partition_pptx(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
@ -476,9 +379,9 @@ def partition(
|
||||
strategy=strategy,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.RST:
|
||||
_partition_rst = _get_partition_with_extras("rst")
|
||||
elements = _partition_rst(
|
||||
elif file_type == FileType.RST:
|
||||
partition_rst = partitioner_loader.get(file_type)
|
||||
elements = partition_rst(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
@ -487,9 +390,9 @@ def partition(
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.RTF:
|
||||
_partition_rtf = _get_partition_with_extras("rtf")
|
||||
elements = _partition_rtf(
|
||||
elif file_type == FileType.RTF:
|
||||
partition_rtf = partitioner_loader.get(file_type)
|
||||
elements = partition_rtf(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
@ -498,16 +401,17 @@ def partition(
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.TSV:
|
||||
_partition_tsv = _get_partition_with_extras("tsv")
|
||||
elements = _partition_tsv(
|
||||
elif file_type == FileType.TSV:
|
||||
partition_tsv = partitioner_loader.get(file_type)
|
||||
elements = partition_tsv(
|
||||
filename=filename,
|
||||
file=file,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.TXT:
|
||||
elif file_type == FileType.TXT:
|
||||
partition_text = partitioner_loader.get(file_type)
|
||||
elements = partition_text(
|
||||
filename=filename,
|
||||
file=file,
|
||||
@ -517,9 +421,9 @@ def partition(
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype in (FileType.XLS, FileType.XLSX):
|
||||
_partition_xlsx = _get_partition_with_extras("xlsx")
|
||||
elements = _partition_xlsx(
|
||||
elif file_type in (FileType.XLS, FileType.XLSX):
|
||||
partition_xlsx = partitioner_loader.get(file_type)
|
||||
elements = partition_xlsx(
|
||||
filename=filename,
|
||||
file=file,
|
||||
infer_table_structure=infer_table_structure,
|
||||
@ -528,7 +432,8 @@ def partition(
|
||||
starting_page_number=starting_page_number,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.XML:
|
||||
elif file_type == FileType.XML:
|
||||
partition_xml = partitioner_loader.get(file_type)
|
||||
elements = partition_xml(
|
||||
filename=filename,
|
||||
file=file,
|
||||
@ -538,11 +443,11 @@ def partition(
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.EMPTY:
|
||||
elif file_type == FileType.EMPTY:
|
||||
elements = []
|
||||
else:
|
||||
msg = "Invalid file" if not filename else f"Invalid file {filename}"
|
||||
raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.")
|
||||
raise ValueError(f"{msg}. The {file_type} file type is not supported in partition.")
|
||||
|
||||
for element in elements:
|
||||
element.metadata.url = url
|
||||
@ -551,7 +456,7 @@ def partition(
|
||||
out_filetype = FileType.from_mime_type(content_type)
|
||||
element.metadata.filetype = out_filetype.mime_type if out_filetype is not None else None
|
||||
else:
|
||||
element.metadata.filetype = filetype.mime_type
|
||||
element.metadata.filetype = file_type.mime_type
|
||||
|
||||
return elements
|
||||
|
||||
@ -562,7 +467,7 @@ def file_and_type_from_url(
|
||||
headers: dict[str, str] = {},
|
||||
ssl_verify: bool = True,
|
||||
request_timeout: Optional[int] = None,
|
||||
) -> tuple[io.BytesIO, Optional[FileType]]:
|
||||
) -> tuple[io.BytesIO, FileType]:
|
||||
response = requests.get(url, headers=headers, verify=ssl_verify, timeout=request_timeout)
|
||||
file = io.BytesIO(response.content)
|
||||
|
||||
@ -590,3 +495,51 @@ def decide_table_extraction(
|
||||
return pdf_infer_table_structure or doc_type not in skip_infer_table_types
|
||||
|
||||
return doc_type not in skip_infer_table_types
|
||||
|
||||
|
||||
class _PartitionerLoader:
|
||||
"""Provides uniform helpful error when a partitioner dependency is not installed.
|
||||
|
||||
Used by `partition()` to encapsulate coping with the possibility the Python
|
||||
environment it is executing in may not have all dependencies installed for a
|
||||
particular partitioner.
|
||||
|
||||
Provides `.get()` to access partitioners by file-type, which raises when one or
|
||||
more dependencies for that partitioner are not installed.
|
||||
|
||||
The error message indicates what extra needs to be installed to enable that
|
||||
partitioner. This avoids an inconsistent variety of possibly puzzling exceptions
|
||||
arising from much deeper in the partitioner when access to the missing dependency is
|
||||
first attempted.
|
||||
"""
|
||||
|
||||
# -- module-lifetime cache for partitioners once loaded --
|
||||
_partitioners: dict[FileType, Partitioner] = {}
|
||||
|
||||
def get(self, file_type: FileType) -> Partitioner:
|
||||
"""Return partitioner for `file_type`.
|
||||
|
||||
Raises when one or more package dependencies for that file-type have not been
|
||||
installed.
|
||||
"""
|
||||
if file_type not in self._partitioners:
|
||||
self._partitioners[file_type] = self._load_partitioner(file_type)
|
||||
|
||||
return self._partitioners[file_type]
|
||||
|
||||
def _load_partitioner(self, file_type: FileType) -> Partitioner:
|
||||
"""Load the partitioner for `file_type` after verifying dependencies."""
|
||||
# -- verify all package dependencies are installed --
|
||||
for pkg_name in file_type.importable_package_dependencies:
|
||||
if not dependency_exists(pkg_name):
|
||||
raise ImportError(
|
||||
f"{file_type.partitioner_function_name}() is not available because one or"
|
||||
f" more dependencies are not installed. Use:"
|
||||
f' pip install "unstructured[{file_type.extra_name}]" (including quotes)'
|
||||
f" to install the required dependencies",
|
||||
)
|
||||
|
||||
# -- load the partitioner and return it --
|
||||
assert file_type.is_partitionable # -- would be a programming error if this failed --
|
||||
partitioner_module = importlib.import_module(file_type.partitioner_module_qname)
|
||||
return getattr(partitioner_module, file_type.partitioner_function_name)
|
||||
|
Loading…
x
Reference in New Issue
Block a user