mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-26 08:53:15 +00:00
rfctr(auto): add _PartitionerLoader (#3418)
**Summary** Replace conditional explicit import of partitioner modules in `.partition.auto` with the new `_PartitionerLoader` class. This avoids unbound variable warnings and is much less noisy. `_PartitionerLoader` makes use of the new `FileType` property `.importable_package_dependencies` to determine whether all required packages are importable before dispatching the file to its partitioner. It uses `FileType.extra_name` to form a helpful error message when a dependency is not installed, so the caller knows which `pip install` extra to specify to remedy the error. `PartitionerLoader` uses the `FileType` properties `.partitioner_module_qname` and `partitioner_function_name` to load the partitioner once its dependencies are verified. Loaded partitioners are cached with module lifetime scope for efficiency.
This commit is contained in:
parent
ec59abfabc
commit
49c4bd34be
@ -1,3 +1,11 @@
|
|||||||
|
## 0.15.1-dev0
|
||||||
|
|
||||||
|
### Enhancements
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
### Fixes
|
||||||
|
|
||||||
## 0.15.0
|
## 0.15.0
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
@ -50,6 +50,68 @@ class DescribeFileType:
|
|||||||
def but_not_when_that_mime_type_is_not_registered_by_a_file_type(self, mime_type: str):
|
def but_not_when_that_mime_type_is_not_registered_by_a_file_type(self, mime_type: str):
|
||||||
assert FileType.from_mime_type(mime_type) is None
|
assert FileType.from_mime_type(mime_type) is None
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("file_type", "expected_value"),
|
||||||
|
[
|
||||||
|
(FileType.BMP, ("unstructured_inference",)),
|
||||||
|
(FileType.CSV, ("pandas",)),
|
||||||
|
(FileType.DOC, ("docx",)),
|
||||||
|
(FileType.EMPTY, ()),
|
||||||
|
(FileType.HTML, ()),
|
||||||
|
(FileType.ODT, ("docx", "pypandoc")),
|
||||||
|
(FileType.PDF, ("pdf2image", "pdfminer", "PIL")),
|
||||||
|
(FileType.UNK, ()),
|
||||||
|
(FileType.WAV, ()),
|
||||||
|
(FileType.ZIP, ()),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def it_knows_which_importable_packages_its_partitioner_depends_on(
|
||||||
|
self, file_type: FileType, expected_value: tuple[str, ...]
|
||||||
|
):
|
||||||
|
assert file_type.importable_package_dependencies == expected_value
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("file_type", "expected_value"),
|
||||||
|
[
|
||||||
|
(FileType.BMP, "image"),
|
||||||
|
(FileType.DOC, "doc"),
|
||||||
|
(FileType.DOCX, "docx"),
|
||||||
|
(FileType.EML, None),
|
||||||
|
(FileType.EMPTY, None),
|
||||||
|
(FileType.MSG, "msg"),
|
||||||
|
(FileType.PDF, "pdf"),
|
||||||
|
(FileType.XLS, "xlsx"),
|
||||||
|
(FileType.UNK, None),
|
||||||
|
(FileType.WAV, None),
|
||||||
|
(FileType.ZIP, None),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def and_it_knows_which_pip_extra_needs_to_be_installed_to_get_those_dependencies(
|
||||||
|
self, file_type: FileType, expected_value: str | None
|
||||||
|
):
|
||||||
|
assert file_type.extra_name == expected_value
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("file_type", "expected_value"),
|
||||||
|
[
|
||||||
|
(FileType.BMP, True),
|
||||||
|
(FileType.CSV, True),
|
||||||
|
(FileType.DOC, True),
|
||||||
|
(FileType.EML, True),
|
||||||
|
(FileType.JPG, True),
|
||||||
|
(FileType.PDF, True),
|
||||||
|
(FileType.PPTX, True),
|
||||||
|
(FileType.WAV, False),
|
||||||
|
(FileType.ZIP, False),
|
||||||
|
(FileType.EMPTY, False),
|
||||||
|
(FileType.UNK, False),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def it_knows_whether_files_of_its_type_are_directly_partitionable(
|
||||||
|
self, file_type: FileType, expected_value: str
|
||||||
|
):
|
||||||
|
assert file_type.is_partitionable is expected_value
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("file_type", "mime_type"),
|
("file_type", "mime_type"),
|
||||||
[
|
[
|
||||||
@ -68,3 +130,72 @@ class DescribeFileType:
|
|||||||
)
|
)
|
||||||
def it_knows_its_canonical_MIME_type(self, file_type: FileType, mime_type: str):
|
def it_knows_its_canonical_MIME_type(self, file_type: FileType, mime_type: str):
|
||||||
assert file_type.mime_type == mime_type
|
assert file_type.mime_type == mime_type
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("file_type", "expected_value"),
|
||||||
|
[
|
||||||
|
(FileType.BMP, "partition_image"),
|
||||||
|
(FileType.CSV, "partition_csv"),
|
||||||
|
(FileType.DOC, "partition_doc"),
|
||||||
|
(FileType.DOCX, "partition_docx"),
|
||||||
|
(FileType.JPG, "partition_image"),
|
||||||
|
(FileType.PNG, "partition_image"),
|
||||||
|
(FileType.TIFF, "partition_image"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def it_knows_its_partitioner_function_name(self, file_type: FileType, expected_value: str):
|
||||||
|
assert file_type.partitioner_function_name == expected_value
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"file_type", [FileType.WAV, FileType.ZIP, FileType.EMPTY, FileType.UNK]
|
||||||
|
)
|
||||||
|
def but_it_raises_on_partitioner_function_name_access_when_the_file_type_is_not_partitionable(
|
||||||
|
self, file_type: FileType
|
||||||
|
):
|
||||||
|
with pytest.raises(ValueError, match="`.partitioner_function_name` is undefined because "):
|
||||||
|
file_type.partitioner_function_name
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("file_type", "expected_value"),
|
||||||
|
[
|
||||||
|
(FileType.BMP, "unstructured.partition.image"),
|
||||||
|
(FileType.CSV, "unstructured.partition.csv"),
|
||||||
|
(FileType.DOC, "unstructured.partition.doc"),
|
||||||
|
(FileType.DOCX, "unstructured.partition.docx"),
|
||||||
|
(FileType.JPG, "unstructured.partition.image"),
|
||||||
|
(FileType.PNG, "unstructured.partition.image"),
|
||||||
|
(FileType.TIFF, "unstructured.partition.image"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def it_knows_the_fully_qualified_name_of_its_partitioner_module(
|
||||||
|
self, file_type: FileType, expected_value: str
|
||||||
|
):
|
||||||
|
assert file_type.partitioner_module_qname == expected_value
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"file_type", [FileType.WAV, FileType.ZIP, FileType.EMPTY, FileType.UNK]
|
||||||
|
)
|
||||||
|
def but_it_raises_on_partitioner_module_qname_access_when_the_file_type_is_not_partitionable(
|
||||||
|
self, file_type: FileType
|
||||||
|
):
|
||||||
|
with pytest.raises(ValueError, match="`.partitioner_module_qname` is undefined because "):
|
||||||
|
file_type.partitioner_module_qname
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("file_type", "expected_value"),
|
||||||
|
[
|
||||||
|
(FileType.BMP, "image"),
|
||||||
|
(FileType.CSV, "csv"),
|
||||||
|
(FileType.DOC, "doc"),
|
||||||
|
(FileType.DOCX, "docx"),
|
||||||
|
(FileType.JPG, "image"),
|
||||||
|
(FileType.PNG, "image"),
|
||||||
|
(FileType.TIFF, "image"),
|
||||||
|
(FileType.XLS, "xlsx"),
|
||||||
|
(FileType.XLSX, "xlsx"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def it_provides_access_to_the_partitioner_shortname(
|
||||||
|
self, file_type: FileType, expected_value: str
|
||||||
|
):
|
||||||
|
assert file_type.partitioner_shortname == expected_value
|
||||||
|
@ -10,7 +10,7 @@ import sys
|
|||||||
import tempfile
|
import tempfile
|
||||||
import warnings
|
import warnings
|
||||||
from importlib import import_module
|
from importlib import import_module
|
||||||
from typing import Callable, Iterator, cast
|
from typing import Iterator, cast
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
@ -27,7 +27,6 @@ from test_unstructured.unit_utils import (
|
|||||||
ANY,
|
ANY,
|
||||||
FixtureRequest,
|
FixtureRequest,
|
||||||
LogCaptureFixture,
|
LogCaptureFixture,
|
||||||
MonkeyPatch,
|
|
||||||
example_doc_path,
|
example_doc_path,
|
||||||
function_mock,
|
function_mock,
|
||||||
method_mock,
|
method_mock,
|
||||||
@ -46,8 +45,7 @@ from unstructured.documents.elements import (
|
|||||||
Title,
|
Title,
|
||||||
)
|
)
|
||||||
from unstructured.file_utils.model import FileType
|
from unstructured.file_utils.model import FileType
|
||||||
from unstructured.partition import auto
|
from unstructured.partition.auto import _PartitionerLoader, partition
|
||||||
from unstructured.partition.auto import IMAGE_FILETYPES, _get_partition_with_extras, partition
|
|
||||||
from unstructured.partition.utils.constants import PartitionStrategy
|
from unstructured.partition.utils.constants import PartitionStrategy
|
||||||
from unstructured.staging.base import elements_from_json, elements_to_dicts, elements_to_json
|
from unstructured.staging.base import elements_from_json, elements_to_dicts, elements_to_json
|
||||||
|
|
||||||
@ -570,16 +568,21 @@ def test_auto_partition_pdf_from_file(pass_metadata_filename: bool, content_type
|
|||||||
assert e.text.startswith("Zejiang Shen")
|
assert e.text.startswith("Zejiang Shen")
|
||||||
|
|
||||||
|
|
||||||
def test_auto_partition_pdf_with_fast_strategy(monkeypatch: MonkeyPatch):
|
def test_auto_partition_pdf_with_fast_strategy(request: FixtureRequest):
|
||||||
|
partition_pdf_ = function_mock(
|
||||||
|
request,
|
||||||
|
"unstructured.partition.pdf.partition_pdf",
|
||||||
|
return_value=[NarrativeText("Hello there!")],
|
||||||
|
)
|
||||||
|
partitioner_loader_get_ = method_mock(
|
||||||
|
request, _PartitionerLoader, "get", return_value=partition_pdf_
|
||||||
|
)
|
||||||
file_path = example_doc_path("pdf/layout-parser-paper-fast.pdf")
|
file_path = example_doc_path("pdf/layout-parser-paper-fast.pdf")
|
||||||
|
|
||||||
mock_return = [NarrativeText("Hello there!")]
|
partition(file_path, strategy=PartitionStrategy.FAST)
|
||||||
with patch.object(auto, "partition_pdf", return_value=mock_return) as mock_partition:
|
|
||||||
mock_partition_with_extras_map = {"pdf": mock_partition}
|
|
||||||
monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
|
|
||||||
partition(filename=file_path, strategy=PartitionStrategy.FAST)
|
|
||||||
|
|
||||||
mock_partition.assert_called_once_with(
|
partitioner_loader_get_.assert_called_once_with(ANY, FileType.PDF)
|
||||||
|
partition_pdf_.assert_called_once_with(
|
||||||
filename=file_path,
|
filename=file_path,
|
||||||
file=None,
|
file=None,
|
||||||
url=None,
|
url=None,
|
||||||
@ -919,10 +922,10 @@ def test_auto_partition_xml_from_file_with_tags():
|
|||||||
|
|
||||||
def test_auto_partition_raises_with_bad_type(request: FixtureRequest):
|
def test_auto_partition_raises_with_bad_type(request: FixtureRequest):
|
||||||
detect_filetype_ = function_mock(
|
detect_filetype_ = function_mock(
|
||||||
request, "unstructured.partition.auto.detect_filetype", return_value=None
|
request, "unstructured.partition.auto.detect_filetype", return_value=FileType.UNK
|
||||||
)
|
)
|
||||||
|
|
||||||
with pytest.raises(ValueError, match="Invalid file made-up.fake. The None file type is not "):
|
with pytest.raises(ValueError, match="Invalid file made-up.fake. The FileType.UNK file type "):
|
||||||
partition(filename="made-up.fake", strategy=PartitionStrategy.HI_RES)
|
partition(filename="made-up.fake", strategy=PartitionStrategy.HI_RES)
|
||||||
|
|
||||||
detect_filetype_.assert_called_once_with(
|
detect_filetype_.assert_called_once_with(
|
||||||
@ -1026,23 +1029,7 @@ def test_auto_partition_respects_detect_language_per_element_arg():
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"file_extension",
|
"file_extension", "doc docx eml epub html md odt org ppt pptx rst rtf txt xml".split()
|
||||||
[
|
|
||||||
"doc",
|
|
||||||
"docx",
|
|
||||||
"eml",
|
|
||||||
"epub",
|
|
||||||
"html",
|
|
||||||
"md",
|
|
||||||
"odt",
|
|
||||||
"org",
|
|
||||||
"ppt",
|
|
||||||
"pptx",
|
|
||||||
"rst",
|
|
||||||
"rtf",
|
|
||||||
"txt",
|
|
||||||
"xml",
|
|
||||||
],
|
|
||||||
)
|
)
|
||||||
def test_auto_partition_respects_language_arg(file_extension: str):
|
def test_auto_partition_respects_language_arg(file_extension: str):
|
||||||
elements = partition(
|
elements = partition(
|
||||||
@ -1167,7 +1154,7 @@ def test_auto_partition_respects_skip_infer_table_types(
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("content_type", "filetype_shortname", "expected_value"),
|
("content_type", "shortname", "expected_value"),
|
||||||
[
|
[
|
||||||
("text/csv", "csv", "text/csv"),
|
("text/csv", "csv", "text/csv"),
|
||||||
("text/html", "html", "text/html"),
|
("text/html", "html", "text/html"),
|
||||||
@ -1177,22 +1164,23 @@ def test_auto_partition_respects_skip_infer_table_types(
|
|||||||
def test_auto_partition_adds_filetype_to_metadata(
|
def test_auto_partition_adds_filetype_to_metadata(
|
||||||
request: FixtureRequest,
|
request: FixtureRequest,
|
||||||
content_type: str,
|
content_type: str,
|
||||||
filetype_shortname: str,
|
shortname: str,
|
||||||
expected_value: str | None,
|
expected_value: str | None,
|
||||||
monkeypatch: MonkeyPatch,
|
|
||||||
):
|
):
|
||||||
partition_fn_ = function_mock(
|
partition_fn_ = function_mock(
|
||||||
request,
|
request,
|
||||||
f"unstructured.partition.auto.partition_{filetype_shortname}",
|
f"unstructured.partition.{shortname}.partition_{shortname}",
|
||||||
return_value=[Text("text 1"), Text("text 2")],
|
return_value=[Text("text 1"), Text("text 2")],
|
||||||
)
|
)
|
||||||
mock_partition_with_extras_map = {filetype_shortname: partition_fn_}
|
partitioner_loader_get_ = method_mock(
|
||||||
monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
|
request, _PartitionerLoader, "get", return_value=partition_fn_
|
||||||
|
)
|
||||||
|
|
||||||
elements = partition(
|
elements = partition(
|
||||||
example_doc_path("pdf/layout-parser-paper-fast.pdf"), content_type=content_type
|
example_doc_path("pdf/layout-parser-paper-fast.pdf"), content_type=content_type
|
||||||
)
|
)
|
||||||
|
|
||||||
|
partitioner_loader_get_.assert_called_once()
|
||||||
assert len(elements) == 2
|
assert len(elements) == 2
|
||||||
assert all(e.metadata.filetype == expected_value for e in elements)
|
assert all(e.metadata.filetype == expected_value for e in elements)
|
||||||
|
|
||||||
@ -1207,20 +1195,23 @@ def test_auto_partition_adds_filetype_to_metadata(
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_auto_partition_overwrites_any_filetype_applied_by_file_specific_partitioner(
|
def test_auto_partition_overwrites_any_filetype_applied_by_file_specific_partitioner(
|
||||||
request: FixtureRequest, content_type: str | None, monkeypatch: MonkeyPatch
|
request: FixtureRequest, content_type: str | None
|
||||||
):
|
):
|
||||||
metadata = ElementMetadata(filetype="imapdf")
|
metadata = ElementMetadata(filetype="imapdf")
|
||||||
partition_pdf_ = function_mock(
|
partition_pdf_ = function_mock(
|
||||||
request,
|
request,
|
||||||
"unstructured.partition.auto.partition_pdf",
|
"unstructured.partition.pdf.partition_pdf",
|
||||||
return_value=[Text("text 1", metadata=metadata), Text("text 2", metadata=metadata)],
|
return_value=[Text("text 1", metadata=metadata), Text("text 2", metadata=metadata)],
|
||||||
)
|
)
|
||||||
monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", {"pdf": partition_pdf_})
|
partitioner_loader_get_ = method_mock(
|
||||||
|
request, _PartitionerLoader, "get", return_value=partition_pdf_
|
||||||
|
)
|
||||||
|
|
||||||
elements = partition(
|
elements = partition(
|
||||||
example_doc_path("pdf/layout-parser-paper-fast.pdf"), content_type=content_type
|
example_doc_path("pdf/layout-parser-paper-fast.pdf"), content_type=content_type
|
||||||
)
|
)
|
||||||
|
|
||||||
|
partitioner_loader_get_.assert_called_once_with(ANY, FileType.PDF)
|
||||||
assert len(elements) == 2
|
assert len(elements) == 2
|
||||||
assert all(e.metadata.filetype == "application/pdf" for e in elements)
|
assert all(e.metadata.filetype == "application/pdf" for e in elements)
|
||||||
|
|
||||||
@ -1231,7 +1222,7 @@ def test_auto_partition_overwrites_any_filetype_applied_by_file_specific_partiti
|
|||||||
t
|
t
|
||||||
for t in FileType
|
for t in FileType
|
||||||
if t not in (FileType.EMPTY, FileType.UNK, FileType.WAV, FileType.XLS, FileType.ZIP)
|
if t not in (FileType.EMPTY, FileType.UNK, FileType.WAV, FileType.XLS, FileType.ZIP)
|
||||||
and t not in IMAGE_FILETYPES
|
and t.partitioner_function_name != "partition_image"
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(filetype: FileType):
|
def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(filetype: FileType):
|
||||||
@ -1305,10 +1296,18 @@ def test_auto_partition_from_file_works_on_empty_file():
|
|||||||
assert partition(file=f) == []
|
assert partition(file=f) == []
|
||||||
|
|
||||||
|
|
||||||
def test_auto_partition_requiring_extras_prompts_to_install_missing_dependencies():
|
def test_auto_partition_that_requires_extras_raises_when_dependencies_are_not_installed(
|
||||||
partition_with_extras_map: dict[str, Callable[..., list[Element]]] = {}
|
request: FixtureRequest,
|
||||||
with pytest.raises(ImportError, match="partition_pdf is not available. Install the pdf depen"):
|
):
|
||||||
_get_partition_with_extras("pdf", partition_with_extras_map)
|
_PartitionerLoader._partitioners.pop(FileType.PDF, None)
|
||||||
|
dependency_exists_ = function_mock(
|
||||||
|
request, "unstructured.partition.auto.dependency_exists", return_value=False
|
||||||
|
)
|
||||||
|
match = r"partition_pdf\(\) is not available because one or more dependencies are not installed"
|
||||||
|
with pytest.raises(ImportError, match=match):
|
||||||
|
partition(example_doc_path("layout-parser-paper-fast.pdf"))
|
||||||
|
|
||||||
|
dependency_exists_.assert_called_once_with("pdf2image")
|
||||||
|
|
||||||
|
|
||||||
# ================================================================================================
|
# ================================================================================================
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.15.0" # pragma: no cover
|
__version__ = "0.15.1-dev0" # pragma: no cover
|
||||||
|
@ -32,7 +32,7 @@ def detect_filetype(
|
|||||||
file: Optional[IO[bytes]] = None,
|
file: Optional[IO[bytes]] = None,
|
||||||
file_filename: Optional[str] = None,
|
file_filename: Optional[str] = None,
|
||||||
encoding: Optional[str] = "utf-8",
|
encoding: Optional[str] = "utf-8",
|
||||||
) -> Optional[FileType]:
|
) -> FileType:
|
||||||
"""Use libmagic to determine a file's type.
|
"""Use libmagic to determine a file's type.
|
||||||
|
|
||||||
Helps determine which partition brick to use for a given file. A return value of None indicates
|
Helps determine which partition brick to use for a given file. A return value of None indicates
|
||||||
@ -122,7 +122,7 @@ def detect_filetype(
|
|||||||
".tsv",
|
".tsv",
|
||||||
".json",
|
".json",
|
||||||
]:
|
]:
|
||||||
return FileType.from_extension(extension)
|
return FileType.from_extension(extension) or FileType.TXT
|
||||||
|
|
||||||
# NOTE(crag): for older versions of the OS libmagic package, such as is currently
|
# NOTE(crag): for older versions of the OS libmagic package, such as is currently
|
||||||
# installed on the Unstructured docker image, .json files resolve to "text/plain"
|
# installed on the Unstructured docker image, .json files resolve to "text/plain"
|
||||||
|
@ -12,7 +12,17 @@ class FileType(enum.Enum):
|
|||||||
Note not all of these can be partitioned, e.g. WAV and ZIP have no partitioner.
|
Note not all of these can be partitioned, e.g. WAV and ZIP have no partitioner.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
_partitioner_shortname: str | None
|
||||||
|
"""Like "docx", from which partitioner module and function-name can be derived via template."""
|
||||||
|
|
||||||
|
_importable_package_dependencies: tuple[str, ...]
|
||||||
|
"""Packages that must be available for import for this file-type's partitioner to work."""
|
||||||
|
|
||||||
|
_extra_name: str | None
|
||||||
|
"""`pip install` extra that provides package dependencies for this file-type."""
|
||||||
|
|
||||||
_extensions: tuple[str, ...]
|
_extensions: tuple[str, ...]
|
||||||
|
"""Filename-extensions recognized as this file-type. Use for secondary identification only."""
|
||||||
|
|
||||||
_canonical_mime_type: str
|
_canonical_mime_type: str
|
||||||
"""The MIME-type used as `.metadata.filetype` for this file-type."""
|
"""The MIME-type used as `.metadata.filetype` for this file-type."""
|
||||||
@ -23,12 +33,18 @@ class FileType(enum.Enum):
|
|||||||
def __new__(
|
def __new__(
|
||||||
cls,
|
cls,
|
||||||
value: str,
|
value: str,
|
||||||
|
partitioner_shortname: str | None,
|
||||||
|
importable_package_dependencies: Iterable[str],
|
||||||
|
extra_name: str | None,
|
||||||
extensions: Iterable[str],
|
extensions: Iterable[str],
|
||||||
canonical_mime_type: str,
|
canonical_mime_type: str,
|
||||||
alias_mime_types: Iterable[str],
|
alias_mime_types: Iterable[str],
|
||||||
):
|
):
|
||||||
self = object.__new__(cls)
|
self = object.__new__(cls)
|
||||||
self._value_ = value
|
self._value_ = value
|
||||||
|
self._partitioner_shortname = partitioner_shortname
|
||||||
|
self._importable_package_dependencies = tuple(importable_package_dependencies)
|
||||||
|
self._extra_name = extra_name
|
||||||
self._extensions = tuple(extensions)
|
self._extensions = tuple(extensions)
|
||||||
self._canonical_mime_type = canonical_mime_type
|
self._canonical_mime_type = canonical_mime_type
|
||||||
self._alias_mime_types = tuple(alias_mime_types)
|
self._alias_mime_types = tuple(alias_mime_types)
|
||||||
@ -41,8 +57,150 @@ class FileType(enum.Enum):
|
|||||||
"""
|
"""
|
||||||
return self.name < other.name
|
return self.name < other.name
|
||||||
|
|
||||||
BMP = ("bmp", [".bmp"], "image/bmp", cast(list[str], []))
|
@classmethod
|
||||||
|
def from_extension(cls, extension: str | None) -> FileType | None:
|
||||||
|
"""Select a FileType member based on an extension.
|
||||||
|
|
||||||
|
`extension` must include the leading period, like `".pdf"`. Extension is suitable as a
|
||||||
|
secondary file-type identification method but is unreliable for primary identification.
|
||||||
|
|
||||||
|
Returns `None` when `extension` is not registered for any supported file-type.
|
||||||
|
"""
|
||||||
|
if extension in (None, "", "."):
|
||||||
|
return None
|
||||||
|
# -- not super efficient but plenty fast enough for once-or-twice-per-file use and avoids
|
||||||
|
# -- limitations on defining a class variable on an Enum.
|
||||||
|
for m in cls.__members__.values():
|
||||||
|
if extension in m._extensions:
|
||||||
|
return m
|
||||||
|
return None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_mime_type(cls, mime_type: str) -> FileType | None:
|
||||||
|
"""Select a FileType member based on a MIME-type.
|
||||||
|
|
||||||
|
Returns `None` when `mime_type` is `None` or does not map to the canonical MIME-type of a
|
||||||
|
`FileType` member or one of its alias MIME-types.
|
||||||
|
"""
|
||||||
|
# -- not super efficient but plenty fast enough for once-or-twice-per-file use and avoids
|
||||||
|
# -- limitations on defining a class variable on an Enum.
|
||||||
|
for m in cls.__members__.values():
|
||||||
|
if mime_type == m._canonical_mime_type or mime_type in m._alias_mime_types:
|
||||||
|
return m
|
||||||
|
return None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def extra_name(self) -> str | None:
|
||||||
|
"""The `pip` "extra" that must be installed to provide this file-type's dependencies.
|
||||||
|
|
||||||
|
Like "image" for PNG, as in `pip install "unstructured[image]"`.
|
||||||
|
|
||||||
|
`None` when partitioning this file-type requires only the base `unstructured` install.
|
||||||
|
"""
|
||||||
|
return self._extra_name
|
||||||
|
|
||||||
|
@property
|
||||||
|
def importable_package_dependencies(self) -> tuple[str, ...]:
|
||||||
|
"""Packages that must be importable for this file-type's partitioner to work.
|
||||||
|
|
||||||
|
In general, these are the packages provided by the `pip install` "extra" for this file-type,
|
||||||
|
like `pip install "unstructured[docx]"` loads the `python-docx` package.
|
||||||
|
|
||||||
|
Note that these names are the ones used in an `import` statement, which is not necessarily
|
||||||
|
the same as the _distribution_ package name used by `pip`. For example, the DOCX
|
||||||
|
distribution package name is `"python-docx"` whereas the _importable_ package name is
|
||||||
|
`"docx"`. This latter name as it appears like `import docx` is what is provided by this
|
||||||
|
property.
|
||||||
|
|
||||||
|
The return value is an empty tuple for file-types that do not require optional dependencies.
|
||||||
|
|
||||||
|
Note this property does not complain when accessed on a non-partitionable file-type, it
|
||||||
|
simply returns an empty tuple because file-types that are not partitionable require no
|
||||||
|
optional dependencies.
|
||||||
|
"""
|
||||||
|
return self._importable_package_dependencies
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_partitionable(self) -> bool:
|
||||||
|
"""True when there is a partitioner for this file-type.
|
||||||
|
|
||||||
|
Note this does not check whether the dependencies for this file-type are installed so
|
||||||
|
attempting to partition a file of this type may still fail. This is meant for
|
||||||
|
distinguishing file-types like WAV, ZIP, EMPTY, and UNK which are legitimate file-types
|
||||||
|
but have no associated partitioner.
|
||||||
|
"""
|
||||||
|
return bool(self._partitioner_shortname)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def mime_type(self) -> str:
|
||||||
|
"""The canonical MIME-type for this file-type, suitable for use in metadata.
|
||||||
|
|
||||||
|
This value is used in `.metadata.filetype` for elements partitioned from files of this
|
||||||
|
type. In general it is the "offical", "recommended", or "defacto-standard" MIME-type for
|
||||||
|
files of this type, in that order, as available.
|
||||||
|
"""
|
||||||
|
return self._canonical_mime_type
|
||||||
|
|
||||||
|
@property
|
||||||
|
def partitioner_function_name(self) -> str:
|
||||||
|
"""Name of partitioner function for this file-type. Like "partition_docx".
|
||||||
|
|
||||||
|
Raises when this property is accessed on a file-type that is not partitionable. Use
|
||||||
|
`.is_partitionable` to avoid exceptions when partitionability is unknown.
|
||||||
|
"""
|
||||||
|
# -- Raise when this property is accessed on a FileType member that has no partitioner
|
||||||
|
# -- shortname. This prevents a harder-to-find bug from appearing far away from this call
|
||||||
|
# -- when code would try to `getattr(module, None)` or whatever.
|
||||||
|
if (shortname := self._partitioner_shortname) is None:
|
||||||
|
raise ValueError(
|
||||||
|
f"`.partitioner_function_name` is undefined because FileType.{self.name} is not"
|
||||||
|
f" partitionable. Use `.is_partitionable` to determine whether a `FileType`"
|
||||||
|
f" is partitionable."
|
||||||
|
)
|
||||||
|
return f"partition_{shortname}"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def partitioner_module_qname(self) -> str:
|
||||||
|
"""Fully-qualified name of module providing partitioner for this file-type.
|
||||||
|
|
||||||
|
e.g. "unstructured.partition.docx" for FileType.DOCX.
|
||||||
|
"""
|
||||||
|
# -- Raise when this property is accessed on a FileType member that has no partitioner
|
||||||
|
# -- shortname. This prevents a harder-to-find bug from appearing far away from this call
|
||||||
|
# -- when code would try to `importlib.import_module(None)` or whatever.
|
||||||
|
if (shortname := self._partitioner_shortname) is None:
|
||||||
|
raise ValueError(
|
||||||
|
f"`.partitioner_module_qname` is undefined because FileType.{self.name} is not"
|
||||||
|
f" partitionable. Use `.is_partitionable` to determine whether a `FileType`"
|
||||||
|
f" is partitionable."
|
||||||
|
)
|
||||||
|
return f"unstructured.partition.{shortname}"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def partitioner_shortname(self) -> str | None:
|
||||||
|
"""Familiar name of partitioner, like "image" for file-types that use `partition_image()`.
|
||||||
|
|
||||||
|
One use is to determine whether a file-type is one of the five image types, all of which
|
||||||
|
are processed by `partition_image()`.
|
||||||
|
|
||||||
|
`None` for file-types that are not partitionable, although `.is_partitionable` is the
|
||||||
|
preferred way of discovering that.
|
||||||
|
"""
|
||||||
|
return self._partitioner_shortname
|
||||||
|
|
||||||
|
BMP = (
|
||||||
|
"bmp", # -- value for this Enum member, like BMP = "bmp" in a simple enum --
|
||||||
|
"image", # -- partitioner_shortname --
|
||||||
|
["unstructured_inference"], # -- importable_package_dependencies --
|
||||||
|
"image", # -- extra_name - like `pip install "unstructured[image]"` in this case --
|
||||||
|
[".bmp"], # -- extensions - filename extensions that map to this file-type --
|
||||||
|
"image/bmp", # -- canonical_mime_type - MIME-type written to `.metadata.filetype` --
|
||||||
|
cast(list[str], []), # -- alias_mime-types - other MIME-types that map to this file-type --
|
||||||
|
)
|
||||||
CSV = (
|
CSV = (
|
||||||
|
"csv",
|
||||||
|
"csv",
|
||||||
|
["pandas"],
|
||||||
"csv",
|
"csv",
|
||||||
[".csv"],
|
[".csv"],
|
||||||
"text/csv",
|
"text/csv",
|
||||||
@ -54,38 +212,143 @@ class FileType(enum.Enum):
|
|||||||
"text/x-csv",
|
"text/x-csv",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
DOC = ("doc", [".doc"], "application/msword", cast(list[str], []))
|
DOC = ("doc", "doc", ["docx"], "doc", [".doc"], "application/msword", cast(list[str], []))
|
||||||
DOCX = (
|
DOCX = (
|
||||||
|
"docx",
|
||||||
|
"docx",
|
||||||
|
["docx"],
|
||||||
"docx",
|
"docx",
|
||||||
[".docx"],
|
[".docx"],
|
||||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
cast(list[str], []),
|
cast(list[str], []),
|
||||||
)
|
)
|
||||||
EML = ("eml", [".eml", ".p7s"], "message/rfc822", cast(list[str], []))
|
EML = (
|
||||||
EPUB = ("epub", [".epub"], "application/epub", ["application/epub+zip"])
|
"eml",
|
||||||
HEIC = ("heic", [".heic"], "image/heic", cast(list[str], []))
|
"email",
|
||||||
HTML = ("html", [".html", ".htm"], "text/html", cast(list[str], []))
|
cast(list[str], []),
|
||||||
JPG = ("jpg", [".jpeg", ".jpg"], "image/jpeg", cast(list[str], []))
|
None,
|
||||||
JSON = ("json", [".json"], "application/json", cast(list[str], []))
|
[".eml", ".p7s"],
|
||||||
MD = ("md", [".md"], "text/markdown", ["text/x-markdown"])
|
"message/rfc822",
|
||||||
MSG = ("msg", [".msg"], "application/vnd.ms-outlook", ["application/x-ole-storage"])
|
cast(list[str], []),
|
||||||
ODT = ("odt", [".odt"], "application/vnd.oasis.opendocument.text", cast(list[str], []))
|
)
|
||||||
ORG = ("org", [".org"], "text/org", cast(list[str], []))
|
EPUB = (
|
||||||
PDF = ("pdf", [".pdf"], "application/pdf", cast(list[str], []))
|
"epub",
|
||||||
PNG = ("png", [".png"], "image/png", cast(list[str], []))
|
"epub",
|
||||||
PPT = ("ppt", [".ppt"], "application/vnd.ms-powerpoint", cast(list[str], []))
|
["pypandoc"],
|
||||||
|
"epub",
|
||||||
|
[".epub"],
|
||||||
|
"application/epub",
|
||||||
|
["application/epub+zip"],
|
||||||
|
)
|
||||||
|
HEIC = (
|
||||||
|
"heic",
|
||||||
|
"image",
|
||||||
|
["unstructured_inference"],
|
||||||
|
"image",
|
||||||
|
[".heic"],
|
||||||
|
"image/heic",
|
||||||
|
cast(list[str], []),
|
||||||
|
)
|
||||||
|
HTML = (
|
||||||
|
"html",
|
||||||
|
"html",
|
||||||
|
cast(list[str], []),
|
||||||
|
None,
|
||||||
|
[".html", ".htm"],
|
||||||
|
"text/html",
|
||||||
|
cast(list[str], []),
|
||||||
|
)
|
||||||
|
JPG = (
|
||||||
|
"jpg",
|
||||||
|
"image",
|
||||||
|
["unstructured_inference"],
|
||||||
|
"image",
|
||||||
|
[".jpeg", ".jpg"],
|
||||||
|
"image/jpeg",
|
||||||
|
cast(list[str], []),
|
||||||
|
)
|
||||||
|
JSON = (
|
||||||
|
"json",
|
||||||
|
"json",
|
||||||
|
cast(list[str], []),
|
||||||
|
None,
|
||||||
|
[".json"],
|
||||||
|
"application/json",
|
||||||
|
cast(list[str], []),
|
||||||
|
)
|
||||||
|
MD = ("md", "md", ["markdown"], "md", [".md"], "text/markdown", ["text/x-markdown"])
|
||||||
|
MSG = (
|
||||||
|
"msg",
|
||||||
|
"msg",
|
||||||
|
["oxmsg"],
|
||||||
|
"msg",
|
||||||
|
[".msg"],
|
||||||
|
"application/vnd.ms-outlook",
|
||||||
|
["application/x-ole-storage"],
|
||||||
|
)
|
||||||
|
ODT = (
|
||||||
|
"odt",
|
||||||
|
"odt",
|
||||||
|
["docx", "pypandoc"],
|
||||||
|
"odt",
|
||||||
|
[".odt"],
|
||||||
|
"application/vnd.oasis.opendocument.text",
|
||||||
|
cast(list[str], []),
|
||||||
|
)
|
||||||
|
ORG = ("org", "org", ["pypandoc"], "org", [".org"], "text/org", cast(list[str], []))
|
||||||
|
PDF = (
|
||||||
|
"pdf",
|
||||||
|
"pdf",
|
||||||
|
["pdf2image", "pdfminer", "PIL"],
|
||||||
|
"pdf",
|
||||||
|
[".pdf"],
|
||||||
|
"application/pdf",
|
||||||
|
cast(list[str], []),
|
||||||
|
)
|
||||||
|
PNG = (
|
||||||
|
"png",
|
||||||
|
"image",
|
||||||
|
["unstructured_inference"],
|
||||||
|
"image",
|
||||||
|
[".png"],
|
||||||
|
"image/png",
|
||||||
|
cast(list[str], []),
|
||||||
|
)
|
||||||
|
PPT = (
|
||||||
|
"ppt",
|
||||||
|
"ppt",
|
||||||
|
["pptx"],
|
||||||
|
"ppt",
|
||||||
|
[".ppt"],
|
||||||
|
"application/vnd.ms-powerpoint",
|
||||||
|
cast(list[str], []),
|
||||||
|
)
|
||||||
PPTX = (
|
PPTX = (
|
||||||
|
"pptx",
|
||||||
|
"pptx",
|
||||||
|
["pptx"],
|
||||||
"pptx",
|
"pptx",
|
||||||
[".pptx"],
|
[".pptx"],
|
||||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||||
cast(list[str], []),
|
cast(list[str], []),
|
||||||
)
|
)
|
||||||
RST = ("rst", [".rst"], "text/x-rst", cast(list[str], []))
|
RST = ("rst", "rst", ["pypandoc"], "rst", [".rst"], "text/x-rst", cast(list[str], []))
|
||||||
RTF = ("rtf", [".rtf"], "text/rtf", ["application/rtf"])
|
RTF = ("rtf", "rtf", ["pypandoc"], "rtf", [".rtf"], "text/rtf", ["application/rtf"])
|
||||||
TIFF = ("tiff", [".tiff"], "image/tiff", cast(list[str], []))
|
TIFF = (
|
||||||
TSV = ("tsv", [".tab", ".tsv"], "text/tsv", cast(list[str], []))
|
"tiff",
|
||||||
|
"image",
|
||||||
|
["unstructured_inference"],
|
||||||
|
"image",
|
||||||
|
[".tiff"],
|
||||||
|
"image/tiff",
|
||||||
|
cast(list[str], []),
|
||||||
|
)
|
||||||
|
TSV = ("tsv", "tsv", ["pandas"], "tsv", [".tab", ".tsv"], "text/tsv", cast(list[str], []))
|
||||||
TXT = (
|
TXT = (
|
||||||
"txt",
|
"txt",
|
||||||
|
"text",
|
||||||
|
cast(list[str], []),
|
||||||
|
None,
|
||||||
[
|
[
|
||||||
".txt",
|
".txt",
|
||||||
".text",
|
".text",
|
||||||
@ -119,6 +382,9 @@ class FileType(enum.Enum):
|
|||||||
)
|
)
|
||||||
WAV = (
|
WAV = (
|
||||||
"wav",
|
"wav",
|
||||||
|
None,
|
||||||
|
cast(list[str], []),
|
||||||
|
None,
|
||||||
[".wav"],
|
[".wav"],
|
||||||
"audio/wav",
|
"audio/wav",
|
||||||
[
|
[
|
||||||
@ -129,60 +395,45 @@ class FileType(enum.Enum):
|
|||||||
"audio/x-wav",
|
"audio/x-wav",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
XLS = ("xls", [".xls"], "application/vnd.ms-excel", cast(list[str], []))
|
XLS = (
|
||||||
|
"xls",
|
||||||
|
"xlsx",
|
||||||
|
["pandas", "openpyxl"],
|
||||||
|
"xlsx",
|
||||||
|
[".xls"],
|
||||||
|
"application/vnd.ms-excel",
|
||||||
|
cast(list[str], []),
|
||||||
|
)
|
||||||
XLSX = (
|
XLSX = (
|
||||||
|
"xlsx",
|
||||||
|
"xlsx",
|
||||||
|
["pandas", "openpyxl"],
|
||||||
"xlsx",
|
"xlsx",
|
||||||
[".xlsx"],
|
[".xlsx"],
|
||||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||||
cast(list[str], []),
|
cast(list[str], []),
|
||||||
)
|
)
|
||||||
XML = ("xml", [".xml"], "application/xml", ["text/xml"])
|
XML = ("xml", "xml", cast(list[str], []), None, [".xml"], "application/xml", ["text/xml"])
|
||||||
ZIP = ("zip", [".zip"], "application/zip", cast(list[str], []))
|
ZIP = ("zip", None, cast(list[str], []), None, [".zip"], "application/zip", cast(list[str], []))
|
||||||
|
|
||||||
UNK = ("unk", cast(list[str], []), "application/octet-stream", cast(list[str], []))
|
UNK = (
|
||||||
EMPTY = ("empty", cast(list[str], []), "inode/x-empty", cast(list[str], []))
|
"unk",
|
||||||
|
None,
|
||||||
@classmethod
|
cast(list[str], []),
|
||||||
def from_extension(cls, extension: str | None) -> FileType | None:
|
None,
|
||||||
"""Select a FileType member based on an extension.
|
cast(list[str], []),
|
||||||
|
"application/octet-stream",
|
||||||
`extension` must include the leading period, like `".pdf"`. Extension is suitable as a
|
cast(list[str], []),
|
||||||
secondary file-type identification method but is unreliable for primary identification..
|
)
|
||||||
|
EMPTY = (
|
||||||
Returns `None` when `extension` is not registered for any supported file-type.
|
"empty",
|
||||||
"""
|
None,
|
||||||
if extension in (None, "", "."):
|
cast(list[str], []),
|
||||||
return None
|
None,
|
||||||
# -- not super efficient but plenty fast enough for once-or-twice-per-file use and avoids
|
cast(list[str], []),
|
||||||
# -- limitations on defining a class variable on an Enum.
|
"inode/x-empty",
|
||||||
for m in cls.__members__.values():
|
cast(list[str], []),
|
||||||
if extension in m._extensions:
|
)
|
||||||
return m
|
|
||||||
return None
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_mime_type(cls, mime_type: str) -> FileType | None:
|
|
||||||
"""Select a FileType member based on a MIME-type.
|
|
||||||
|
|
||||||
`extension` must include the leading period, like `".pdf"`. Extension is suitable as a
|
|
||||||
secondary file-type identification method but is unreliable for primary identification..
|
|
||||||
"""
|
|
||||||
# -- not super efficient but plenty fast enough for once-or-twice-per-file use and avoids
|
|
||||||
# -- limitations on defining a class variable on an Enum.
|
|
||||||
for m in cls.__members__.values():
|
|
||||||
if mime_type == m._canonical_mime_type or mime_type in m._alias_mime_types:
|
|
||||||
return m
|
|
||||||
return None
|
|
||||||
|
|
||||||
@property
|
|
||||||
def mime_type(self) -> str:
|
|
||||||
"""The canonical MIME-type for this file-type, suitable for use in metadata.
|
|
||||||
|
|
||||||
This value is used in `.metadata.filetype` for elements partitioned from files of this
|
|
||||||
type. In general it is the "offical", "recommended", or "defacto-standard" MIME-type for
|
|
||||||
files of this type, in that order, as available.
|
|
||||||
"""
|
|
||||||
return self._canonical_mime_type
|
|
||||||
|
|
||||||
|
|
||||||
PLAIN_TEXT_EXTENSIONS = ".csv .eml .html .json .md .org .p7s .rst .rtf .tab .text .tsv .txt".split()
|
PLAIN_TEXT_EXTENSIONS = ".csv .eml .html .json .md .org .p7s .rst .rtf .tab .text .tsv .txt".split()
|
||||||
|
@ -2,131 +2,28 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import importlib
|
||||||
import io
|
import io
|
||||||
from typing import IO, Any, Callable, Literal, Optional
|
from typing import IO, Any, Callable, Literal, Optional
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
from typing_extensions import TypeAlias
|
||||||
|
|
||||||
from unstructured.documents.elements import DataSourceMetadata, Element
|
from unstructured.documents.elements import DataSourceMetadata, Element
|
||||||
from unstructured.file_utils.filetype import detect_filetype, is_json_processable
|
from unstructured.file_utils.filetype import detect_filetype, is_json_processable
|
||||||
from unstructured.file_utils.model import FileType
|
from unstructured.file_utils.model import FileType
|
||||||
from unstructured.logger import logger
|
from unstructured.logger import logger
|
||||||
from unstructured.partition.common import exactly_one
|
from unstructured.partition.common import exactly_one
|
||||||
from unstructured.partition.email import partition_email
|
|
||||||
from unstructured.partition.html import partition_html
|
|
||||||
from unstructured.partition.json import partition_json
|
|
||||||
from unstructured.partition.lang import check_language_args
|
from unstructured.partition.lang import check_language_args
|
||||||
from unstructured.partition.text import partition_text
|
|
||||||
from unstructured.partition.utils.constants import PartitionStrategy
|
from unstructured.partition.utils.constants import PartitionStrategy
|
||||||
from unstructured.partition.xml import partition_xml
|
|
||||||
from unstructured.utils import dependency_exists
|
from unstructured.utils import dependency_exists
|
||||||
|
|
||||||
PARTITION_WITH_EXTRAS_MAP: dict[str, Callable[..., list[Element]]] = {}
|
Partitioner: TypeAlias = Callable[..., list[Element]]
|
||||||
|
|
||||||
if dependency_exists("pandas"):
|
|
||||||
from unstructured.partition.csv import partition_csv
|
|
||||||
from unstructured.partition.tsv import partition_tsv
|
|
||||||
|
|
||||||
PARTITION_WITH_EXTRAS_MAP["csv"] = partition_csv
|
|
||||||
PARTITION_WITH_EXTRAS_MAP["tsv"] = partition_tsv
|
|
||||||
|
|
||||||
|
|
||||||
if dependency_exists("docx"):
|
|
||||||
from unstructured.partition.doc import partition_doc
|
|
||||||
from unstructured.partition.docx import partition_docx
|
|
||||||
|
|
||||||
PARTITION_WITH_EXTRAS_MAP["doc"] = partition_doc
|
|
||||||
PARTITION_WITH_EXTRAS_MAP["docx"] = partition_docx
|
|
||||||
|
|
||||||
|
|
||||||
if dependency_exists("docx") and dependency_exists("pypandoc"):
|
|
||||||
from unstructured.partition.odt import partition_odt
|
|
||||||
|
|
||||||
PARTITION_WITH_EXTRAS_MAP["odt"] = partition_odt
|
|
||||||
|
|
||||||
|
|
||||||
if dependency_exists("pypandoc"):
|
|
||||||
from unstructured.partition.epub import partition_epub
|
|
||||||
|
|
||||||
PARTITION_WITH_EXTRAS_MAP["epub"] = partition_epub
|
|
||||||
|
|
||||||
|
|
||||||
if dependency_exists("pypandoc"):
|
|
||||||
from unstructured.partition.org import partition_org
|
|
||||||
from unstructured.partition.rst import partition_rst
|
|
||||||
from unstructured.partition.rtf import partition_rtf
|
|
||||||
|
|
||||||
PARTITION_WITH_EXTRAS_MAP["org"] = partition_org
|
|
||||||
PARTITION_WITH_EXTRAS_MAP["rst"] = partition_rst
|
|
||||||
PARTITION_WITH_EXTRAS_MAP["rtf"] = partition_rtf
|
|
||||||
|
|
||||||
|
|
||||||
if dependency_exists("markdown"):
|
|
||||||
from unstructured.partition.md import partition_md
|
|
||||||
|
|
||||||
PARTITION_WITH_EXTRAS_MAP["md"] = partition_md
|
|
||||||
|
|
||||||
|
|
||||||
if dependency_exists("oxmsg"):
|
|
||||||
from unstructured.partition.msg import partition_msg
|
|
||||||
|
|
||||||
PARTITION_WITH_EXTRAS_MAP["msg"] = partition_msg
|
|
||||||
|
|
||||||
|
|
||||||
pdf_imports = ["pdf2image", "pdfminer", "PIL"]
|
|
||||||
if all(dependency_exists(dep) for dep in pdf_imports):
|
|
||||||
from unstructured.partition.pdf import partition_pdf
|
|
||||||
|
|
||||||
PARTITION_WITH_EXTRAS_MAP["pdf"] = partition_pdf
|
|
||||||
|
|
||||||
|
|
||||||
if dependency_exists("unstructured_inference"):
|
|
||||||
from unstructured.partition.image import partition_image
|
|
||||||
|
|
||||||
PARTITION_WITH_EXTRAS_MAP["image"] = partition_image
|
|
||||||
|
|
||||||
|
|
||||||
if dependency_exists("pptx"):
|
|
||||||
from unstructured.partition.ppt import partition_ppt
|
|
||||||
from unstructured.partition.pptx import partition_pptx
|
|
||||||
|
|
||||||
PARTITION_WITH_EXTRAS_MAP["ppt"] = partition_ppt
|
|
||||||
PARTITION_WITH_EXTRAS_MAP["pptx"] = partition_pptx
|
|
||||||
|
|
||||||
|
|
||||||
if dependency_exists("pandas") and dependency_exists("openpyxl"):
|
|
||||||
from unstructured.partition.xlsx import partition_xlsx
|
|
||||||
|
|
||||||
PARTITION_WITH_EXTRAS_MAP["xlsx"] = partition_xlsx
|
|
||||||
|
|
||||||
|
|
||||||
IMAGE_FILETYPES = [
|
|
||||||
FileType.HEIC,
|
|
||||||
FileType.PNG,
|
|
||||||
FileType.JPG,
|
|
||||||
FileType.TIFF,
|
|
||||||
FileType.BMP,
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def _get_partition_with_extras(
|
|
||||||
doc_type: str,
|
|
||||||
partition_with_extras_map: Optional[dict[str, Callable[..., list[Element]]]] = None,
|
|
||||||
):
|
|
||||||
if partition_with_extras_map is None:
|
|
||||||
partition_with_extras_map = PARTITION_WITH_EXTRAS_MAP
|
|
||||||
_partition_func = partition_with_extras_map.get(doc_type)
|
|
||||||
if _partition_func is None:
|
|
||||||
raise ImportError(
|
|
||||||
f"partition_{doc_type} is not available. "
|
|
||||||
f"Install the {doc_type} dependencies with "
|
|
||||||
f'pip install "unstructured[{doc_type}]"',
|
|
||||||
)
|
|
||||||
return _partition_func
|
|
||||||
|
|
||||||
|
|
||||||
def partition(
|
def partition(
|
||||||
filename: Optional[str] = None,
|
filename: Optional[str] = None,
|
||||||
|
*,
|
||||||
content_type: Optional[str] = None,
|
content_type: Optional[str] = None,
|
||||||
file: Optional[IO[bytes]] = None,
|
file: Optional[IO[bytes]] = None,
|
||||||
file_filename: Optional[str] = None,
|
file_filename: Optional[str] = None,
|
||||||
@ -156,10 +53,11 @@ def partition(
|
|||||||
starting_page_number: int = 1,
|
starting_page_number: int = 1,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
):
|
):
|
||||||
"""Partitions a document into its constituent elements. Will use libmagic to determine
|
"""Partitions a document into its constituent elements.
|
||||||
the file's type and route it to the appropriate partitioning function. Applies the default
|
|
||||||
parameters for each partitioning function. Use the document-type specific partitioning
|
Uses libmagic to determine the file's type and route it to the appropriate partitioning
|
||||||
functions if you need access to additional kwarg options.
|
function. Applies the default parameters for each partitioning function. Use the document-type
|
||||||
|
specific partitioning functions if you need access to additional kwarg options.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
@ -272,7 +170,7 @@ def partition(
|
|||||||
languages = check_language_args(languages or [], ocr_languages)
|
languages = check_language_args(languages or [], ocr_languages)
|
||||||
|
|
||||||
if url is not None:
|
if url is not None:
|
||||||
file, filetype = file_and_type_from_url(
|
file, file_type = file_and_type_from_url(
|
||||||
url=url,
|
url=url,
|
||||||
content_type=content_type,
|
content_type=content_type,
|
||||||
headers=headers,
|
headers=headers,
|
||||||
@ -285,7 +183,7 @@ def partition(
|
|||||||
"The headers kwarg is set but the url kwarg is not. "
|
"The headers kwarg is set but the url kwarg is not. "
|
||||||
"The headers kwarg will be ignored.",
|
"The headers kwarg will be ignored.",
|
||||||
)
|
)
|
||||||
filetype = detect_filetype(
|
file_type = detect_filetype(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
file_filename=metadata_filename,
|
file_filename=metadata_filename,
|
||||||
@ -297,14 +195,16 @@ def partition(
|
|||||||
file.seek(0)
|
file.seek(0)
|
||||||
|
|
||||||
infer_table_structure = decide_table_extraction(
|
infer_table_structure = decide_table_extraction(
|
||||||
filetype,
|
file_type,
|
||||||
skip_infer_table_types,
|
skip_infer_table_types,
|
||||||
pdf_infer_table_structure,
|
pdf_infer_table_structure,
|
||||||
)
|
)
|
||||||
|
|
||||||
if filetype == FileType.CSV:
|
partitioner_loader = _PartitionerLoader()
|
||||||
_partition_csv = _get_partition_with_extras("csv")
|
|
||||||
elements = _partition_csv(
|
if file_type == FileType.CSV:
|
||||||
|
partition_csv = partitioner_loader.get(file_type)
|
||||||
|
elements = partition_csv(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
infer_table_structure=infer_table_structure,
|
infer_table_structure=infer_table_structure,
|
||||||
@ -312,9 +212,9 @@ def partition(
|
|||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.DOC:
|
elif file_type == FileType.DOC:
|
||||||
_partition_doc = _get_partition_with_extras("doc")
|
partition_doc = partitioner_loader.get(file_type)
|
||||||
elements = _partition_doc(
|
elements = partition_doc(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
infer_table_structure=infer_table_structure,
|
infer_table_structure=infer_table_structure,
|
||||||
@ -324,9 +224,9 @@ def partition(
|
|||||||
strategy=strategy,
|
strategy=strategy,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.DOCX:
|
elif file_type == FileType.DOCX:
|
||||||
_partition_docx = _get_partition_with_extras("docx")
|
partition_docx = partitioner_loader.get(file_type)
|
||||||
elements = _partition_docx(
|
elements = partition_docx(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
infer_table_structure=infer_table_structure,
|
infer_table_structure=infer_table_structure,
|
||||||
@ -336,7 +236,8 @@ def partition(
|
|||||||
strategy=strategy,
|
strategy=strategy,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.EML:
|
elif file_type == FileType.EML:
|
||||||
|
partition_email = partitioner_loader.get(file_type)
|
||||||
elements = partition_email(
|
elements = partition_email(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
@ -345,9 +246,9 @@ def partition(
|
|||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.EPUB:
|
elif file_type == FileType.EPUB:
|
||||||
_partition_epub = _get_partition_with_extras("epub")
|
partition_epub = partitioner_loader.get(file_type)
|
||||||
elements = _partition_epub(
|
elements = partition_epub(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
@ -356,7 +257,8 @@ def partition(
|
|||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.HTML:
|
elif file_type == FileType.HTML:
|
||||||
|
partition_html = partitioner_loader.get(file_type)
|
||||||
elements = partition_html(
|
elements = partition_html(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
@ -366,9 +268,9 @@ def partition(
|
|||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype in IMAGE_FILETYPES:
|
elif file_type.partitioner_shortname == "image":
|
||||||
_partition_image = _get_partition_with_extras("image")
|
partition_image = partitioner_loader.get(file_type)
|
||||||
elements = _partition_image(
|
elements = partition_image(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
url=None,
|
url=None,
|
||||||
@ -384,16 +286,17 @@ def partition(
|
|||||||
starting_page_number=starting_page_number,
|
starting_page_number=starting_page_number,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.JSON:
|
elif file_type == FileType.JSON:
|
||||||
if not is_json_processable(filename=filename, file=file):
|
if not is_json_processable(filename=filename, file=file):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Detected a JSON file that does not conform to the Unstructured schema. "
|
"Detected a JSON file that does not conform to the Unstructured schema. "
|
||||||
"partition_json currently only processes serialized Unstructured output.",
|
"partition_json currently only processes serialized Unstructured output.",
|
||||||
)
|
)
|
||||||
|
partition_json = partitioner_loader.get(file_type)
|
||||||
elements = partition_json(filename=filename, file=file, **kwargs)
|
elements = partition_json(filename=filename, file=file, **kwargs)
|
||||||
elif filetype == FileType.MD:
|
elif file_type == FileType.MD:
|
||||||
_partition_md = _get_partition_with_extras("md")
|
partition_md = partitioner_loader.get(file_type)
|
||||||
elements = _partition_md(
|
elements = partition_md(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
@ -402,18 +305,18 @@ def partition(
|
|||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.MSG:
|
elif file_type == FileType.MSG:
|
||||||
_partition_msg = _get_partition_with_extras("msg")
|
partition_msg = partitioner_loader.get(file_type)
|
||||||
elements = _partition_msg(
|
elements = partition_msg(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.ODT:
|
elif file_type == FileType.ODT:
|
||||||
_partition_odt = _get_partition_with_extras("odt")
|
partition_odt = partitioner_loader.get(file_type)
|
||||||
elements = _partition_odt(
|
elements = partition_odt(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
infer_table_structure=infer_table_structure,
|
infer_table_structure=infer_table_structure,
|
||||||
@ -423,9 +326,9 @@ def partition(
|
|||||||
strategy=strategy,
|
strategy=strategy,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.ORG:
|
elif file_type == FileType.ORG:
|
||||||
_partition_org = _get_partition_with_extras("org")
|
partition_org = partitioner_loader.get(file_type)
|
||||||
elements = _partition_org(
|
elements = partition_org(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
@ -433,9 +336,9 @@ def partition(
|
|||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.PDF:
|
elif file_type == FileType.PDF:
|
||||||
_partition_pdf = _get_partition_with_extras("pdf")
|
partition_pdf = partitioner_loader.get(file_type)
|
||||||
elements = _partition_pdf(
|
elements = partition_pdf(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
url=None,
|
url=None,
|
||||||
@ -451,9 +354,9 @@ def partition(
|
|||||||
starting_page_number=starting_page_number,
|
starting_page_number=starting_page_number,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.PPT:
|
elif file_type == FileType.PPT:
|
||||||
_partition_ppt = _get_partition_with_extras("ppt")
|
partition_ppt = partitioner_loader.get(file_type)
|
||||||
elements = _partition_ppt(
|
elements = partition_ppt(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
@ -463,9 +366,9 @@ def partition(
|
|||||||
strategy=strategy,
|
strategy=strategy,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.PPTX:
|
elif file_type == FileType.PPTX:
|
||||||
_partition_pptx = _get_partition_with_extras("pptx")
|
partition_pptx = partitioner_loader.get(file_type)
|
||||||
elements = _partition_pptx(
|
elements = partition_pptx(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
@ -476,9 +379,9 @@ def partition(
|
|||||||
strategy=strategy,
|
strategy=strategy,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.RST:
|
elif file_type == FileType.RST:
|
||||||
_partition_rst = _get_partition_with_extras("rst")
|
partition_rst = partitioner_loader.get(file_type)
|
||||||
elements = _partition_rst(
|
elements = partition_rst(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
@ -487,9 +390,9 @@ def partition(
|
|||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.RTF:
|
elif file_type == FileType.RTF:
|
||||||
_partition_rtf = _get_partition_with_extras("rtf")
|
partition_rtf = partitioner_loader.get(file_type)
|
||||||
elements = _partition_rtf(
|
elements = partition_rtf(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
@ -498,16 +401,17 @@ def partition(
|
|||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.TSV:
|
elif file_type == FileType.TSV:
|
||||||
_partition_tsv = _get_partition_with_extras("tsv")
|
partition_tsv = partitioner_loader.get(file_type)
|
||||||
elements = _partition_tsv(
|
elements = partition_tsv(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.TXT:
|
elif file_type == FileType.TXT:
|
||||||
|
partition_text = partitioner_loader.get(file_type)
|
||||||
elements = partition_text(
|
elements = partition_text(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
@ -517,9 +421,9 @@ def partition(
|
|||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype in (FileType.XLS, FileType.XLSX):
|
elif file_type in (FileType.XLS, FileType.XLSX):
|
||||||
_partition_xlsx = _get_partition_with_extras("xlsx")
|
partition_xlsx = partitioner_loader.get(file_type)
|
||||||
elements = _partition_xlsx(
|
elements = partition_xlsx(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
infer_table_structure=infer_table_structure,
|
infer_table_structure=infer_table_structure,
|
||||||
@ -528,7 +432,8 @@ def partition(
|
|||||||
starting_page_number=starting_page_number,
|
starting_page_number=starting_page_number,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.XML:
|
elif file_type == FileType.XML:
|
||||||
|
partition_xml = partitioner_loader.get(file_type)
|
||||||
elements = partition_xml(
|
elements = partition_xml(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
@ -538,11 +443,11 @@ def partition(
|
|||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.EMPTY:
|
elif file_type == FileType.EMPTY:
|
||||||
elements = []
|
elements = []
|
||||||
else:
|
else:
|
||||||
msg = "Invalid file" if not filename else f"Invalid file {filename}"
|
msg = "Invalid file" if not filename else f"Invalid file {filename}"
|
||||||
raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.")
|
raise ValueError(f"{msg}. The {file_type} file type is not supported in partition.")
|
||||||
|
|
||||||
for element in elements:
|
for element in elements:
|
||||||
element.metadata.url = url
|
element.metadata.url = url
|
||||||
@ -551,7 +456,7 @@ def partition(
|
|||||||
out_filetype = FileType.from_mime_type(content_type)
|
out_filetype = FileType.from_mime_type(content_type)
|
||||||
element.metadata.filetype = out_filetype.mime_type if out_filetype is not None else None
|
element.metadata.filetype = out_filetype.mime_type if out_filetype is not None else None
|
||||||
else:
|
else:
|
||||||
element.metadata.filetype = filetype.mime_type
|
element.metadata.filetype = file_type.mime_type
|
||||||
|
|
||||||
return elements
|
return elements
|
||||||
|
|
||||||
@ -562,7 +467,7 @@ def file_and_type_from_url(
|
|||||||
headers: dict[str, str] = {},
|
headers: dict[str, str] = {},
|
||||||
ssl_verify: bool = True,
|
ssl_verify: bool = True,
|
||||||
request_timeout: Optional[int] = None,
|
request_timeout: Optional[int] = None,
|
||||||
) -> tuple[io.BytesIO, Optional[FileType]]:
|
) -> tuple[io.BytesIO, FileType]:
|
||||||
response = requests.get(url, headers=headers, verify=ssl_verify, timeout=request_timeout)
|
response = requests.get(url, headers=headers, verify=ssl_verify, timeout=request_timeout)
|
||||||
file = io.BytesIO(response.content)
|
file = io.BytesIO(response.content)
|
||||||
|
|
||||||
@ -590,3 +495,51 @@ def decide_table_extraction(
|
|||||||
return pdf_infer_table_structure or doc_type not in skip_infer_table_types
|
return pdf_infer_table_structure or doc_type not in skip_infer_table_types
|
||||||
|
|
||||||
return doc_type not in skip_infer_table_types
|
return doc_type not in skip_infer_table_types
|
||||||
|
|
||||||
|
|
||||||
|
class _PartitionerLoader:
|
||||||
|
"""Provides uniform helpful error when a partitioner dependency is not installed.
|
||||||
|
|
||||||
|
Used by `partition()` to encapsulate coping with the possibility the Python
|
||||||
|
environment it is executing in may not have all dependencies installed for a
|
||||||
|
particular partitioner.
|
||||||
|
|
||||||
|
Provides `.get()` to access partitioners by file-type, which raises when one or
|
||||||
|
more dependencies for that partitioner are not installed.
|
||||||
|
|
||||||
|
The error message indicates what extra needs to be installed to enable that
|
||||||
|
partitioner. This avoids an inconsistent variety of possibly puzzling exceptions
|
||||||
|
arising from much deeper in the partitioner when access to the missing dependency is
|
||||||
|
first attempted.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# -- module-lifetime cache for partitioners once loaded --
|
||||||
|
_partitioners: dict[FileType, Partitioner] = {}
|
||||||
|
|
||||||
|
def get(self, file_type: FileType) -> Partitioner:
|
||||||
|
"""Return partitioner for `file_type`.
|
||||||
|
|
||||||
|
Raises when one or more package dependencies for that file-type have not been
|
||||||
|
installed.
|
||||||
|
"""
|
||||||
|
if file_type not in self._partitioners:
|
||||||
|
self._partitioners[file_type] = self._load_partitioner(file_type)
|
||||||
|
|
||||||
|
return self._partitioners[file_type]
|
||||||
|
|
||||||
|
def _load_partitioner(self, file_type: FileType) -> Partitioner:
|
||||||
|
"""Load the partitioner for `file_type` after verifying dependencies."""
|
||||||
|
# -- verify all package dependencies are installed --
|
||||||
|
for pkg_name in file_type.importable_package_dependencies:
|
||||||
|
if not dependency_exists(pkg_name):
|
||||||
|
raise ImportError(
|
||||||
|
f"{file_type.partitioner_function_name}() is not available because one or"
|
||||||
|
f" more dependencies are not installed. Use:"
|
||||||
|
f' pip install "unstructured[{file_type.extra_name}]" (including quotes)'
|
||||||
|
f" to install the required dependencies",
|
||||||
|
)
|
||||||
|
|
||||||
|
# -- load the partitioner and return it --
|
||||||
|
assert file_type.is_partitionable # -- would be a programming error if this failed --
|
||||||
|
partitioner_module = importlib.import_module(file_type.partitioner_module_qname)
|
||||||
|
return getattr(partitioner_module, file_type.partitioner_function_name)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user