rfctr(auto): add _PartitionerLoader (#3418)

**Summary**
Replace conditional explicit import of partitioner modules in
`.partition.auto` with the new `_PartitionerLoader` class. This avoids
unbound variable warnings and is much less noisy.

`_PartitionerLoader` makes use of the new `FileType` property
`.importable_package_dependencies` to determine whether all required
packages are importable before dispatching the file to its partitioner.
It uses `FileType.extra_name` to form a helpful error message when a
dependency is not installed, so the caller knows which `pip install`
extra to specify to remedy the error.

`PartitionerLoader` uses the `FileType` properties
`.partitioner_module_qname` and `partitioner_function_name` to load
the partitioner once its dependencies are verified. Loaded partitioners
are cached with module lifetime scope for efficiency.
This commit is contained in:
Steve Canny 2024-07-21 23:03:55 -07:00 committed by GitHub
parent ec59abfabc
commit 49c4bd34be
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 626 additions and 284 deletions

View File

@ -1,3 +1,11 @@
## 0.15.1-dev0
### Enhancements
### Features
### Fixes
## 0.15.0
### Enhancements

View File

@ -50,6 +50,68 @@ class DescribeFileType:
def but_not_when_that_mime_type_is_not_registered_by_a_file_type(self, mime_type: str):
assert FileType.from_mime_type(mime_type) is None
@pytest.mark.parametrize(
("file_type", "expected_value"),
[
(FileType.BMP, ("unstructured_inference",)),
(FileType.CSV, ("pandas",)),
(FileType.DOC, ("docx",)),
(FileType.EMPTY, ()),
(FileType.HTML, ()),
(FileType.ODT, ("docx", "pypandoc")),
(FileType.PDF, ("pdf2image", "pdfminer", "PIL")),
(FileType.UNK, ()),
(FileType.WAV, ()),
(FileType.ZIP, ()),
],
)
def it_knows_which_importable_packages_its_partitioner_depends_on(
self, file_type: FileType, expected_value: tuple[str, ...]
):
assert file_type.importable_package_dependencies == expected_value
@pytest.mark.parametrize(
("file_type", "expected_value"),
[
(FileType.BMP, "image"),
(FileType.DOC, "doc"),
(FileType.DOCX, "docx"),
(FileType.EML, None),
(FileType.EMPTY, None),
(FileType.MSG, "msg"),
(FileType.PDF, "pdf"),
(FileType.XLS, "xlsx"),
(FileType.UNK, None),
(FileType.WAV, None),
(FileType.ZIP, None),
],
)
def and_it_knows_which_pip_extra_needs_to_be_installed_to_get_those_dependencies(
self, file_type: FileType, expected_value: str | None
):
assert file_type.extra_name == expected_value
@pytest.mark.parametrize(
("file_type", "expected_value"),
[
(FileType.BMP, True),
(FileType.CSV, True),
(FileType.DOC, True),
(FileType.EML, True),
(FileType.JPG, True),
(FileType.PDF, True),
(FileType.PPTX, True),
(FileType.WAV, False),
(FileType.ZIP, False),
(FileType.EMPTY, False),
(FileType.UNK, False),
],
)
def it_knows_whether_files_of_its_type_are_directly_partitionable(
self, file_type: FileType, expected_value: str
):
assert file_type.is_partitionable is expected_value
@pytest.mark.parametrize(
("file_type", "mime_type"),
[
@ -68,3 +130,72 @@ class DescribeFileType:
)
def it_knows_its_canonical_MIME_type(self, file_type: FileType, mime_type: str):
assert file_type.mime_type == mime_type
@pytest.mark.parametrize(
("file_type", "expected_value"),
[
(FileType.BMP, "partition_image"),
(FileType.CSV, "partition_csv"),
(FileType.DOC, "partition_doc"),
(FileType.DOCX, "partition_docx"),
(FileType.JPG, "partition_image"),
(FileType.PNG, "partition_image"),
(FileType.TIFF, "partition_image"),
],
)
def it_knows_its_partitioner_function_name(self, file_type: FileType, expected_value: str):
assert file_type.partitioner_function_name == expected_value
@pytest.mark.parametrize(
"file_type", [FileType.WAV, FileType.ZIP, FileType.EMPTY, FileType.UNK]
)
def but_it_raises_on_partitioner_function_name_access_when_the_file_type_is_not_partitionable(
self, file_type: FileType
):
with pytest.raises(ValueError, match="`.partitioner_function_name` is undefined because "):
file_type.partitioner_function_name
@pytest.mark.parametrize(
("file_type", "expected_value"),
[
(FileType.BMP, "unstructured.partition.image"),
(FileType.CSV, "unstructured.partition.csv"),
(FileType.DOC, "unstructured.partition.doc"),
(FileType.DOCX, "unstructured.partition.docx"),
(FileType.JPG, "unstructured.partition.image"),
(FileType.PNG, "unstructured.partition.image"),
(FileType.TIFF, "unstructured.partition.image"),
],
)
def it_knows_the_fully_qualified_name_of_its_partitioner_module(
self, file_type: FileType, expected_value: str
):
assert file_type.partitioner_module_qname == expected_value
@pytest.mark.parametrize(
"file_type", [FileType.WAV, FileType.ZIP, FileType.EMPTY, FileType.UNK]
)
def but_it_raises_on_partitioner_module_qname_access_when_the_file_type_is_not_partitionable(
self, file_type: FileType
):
with pytest.raises(ValueError, match="`.partitioner_module_qname` is undefined because "):
file_type.partitioner_module_qname
@pytest.mark.parametrize(
("file_type", "expected_value"),
[
(FileType.BMP, "image"),
(FileType.CSV, "csv"),
(FileType.DOC, "doc"),
(FileType.DOCX, "docx"),
(FileType.JPG, "image"),
(FileType.PNG, "image"),
(FileType.TIFF, "image"),
(FileType.XLS, "xlsx"),
(FileType.XLSX, "xlsx"),
],
)
def it_provides_access_to_the_partitioner_shortname(
self, file_type: FileType, expected_value: str
):
assert file_type.partitioner_shortname == expected_value

View File

@ -10,7 +10,7 @@ import sys
import tempfile
import warnings
from importlib import import_module
from typing import Callable, Iterator, cast
from typing import Iterator, cast
from unittest.mock import patch
import pytest
@ -27,7 +27,6 @@ from test_unstructured.unit_utils import (
ANY,
FixtureRequest,
LogCaptureFixture,
MonkeyPatch,
example_doc_path,
function_mock,
method_mock,
@ -46,8 +45,7 @@ from unstructured.documents.elements import (
Title,
)
from unstructured.file_utils.model import FileType
from unstructured.partition import auto
from unstructured.partition.auto import IMAGE_FILETYPES, _get_partition_with_extras, partition
from unstructured.partition.auto import _PartitionerLoader, partition
from unstructured.partition.utils.constants import PartitionStrategy
from unstructured.staging.base import elements_from_json, elements_to_dicts, elements_to_json
@ -570,16 +568,21 @@ def test_auto_partition_pdf_from_file(pass_metadata_filename: bool, content_type
assert e.text.startswith("Zejiang Shen")
def test_auto_partition_pdf_with_fast_strategy(monkeypatch: MonkeyPatch):
def test_auto_partition_pdf_with_fast_strategy(request: FixtureRequest):
partition_pdf_ = function_mock(
request,
"unstructured.partition.pdf.partition_pdf",
return_value=[NarrativeText("Hello there!")],
)
partitioner_loader_get_ = method_mock(
request, _PartitionerLoader, "get", return_value=partition_pdf_
)
file_path = example_doc_path("pdf/layout-parser-paper-fast.pdf")
mock_return = [NarrativeText("Hello there!")]
with patch.object(auto, "partition_pdf", return_value=mock_return) as mock_partition:
mock_partition_with_extras_map = {"pdf": mock_partition}
monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
partition(filename=file_path, strategy=PartitionStrategy.FAST)
partition(file_path, strategy=PartitionStrategy.FAST)
mock_partition.assert_called_once_with(
partitioner_loader_get_.assert_called_once_with(ANY, FileType.PDF)
partition_pdf_.assert_called_once_with(
filename=file_path,
file=None,
url=None,
@ -919,10 +922,10 @@ def test_auto_partition_xml_from_file_with_tags():
def test_auto_partition_raises_with_bad_type(request: FixtureRequest):
detect_filetype_ = function_mock(
request, "unstructured.partition.auto.detect_filetype", return_value=None
request, "unstructured.partition.auto.detect_filetype", return_value=FileType.UNK
)
with pytest.raises(ValueError, match="Invalid file made-up.fake. The None file type is not "):
with pytest.raises(ValueError, match="Invalid file made-up.fake. The FileType.UNK file type "):
partition(filename="made-up.fake", strategy=PartitionStrategy.HI_RES)
detect_filetype_.assert_called_once_with(
@ -1026,23 +1029,7 @@ def test_auto_partition_respects_detect_language_per_element_arg():
@pytest.mark.parametrize(
"file_extension",
[
"doc",
"docx",
"eml",
"epub",
"html",
"md",
"odt",
"org",
"ppt",
"pptx",
"rst",
"rtf",
"txt",
"xml",
],
"file_extension", "doc docx eml epub html md odt org ppt pptx rst rtf txt xml".split()
)
def test_auto_partition_respects_language_arg(file_extension: str):
elements = partition(
@ -1167,7 +1154,7 @@ def test_auto_partition_respects_skip_infer_table_types(
@pytest.mark.parametrize(
("content_type", "filetype_shortname", "expected_value"),
("content_type", "shortname", "expected_value"),
[
("text/csv", "csv", "text/csv"),
("text/html", "html", "text/html"),
@ -1177,22 +1164,23 @@ def test_auto_partition_respects_skip_infer_table_types(
def test_auto_partition_adds_filetype_to_metadata(
request: FixtureRequest,
content_type: str,
filetype_shortname: str,
shortname: str,
expected_value: str | None,
monkeypatch: MonkeyPatch,
):
partition_fn_ = function_mock(
request,
f"unstructured.partition.auto.partition_{filetype_shortname}",
f"unstructured.partition.{shortname}.partition_{shortname}",
return_value=[Text("text 1"), Text("text 2")],
)
mock_partition_with_extras_map = {filetype_shortname: partition_fn_}
monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
partitioner_loader_get_ = method_mock(
request, _PartitionerLoader, "get", return_value=partition_fn_
)
elements = partition(
example_doc_path("pdf/layout-parser-paper-fast.pdf"), content_type=content_type
)
partitioner_loader_get_.assert_called_once()
assert len(elements) == 2
assert all(e.metadata.filetype == expected_value for e in elements)
@ -1207,20 +1195,23 @@ def test_auto_partition_adds_filetype_to_metadata(
],
)
def test_auto_partition_overwrites_any_filetype_applied_by_file_specific_partitioner(
request: FixtureRequest, content_type: str | None, monkeypatch: MonkeyPatch
request: FixtureRequest, content_type: str | None
):
metadata = ElementMetadata(filetype="imapdf")
partition_pdf_ = function_mock(
request,
"unstructured.partition.auto.partition_pdf",
"unstructured.partition.pdf.partition_pdf",
return_value=[Text("text 1", metadata=metadata), Text("text 2", metadata=metadata)],
)
monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", {"pdf": partition_pdf_})
partitioner_loader_get_ = method_mock(
request, _PartitionerLoader, "get", return_value=partition_pdf_
)
elements = partition(
example_doc_path("pdf/layout-parser-paper-fast.pdf"), content_type=content_type
)
partitioner_loader_get_.assert_called_once_with(ANY, FileType.PDF)
assert len(elements) == 2
assert all(e.metadata.filetype == "application/pdf" for e in elements)
@ -1231,7 +1222,7 @@ def test_auto_partition_overwrites_any_filetype_applied_by_file_specific_partiti
t
for t in FileType
if t not in (FileType.EMPTY, FileType.UNK, FileType.WAV, FileType.XLS, FileType.ZIP)
and t not in IMAGE_FILETYPES
and t.partitioner_function_name != "partition_image"
],
)
def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(filetype: FileType):
@ -1305,10 +1296,18 @@ def test_auto_partition_from_file_works_on_empty_file():
assert partition(file=f) == []
def test_auto_partition_requiring_extras_prompts_to_install_missing_dependencies():
partition_with_extras_map: dict[str, Callable[..., list[Element]]] = {}
with pytest.raises(ImportError, match="partition_pdf is not available. Install the pdf depen"):
_get_partition_with_extras("pdf", partition_with_extras_map)
def test_auto_partition_that_requires_extras_raises_when_dependencies_are_not_installed(
request: FixtureRequest,
):
_PartitionerLoader._partitioners.pop(FileType.PDF, None)
dependency_exists_ = function_mock(
request, "unstructured.partition.auto.dependency_exists", return_value=False
)
match = r"partition_pdf\(\) is not available because one or more dependencies are not installed"
with pytest.raises(ImportError, match=match):
partition(example_doc_path("layout-parser-paper-fast.pdf"))
dependency_exists_.assert_called_once_with("pdf2image")
# ================================================================================================

View File

@ -1 +1 @@
__version__ = "0.15.0" # pragma: no cover
__version__ = "0.15.1-dev0" # pragma: no cover

View File

@ -32,7 +32,7 @@ def detect_filetype(
file: Optional[IO[bytes]] = None,
file_filename: Optional[str] = None,
encoding: Optional[str] = "utf-8",
) -> Optional[FileType]:
) -> FileType:
"""Use libmagic to determine a file's type.
Helps determine which partition brick to use for a given file. A return value of None indicates
@ -122,7 +122,7 @@ def detect_filetype(
".tsv",
".json",
]:
return FileType.from_extension(extension)
return FileType.from_extension(extension) or FileType.TXT
# NOTE(crag): for older versions of the OS libmagic package, such as is currently
# installed on the Unstructured docker image, .json files resolve to "text/plain"

View File

@ -12,7 +12,17 @@ class FileType(enum.Enum):
Note not all of these can be partitioned, e.g. WAV and ZIP have no partitioner.
"""
_partitioner_shortname: str | None
"""Like "docx", from which partitioner module and function-name can be derived via template."""
_importable_package_dependencies: tuple[str, ...]
"""Packages that must be available for import for this file-type's partitioner to work."""
_extra_name: str | None
"""`pip install` extra that provides package dependencies for this file-type."""
_extensions: tuple[str, ...]
"""Filename-extensions recognized as this file-type. Use for secondary identification only."""
_canonical_mime_type: str
"""The MIME-type used as `.metadata.filetype` for this file-type."""
@ -23,12 +33,18 @@ class FileType(enum.Enum):
def __new__(
cls,
value: str,
partitioner_shortname: str | None,
importable_package_dependencies: Iterable[str],
extra_name: str | None,
extensions: Iterable[str],
canonical_mime_type: str,
alias_mime_types: Iterable[str],
):
self = object.__new__(cls)
self._value_ = value
self._partitioner_shortname = partitioner_shortname
self._importable_package_dependencies = tuple(importable_package_dependencies)
self._extra_name = extra_name
self._extensions = tuple(extensions)
self._canonical_mime_type = canonical_mime_type
self._alias_mime_types = tuple(alias_mime_types)
@ -41,8 +57,150 @@ class FileType(enum.Enum):
"""
return self.name < other.name
BMP = ("bmp", [".bmp"], "image/bmp", cast(list[str], []))
@classmethod
def from_extension(cls, extension: str | None) -> FileType | None:
"""Select a FileType member based on an extension.
`extension` must include the leading period, like `".pdf"`. Extension is suitable as a
secondary file-type identification method but is unreliable for primary identification.
Returns `None` when `extension` is not registered for any supported file-type.
"""
if extension in (None, "", "."):
return None
# -- not super efficient but plenty fast enough for once-or-twice-per-file use and avoids
# -- limitations on defining a class variable on an Enum.
for m in cls.__members__.values():
if extension in m._extensions:
return m
return None
@classmethod
def from_mime_type(cls, mime_type: str) -> FileType | None:
"""Select a FileType member based on a MIME-type.
Returns `None` when `mime_type` is `None` or does not map to the canonical MIME-type of a
`FileType` member or one of its alias MIME-types.
"""
# -- not super efficient but plenty fast enough for once-or-twice-per-file use and avoids
# -- limitations on defining a class variable on an Enum.
for m in cls.__members__.values():
if mime_type == m._canonical_mime_type or mime_type in m._alias_mime_types:
return m
return None
@property
def extra_name(self) -> str | None:
"""The `pip` "extra" that must be installed to provide this file-type's dependencies.
Like "image" for PNG, as in `pip install "unstructured[image]"`.
`None` when partitioning this file-type requires only the base `unstructured` install.
"""
return self._extra_name
@property
def importable_package_dependencies(self) -> tuple[str, ...]:
"""Packages that must be importable for this file-type's partitioner to work.
In general, these are the packages provided by the `pip install` "extra" for this file-type,
like `pip install "unstructured[docx]"` loads the `python-docx` package.
Note that these names are the ones used in an `import` statement, which is not necessarily
the same as the _distribution_ package name used by `pip`. For example, the DOCX
distribution package name is `"python-docx"` whereas the _importable_ package name is
`"docx"`. This latter name as it appears like `import docx` is what is provided by this
property.
The return value is an empty tuple for file-types that do not require optional dependencies.
Note this property does not complain when accessed on a non-partitionable file-type, it
simply returns an empty tuple because file-types that are not partitionable require no
optional dependencies.
"""
return self._importable_package_dependencies
@property
def is_partitionable(self) -> bool:
"""True when there is a partitioner for this file-type.
Note this does not check whether the dependencies for this file-type are installed so
attempting to partition a file of this type may still fail. This is meant for
distinguishing file-types like WAV, ZIP, EMPTY, and UNK which are legitimate file-types
but have no associated partitioner.
"""
return bool(self._partitioner_shortname)
@property
def mime_type(self) -> str:
"""The canonical MIME-type for this file-type, suitable for use in metadata.
This value is used in `.metadata.filetype` for elements partitioned from files of this
type. In general it is the "offical", "recommended", or "defacto-standard" MIME-type for
files of this type, in that order, as available.
"""
return self._canonical_mime_type
@property
def partitioner_function_name(self) -> str:
"""Name of partitioner function for this file-type. Like "partition_docx".
Raises when this property is accessed on a file-type that is not partitionable. Use
`.is_partitionable` to avoid exceptions when partitionability is unknown.
"""
# -- Raise when this property is accessed on a FileType member that has no partitioner
# -- shortname. This prevents a harder-to-find bug from appearing far away from this call
# -- when code would try to `getattr(module, None)` or whatever.
if (shortname := self._partitioner_shortname) is None:
raise ValueError(
f"`.partitioner_function_name` is undefined because FileType.{self.name} is not"
f" partitionable. Use `.is_partitionable` to determine whether a `FileType`"
f" is partitionable."
)
return f"partition_{shortname}"
@property
def partitioner_module_qname(self) -> str:
"""Fully-qualified name of module providing partitioner for this file-type.
e.g. "unstructured.partition.docx" for FileType.DOCX.
"""
# -- Raise when this property is accessed on a FileType member that has no partitioner
# -- shortname. This prevents a harder-to-find bug from appearing far away from this call
# -- when code would try to `importlib.import_module(None)` or whatever.
if (shortname := self._partitioner_shortname) is None:
raise ValueError(
f"`.partitioner_module_qname` is undefined because FileType.{self.name} is not"
f" partitionable. Use `.is_partitionable` to determine whether a `FileType`"
f" is partitionable."
)
return f"unstructured.partition.{shortname}"
@property
def partitioner_shortname(self) -> str | None:
"""Familiar name of partitioner, like "image" for file-types that use `partition_image()`.
One use is to determine whether a file-type is one of the five image types, all of which
are processed by `partition_image()`.
`None` for file-types that are not partitionable, although `.is_partitionable` is the
preferred way of discovering that.
"""
return self._partitioner_shortname
BMP = (
"bmp", # -- value for this Enum member, like BMP = "bmp" in a simple enum --
"image", # -- partitioner_shortname --
["unstructured_inference"], # -- importable_package_dependencies --
"image", # -- extra_name - like `pip install "unstructured[image]"` in this case --
[".bmp"], # -- extensions - filename extensions that map to this file-type --
"image/bmp", # -- canonical_mime_type - MIME-type written to `.metadata.filetype` --
cast(list[str], []), # -- alias_mime-types - other MIME-types that map to this file-type --
)
CSV = (
"csv",
"csv",
["pandas"],
"csv",
[".csv"],
"text/csv",
@ -54,38 +212,143 @@ class FileType(enum.Enum):
"text/x-csv",
],
)
DOC = ("doc", [".doc"], "application/msword", cast(list[str], []))
DOC = ("doc", "doc", ["docx"], "doc", [".doc"], "application/msword", cast(list[str], []))
DOCX = (
"docx",
"docx",
["docx"],
"docx",
[".docx"],
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
cast(list[str], []),
)
EML = ("eml", [".eml", ".p7s"], "message/rfc822", cast(list[str], []))
EPUB = ("epub", [".epub"], "application/epub", ["application/epub+zip"])
HEIC = ("heic", [".heic"], "image/heic", cast(list[str], []))
HTML = ("html", [".html", ".htm"], "text/html", cast(list[str], []))
JPG = ("jpg", [".jpeg", ".jpg"], "image/jpeg", cast(list[str], []))
JSON = ("json", [".json"], "application/json", cast(list[str], []))
MD = ("md", [".md"], "text/markdown", ["text/x-markdown"])
MSG = ("msg", [".msg"], "application/vnd.ms-outlook", ["application/x-ole-storage"])
ODT = ("odt", [".odt"], "application/vnd.oasis.opendocument.text", cast(list[str], []))
ORG = ("org", [".org"], "text/org", cast(list[str], []))
PDF = ("pdf", [".pdf"], "application/pdf", cast(list[str], []))
PNG = ("png", [".png"], "image/png", cast(list[str], []))
PPT = ("ppt", [".ppt"], "application/vnd.ms-powerpoint", cast(list[str], []))
EML = (
"eml",
"email",
cast(list[str], []),
None,
[".eml", ".p7s"],
"message/rfc822",
cast(list[str], []),
)
EPUB = (
"epub",
"epub",
["pypandoc"],
"epub",
[".epub"],
"application/epub",
["application/epub+zip"],
)
HEIC = (
"heic",
"image",
["unstructured_inference"],
"image",
[".heic"],
"image/heic",
cast(list[str], []),
)
HTML = (
"html",
"html",
cast(list[str], []),
None,
[".html", ".htm"],
"text/html",
cast(list[str], []),
)
JPG = (
"jpg",
"image",
["unstructured_inference"],
"image",
[".jpeg", ".jpg"],
"image/jpeg",
cast(list[str], []),
)
JSON = (
"json",
"json",
cast(list[str], []),
None,
[".json"],
"application/json",
cast(list[str], []),
)
MD = ("md", "md", ["markdown"], "md", [".md"], "text/markdown", ["text/x-markdown"])
MSG = (
"msg",
"msg",
["oxmsg"],
"msg",
[".msg"],
"application/vnd.ms-outlook",
["application/x-ole-storage"],
)
ODT = (
"odt",
"odt",
["docx", "pypandoc"],
"odt",
[".odt"],
"application/vnd.oasis.opendocument.text",
cast(list[str], []),
)
ORG = ("org", "org", ["pypandoc"], "org", [".org"], "text/org", cast(list[str], []))
PDF = (
"pdf",
"pdf",
["pdf2image", "pdfminer", "PIL"],
"pdf",
[".pdf"],
"application/pdf",
cast(list[str], []),
)
PNG = (
"png",
"image",
["unstructured_inference"],
"image",
[".png"],
"image/png",
cast(list[str], []),
)
PPT = (
"ppt",
"ppt",
["pptx"],
"ppt",
[".ppt"],
"application/vnd.ms-powerpoint",
cast(list[str], []),
)
PPTX = (
"pptx",
"pptx",
["pptx"],
"pptx",
[".pptx"],
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
cast(list[str], []),
)
RST = ("rst", [".rst"], "text/x-rst", cast(list[str], []))
RTF = ("rtf", [".rtf"], "text/rtf", ["application/rtf"])
TIFF = ("tiff", [".tiff"], "image/tiff", cast(list[str], []))
TSV = ("tsv", [".tab", ".tsv"], "text/tsv", cast(list[str], []))
RST = ("rst", "rst", ["pypandoc"], "rst", [".rst"], "text/x-rst", cast(list[str], []))
RTF = ("rtf", "rtf", ["pypandoc"], "rtf", [".rtf"], "text/rtf", ["application/rtf"])
TIFF = (
"tiff",
"image",
["unstructured_inference"],
"image",
[".tiff"],
"image/tiff",
cast(list[str], []),
)
TSV = ("tsv", "tsv", ["pandas"], "tsv", [".tab", ".tsv"], "text/tsv", cast(list[str], []))
TXT = (
"txt",
"text",
cast(list[str], []),
None,
[
".txt",
".text",
@ -119,6 +382,9 @@ class FileType(enum.Enum):
)
WAV = (
"wav",
None,
cast(list[str], []),
None,
[".wav"],
"audio/wav",
[
@ -129,60 +395,45 @@ class FileType(enum.Enum):
"audio/x-wav",
],
)
XLS = ("xls", [".xls"], "application/vnd.ms-excel", cast(list[str], []))
XLS = (
"xls",
"xlsx",
["pandas", "openpyxl"],
"xlsx",
[".xls"],
"application/vnd.ms-excel",
cast(list[str], []),
)
XLSX = (
"xlsx",
"xlsx",
["pandas", "openpyxl"],
"xlsx",
[".xlsx"],
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
cast(list[str], []),
)
XML = ("xml", [".xml"], "application/xml", ["text/xml"])
ZIP = ("zip", [".zip"], "application/zip", cast(list[str], []))
XML = ("xml", "xml", cast(list[str], []), None, [".xml"], "application/xml", ["text/xml"])
ZIP = ("zip", None, cast(list[str], []), None, [".zip"], "application/zip", cast(list[str], []))
UNK = ("unk", cast(list[str], []), "application/octet-stream", cast(list[str], []))
EMPTY = ("empty", cast(list[str], []), "inode/x-empty", cast(list[str], []))
@classmethod
def from_extension(cls, extension: str | None) -> FileType | None:
"""Select a FileType member based on an extension.
`extension` must include the leading period, like `".pdf"`. Extension is suitable as a
secondary file-type identification method but is unreliable for primary identification..
Returns `None` when `extension` is not registered for any supported file-type.
"""
if extension in (None, "", "."):
return None
# -- not super efficient but plenty fast enough for once-or-twice-per-file use and avoids
# -- limitations on defining a class variable on an Enum.
for m in cls.__members__.values():
if extension in m._extensions:
return m
return None
@classmethod
def from_mime_type(cls, mime_type: str) -> FileType | None:
"""Select a FileType member based on a MIME-type.
`extension` must include the leading period, like `".pdf"`. Extension is suitable as a
secondary file-type identification method but is unreliable for primary identification..
"""
# -- not super efficient but plenty fast enough for once-or-twice-per-file use and avoids
# -- limitations on defining a class variable on an Enum.
for m in cls.__members__.values():
if mime_type == m._canonical_mime_type or mime_type in m._alias_mime_types:
return m
return None
@property
def mime_type(self) -> str:
"""The canonical MIME-type for this file-type, suitable for use in metadata.
This value is used in `.metadata.filetype` for elements partitioned from files of this
type. In general it is the "offical", "recommended", or "defacto-standard" MIME-type for
files of this type, in that order, as available.
"""
return self._canonical_mime_type
UNK = (
"unk",
None,
cast(list[str], []),
None,
cast(list[str], []),
"application/octet-stream",
cast(list[str], []),
)
EMPTY = (
"empty",
None,
cast(list[str], []),
None,
cast(list[str], []),
"inode/x-empty",
cast(list[str], []),
)
PLAIN_TEXT_EXTENSIONS = ".csv .eml .html .json .md .org .p7s .rst .rtf .tab .text .tsv .txt".split()

View File

@ -2,131 +2,28 @@
from __future__ import annotations
import importlib
import io
from typing import IO, Any, Callable, Literal, Optional
import requests
from typing_extensions import TypeAlias
from unstructured.documents.elements import DataSourceMetadata, Element
from unstructured.file_utils.filetype import detect_filetype, is_json_processable
from unstructured.file_utils.model import FileType
from unstructured.logger import logger
from unstructured.partition.common import exactly_one
from unstructured.partition.email import partition_email
from unstructured.partition.html import partition_html
from unstructured.partition.json import partition_json
from unstructured.partition.lang import check_language_args
from unstructured.partition.text import partition_text
from unstructured.partition.utils.constants import PartitionStrategy
from unstructured.partition.xml import partition_xml
from unstructured.utils import dependency_exists
PARTITION_WITH_EXTRAS_MAP: dict[str, Callable[..., list[Element]]] = {}
if dependency_exists("pandas"):
from unstructured.partition.csv import partition_csv
from unstructured.partition.tsv import partition_tsv
PARTITION_WITH_EXTRAS_MAP["csv"] = partition_csv
PARTITION_WITH_EXTRAS_MAP["tsv"] = partition_tsv
if dependency_exists("docx"):
from unstructured.partition.doc import partition_doc
from unstructured.partition.docx import partition_docx
PARTITION_WITH_EXTRAS_MAP["doc"] = partition_doc
PARTITION_WITH_EXTRAS_MAP["docx"] = partition_docx
if dependency_exists("docx") and dependency_exists("pypandoc"):
from unstructured.partition.odt import partition_odt
PARTITION_WITH_EXTRAS_MAP["odt"] = partition_odt
if dependency_exists("pypandoc"):
from unstructured.partition.epub import partition_epub
PARTITION_WITH_EXTRAS_MAP["epub"] = partition_epub
if dependency_exists("pypandoc"):
from unstructured.partition.org import partition_org
from unstructured.partition.rst import partition_rst
from unstructured.partition.rtf import partition_rtf
PARTITION_WITH_EXTRAS_MAP["org"] = partition_org
PARTITION_WITH_EXTRAS_MAP["rst"] = partition_rst
PARTITION_WITH_EXTRAS_MAP["rtf"] = partition_rtf
if dependency_exists("markdown"):
from unstructured.partition.md import partition_md
PARTITION_WITH_EXTRAS_MAP["md"] = partition_md
if dependency_exists("oxmsg"):
from unstructured.partition.msg import partition_msg
PARTITION_WITH_EXTRAS_MAP["msg"] = partition_msg
pdf_imports = ["pdf2image", "pdfminer", "PIL"]
if all(dependency_exists(dep) for dep in pdf_imports):
from unstructured.partition.pdf import partition_pdf
PARTITION_WITH_EXTRAS_MAP["pdf"] = partition_pdf
if dependency_exists("unstructured_inference"):
from unstructured.partition.image import partition_image
PARTITION_WITH_EXTRAS_MAP["image"] = partition_image
if dependency_exists("pptx"):
from unstructured.partition.ppt import partition_ppt
from unstructured.partition.pptx import partition_pptx
PARTITION_WITH_EXTRAS_MAP["ppt"] = partition_ppt
PARTITION_WITH_EXTRAS_MAP["pptx"] = partition_pptx
if dependency_exists("pandas") and dependency_exists("openpyxl"):
from unstructured.partition.xlsx import partition_xlsx
PARTITION_WITH_EXTRAS_MAP["xlsx"] = partition_xlsx
IMAGE_FILETYPES = [
FileType.HEIC,
FileType.PNG,
FileType.JPG,
FileType.TIFF,
FileType.BMP,
]
def _get_partition_with_extras(
doc_type: str,
partition_with_extras_map: Optional[dict[str, Callable[..., list[Element]]]] = None,
):
if partition_with_extras_map is None:
partition_with_extras_map = PARTITION_WITH_EXTRAS_MAP
_partition_func = partition_with_extras_map.get(doc_type)
if _partition_func is None:
raise ImportError(
f"partition_{doc_type} is not available. "
f"Install the {doc_type} dependencies with "
f'pip install "unstructured[{doc_type}]"',
)
return _partition_func
Partitioner: TypeAlias = Callable[..., list[Element]]
def partition(
filename: Optional[str] = None,
*,
content_type: Optional[str] = None,
file: Optional[IO[bytes]] = None,
file_filename: Optional[str] = None,
@ -156,10 +53,11 @@ def partition(
starting_page_number: int = 1,
**kwargs: Any,
):
"""Partitions a document into its constituent elements. Will use libmagic to determine
the file's type and route it to the appropriate partitioning function. Applies the default
parameters for each partitioning function. Use the document-type specific partitioning
functions if you need access to additional kwarg options.
"""Partitions a document into its constituent elements.
Uses libmagic to determine the file's type and route it to the appropriate partitioning
function. Applies the default parameters for each partitioning function. Use the document-type
specific partitioning functions if you need access to additional kwarg options.
Parameters
----------
@ -272,7 +170,7 @@ def partition(
languages = check_language_args(languages or [], ocr_languages)
if url is not None:
file, filetype = file_and_type_from_url(
file, file_type = file_and_type_from_url(
url=url,
content_type=content_type,
headers=headers,
@ -285,7 +183,7 @@ def partition(
"The headers kwarg is set but the url kwarg is not. "
"The headers kwarg will be ignored.",
)
filetype = detect_filetype(
file_type = detect_filetype(
filename=filename,
file=file,
file_filename=metadata_filename,
@ -297,14 +195,16 @@ def partition(
file.seek(0)
infer_table_structure = decide_table_extraction(
filetype,
file_type,
skip_infer_table_types,
pdf_infer_table_structure,
)
if filetype == FileType.CSV:
_partition_csv = _get_partition_with_extras("csv")
elements = _partition_csv(
partitioner_loader = _PartitionerLoader()
if file_type == FileType.CSV:
partition_csv = partitioner_loader.get(file_type)
elements = partition_csv(
filename=filename,
file=file,
infer_table_structure=infer_table_structure,
@ -312,9 +212,9 @@ def partition(
detect_language_per_element=detect_language_per_element,
**kwargs,
)
elif filetype == FileType.DOC:
_partition_doc = _get_partition_with_extras("doc")
elements = _partition_doc(
elif file_type == FileType.DOC:
partition_doc = partitioner_loader.get(file_type)
elements = partition_doc(
filename=filename,
file=file,
infer_table_structure=infer_table_structure,
@ -324,9 +224,9 @@ def partition(
strategy=strategy,
**kwargs,
)
elif filetype == FileType.DOCX:
_partition_docx = _get_partition_with_extras("docx")
elements = _partition_docx(
elif file_type == FileType.DOCX:
partition_docx = partitioner_loader.get(file_type)
elements = partition_docx(
filename=filename,
file=file,
infer_table_structure=infer_table_structure,
@ -336,7 +236,8 @@ def partition(
strategy=strategy,
**kwargs,
)
elif filetype == FileType.EML:
elif file_type == FileType.EML:
partition_email = partitioner_loader.get(file_type)
elements = partition_email(
filename=filename,
file=file,
@ -345,9 +246,9 @@ def partition(
detect_language_per_element=detect_language_per_element,
**kwargs,
)
elif filetype == FileType.EPUB:
_partition_epub = _get_partition_with_extras("epub")
elements = _partition_epub(
elif file_type == FileType.EPUB:
partition_epub = partitioner_loader.get(file_type)
elements = partition_epub(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
@ -356,7 +257,8 @@ def partition(
detect_language_per_element=detect_language_per_element,
**kwargs,
)
elif filetype == FileType.HTML:
elif file_type == FileType.HTML:
partition_html = partitioner_loader.get(file_type)
elements = partition_html(
filename=filename,
file=file,
@ -366,9 +268,9 @@ def partition(
detect_language_per_element=detect_language_per_element,
**kwargs,
)
elif filetype in IMAGE_FILETYPES:
_partition_image = _get_partition_with_extras("image")
elements = _partition_image(
elif file_type.partitioner_shortname == "image":
partition_image = partitioner_loader.get(file_type)
elements = partition_image(
filename=filename,
file=file,
url=None,
@ -384,16 +286,17 @@ def partition(
starting_page_number=starting_page_number,
**kwargs,
)
elif filetype == FileType.JSON:
elif file_type == FileType.JSON:
if not is_json_processable(filename=filename, file=file):
raise ValueError(
"Detected a JSON file that does not conform to the Unstructured schema. "
"partition_json currently only processes serialized Unstructured output.",
)
partition_json = partitioner_loader.get(file_type)
elements = partition_json(filename=filename, file=file, **kwargs)
elif filetype == FileType.MD:
_partition_md = _get_partition_with_extras("md")
elements = _partition_md(
elif file_type == FileType.MD:
partition_md = partitioner_loader.get(file_type)
elements = partition_md(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
@ -402,18 +305,18 @@ def partition(
detect_language_per_element=detect_language_per_element,
**kwargs,
)
elif filetype == FileType.MSG:
_partition_msg = _get_partition_with_extras("msg")
elements = _partition_msg(
elif file_type == FileType.MSG:
partition_msg = partitioner_loader.get(file_type)
elements = partition_msg(
filename=filename,
file=file,
languages=languages,
detect_language_per_element=detect_language_per_element,
**kwargs,
)
elif filetype == FileType.ODT:
_partition_odt = _get_partition_with_extras("odt")
elements = _partition_odt(
elif file_type == FileType.ODT:
partition_odt = partitioner_loader.get(file_type)
elements = partition_odt(
filename=filename,
file=file,
infer_table_structure=infer_table_structure,
@ -423,9 +326,9 @@ def partition(
strategy=strategy,
**kwargs,
)
elif filetype == FileType.ORG:
_partition_org = _get_partition_with_extras("org")
elements = _partition_org(
elif file_type == FileType.ORG:
partition_org = partitioner_loader.get(file_type)
elements = partition_org(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
@ -433,9 +336,9 @@ def partition(
detect_language_per_element=detect_language_per_element,
**kwargs,
)
elif filetype == FileType.PDF:
_partition_pdf = _get_partition_with_extras("pdf")
elements = _partition_pdf(
elif file_type == FileType.PDF:
partition_pdf = partitioner_loader.get(file_type)
elements = partition_pdf(
filename=filename,
file=file,
url=None,
@ -451,9 +354,9 @@ def partition(
starting_page_number=starting_page_number,
**kwargs,
)
elif filetype == FileType.PPT:
_partition_ppt = _get_partition_with_extras("ppt")
elements = _partition_ppt(
elif file_type == FileType.PPT:
partition_ppt = partitioner_loader.get(file_type)
elements = partition_ppt(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
@ -463,9 +366,9 @@ def partition(
strategy=strategy,
**kwargs,
)
elif filetype == FileType.PPTX:
_partition_pptx = _get_partition_with_extras("pptx")
elements = _partition_pptx(
elif file_type == FileType.PPTX:
partition_pptx = partitioner_loader.get(file_type)
elements = partition_pptx(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
@ -476,9 +379,9 @@ def partition(
strategy=strategy,
**kwargs,
)
elif filetype == FileType.RST:
_partition_rst = _get_partition_with_extras("rst")
elements = _partition_rst(
elif file_type == FileType.RST:
partition_rst = partitioner_loader.get(file_type)
elements = partition_rst(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
@ -487,9 +390,9 @@ def partition(
detect_language_per_element=detect_language_per_element,
**kwargs,
)
elif filetype == FileType.RTF:
_partition_rtf = _get_partition_with_extras("rtf")
elements = _partition_rtf(
elif file_type == FileType.RTF:
partition_rtf = partitioner_loader.get(file_type)
elements = partition_rtf(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
@ -498,16 +401,17 @@ def partition(
detect_language_per_element=detect_language_per_element,
**kwargs,
)
elif filetype == FileType.TSV:
_partition_tsv = _get_partition_with_extras("tsv")
elements = _partition_tsv(
elif file_type == FileType.TSV:
partition_tsv = partitioner_loader.get(file_type)
elements = partition_tsv(
filename=filename,
file=file,
languages=languages,
detect_language_per_element=detect_language_per_element,
**kwargs,
)
elif filetype == FileType.TXT:
elif file_type == FileType.TXT:
partition_text = partitioner_loader.get(file_type)
elements = partition_text(
filename=filename,
file=file,
@ -517,9 +421,9 @@ def partition(
detect_language_per_element=detect_language_per_element,
**kwargs,
)
elif filetype in (FileType.XLS, FileType.XLSX):
_partition_xlsx = _get_partition_with_extras("xlsx")
elements = _partition_xlsx(
elif file_type in (FileType.XLS, FileType.XLSX):
partition_xlsx = partitioner_loader.get(file_type)
elements = partition_xlsx(
filename=filename,
file=file,
infer_table_structure=infer_table_structure,
@ -528,7 +432,8 @@ def partition(
starting_page_number=starting_page_number,
**kwargs,
)
elif filetype == FileType.XML:
elif file_type == FileType.XML:
partition_xml = partitioner_loader.get(file_type)
elements = partition_xml(
filename=filename,
file=file,
@ -538,11 +443,11 @@ def partition(
detect_language_per_element=detect_language_per_element,
**kwargs,
)
elif filetype == FileType.EMPTY:
elif file_type == FileType.EMPTY:
elements = []
else:
msg = "Invalid file" if not filename else f"Invalid file {filename}"
raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.")
raise ValueError(f"{msg}. The {file_type} file type is not supported in partition.")
for element in elements:
element.metadata.url = url
@ -551,7 +456,7 @@ def partition(
out_filetype = FileType.from_mime_type(content_type)
element.metadata.filetype = out_filetype.mime_type if out_filetype is not None else None
else:
element.metadata.filetype = filetype.mime_type
element.metadata.filetype = file_type.mime_type
return elements
@ -562,7 +467,7 @@ def file_and_type_from_url(
headers: dict[str, str] = {},
ssl_verify: bool = True,
request_timeout: Optional[int] = None,
) -> tuple[io.BytesIO, Optional[FileType]]:
) -> tuple[io.BytesIO, FileType]:
response = requests.get(url, headers=headers, verify=ssl_verify, timeout=request_timeout)
file = io.BytesIO(response.content)
@ -590,3 +495,51 @@ def decide_table_extraction(
return pdf_infer_table_structure or doc_type not in skip_infer_table_types
return doc_type not in skip_infer_table_types
class _PartitionerLoader:
"""Provides uniform helpful error when a partitioner dependency is not installed.
Used by `partition()` to encapsulate coping with the possibility the Python
environment it is executing in may not have all dependencies installed for a
particular partitioner.
Provides `.get()` to access partitioners by file-type, which raises when one or
more dependencies for that partitioner are not installed.
The error message indicates what extra needs to be installed to enable that
partitioner. This avoids an inconsistent variety of possibly puzzling exceptions
arising from much deeper in the partitioner when access to the missing dependency is
first attempted.
"""
# -- module-lifetime cache for partitioners once loaded --
_partitioners: dict[FileType, Partitioner] = {}
def get(self, file_type: FileType) -> Partitioner:
"""Return partitioner for `file_type`.
Raises when one or more package dependencies for that file-type have not been
installed.
"""
if file_type not in self._partitioners:
self._partitioners[file_type] = self._load_partitioner(file_type)
return self._partitioners[file_type]
def _load_partitioner(self, file_type: FileType) -> Partitioner:
"""Load the partitioner for `file_type` after verifying dependencies."""
# -- verify all package dependencies are installed --
for pkg_name in file_type.importable_package_dependencies:
if not dependency_exists(pkg_name):
raise ImportError(
f"{file_type.partitioner_function_name}() is not available because one or"
f" more dependencies are not installed. Use:"
f' pip install "unstructured[{file_type.extra_name}]" (including quotes)'
f" to install the required dependencies",
)
# -- load the partitioner and return it --
assert file_type.is_partitionable # -- would be a programming error if this failed --
partitioner_module = importlib.import_module(file_type.partitioner_module_qname)
return getattr(partitioner_module, file_type.partitioner_function_name)