mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00

**Summary** Replace conditional explicit import of partitioner modules in `.partition.auto` with the new `_PartitionerLoader` class. This avoids unbound variable warnings and is much less noisy. `_PartitionerLoader` makes use of the new `FileType` property `.importable_package_dependencies` to determine whether all required packages are importable before dispatching the file to its partitioner. It uses `FileType.extra_name` to form a helpful error message when a dependency is not installed, so the caller knows which `pip install` extra to specify to remedy the error. `PartitionerLoader` uses the `FileType` properties `.partitioner_module_qname` and `partitioner_function_name` to load the partitioner once its dependencies are verified. Loaded partitioners are cached with module lifetime scope for efficiency.
202 lines
7.2 KiB
Python
202 lines
7.2 KiB
Python
"""Test suite for `unstructured.file_utils.filetype`."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import pytest
|
|
|
|
from unstructured.file_utils.model import FileType
|
|
|
|
|
|
class DescribeFileType:
|
|
"""Unit-test suite for `unstructured.file_utils.model.Filetype`."""
|
|
|
|
@pytest.mark.parametrize(
|
|
("ext", "file_type"),
|
|
[
|
|
(".bmp", FileType.BMP),
|
|
(".html", FileType.HTML),
|
|
(".eml", FileType.EML),
|
|
(".p7s", FileType.EML),
|
|
(".java", FileType.TXT),
|
|
],
|
|
)
|
|
def it_can_recognize_a_file_type_from_an_extension(self, ext: str, file_type: FileType | None):
|
|
assert FileType.from_extension(ext) is file_type
|
|
|
|
@pytest.mark.parametrize("ext", [".foobar", ".xyz", ".mdx", "", "."])
|
|
def but_not_when_that_extension_is_empty_or_not_registered(self, ext: str):
|
|
assert FileType.from_extension(ext) is None
|
|
|
|
@pytest.mark.parametrize(
|
|
("mime_type", "file_type"),
|
|
[
|
|
("image/bmp", FileType.BMP),
|
|
("text/x-csv", FileType.CSV),
|
|
("application/msword", FileType.DOC),
|
|
("message/rfc822", FileType.EML),
|
|
("text/plain", FileType.TXT),
|
|
("text/yaml", FileType.TXT),
|
|
("application/xml", FileType.XML),
|
|
("text/xml", FileType.XML),
|
|
("inode/x-empty", FileType.EMPTY),
|
|
],
|
|
)
|
|
def it_can_recognize_a_file_type_from_a_mime_type(
|
|
self, mime_type: str, file_type: FileType | None
|
|
):
|
|
assert FileType.from_mime_type(mime_type) is file_type
|
|
|
|
@pytest.mark.parametrize("mime_type", ["text/css", "image/gif", "audio/mpeg", "foo/bar"])
|
|
def but_not_when_that_mime_type_is_not_registered_by_a_file_type(self, mime_type: str):
|
|
assert FileType.from_mime_type(mime_type) is None
|
|
|
|
@pytest.mark.parametrize(
|
|
("file_type", "expected_value"),
|
|
[
|
|
(FileType.BMP, ("unstructured_inference",)),
|
|
(FileType.CSV, ("pandas",)),
|
|
(FileType.DOC, ("docx",)),
|
|
(FileType.EMPTY, ()),
|
|
(FileType.HTML, ()),
|
|
(FileType.ODT, ("docx", "pypandoc")),
|
|
(FileType.PDF, ("pdf2image", "pdfminer", "PIL")),
|
|
(FileType.UNK, ()),
|
|
(FileType.WAV, ()),
|
|
(FileType.ZIP, ()),
|
|
],
|
|
)
|
|
def it_knows_which_importable_packages_its_partitioner_depends_on(
|
|
self, file_type: FileType, expected_value: tuple[str, ...]
|
|
):
|
|
assert file_type.importable_package_dependencies == expected_value
|
|
|
|
@pytest.mark.parametrize(
|
|
("file_type", "expected_value"),
|
|
[
|
|
(FileType.BMP, "image"),
|
|
(FileType.DOC, "doc"),
|
|
(FileType.DOCX, "docx"),
|
|
(FileType.EML, None),
|
|
(FileType.EMPTY, None),
|
|
(FileType.MSG, "msg"),
|
|
(FileType.PDF, "pdf"),
|
|
(FileType.XLS, "xlsx"),
|
|
(FileType.UNK, None),
|
|
(FileType.WAV, None),
|
|
(FileType.ZIP, None),
|
|
],
|
|
)
|
|
def and_it_knows_which_pip_extra_needs_to_be_installed_to_get_those_dependencies(
|
|
self, file_type: FileType, expected_value: str | None
|
|
):
|
|
assert file_type.extra_name == expected_value
|
|
|
|
@pytest.mark.parametrize(
|
|
("file_type", "expected_value"),
|
|
[
|
|
(FileType.BMP, True),
|
|
(FileType.CSV, True),
|
|
(FileType.DOC, True),
|
|
(FileType.EML, True),
|
|
(FileType.JPG, True),
|
|
(FileType.PDF, True),
|
|
(FileType.PPTX, True),
|
|
(FileType.WAV, False),
|
|
(FileType.ZIP, False),
|
|
(FileType.EMPTY, False),
|
|
(FileType.UNK, False),
|
|
],
|
|
)
|
|
def it_knows_whether_files_of_its_type_are_directly_partitionable(
|
|
self, file_type: FileType, expected_value: str
|
|
):
|
|
assert file_type.is_partitionable is expected_value
|
|
|
|
@pytest.mark.parametrize(
|
|
("file_type", "mime_type"),
|
|
[
|
|
(FileType.BMP, "image/bmp"),
|
|
(FileType.CSV, "text/csv"),
|
|
(FileType.DOC, "application/msword"),
|
|
(FileType.EML, "message/rfc822"),
|
|
(FileType.HTML, "text/html"),
|
|
(FileType.JPG, "image/jpeg"),
|
|
(FileType.PDF, "application/pdf"),
|
|
(FileType.TXT, "text/plain"),
|
|
(FileType.XML, "application/xml"),
|
|
(FileType.EMPTY, "inode/x-empty"),
|
|
(FileType.UNK, "application/octet-stream"),
|
|
],
|
|
)
|
|
def it_knows_its_canonical_MIME_type(self, file_type: FileType, mime_type: str):
|
|
assert file_type.mime_type == mime_type
|
|
|
|
@pytest.mark.parametrize(
|
|
("file_type", "expected_value"),
|
|
[
|
|
(FileType.BMP, "partition_image"),
|
|
(FileType.CSV, "partition_csv"),
|
|
(FileType.DOC, "partition_doc"),
|
|
(FileType.DOCX, "partition_docx"),
|
|
(FileType.JPG, "partition_image"),
|
|
(FileType.PNG, "partition_image"),
|
|
(FileType.TIFF, "partition_image"),
|
|
],
|
|
)
|
|
def it_knows_its_partitioner_function_name(self, file_type: FileType, expected_value: str):
|
|
assert file_type.partitioner_function_name == expected_value
|
|
|
|
@pytest.mark.parametrize(
|
|
"file_type", [FileType.WAV, FileType.ZIP, FileType.EMPTY, FileType.UNK]
|
|
)
|
|
def but_it_raises_on_partitioner_function_name_access_when_the_file_type_is_not_partitionable(
|
|
self, file_type: FileType
|
|
):
|
|
with pytest.raises(ValueError, match="`.partitioner_function_name` is undefined because "):
|
|
file_type.partitioner_function_name
|
|
|
|
@pytest.mark.parametrize(
|
|
("file_type", "expected_value"),
|
|
[
|
|
(FileType.BMP, "unstructured.partition.image"),
|
|
(FileType.CSV, "unstructured.partition.csv"),
|
|
(FileType.DOC, "unstructured.partition.doc"),
|
|
(FileType.DOCX, "unstructured.partition.docx"),
|
|
(FileType.JPG, "unstructured.partition.image"),
|
|
(FileType.PNG, "unstructured.partition.image"),
|
|
(FileType.TIFF, "unstructured.partition.image"),
|
|
],
|
|
)
|
|
def it_knows_the_fully_qualified_name_of_its_partitioner_module(
|
|
self, file_type: FileType, expected_value: str
|
|
):
|
|
assert file_type.partitioner_module_qname == expected_value
|
|
|
|
@pytest.mark.parametrize(
|
|
"file_type", [FileType.WAV, FileType.ZIP, FileType.EMPTY, FileType.UNK]
|
|
)
|
|
def but_it_raises_on_partitioner_module_qname_access_when_the_file_type_is_not_partitionable(
|
|
self, file_type: FileType
|
|
):
|
|
with pytest.raises(ValueError, match="`.partitioner_module_qname` is undefined because "):
|
|
file_type.partitioner_module_qname
|
|
|
|
@pytest.mark.parametrize(
|
|
("file_type", "expected_value"),
|
|
[
|
|
(FileType.BMP, "image"),
|
|
(FileType.CSV, "csv"),
|
|
(FileType.DOC, "doc"),
|
|
(FileType.DOCX, "docx"),
|
|
(FileType.JPG, "image"),
|
|
(FileType.PNG, "image"),
|
|
(FileType.TIFF, "image"),
|
|
(FileType.XLS, "xlsx"),
|
|
(FileType.XLSX, "xlsx"),
|
|
],
|
|
)
|
|
def it_provides_access_to_the_partitioner_shortname(
|
|
self, file_type: FileType, expected_value: str
|
|
):
|
|
assert file_type.partitioner_shortname == expected_value
|