mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-03 23:20:35 +00:00

The purpose of this PR is to enable registering new file types dynamically. The PR enables this through 2 primary functions: 1. `unstructured.file_utils.model.create_file_type` This registers the new `FileType` enum which enables the rest of unstructured to understand a new type of file 2. `unstructured.file_utils.model.register_partitioner` Decorator that enables registering a partitioner function to run for a file type. --------- Co-authored-by: Roman Isecke <136338424+rbiseck3@users.noreply.github.com>
246 lines
8.7 KiB
Python
246 lines
8.7 KiB
Python
"""Test suite for `unstructured.file_utils.filetype`."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import pytest
|
|
|
|
from unstructured.file_utils.model import FileType, create_file_type, register_partitioner
|
|
|
|
|
|
class DescribeFileType:
|
|
"""Unit-test suite for `unstructured.file_utils.model.Filetype`."""
|
|
|
|
# -- .__lt__() ----------------------------------------------
|
|
|
|
def it_is_a_collection_ordered_by_name_and_can_be_sorted(self):
|
|
"""FileType is a total order on name, e.g. FileType.A < FileType.B."""
|
|
assert FileType.EML < FileType.HTML < FileType.XML
|
|
|
|
# -- .from_extension() --------------------------------------
|
|
|
|
@pytest.mark.parametrize(
|
|
("ext", "file_type"),
|
|
[
|
|
(".bmp", FileType.BMP),
|
|
(".html", FileType.HTML),
|
|
(".eml", FileType.EML),
|
|
(".p7s", FileType.EML),
|
|
(".java", FileType.TXT),
|
|
],
|
|
)
|
|
def it_can_recognize_a_file_type_from_an_extension(self, ext: str, file_type: FileType | None):
|
|
assert FileType.from_extension(ext) is file_type
|
|
|
|
@pytest.mark.parametrize("ext", [".foobar", ".xyz", ".mdx", "", ".", None])
|
|
def but_not_when_that_extension_is_empty_or_None_or_not_registered(self, ext: str | None):
|
|
assert FileType.from_extension(ext) is None
|
|
|
|
# -- .from_mime_type() --------------------------------------
|
|
|
|
@pytest.mark.parametrize(
|
|
("mime_type", "file_type"),
|
|
[
|
|
("image/bmp", FileType.BMP),
|
|
("text/x-csv", FileType.CSV),
|
|
("application/msword", FileType.DOC),
|
|
("message/rfc822", FileType.EML),
|
|
("text/plain", FileType.TXT),
|
|
("text/yaml", FileType.TXT),
|
|
("application/xml", FileType.XML),
|
|
("text/xml", FileType.XML),
|
|
("inode/x-empty", FileType.EMPTY),
|
|
],
|
|
)
|
|
def it_can_recognize_a_file_type_from_a_mime_type(
|
|
self, mime_type: str, file_type: FileType | None
|
|
):
|
|
assert FileType.from_mime_type(mime_type) is file_type
|
|
|
|
@pytest.mark.parametrize("mime_type", ["text/css", "image/gif", "audio/mpeg", "foo/bar", None])
|
|
def but_not_when_that_mime_type_is_not_registered_by_a_file_type_or_None(
|
|
self, mime_type: str | None
|
|
):
|
|
assert FileType.from_mime_type(mime_type) is None
|
|
|
|
# -- .extra_name --------------------------------------------
|
|
|
|
@pytest.mark.parametrize(
|
|
("file_type", "expected_value"),
|
|
[
|
|
(FileType.BMP, "image"),
|
|
(FileType.DOC, "doc"),
|
|
(FileType.DOCX, "docx"),
|
|
(FileType.EML, None),
|
|
(FileType.EMPTY, None),
|
|
(FileType.MSG, "msg"),
|
|
(FileType.PDF, "pdf"),
|
|
(FileType.XLS, "xlsx"),
|
|
(FileType.UNK, None),
|
|
(FileType.WAV, None),
|
|
(FileType.ZIP, None),
|
|
],
|
|
)
|
|
def and_it_knows_which_pip_extra_needs_to_be_installed_to_get_those_dependencies(
|
|
self, file_type: FileType, expected_value: str | None
|
|
):
|
|
assert file_type.extra_name == expected_value
|
|
|
|
# -- .importable_package_dependencies -----------------------
|
|
|
|
@pytest.mark.parametrize(
|
|
("file_type", "expected_value"),
|
|
[
|
|
(FileType.BMP, ("unstructured_inference",)),
|
|
(FileType.CSV, ("pandas",)),
|
|
(FileType.DOC, ("docx",)),
|
|
(FileType.EMPTY, ()),
|
|
(FileType.HTML, ()),
|
|
(FileType.ODT, ("docx", "pypandoc")),
|
|
(FileType.PDF, ("pdf2image", "pdfminer", "PIL")),
|
|
(FileType.UNK, ()),
|
|
(FileType.WAV, ()),
|
|
(FileType.ZIP, ()),
|
|
],
|
|
)
|
|
def it_knows_which_importable_packages_its_partitioner_depends_on(
|
|
self, file_type: FileType, expected_value: tuple[str, ...]
|
|
):
|
|
assert file_type.importable_package_dependencies == expected_value
|
|
|
|
# -- .is_partitionable --------------------------------------
|
|
|
|
@pytest.mark.parametrize(
|
|
("file_type", "expected_value"),
|
|
[
|
|
(FileType.BMP, True),
|
|
(FileType.CSV, True),
|
|
(FileType.DOC, True),
|
|
(FileType.EML, True),
|
|
(FileType.JPG, True),
|
|
(FileType.PDF, True),
|
|
(FileType.PPTX, True),
|
|
(FileType.WAV, False),
|
|
(FileType.ZIP, False),
|
|
(FileType.EMPTY, False),
|
|
(FileType.UNK, False),
|
|
],
|
|
)
|
|
def it_knows_whether_files_of_its_type_are_directly_partitionable(
|
|
self, file_type: FileType, expected_value: str
|
|
):
|
|
assert file_type.is_partitionable is expected_value
|
|
|
|
# -- .mime_type ---------------------------------------------
|
|
|
|
@pytest.mark.parametrize(
|
|
("file_type", "mime_type"),
|
|
[
|
|
(FileType.BMP, "image/bmp"),
|
|
(FileType.CSV, "text/csv"),
|
|
(FileType.DOC, "application/msword"),
|
|
(FileType.EML, "message/rfc822"),
|
|
(FileType.HTML, "text/html"),
|
|
(FileType.JPG, "image/jpeg"),
|
|
(FileType.PDF, "application/pdf"),
|
|
(FileType.TXT, "text/plain"),
|
|
(FileType.XML, "application/xml"),
|
|
(FileType.EMPTY, "inode/x-empty"),
|
|
(FileType.UNK, "application/octet-stream"),
|
|
],
|
|
)
|
|
def it_knows_its_canonical_MIME_type(self, file_type: FileType, mime_type: str):
|
|
assert file_type.mime_type == mime_type
|
|
|
|
# -- .partitioner_function_name -----------------------------
|
|
|
|
@pytest.mark.parametrize(
|
|
("file_type", "expected_value"),
|
|
[
|
|
(FileType.BMP, "partition_image"),
|
|
(FileType.CSV, "partition_csv"),
|
|
(FileType.DOC, "partition_doc"),
|
|
(FileType.DOCX, "partition_docx"),
|
|
(FileType.JPG, "partition_image"),
|
|
(FileType.PNG, "partition_image"),
|
|
(FileType.TIFF, "partition_image"),
|
|
],
|
|
)
|
|
def it_knows_its_partitioner_function_name(self, file_type: FileType, expected_value: str):
|
|
assert file_type.partitioner_function_name == expected_value
|
|
|
|
@pytest.mark.parametrize(
|
|
"file_type", [FileType.WAV, FileType.ZIP, FileType.EMPTY, FileType.UNK]
|
|
)
|
|
def but_it_raises_on_partitioner_function_name_access_when_the_file_type_is_not_partitionable(
|
|
self, file_type: FileType
|
|
):
|
|
with pytest.raises(ValueError, match="`.partitioner_function_name` is undefined because "):
|
|
file_type.partitioner_function_name
|
|
|
|
# -- .partitioner_module_qname ------------------------------
|
|
|
|
@pytest.mark.parametrize(
|
|
("file_type", "expected_value"),
|
|
[
|
|
(FileType.BMP, "unstructured.partition.image"),
|
|
(FileType.CSV, "unstructured.partition.csv"),
|
|
(FileType.DOC, "unstructured.partition.doc"),
|
|
(FileType.DOCX, "unstructured.partition.docx"),
|
|
(FileType.JPG, "unstructured.partition.image"),
|
|
(FileType.PNG, "unstructured.partition.image"),
|
|
(FileType.TIFF, "unstructured.partition.image"),
|
|
],
|
|
)
|
|
def it_knows_the_fully_qualified_name_of_its_partitioner_module(
|
|
self, file_type: FileType, expected_value: str
|
|
):
|
|
assert file_type.partitioner_module_qname == expected_value
|
|
|
|
@pytest.mark.parametrize(
|
|
"file_type", [FileType.WAV, FileType.ZIP, FileType.EMPTY, FileType.UNK]
|
|
)
|
|
def but_it_raises_on_partitioner_module_qname_access_when_the_file_type_is_not_partitionable(
|
|
self, file_type: FileType
|
|
):
|
|
with pytest.raises(ValueError, match="`.partitioner_module_qname` is undefined because "):
|
|
file_type.partitioner_module_qname
|
|
|
|
# -- .partitioner_shortname ---------------------------------
|
|
|
|
@pytest.mark.parametrize(
|
|
("file_type", "expected_value"),
|
|
[
|
|
(FileType.BMP, "image"),
|
|
(FileType.CSV, "csv"),
|
|
(FileType.DOC, "doc"),
|
|
(FileType.DOCX, "docx"),
|
|
(FileType.JPG, "image"),
|
|
(FileType.PNG, "image"),
|
|
(FileType.TIFF, "image"),
|
|
(FileType.XLS, "xlsx"),
|
|
(FileType.XLSX, "xlsx"),
|
|
],
|
|
)
|
|
def it_provides_access_to_the_partitioner_shortname(
|
|
self, file_type: FileType, expected_value: str
|
|
):
|
|
assert file_type.partitioner_shortname == expected_value
|
|
|
|
|
|
def test_create_file_type():
|
|
file_type = create_file_type("FOO", canonical_mime_type="application/foo", extensions=[".foo"])
|
|
|
|
assert FileType.from_extension(".foo") is file_type
|
|
assert FileType.from_mime_type("application/foo") is file_type
|
|
|
|
|
|
def test_register_partitioner():
|
|
file_type = create_file_type("FOO", canonical_mime_type="application/foo", extensions=[".foo"])
|
|
|
|
@register_partitioner(file_type)
|
|
def partition_foo():
|
|
pass
|
|
|
|
assert file_type.partitioner_function_name == "partition_foo"
|
|
assert file_type.partitioner_module_qname == "test_unstructured.file_utils.test_model"
|