mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
71 lines
2.5 KiB
Python
71 lines
2.5 KiB
Python
![]() |
"""Test suite for `unstructured.file_utils.filetype`."""
|
||
|
|
||
|
from __future__ import annotations
|
||
|
|
||
|
import pytest
|
||
|
|
||
|
from unstructured.file_utils.model import FileType
|
||
|
|
||
|
|
||
|
class DescribeFileType:
|
||
|
"""Unit-test suite for `unstructured.file_utils.model.Filetype`."""
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
("ext", "file_type"),
|
||
|
[
|
||
|
(".bmp", FileType.BMP),
|
||
|
(".html", FileType.HTML),
|
||
|
(".eml", FileType.EML),
|
||
|
(".p7s", FileType.EML),
|
||
|
(".java", FileType.TXT),
|
||
|
],
|
||
|
)
|
||
|
def it_can_recognize_a_file_type_from_an_extension(self, ext: str, file_type: FileType | None):
|
||
|
assert FileType.from_extension(ext) is file_type
|
||
|
|
||
|
@pytest.mark.parametrize("ext", [".foobar", ".xyz", ".mdx", "", "."])
|
||
|
def but_not_when_that_extension_is_empty_or_not_registered(self, ext: str):
|
||
|
assert FileType.from_extension(ext) is None
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
("mime_type", "file_type"),
|
||
|
[
|
||
|
("image/bmp", FileType.BMP),
|
||
|
("text/x-csv", FileType.CSV),
|
||
|
("application/msword", FileType.DOC),
|
||
|
("message/rfc822", FileType.EML),
|
||
|
("text/plain", FileType.TXT),
|
||
|
("text/yaml", FileType.TXT),
|
||
|
("application/xml", FileType.XML),
|
||
|
("text/xml", FileType.XML),
|
||
|
("inode/x-empty", FileType.EMPTY),
|
||
|
],
|
||
|
)
|
||
|
def it_can_recognize_a_file_type_from_a_mime_type(
|
||
|
self, mime_type: str, file_type: FileType | None
|
||
|
):
|
||
|
assert FileType.from_mime_type(mime_type) is file_type
|
||
|
|
||
|
@pytest.mark.parametrize("mime_type", ["text/css", "image/gif", "audio/mpeg", "foo/bar"])
|
||
|
def but_not_when_that_mime_type_is_not_registered_by_a_file_type(self, mime_type: str):
|
||
|
assert FileType.from_mime_type(mime_type) is None
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
("file_type", "mime_type"),
|
||
|
[
|
||
|
(FileType.BMP, "image/bmp"),
|
||
|
(FileType.CSV, "text/csv"),
|
||
|
(FileType.DOC, "application/msword"),
|
||
|
(FileType.EML, "message/rfc822"),
|
||
|
(FileType.HTML, "text/html"),
|
||
|
(FileType.JPG, "image/jpeg"),
|
||
|
(FileType.PDF, "application/pdf"),
|
||
|
(FileType.TXT, "text/plain"),
|
||
|
(FileType.XML, "application/xml"),
|
||
|
(FileType.EMPTY, "inode/x-empty"),
|
||
|
(FileType.UNK, "application/octet-stream"),
|
||
|
],
|
||
|
)
|
||
|
def it_knows_its_canonical_MIME_type(self, file_type: FileType, mime_type: str):
|
||
|
assert file_type.mime_type == mime_type
|