mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-04 07:27:34 +00:00

**Summary** Elaborate the `FileType` enum to be a complete descriptor of file-types. Add methods to allow `STR_TO_FILETYPE`, `EXT_TO_FILETYPE` and `FILETYPE_TO_MIMETYPE` mappings to be replaced, removing those redundant and noisy declarations. In the process, fix some lingering file-type identification and `.metadata.filetype` errors that had been skipped in the tests. **Additional Context** Gathering the various attributes of a file-type into the `FileType` enum eliminates the duplication inherent in the separate `STR_TO_FILETYPE` etc. mappings and makes access to those values convenient for callers. These attributes include what MIME-type a file-type should record in metadata and what MIME-types and extensions map to that file-type. These values and others are made available as methods and properties directly on the `FileType` class and members. Because all attributes are defined in the `FileType` enum there is no risk of inconsistency across multiple locations and any changes happen in one and only one place. Further attributes and methods will be added in later commits to support other file-type related operations like mapping to a partitioner and verifying its dependencies are installed.
98 lines
3.2 KiB
Python
98 lines
3.2 KiB
Python
import os
|
|
import pathlib
|
|
|
|
import pandas as pd
|
|
import pytest
|
|
|
|
from unstructured.file_utils import exploration
|
|
from unstructured.file_utils.model import FileType
|
|
|
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
|
|
|
|
|
is_in_docker = os.path.exists("/.dockerenv")
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
def test_get_directory_file_info(tmpdir):
|
|
file_info_test = os.path.join(tmpdir, "file_info_test")
|
|
if not os.path.exists(file_info_test):
|
|
os.mkdir(file_info_test)
|
|
|
|
directory1 = os.path.join(file_info_test, "directory1")
|
|
if not os.path.exists(directory1):
|
|
os.mkdir(directory1)
|
|
|
|
filename1 = os.path.join(directory1, "filename1.txt")
|
|
with open(filename1, "w") as f:
|
|
f.write("hello there!")
|
|
|
|
directory2 = os.path.join(file_info_test, "directory2")
|
|
if not os.path.exists(directory2):
|
|
os.mkdir(directory2)
|
|
|
|
filename2 = os.path.join(directory2, "filename2.txt")
|
|
with open(filename2, "w") as f:
|
|
f.write("hello there!")
|
|
|
|
file_info = exploration.get_directory_file_info(file_info_test)
|
|
assert isinstance(file_info, pd.DataFrame)
|
|
assert set(file_info["filename"].to_list()) == {"filename1.txt", "filename2.txt"}
|
|
|
|
means = file_info.groupby("filetype").mean(numeric_only=True)
|
|
assert means.columns.to_list() == ["filesize"]
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
def test_get_file_info(tmpdir):
|
|
file_info_test = os.path.join(tmpdir, "file_info_test")
|
|
if not os.path.exists(file_info_test):
|
|
os.mkdir(file_info_test)
|
|
|
|
directory1 = os.path.join(file_info_test, "directory1")
|
|
if not os.path.exists(directory1):
|
|
os.mkdir(directory1)
|
|
|
|
filename1 = os.path.join(directory1, "filename1.txt")
|
|
with open(filename1, "w") as f:
|
|
f.write("hello there!")
|
|
|
|
directory2 = os.path.join(file_info_test, "directory2")
|
|
if not os.path.exists(directory2):
|
|
os.mkdir(directory2)
|
|
|
|
filename2 = os.path.join(directory2, "filename2.txt")
|
|
with open(filename2, "w") as f:
|
|
f.write("hello there!")
|
|
|
|
file_info = exploration.get_file_info([filename1, filename2])
|
|
assert isinstance(file_info, pd.DataFrame)
|
|
assert set(file_info["filename"].to_list()) == {"filename1.txt", "filename2.txt"}
|
|
|
|
means = file_info.groupby("filetype").mean(numeric_only=True)
|
|
assert means.columns.to_list() == ["filesize"]
|
|
|
|
|
|
def test_get_file_info_from_file_contents():
|
|
file_contents_filename = os.path.join(DIRECTORY, "test-file-contents.txt")
|
|
with open(file_contents_filename) as f:
|
|
file_contents = [f.read()]
|
|
|
|
file_info = exploration.get_file_info_from_file_contents(
|
|
file_contents=file_contents,
|
|
filenames=["test.eml"],
|
|
)
|
|
assert file_info.filetype[0] == FileType.EML
|
|
|
|
|
|
def test_get_file_info_from_file_contents_raises_if_lists_no_equal():
|
|
file_contents_filename = os.path.join(DIRECTORY, "test-file-contents.txt")
|
|
with open(file_contents_filename) as f:
|
|
file_contents = [f.read()]
|
|
|
|
with pytest.raises(ValueError):
|
|
exploration.get_file_info_from_file_contents(
|
|
file_contents=file_contents,
|
|
filenames=["test.eml", "test2.eml"],
|
|
)
|