mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-03 07:05:20 +00:00

* Apply import sorting ruff . --select I --fix * Remove unnecessary open mode parameter ruff . --select UP015 --fix * Use f-string formatting rather than .format * Remove extraneous parentheses Also use "" instead of str() * Resolve missing trailing commas ruff . --select COM --fix * Rewrite list() and dict() calls using literals ruff . --select C4 --fix * Add () to pytest.fixture, use tuples for parametrize, etc. ruff . --select PT --fix * Simplify code: merge conditionals, context managers ruff . --select SIM --fix * Import without unnecessary alias ruff . --select PLR0402 --fix * Apply formatting via black * Rewrite ValueError somewhat Slightly unrelated to the rest of the PR * Apply formatting to tests via black * Update expected exception message to match 0d81564 * Satisfy E501 line too long in test * Update changelog & version * Add ruff to make tidy and test deps * Run 'make tidy' * Update changelog & version * Update changelog & version * Add ruff to 'check' target Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
93 lines
2.9 KiB
Python
93 lines
2.9 KiB
Python
import os
|
|
import pathlib
|
|
|
|
import pandas as pd
|
|
import pytest
|
|
|
|
from unstructured.file_utils import exploration
|
|
from unstructured.file_utils.filetype import FileType
|
|
|
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
|
|
|
|
|
def test_get_directory_file_info(tmpdir):
|
|
file_info_test = os.path.join(tmpdir, "file_info_test")
|
|
if not os.path.exists(file_info_test):
|
|
os.mkdir(file_info_test)
|
|
|
|
directory1 = os.path.join(file_info_test, "directory1")
|
|
if not os.path.exists(directory1):
|
|
os.mkdir(directory1)
|
|
|
|
filename1 = os.path.join(directory1, "filename1.txt")
|
|
with open(filename1, "w") as f:
|
|
f.write("hello there!")
|
|
|
|
directory2 = os.path.join(file_info_test, "directory2")
|
|
if not os.path.exists(directory2):
|
|
os.mkdir(directory2)
|
|
|
|
filename2 = os.path.join(directory2, "filename2.txt")
|
|
with open(filename2, "w") as f:
|
|
f.write("hello there!")
|
|
|
|
file_info = exploration.get_directory_file_info(file_info_test)
|
|
assert isinstance(file_info, pd.DataFrame)
|
|
assert set(file_info["filename"].to_list()) == {"filename1.txt", "filename2.txt"}
|
|
|
|
means = file_info.groupby("filetype").mean()
|
|
assert means.columns.to_list() == ["filesize"]
|
|
|
|
|
|
def test_get_file_info(tmpdir):
|
|
file_info_test = os.path.join(tmpdir, "file_info_test")
|
|
if not os.path.exists(file_info_test):
|
|
os.mkdir(file_info_test)
|
|
|
|
directory1 = os.path.join(file_info_test, "directory1")
|
|
if not os.path.exists(directory1):
|
|
os.mkdir(directory1)
|
|
|
|
filename1 = os.path.join(directory1, "filename1.txt")
|
|
with open(filename1, "w") as f:
|
|
f.write("hello there!")
|
|
|
|
directory2 = os.path.join(file_info_test, "directory2")
|
|
if not os.path.exists(directory2):
|
|
os.mkdir(directory2)
|
|
|
|
filename2 = os.path.join(directory2, "filename2.txt")
|
|
with open(filename2, "w") as f:
|
|
f.write("hello there!")
|
|
|
|
file_info = exploration.get_file_info([filename1, filename2])
|
|
assert isinstance(file_info, pd.DataFrame)
|
|
assert set(file_info["filename"].to_list()) == {"filename1.txt", "filename2.txt"}
|
|
|
|
means = file_info.groupby("filetype").mean()
|
|
assert means.columns.to_list() == ["filesize"]
|
|
|
|
|
|
def test_get_file_info_from_file_contents():
|
|
file_contents_filename = os.path.join(DIRECTORY, "test-file-contents.txt")
|
|
with open(file_contents_filename) as f:
|
|
file_contents = [f.read()]
|
|
|
|
file_info = exploration.get_file_info_from_file_contents(
|
|
file_contents=file_contents,
|
|
filenames=["test.eml"],
|
|
)
|
|
assert file_info.filetype[0] == FileType.EML
|
|
|
|
|
|
def test_get_file_info_from_file_contents_raises_if_lists_no_equal():
|
|
file_contents_filename = os.path.join(DIRECTORY, "test-file-contents.txt")
|
|
with open(file_contents_filename) as f:
|
|
file_contents = [f.read()]
|
|
|
|
with pytest.raises(ValueError):
|
|
exploration.get_file_info_from_file_contents(
|
|
file_contents=file_contents,
|
|
filenames=["test.eml", "test2.eml"],
|
|
)
|