llama-hub/tests/test_file.py
2023-02-15 20:19:37 -08:00

209 lines
8.1 KiB
Python

"""Test file reader."""
import sys
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Any, Dict
sys.path.append(Path(__file__).parent.parent)
from loader_hub.file.base import SimpleDirectoryReader
def test_recursive() -> None:
"""Test simple directory reader in recursive mode."""
# test recursive
with TemporaryDirectory() as tmp_dir:
with open(f"{tmp_dir}/test1.txt", "w") as f:
f.write("test1")
with TemporaryDirectory(dir=tmp_dir) as tmp_sub_dir:
with open(f"{tmp_sub_dir}/test2.txt", "w") as f:
f.write("test2")
with TemporaryDirectory(dir=tmp_sub_dir) as tmp_sub_sub_dir:
with open(f"{tmp_sub_sub_dir}/test3.txt", "w") as f:
f.write("test3")
with open(f"{tmp_sub_sub_dir}/test4.txt", "w") as f:
f.write("test4")
reader = SimpleDirectoryReader(tmp_dir, recursive=True)
input_file_names = [f.name for f in reader.input_files]
assert len(reader.input_files) == 4
assert set(input_file_names) == {
"test1.txt",
"test2.txt",
"test3.txt",
"test4.txt",
}
# test that recursive=False works
with TemporaryDirectory() as tmp_dir:
with open(f"{tmp_dir}/test1.txt", "w") as f:
f.write("test1")
with TemporaryDirectory(dir=tmp_dir) as tmp_sub_dir:
with open(f"{tmp_sub_dir}/test2.txt", "w") as f:
f.write("test2")
with TemporaryDirectory(dir=tmp_sub_dir) as tmp_sub_sub_dir:
with open(f"{tmp_sub_sub_dir}/test3.txt", "w") as f:
f.write("test3")
with open(f"{tmp_sub_sub_dir}/test4.txt", "w") as f:
f.write("test4")
reader = SimpleDirectoryReader(tmp_dir, recursive=False)
input_file_names = [f.name for f in reader.input_files]
print(reader.input_files)
assert len(reader.input_files) == 1
assert set(input_file_names) == {
"test1.txt",
}
# test recursive with .md files
with TemporaryDirectory() as tmp_dir:
with open(f"{tmp_dir}/test1.md", "w") as f:
f.write("test1")
with TemporaryDirectory(dir=tmp_dir) as tmp_sub_dir:
with open(f"{tmp_sub_dir}/test2.txt", "w") as f:
f.write("test2")
with TemporaryDirectory(dir=tmp_sub_dir) as tmp_sub_sub_dir:
with open(f"{tmp_sub_sub_dir}/test3.md", "w") as f:
f.write("test3")
with open(f"{tmp_sub_sub_dir}/test4.txt", "w") as f:
f.write("test4")
reader = SimpleDirectoryReader(
tmp_dir, recursive=True, required_exts=[".md"]
)
input_file_names = [f.name for f in reader.input_files]
assert len(reader.input_files) == 2
assert set(input_file_names) == {
"test1.md",
"test3.md",
}
def test_nonrecursive() -> None:
"""Test simple non-recursive directory reader."""
# test nonrecursive
with TemporaryDirectory() as tmp_dir:
with open(f"{tmp_dir}/test1.txt", "w") as f:
f.write("test1")
with open(f"{tmp_dir}/test2.txt", "w") as f:
f.write("test2")
with open(f"{tmp_dir}/test3.txt", "w") as f:
f.write("test3")
with open(f"{tmp_dir}/test4.txt", "w") as f:
f.write("test4")
with open(f"{tmp_dir}/.test5.txt", "w") as f:
f.write("test5")
# test exclude hidden
reader = SimpleDirectoryReader(tmp_dir, recursive=False)
input_file_names = [f.name for f in reader.input_files]
assert len(reader.input_files) == 4
assert input_file_names == ["test1.txt", "test2.txt", "test3.txt", "test4.txt"]
# test include hidden
reader = SimpleDirectoryReader(tmp_dir, recursive=False, exclude_hidden=False)
input_file_names = [f.name for f in reader.input_files]
assert len(reader.input_files) == 5
assert input_file_names == [
".test5.txt",
"test1.txt",
"test2.txt",
"test3.txt",
"test4.txt",
]
def test_required_exts() -> None:
"""Test extension filter."""
# test nonrecursive
with TemporaryDirectory() as tmp_dir:
with open(f"{tmp_dir}/test1.txt", "w") as f:
f.write("test1")
with open(f"{tmp_dir}/test2.md", "w") as f:
f.write("test2")
with open(f"{tmp_dir}/test3.tmp", "w") as f:
f.write("test3")
with open(f"{tmp_dir}/test4.json", "w") as f:
f.write("test4")
with open(f"{tmp_dir}/test5.json", "w") as f:
f.write("test5")
# test exclude hidden
reader = SimpleDirectoryReader(tmp_dir, required_exts=[".json"])
input_file_names = [f.name for f in reader.input_files]
assert len(reader.input_files) == 2
assert input_file_names == ["test4.json", "test5.json"]
def test_num_files_limit() -> None:
"""Test num files limit."""
# test num_files_limit (with recursion)
with TemporaryDirectory() as tmp_dir:
with open(f"{tmp_dir}/test1.txt", "w") as f:
f.write("test1")
with TemporaryDirectory(dir=tmp_dir) as tmp_sub_dir:
with open(f"{tmp_sub_dir}/test2.txt", "w") as f:
f.write("test2")
with open(f"{tmp_sub_dir}/test3.txt", "w") as f:
f.write("test3")
with TemporaryDirectory(dir=tmp_sub_dir) as tmp_sub_sub_dir:
with open(f"{tmp_sub_sub_dir}/test4.txt", "w") as f:
f.write("test4")
reader = SimpleDirectoryReader(
tmp_dir, recursive=True, num_files_limit=2
)
input_file_names = [f.name for f in reader.input_files]
assert len(reader.input_files) == 2
assert set(input_file_names) == {
"test1.txt",
"test2.txt",
}
reader = SimpleDirectoryReader(
tmp_dir, recursive=True, num_files_limit=3
)
input_file_names = [f.name for f in reader.input_files]
assert len(reader.input_files) == 3
assert set(input_file_names) == {
"test1.txt",
"test2.txt",
"test3.txt",
}
reader = SimpleDirectoryReader(
tmp_dir, recursive=True, num_files_limit=4
)
input_file_names = [f.name for f in reader.input_files]
assert len(reader.input_files) == 4
assert set(input_file_names) == {
"test1.txt",
"test2.txt",
"test3.txt",
"test4.txt",
}
def test_file_metadata() -> None:
"""Test if file metadata is added to Document."""
# test file_metadata
with TemporaryDirectory() as tmp_dir:
with open(f"{tmp_dir}/test1.txt", "w") as f:
f.write("test1")
with open(f"{tmp_dir}/test2.txt", "w") as f:
f.write("test2")
with open(f"{tmp_dir}/test3.txt", "w") as f:
f.write("test3")
test_author = "Bruce Wayne"
def filename_to_metadata(filename: str) -> Dict[str, Any]:
return {"filename": filename, "author": test_author}
reader = SimpleDirectoryReader(tmp_dir, file_metadata=filename_to_metadata)
documents = reader.load_data()
for d in documents:
assert d.extra_info is not None and d.extra_info["author"] == test_author