mirror of
https://github.com/run-llama/llama-hub.git
synced 2025-07-31 12:52:23 +00:00
209 lines
8.1 KiB
Python
209 lines
8.1 KiB
Python
"""Test file reader."""
|
|
import sys
|
|
from pathlib import Path
|
|
from tempfile import TemporaryDirectory
|
|
from typing import Any, Dict
|
|
|
|
sys.path.append(Path(__file__).parent.parent)
|
|
|
|
from loader_hub.file.base import SimpleDirectoryReader
|
|
|
|
|
|
def test_recursive() -> None:
|
|
"""Test simple directory reader in recursive mode."""
|
|
# test recursive
|
|
with TemporaryDirectory() as tmp_dir:
|
|
with open(f"{tmp_dir}/test1.txt", "w") as f:
|
|
f.write("test1")
|
|
with TemporaryDirectory(dir=tmp_dir) as tmp_sub_dir:
|
|
with open(f"{tmp_sub_dir}/test2.txt", "w") as f:
|
|
f.write("test2")
|
|
with TemporaryDirectory(dir=tmp_sub_dir) as tmp_sub_sub_dir:
|
|
with open(f"{tmp_sub_sub_dir}/test3.txt", "w") as f:
|
|
f.write("test3")
|
|
with open(f"{tmp_sub_sub_dir}/test4.txt", "w") as f:
|
|
f.write("test4")
|
|
|
|
reader = SimpleDirectoryReader(tmp_dir, recursive=True)
|
|
input_file_names = [f.name for f in reader.input_files]
|
|
assert len(reader.input_files) == 4
|
|
assert set(input_file_names) == {
|
|
"test1.txt",
|
|
"test2.txt",
|
|
"test3.txt",
|
|
"test4.txt",
|
|
}
|
|
|
|
# test that recursive=False works
|
|
with TemporaryDirectory() as tmp_dir:
|
|
with open(f"{tmp_dir}/test1.txt", "w") as f:
|
|
f.write("test1")
|
|
with TemporaryDirectory(dir=tmp_dir) as tmp_sub_dir:
|
|
with open(f"{tmp_sub_dir}/test2.txt", "w") as f:
|
|
f.write("test2")
|
|
with TemporaryDirectory(dir=tmp_sub_dir) as tmp_sub_sub_dir:
|
|
with open(f"{tmp_sub_sub_dir}/test3.txt", "w") as f:
|
|
f.write("test3")
|
|
with open(f"{tmp_sub_sub_dir}/test4.txt", "w") as f:
|
|
f.write("test4")
|
|
|
|
reader = SimpleDirectoryReader(tmp_dir, recursive=False)
|
|
input_file_names = [f.name for f in reader.input_files]
|
|
print(reader.input_files)
|
|
assert len(reader.input_files) == 1
|
|
assert set(input_file_names) == {
|
|
"test1.txt",
|
|
}
|
|
|
|
# test recursive with .md files
|
|
with TemporaryDirectory() as tmp_dir:
|
|
with open(f"{tmp_dir}/test1.md", "w") as f:
|
|
f.write("test1")
|
|
with TemporaryDirectory(dir=tmp_dir) as tmp_sub_dir:
|
|
with open(f"{tmp_sub_dir}/test2.txt", "w") as f:
|
|
f.write("test2")
|
|
with TemporaryDirectory(dir=tmp_sub_dir) as tmp_sub_sub_dir:
|
|
with open(f"{tmp_sub_sub_dir}/test3.md", "w") as f:
|
|
f.write("test3")
|
|
with open(f"{tmp_sub_sub_dir}/test4.txt", "w") as f:
|
|
f.write("test4")
|
|
|
|
reader = SimpleDirectoryReader(
|
|
tmp_dir, recursive=True, required_exts=[".md"]
|
|
)
|
|
input_file_names = [f.name for f in reader.input_files]
|
|
assert len(reader.input_files) == 2
|
|
assert set(input_file_names) == {
|
|
"test1.md",
|
|
"test3.md",
|
|
}
|
|
|
|
|
|
def test_nonrecursive() -> None:
|
|
"""Test simple non-recursive directory reader."""
|
|
# test nonrecursive
|
|
with TemporaryDirectory() as tmp_dir:
|
|
with open(f"{tmp_dir}/test1.txt", "w") as f:
|
|
f.write("test1")
|
|
with open(f"{tmp_dir}/test2.txt", "w") as f:
|
|
f.write("test2")
|
|
with open(f"{tmp_dir}/test3.txt", "w") as f:
|
|
f.write("test3")
|
|
with open(f"{tmp_dir}/test4.txt", "w") as f:
|
|
f.write("test4")
|
|
with open(f"{tmp_dir}/.test5.txt", "w") as f:
|
|
f.write("test5")
|
|
|
|
# test exclude hidden
|
|
reader = SimpleDirectoryReader(tmp_dir, recursive=False)
|
|
input_file_names = [f.name for f in reader.input_files]
|
|
assert len(reader.input_files) == 4
|
|
assert input_file_names == ["test1.txt", "test2.txt", "test3.txt", "test4.txt"]
|
|
|
|
# test include hidden
|
|
reader = SimpleDirectoryReader(tmp_dir, recursive=False, exclude_hidden=False)
|
|
input_file_names = [f.name for f in reader.input_files]
|
|
assert len(reader.input_files) == 5
|
|
assert input_file_names == [
|
|
".test5.txt",
|
|
"test1.txt",
|
|
"test2.txt",
|
|
"test3.txt",
|
|
"test4.txt",
|
|
]
|
|
|
|
|
|
def test_required_exts() -> None:
|
|
"""Test extension filter."""
|
|
# test nonrecursive
|
|
with TemporaryDirectory() as tmp_dir:
|
|
with open(f"{tmp_dir}/test1.txt", "w") as f:
|
|
f.write("test1")
|
|
with open(f"{tmp_dir}/test2.md", "w") as f:
|
|
f.write("test2")
|
|
with open(f"{tmp_dir}/test3.tmp", "w") as f:
|
|
f.write("test3")
|
|
with open(f"{tmp_dir}/test4.json", "w") as f:
|
|
f.write("test4")
|
|
with open(f"{tmp_dir}/test5.json", "w") as f:
|
|
f.write("test5")
|
|
|
|
# test exclude hidden
|
|
reader = SimpleDirectoryReader(tmp_dir, required_exts=[".json"])
|
|
input_file_names = [f.name for f in reader.input_files]
|
|
assert len(reader.input_files) == 2
|
|
assert input_file_names == ["test4.json", "test5.json"]
|
|
|
|
|
|
def test_num_files_limit() -> None:
|
|
"""Test num files limit."""
|
|
# test num_files_limit (with recursion)
|
|
with TemporaryDirectory() as tmp_dir:
|
|
with open(f"{tmp_dir}/test1.txt", "w") as f:
|
|
f.write("test1")
|
|
with TemporaryDirectory(dir=tmp_dir) as tmp_sub_dir:
|
|
with open(f"{tmp_sub_dir}/test2.txt", "w") as f:
|
|
f.write("test2")
|
|
with open(f"{tmp_sub_dir}/test3.txt", "w") as f:
|
|
f.write("test3")
|
|
with TemporaryDirectory(dir=tmp_sub_dir) as tmp_sub_sub_dir:
|
|
with open(f"{tmp_sub_sub_dir}/test4.txt", "w") as f:
|
|
f.write("test4")
|
|
|
|
reader = SimpleDirectoryReader(
|
|
tmp_dir, recursive=True, num_files_limit=2
|
|
)
|
|
input_file_names = [f.name for f in reader.input_files]
|
|
assert len(reader.input_files) == 2
|
|
assert set(input_file_names) == {
|
|
"test1.txt",
|
|
"test2.txt",
|
|
}
|
|
|
|
reader = SimpleDirectoryReader(
|
|
tmp_dir, recursive=True, num_files_limit=3
|
|
)
|
|
input_file_names = [f.name for f in reader.input_files]
|
|
assert len(reader.input_files) == 3
|
|
assert set(input_file_names) == {
|
|
"test1.txt",
|
|
"test2.txt",
|
|
"test3.txt",
|
|
}
|
|
|
|
reader = SimpleDirectoryReader(
|
|
tmp_dir, recursive=True, num_files_limit=4
|
|
)
|
|
input_file_names = [f.name for f in reader.input_files]
|
|
assert len(reader.input_files) == 4
|
|
assert set(input_file_names) == {
|
|
"test1.txt",
|
|
"test2.txt",
|
|
"test3.txt",
|
|
"test4.txt",
|
|
}
|
|
|
|
|
|
def test_file_metadata() -> None:
|
|
"""Test if file metadata is added to Document."""
|
|
# test file_metadata
|
|
with TemporaryDirectory() as tmp_dir:
|
|
with open(f"{tmp_dir}/test1.txt", "w") as f:
|
|
f.write("test1")
|
|
with open(f"{tmp_dir}/test2.txt", "w") as f:
|
|
f.write("test2")
|
|
with open(f"{tmp_dir}/test3.txt", "w") as f:
|
|
f.write("test3")
|
|
|
|
test_author = "Bruce Wayne"
|
|
|
|
def filename_to_metadata(filename: str) -> Dict[str, Any]:
|
|
return {"filename": filename, "author": test_author}
|
|
|
|
reader = SimpleDirectoryReader(tmp_dir, file_metadata=filename_to_metadata)
|
|
|
|
documents = reader.load_data()
|
|
|
|
for d in documents:
|
|
assert d.extra_info is not None and d.extra_info["author"] == test_author
|