mirror of
https://github.com/run-llama/llama-hub.git
synced 2025-08-01 05:12:36 +00:00
393 lines
14 KiB
Python
393 lines
14 KiB
Python
from llama_index import Document
|
|
import httpx
|
|
import pytest
|
|
import asyncio
|
|
import base64
|
|
import os
|
|
from unittest.mock import MagicMock, AsyncMock, call
|
|
import unittest
|
|
from typing import List, Tuple
|
|
|
|
# Remove this to test changes to GithubRepositoryReader.
|
|
# pytest.skip(
|
|
# "Skip by default due to dependence on network request and github api token.",
|
|
# allow_module_level=True,
|
|
# )
|
|
|
|
from loader_hub.github_repo.utils import (
|
|
BufferedAsyncIterator,
|
|
BufferedGitBlobDataIterator,
|
|
)
|
|
|
|
from loader_hub.github_repo.github_client import (
|
|
GithubClient,
|
|
GitBlobResponseModel,
|
|
GitTreeResponseModel,
|
|
)
|
|
|
|
from loader_hub.github_repo.base import GithubRepositoryReader
|
|
|
|
|
|
@pytest.fixture
|
|
def github_client():
|
|
return GithubClient(
|
|
github_token=os.getenv("GITHUB_API_TOKEN"),
|
|
verbose=True,
|
|
)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_github_client(github_client):
|
|
owner = "emptycrown"
|
|
repo = "llama-hub"
|
|
branch = "main"
|
|
commit_sha = "0cd691322e5244b48b68e3588d1343eb53f3a112" # Points to Add spotify reader, https://github.com/emptycrown/llama-hub/commit/0cd691322e5244b48b68e3588d1343eb53f3a112
|
|
|
|
# test get_branch
|
|
branch_data = await github_client.get_branch(owner, repo, branch)
|
|
assert branch_data.name == branch
|
|
assert (
|
|
branch_data._links.self
|
|
== f"https://api.github.com/repos/{owner}/{repo}/branches/{branch}"
|
|
), "Branch self link is incorrect"
|
|
assert (
|
|
branch_data._links.html
|
|
== f"https://github.com/{owner}/{repo}/tree/{branch}"
|
|
), "Branch html link is incorrect"
|
|
|
|
# test get_commit
|
|
commit_data = await github_client.get_commit(owner, repo, commit_sha)
|
|
assert commit_data.sha == commit_sha, "Commit sha is incorrect"
|
|
assert (
|
|
commit_data.url
|
|
== f"https://api.github.com/repos/{owner}/{repo}/commits/{commit_sha}"
|
|
), "Commit url is incorrect"
|
|
|
|
# test get_tree
|
|
tree_data = await github_client.get_tree(
|
|
owner, repo, commit_data.commit.tree.sha
|
|
)
|
|
assert (
|
|
tree_data.url
|
|
== f"https://api.github.com/repos/{owner}/{repo}/git/trees/{commit_data.commit.tree.sha}"
|
|
), "Tree url is incorrect"
|
|
assert (
|
|
tree_data.sha == commit_data.commit.tree.sha
|
|
), "Tree sha is incorrect"
|
|
print(tree_data.tree[0].sha)
|
|
assert 1 == 1
|
|
|
|
# test get_blob
|
|
expected_files_in_first_depth_of_the_tree: List[Tuple[str, str]] = [
|
|
("test_requirements.txt", "blob"),
|
|
("README.md", "blob"),
|
|
("Makefile", "blob"),
|
|
(".gitignore", "blob"),
|
|
("tests", "tree"),
|
|
("loader_hub", "tree"),
|
|
(".github", "tree"),
|
|
]
|
|
# check if the first depth of the tree has the expected files. All the expected files should be in the first depth of the tree and vice versa
|
|
assert len(tree_data.tree) == len(
|
|
expected_files_in_first_depth_of_the_tree
|
|
), "The number of files in the first depth of the tree is incorrect"
|
|
for file in expected_files_in_first_depth_of_the_tree:
|
|
assert file in [
|
|
(tree_file.path, tree_file.type) for tree_file in tree_data.tree
|
|
], f"{file} is not in the first depth of the tree"
|
|
# checking the opposite
|
|
for tree_obj in tree_data.tree:
|
|
assert (
|
|
tree_obj.path,
|
|
tree_obj.type,
|
|
) in expected_files_in_first_depth_of_the_tree, (
|
|
f"{tree_obj.path} is not in the expected files"
|
|
)
|
|
|
|
# find test_reqirements.txt in the tree
|
|
test_requirements_txt = [
|
|
tree_obj
|
|
for tree_obj in tree_data.tree
|
|
if tree_obj.path == "test_requirements.txt"
|
|
][0]
|
|
|
|
# test get_blob
|
|
blob_data = await github_client.get_blob(
|
|
owner, repo, test_requirements_txt.sha
|
|
)
|
|
assert blob_data.encoding == "base64", "Blob encoding is incorrect"
|
|
assert (
|
|
blob_data.url
|
|
== f"https://api.github.com/repos/{owner}/{repo}/git/blobs/{test_requirements_txt.sha}"
|
|
), "Blob url is incorrect"
|
|
assert blob_data.sha == test_requirements_txt.sha, "Blob sha is incorrect"
|
|
|
|
# decode blob content base64-decoded string to utf-8
|
|
decoded_blob_content = base64.b64decode(blob_data.content).decode("utf-8")
|
|
|
|
expected_decoded_blob_content = """
|
|
|
|
# For testing
|
|
pytest==7.2.1
|
|
pytest-dotenv==0.5.2
|
|
# TODO: remove gpt_index after migration
|
|
https://github.com/jerryjliu/gpt_index/archive/master.zip
|
|
|
|
llama-index
|
|
|
|
# For linting
|
|
# linting stubs
|
|
types-requests==2.28.11.8
|
|
# formatting
|
|
black==22.12.0
|
|
isort==5.11.4
|
|
"""
|
|
# check if the decoded blob content is correct
|
|
for dbc in zip(
|
|
filter(lambda x: x != "", decoded_blob_content.splitlines()),
|
|
filter(lambda x: x != "", expected_decoded_blob_content.splitlines()),
|
|
):
|
|
assert dbc[0] == dbc[1], f"{dbc[0]} is not equal to {dbc[1]}"
|
|
|
|
|
|
class TestGithubRepositoryReader(unittest.TestCase):
|
|
def setUp(self):
|
|
self.maxDiff = None
|
|
self.github_client = MagicMock()
|
|
self.owner = "owner"
|
|
self.repo = "repo"
|
|
self.reader = GithubRepositoryReader(
|
|
self.github_client,
|
|
self.owner,
|
|
self.repo,
|
|
verbose=True,
|
|
use_parser=False,
|
|
)
|
|
|
|
def test__check_filter_directories(self):
|
|
tree_obj_path = "path/to/some/file.py"
|
|
self.reader._filter_directories = (
|
|
["path/to"],
|
|
GithubRepositoryReader.FilterType.INCLUDE,
|
|
)
|
|
self.assertTrue(self.reader._check_filter_directories(tree_obj_path))
|
|
|
|
self.reader._filter_directories = (
|
|
["path/to"],
|
|
GithubRepositoryReader.FilterType.EXCLUDE,
|
|
)
|
|
self.assertFalse(self.reader._check_filter_directories(tree_obj_path))
|
|
|
|
def test__check_filter_file_extensions(self):
|
|
tree_obj_path = "path/to/some/file.py"
|
|
self.reader._filter_file_extensions = (
|
|
[".py"],
|
|
GithubRepositoryReader.FilterType.INCLUDE,
|
|
)
|
|
self.assertTrue(
|
|
self.reader._check_filter_file_extensions(tree_obj_path)
|
|
)
|
|
|
|
self.reader._filter_file_extensions = (
|
|
[".txt"],
|
|
GithubRepositoryReader.FilterType.EXCLUDE,
|
|
)
|
|
self.assertTrue(
|
|
self.reader._check_filter_file_extensions(tree_obj_path)
|
|
)
|
|
|
|
def test__allow_tree_obj_with_files_only(self):
|
|
tree_obj_paths = [
|
|
("src", "tree"),
|
|
("src/file.py", "blob"),
|
|
("src/file.txt", "blob"),
|
|
("src/file.md", "blob"),
|
|
("src/Path.To.Folder", "tree"),
|
|
("src/Path.To.Folder/file1.js", "blob"),
|
|
("src/Path.To.Folder/file2.cpp", "blob"),
|
|
("src/Path.To.Folder/file4.rs", "blob"),
|
|
("src/Path.To.Folder/file5.ts", "blob"),
|
|
("src/Path.To.Folder/file6.h", "blob"),
|
|
("src/Path.To.Folder/file7.c", "blob"),
|
|
("src/Path.To.Folder/file8.java", "blob"),
|
|
("src/assets/file.png", "blob"),
|
|
("src/assets/file.jpg", "blob"),
|
|
("src/assets/file.jpeg", "blob"),
|
|
("src/assets/file.gif", "blob"),
|
|
("src/assets/file.svg", "blob"),
|
|
("src/assets/file.ico", "blob"),
|
|
("src/documents", "tree"),
|
|
("src/documents/file.pdf", "blob"),
|
|
("src/documents/file.doc", "blob"),
|
|
("src/documents/file.docx", "blob"),
|
|
("src/documents/file.xls", "blob"),
|
|
("src/documents/file.xlsx", "blob"),
|
|
("src/documents/file.ppt", "blob"),
|
|
("src/documents/file.pptx", "blob"),
|
|
("src/documents/file.odt", "blob"),
|
|
("src/documents/file.ods", "blob"),
|
|
("src/dir1", "tree"),
|
|
("src/dir1/file.js", "blob"),
|
|
("src/dir2", "tree"),
|
|
("src/dir2/file.py", "blob"),
|
|
("src/dir2/foo.cc", "blob"),
|
|
("src/dir2/foo.svg", "blob"),
|
|
("src/dir2/subdir", "tree"),
|
|
("src/dir2/subdir/file.cpp", "blob"),
|
|
("src/dir2/subdir/file.c", "blob"),
|
|
("src/dir2/subdir/file.h", "blob"),
|
|
("src/dir2/subdir/file.hpp", "blob"),
|
|
("src/dir2/subdir/file.java", "blob"),
|
|
("src/dir2/subdir/file.go", "blob"),
|
|
("src/sub", "tree"),
|
|
("src/sub/folder", "tree"),
|
|
("src/sub/folder/loading.svg", "blob"),
|
|
("src/sub/folder/loading.ico", "blob"),
|
|
("out", "tree"),
|
|
("out/file.py", "blob"),
|
|
("out/assets", "tree"),
|
|
("out/assets/file.png", "blob"),
|
|
("out/Path.To.Folder", "tree"),
|
|
("out/Path.To.Folder/file1.js", "blob"),
|
|
("out/sub", "tree"),
|
|
("out/sub/folder", "tree"),
|
|
("out/sub/folder/loading.svg", "blob"),
|
|
]
|
|
self.reader._filter_directories = (
|
|
["src/assets", "src/documents"],
|
|
GithubRepositoryReader.FilterType.EXCLUDE,
|
|
)
|
|
self.reader._filter_file_extensions = (
|
|
[".svg", ".ico", ".cpp", ".c", ".h"],
|
|
GithubRepositoryReader.FilterType.EXCLUDE,
|
|
)
|
|
|
|
expected_tree_obj_paths = [
|
|
"src",
|
|
"src/file.py",
|
|
"src/file.txt",
|
|
"src/file.md",
|
|
"src/Path.To.Folder",
|
|
"src/Path.To.Folder/file1.js",
|
|
# "src/Path.To.Folder/file2.cpp", # It should be excluded because of the extension in the filter
|
|
"src/Path.To.Folder/file4.rs",
|
|
"src/Path.To.Folder/file5.ts",
|
|
# "src/Path.To.Folder/file6.h",
|
|
# "src/Path.To.Folder/file7.c",
|
|
"src/Path.To.Folder/file8.java",
|
|
# "src/assets", # The whole directory should be excluded because of the filter
|
|
# "src/assets/file.png",
|
|
# "src/assets/file.jpg",
|
|
# "src/assets/file.jpeg",
|
|
# "src/assets/file.gif",
|
|
# "src/assets/file.svg",
|
|
# "src/assets/file.ico"
|
|
# "src/documents", # The whole directory should be excluded because of the filter
|
|
# "src/documents/file.pdf",
|
|
# "src/documents/file.doc",
|
|
# "src/documents/file.docx",
|
|
# "src/documents/file.xls",
|
|
# "src/documents/file.xlsx",
|
|
# "src/documents/file.ppt",
|
|
# "src/documents/file.pptx",
|
|
# "src/documents/file.odt",
|
|
# "src/documents/file.ods",
|
|
"src/dir1",
|
|
"src/dir1/file.js",
|
|
"src/dir2",
|
|
"src/dir2/file.py",
|
|
"src/dir2/foo.cc",
|
|
# "src/dir2/foo.svg", # It should be excluded because of the extension in the filter
|
|
"src/dir2/subdir",
|
|
# "src/dir2/subdir/file.cpp", # It should be excluded because of the extension in the filter
|
|
# "src/dir2/subdir/file.c", # It should be excluded because of the extension in the filter
|
|
# "src/dir2/subdir/file.h", # It should be excluded because of the extension in the filter
|
|
"src/dir2/subdir/file.hpp",
|
|
"src/dir2/subdir/file.java",
|
|
"src/dir2/subdir/file.go",
|
|
"src/sub",
|
|
"src/sub/folder",
|
|
# "src/sub/folder/loading.svg", # It should be excluded because of the extension in the filter
|
|
# "src/sub/folder/loading.ico", # It should be excluded because of the extension in the filter
|
|
"out",
|
|
"out/file.py",
|
|
"out/assets",
|
|
"out/assets/file.png",
|
|
"out/Path.To.Folder",
|
|
"out/Path.To.Folder/file1.js",
|
|
"out/sub",
|
|
"out/sub/folder",
|
|
# "out/sub/folder/loading.svg", # It should be excluded because of the extension in the filter
|
|
]
|
|
|
|
actual_tree_obj_paths = [
|
|
tree_obj_path
|
|
for tree_obj_path, tree_obj_type in tree_obj_paths
|
|
if self.reader._allow_tree_obj(tree_obj_path, tree_obj_type)
|
|
]
|
|
|
|
self.assertCountEqual(
|
|
expected_tree_obj_paths, actual_tree_obj_paths
|
|
), "Tree object paths are incorrect"
|
|
|
|
self.reader._filter_directories = (
|
|
[
|
|
"src/dir2/subdir",
|
|
"src/documents",
|
|
"src/Path.To.Folder",
|
|
"out/assets",
|
|
"out/sub/folder",
|
|
],
|
|
GithubRepositoryReader.FilterType.INCLUDE,
|
|
)
|
|
self.reader._filter_file_extensions = (
|
|
[".png", ".svg", ".ico", "jpg", ".java", ".doc", ".pptx"],
|
|
GithubRepositoryReader.FilterType.EXCLUDE,
|
|
)
|
|
|
|
expected_tree_obj_paths = [
|
|
"out",
|
|
"out/assets",
|
|
# "out/assets/file.png", # It should be excluded by extension
|
|
"out/sub",
|
|
"out/sub/folder",
|
|
"src",
|
|
# "out/sub/folder/loading.svg", # It should be excluded by extension
|
|
"src/Path.To.Folder",
|
|
"src/Path.To.Folder/file1.js",
|
|
"src/Path.To.Folder/file2.cpp",
|
|
"src/Path.To.Folder/file4.rs",
|
|
"src/Path.To.Folder/file5.ts",
|
|
"src/Path.To.Folder/file6.h",
|
|
"src/Path.To.Folder/file7.c",
|
|
# "src/Path.To.Folder/file8.java", # It should be excluded by extension
|
|
"src/dir2",
|
|
"src/dir2/subdir",
|
|
"src/dir2/subdir/file.cpp",
|
|
"src/dir2/subdir/file.c",
|
|
"src/dir2/subdir/file.h",
|
|
"src/dir2/subdir/file.hpp",
|
|
# "src/dir2/subdir/file.java", # It should be excluded by extension
|
|
"src/dir2/subdir/file.go",
|
|
"src/documents",
|
|
"src/documents/file.pdf",
|
|
# "src/documents/file.doc", # It should be excluded by extension
|
|
"src/documents/file.docx",
|
|
"src/documents/file.xls",
|
|
"src/documents/file.xlsx",
|
|
"src/documents/file.ppt",
|
|
# "src/documents/file.pptx", # It should be excluded by extension
|
|
"src/documents/file.odt",
|
|
"src/documents/file.ods",
|
|
]
|
|
|
|
actual_tree_obj_paths = [
|
|
tree_obj_path
|
|
for tree_obj_path, tree_obj_type in tree_obj_paths
|
|
if self.reader._allow_tree_obj(tree_obj_path, tree_obj_type)
|
|
]
|
|
|
|
self.assertCountEqual(
|
|
expected_tree_obj_paths, actual_tree_obj_paths
|
|
), "Tree object paths are incorrect"
|