llama-hub/tests/tests_github_repo/test_github_client.py

from llama_index import Document
import httpx
import pytest
import asyncio
import base64
import os
from unittest.mock import MagicMock, AsyncMock, call
import unittest
from typing import List, Tuple

# Remove this to test changes to GithubRepositoryReader.
# pytest.skip(
#     "Skip by default due to dependence on network request and github api token.",
#     allow_module_level=True,
# )

from loader_hub.github_repo.utils import (
    BufferedAsyncIterator,
    BufferedGitBlobDataIterator,
)

from loader_hub.github_repo.github_client import (
    GithubClient,
    GitBlobResponseModel,
    GitTreeResponseModel,
)

from loader_hub.github_repo.base import GithubRepositoryReader


@pytest.fixture
def github_client():
    return GithubClient(
        github_token=os.getenv("GITHUB_API_TOKEN"),
        verbose=True,
    )


@pytest.mark.asyncio
async def test_github_client(github_client):
    owner = "emptycrown"
    repo = "llama-hub"
    branch = "main"
    commit_sha = "0cd691322e5244b48b68e3588d1343eb53f3a112"  # Points to Add spotify reader, https://github.com/emptycrown/llama-hub/commit/0cd691322e5244b48b68e3588d1343eb53f3a112

    # test get_branch
    branch_data = await github_client.get_branch(owner, repo, branch)
    assert branch_data.name == branch
    assert (
        branch_data._links.self
        == f"https://api.github.com/repos/{owner}/{repo}/branches/{branch}"
    ), "Branch self link is incorrect"
    assert (
        branch_data._links.html
        == f"https://github.com/{owner}/{repo}/tree/{branch}"
    ), "Branch html link is incorrect"

    # test get_commit
    commit_data = await github_client.get_commit(owner, repo, commit_sha)
    assert commit_data.sha == commit_sha, "Commit sha is incorrect"
    assert (
        commit_data.url
        == f"https://api.github.com/repos/{owner}/{repo}/commits/{commit_sha}"
    ), "Commit url is incorrect"

    # test get_tree
    tree_data = await github_client.get_tree(
        owner, repo, commit_data.commit.tree.sha
    )
    assert (
        tree_data.url
        == f"https://api.github.com/repos/{owner}/{repo}/git/trees/{commit_data.commit.tree.sha}"
    ), "Tree url is incorrect"
    assert (
        tree_data.sha == commit_data.commit.tree.sha
    ), "Tree sha is incorrect"
    print(tree_data.tree[0].sha)
    assert 1 == 1

    # test get_blob
    expected_files_in_first_depth_of_the_tree: List[Tuple[str, str]] = [
        ("test_requirements.txt", "blob"),
        ("README.md", "blob"),
        ("Makefile", "blob"),
        (".gitignore", "blob"),
        ("tests", "tree"),
        ("loader_hub", "tree"),
        (".github", "tree"),
    ]
    # check if the first depth of the tree has the expected files. All the expected files should be in the first depth of the tree and vice versa
    assert len(tree_data.tree) == len(
        expected_files_in_first_depth_of_the_tree
    ), "The number of files in the first depth of the tree is incorrect"
    for file in expected_files_in_first_depth_of_the_tree:
        assert file in [
            (tree_file.path, tree_file.type) for tree_file in tree_data.tree
        ], f"{file} is not in the first depth of the tree"
    # checking the opposite
    for tree_obj in tree_data.tree:
        assert (
            tree_obj.path,
            tree_obj.type,
        ) in expected_files_in_first_depth_of_the_tree, (
            f"{tree_obj.path} is not in the expected files"
        )

    # find test_reqirements.txt in the tree
    test_requirements_txt = [
        tree_obj
        for tree_obj in tree_data.tree
        if tree_obj.path == "test_requirements.txt"
    ][0]

    # test get_blob
    blob_data = await github_client.get_blob(
        owner, repo, test_requirements_txt.sha
    )
    assert blob_data.encoding == "base64", "Blob encoding is incorrect"
    assert (
        blob_data.url
        == f"https://api.github.com/repos/{owner}/{repo}/git/blobs/{test_requirements_txt.sha}"
    ), "Blob url is incorrect"
    assert blob_data.sha == test_requirements_txt.sha, "Blob sha is incorrect"

    # decode blob content base64-decoded string to utf-8
    decoded_blob_content = base64.b64decode(blob_data.content).decode("utf-8")

    expected_decoded_blob_content = """

# For testing
pytest==7.2.1
pytest-dotenv==0.5.2
# TODO: remove gpt_index after migration
https://github.com/jerryjliu/gpt_index/archive/master.zip

llama-index

# For linting
# linting stubs
types-requests==2.28.11.8
# formatting
black==22.12.0
isort==5.11.4
"""
    # check if the decoded blob content is correct
    for dbc in zip(
        filter(lambda x: x != "", decoded_blob_content.splitlines()),
        filter(lambda x: x != "", expected_decoded_blob_content.splitlines()),
    ):
        assert dbc[0] == dbc[1], f"{dbc[0]} is not equal to {dbc[1]}"


class TestGithubRepositoryReader(unittest.TestCase):
    def setUp(self):
        self.maxDiff = None
        self.github_client = MagicMock()
        self.owner = "owner"
        self.repo = "repo"
        self.reader = GithubRepositoryReader(
            self.github_client,
            self.owner,
            self.repo,
            verbose=True,
            use_parser=False,
        )

    def test__check_filter_directories(self):
        tree_obj_path = "path/to/some/file.py"
        self.reader._filter_directories = (
            ["path/to"],
            GithubRepositoryReader.FilterType.INCLUDE,
        )
        self.assertTrue(self.reader._check_filter_directories(tree_obj_path))

        self.reader._filter_directories = (
            ["path/to"],
            GithubRepositoryReader.FilterType.EXCLUDE,
        )
        self.assertFalse(self.reader._check_filter_directories(tree_obj_path))

    def test__check_filter_file_extensions(self):
        tree_obj_path = "path/to/some/file.py"
        self.reader._filter_file_extensions = (
            [".py"],
            GithubRepositoryReader.FilterType.INCLUDE,
        )
        self.assertTrue(
            self.reader._check_filter_file_extensions(tree_obj_path)
        )

        self.reader._filter_file_extensions = (
            [".txt"],
            GithubRepositoryReader.FilterType.EXCLUDE,
        )
        self.assertTrue(
            self.reader._check_filter_file_extensions(tree_obj_path)
        )

    def test__allow_tree_obj_with_files_only(self):
        tree_obj_paths = [
            ("src", "tree"),
            ("src/file.py", "blob"),
            ("src/file.txt", "blob"),
            ("src/file.md", "blob"),
            ("src/Path.To.Folder", "tree"),
            ("src/Path.To.Folder/file1.js", "blob"),
            ("src/Path.To.Folder/file2.cpp", "blob"),
            ("src/Path.To.Folder/file4.rs", "blob"),
            ("src/Path.To.Folder/file5.ts", "blob"),
            ("src/Path.To.Folder/file6.h", "blob"),
            ("src/Path.To.Folder/file7.c", "blob"),
            ("src/Path.To.Folder/file8.java", "blob"),
            ("src/assets/file.png", "blob"),
            ("src/assets/file.jpg", "blob"),
            ("src/assets/file.jpeg", "blob"),
            ("src/assets/file.gif", "blob"),
            ("src/assets/file.svg", "blob"),
            ("src/assets/file.ico", "blob"),
            ("src/documents", "tree"),
            ("src/documents/file.pdf", "blob"),
            ("src/documents/file.doc", "blob"),
            ("src/documents/file.docx", "blob"),
            ("src/documents/file.xls", "blob"),
            ("src/documents/file.xlsx", "blob"),
            ("src/documents/file.ppt", "blob"),
            ("src/documents/file.pptx", "blob"),
            ("src/documents/file.odt", "blob"),
            ("src/documents/file.ods", "blob"),
            ("src/dir1", "tree"),
            ("src/dir1/file.js", "blob"),
            ("src/dir2", "tree"),
            ("src/dir2/file.py", "blob"),
            ("src/dir2/foo.cc", "blob"),
            ("src/dir2/foo.svg", "blob"),
            ("src/dir2/subdir", "tree"),
            ("src/dir2/subdir/file.cpp", "blob"),
            ("src/dir2/subdir/file.c", "blob"),
            ("src/dir2/subdir/file.h", "blob"),
            ("src/dir2/subdir/file.hpp", "blob"),
            ("src/dir2/subdir/file.java", "blob"),
            ("src/dir2/subdir/file.go", "blob"),
            ("src/sub", "tree"),
            ("src/sub/folder", "tree"),
            ("src/sub/folder/loading.svg", "blob"),
            ("src/sub/folder/loading.ico", "blob"),
            ("out", "tree"),
            ("out/file.py", "blob"),
            ("out/assets", "tree"),
            ("out/assets/file.png", "blob"),
            ("out/Path.To.Folder", "tree"),
            ("out/Path.To.Folder/file1.js", "blob"),
            ("out/sub", "tree"),
            ("out/sub/folder", "tree"),
            ("out/sub/folder/loading.svg", "blob"),
        ]
        self.reader._filter_directories = (
            ["src/assets", "src/documents"],
            GithubRepositoryReader.FilterType.EXCLUDE,
        )
        self.reader._filter_file_extensions = (
            [".svg", ".ico", ".cpp", ".c", ".h"],
            GithubRepositoryReader.FilterType.EXCLUDE,
        )

        expected_tree_obj_paths = [
            "src",
            "src/file.py",
            "src/file.txt",
            "src/file.md",
            "src/Path.To.Folder",
            "src/Path.To.Folder/file1.js",
            # "src/Path.To.Folder/file2.cpp",   # It should be excluded because of the extension in the filter
            "src/Path.To.Folder/file4.rs",
            "src/Path.To.Folder/file5.ts",
            # "src/Path.To.Folder/file6.h",
            # "src/Path.To.Folder/file7.c",
            "src/Path.To.Folder/file8.java",
            # "src/assets",                     # The whole directory should be excluded because of the filter
            # "src/assets/file.png",
            # "src/assets/file.jpg",
            # "src/assets/file.jpeg",
            # "src/assets/file.gif",
            # "src/assets/file.svg",
            # "src/assets/file.ico"
            # "src/documents",                  # The whole directory should be excluded because of the filter
            # "src/documents/file.pdf",
            # "src/documents/file.doc",
            # "src/documents/file.docx",
            # "src/documents/file.xls",
            # "src/documents/file.xlsx",
            # "src/documents/file.ppt",
            # "src/documents/file.pptx",
            # "src/documents/file.odt",
            # "src/documents/file.ods",
            "src/dir1",
            "src/dir1/file.js",
            "src/dir2",
            "src/dir2/file.py",
            "src/dir2/foo.cc",
            # "src/dir2/foo.svg",               # It should be excluded because of the extension in the filter
            "src/dir2/subdir",
            # "src/dir2/subdir/file.cpp",       # It should be excluded because of the extension in the filter
            # "src/dir2/subdir/file.c",         # It should be excluded because of the extension in the filter
            # "src/dir2/subdir/file.h",         # It should be excluded because of the extension in the filter
            "src/dir2/subdir/file.hpp",
            "src/dir2/subdir/file.java",
            "src/dir2/subdir/file.go",
            "src/sub",
            "src/sub/folder",
            # "src/sub/folder/loading.svg",     # It should be excluded because of the extension in the filter
            # "src/sub/folder/loading.ico",     # It should be excluded because of the extension in the filter
            "out",
            "out/file.py",
            "out/assets",
            "out/assets/file.png",
            "out/Path.To.Folder",
            "out/Path.To.Folder/file1.js",
            "out/sub",
            "out/sub/folder",
            # "out/sub/folder/loading.svg",     # It should be excluded because of the extension in the filter
        ]

        actual_tree_obj_paths = [
            tree_obj_path
            for tree_obj_path, tree_obj_type in tree_obj_paths
            if self.reader._allow_tree_obj(tree_obj_path, tree_obj_type)
        ]

        self.assertCountEqual(
            expected_tree_obj_paths, actual_tree_obj_paths
        ), "Tree object paths are incorrect"

        self.reader._filter_directories = (
            [
                "src/dir2/subdir",
                "src/documents",
                "src/Path.To.Folder",
                "out/assets",
                "out/sub/folder",
            ],
            GithubRepositoryReader.FilterType.INCLUDE,
        )
        self.reader._filter_file_extensions = (
            [".png", ".svg", ".ico", "jpg", ".java", ".doc", ".pptx"],
            GithubRepositoryReader.FilterType.EXCLUDE,
        )

        expected_tree_obj_paths = [
            "out",
            "out/assets",
            # "out/assets/file.png",  # It should be excluded by extension
            "out/sub",
            "out/sub/folder",
            "src",
            # "out/sub/folder/loading.svg", # It should be excluded by extension
            "src/Path.To.Folder",
            "src/Path.To.Folder/file1.js",
            "src/Path.To.Folder/file2.cpp",
            "src/Path.To.Folder/file4.rs",
            "src/Path.To.Folder/file5.ts",
            "src/Path.To.Folder/file6.h",
            "src/Path.To.Folder/file7.c",
            # "src/Path.To.Folder/file8.java", # It should be excluded by extension
            "src/dir2",
            "src/dir2/subdir",
            "src/dir2/subdir/file.cpp",
            "src/dir2/subdir/file.c",
            "src/dir2/subdir/file.h",
            "src/dir2/subdir/file.hpp",
            # "src/dir2/subdir/file.java", # It should be excluded by extension
            "src/dir2/subdir/file.go",
            "src/documents",
            "src/documents/file.pdf",
            # "src/documents/file.doc", # It should be excluded by extension
            "src/documents/file.docx",
            "src/documents/file.xls",
            "src/documents/file.xlsx",
            "src/documents/file.ppt",
            # "src/documents/file.pptx", # It should be excluded by extension
            "src/documents/file.odt",
            "src/documents/file.ods",
        ]

        actual_tree_obj_paths = [
            tree_obj_path
            for tree_obj_path, tree_obj_type in tree_obj_paths
            if self.reader._allow_tree_obj(tree_obj_path, tree_obj_type)
        ]

        self.assertCountEqual(
            expected_tree_obj_paths, actual_tree_obj_paths
        ), "Tree object paths are incorrect"