llama-hub/tests/test_github_reader.py
ahmetkca 5a27264db1
Add GitHub Repository Reader (#34)
* add github repository, test a new way to download loader

* test imports when downloaded from gpt_index

* Refactor(Github Repo): Move github_client and utils to modules

* Moved github_client.py and utils.py from loader_hub/github_repo to modules/github_repo
* Updated import statements in base.py to reflect the new location

* temp

* Refactor(GithubRepositoryReader): Add github_client argument

- Add github_client argument to GithubRepositoryReader constructor
- Set default value for github_client argument
- Update docstring to reflect changes

* Refactor(Github Repo): Update init file

- Remove imports of base, github_client and utils
- Add imports of GithubRepositoryReader and GithubClient
- Update __all__ to include the new imports

* Fix(library): Update library.json

- Updated library.json to include __init__.py file

* Refactor(GithubRepositoryReader): Add filter for directories and files

- Add filter for directories and files in GithubRepositoryReader
- Ignore directories and files that do not pass the filter
- Print out if directory or file is ignored due to filter

* Refactor(BaseReader): Check filter files

- Refactor `_check_filter_files` to `_check_filter_file_extensions` in `BaseReader`
- Ignoring files due to filter

* Docs(FilterType): Add documentation for FilterType enum

- Add documentation for FilterType enum
- Explain what the enum is used for
- Describe the attributes of the enum

* Add(GPT Index): Add GPT Index example

Add GPT Index example to README
- Set OPENAI_API_KEY environment variable
- Download GithubRepositoryReader module
- Create GithubClient and GithubRepositoryReader
- Load data from Github Repository
- Create GPTSimpleVectorIndex
- Query the index

* Add(GPT Index): Add GPT Index example

Add GPT Index example to README
- Set OPENAI_API_KEY environment variable
- Download GithubRepositoryReader module
- Create GithubClient and GithubRepositoryReader
- Load data from Github Repository
- Create GPTSimpleVectorIndex
- Query the index

* Add(GPT Index): Add GPT Index example

Add GPT Index example to README
- Set OPENAI_API_KEY environment variable
- Download GithubRepositoryReader module
- Create GithubClient and GithubRepositoryReader
- Load data from Github Repository
- Create GPTSimpleVectorIndex
- Query the index

* change the import path for extras

* change import path for extra files to absolute

* Add test for GithubClient currently not using mocks which is not ideal

* Update test_github_reader.py

* Update test_github_reader.py

---------

Co-authored-by: Jesse Zhang <jessetanzhang@gmail.com>
2023-02-24 23:41:48 -08:00

105 lines
4.2 KiB
Python

from typing import List, Tuple
import unittest
from unittest.mock import MagicMock
import pytest
from unittest.mock import AsyncMock
# Skip by default due to network request.
# Remove this to test changes to GithubRepositoryReader.
pytest.skip()
from loader_hub.github_repo import GithubRepositoryReader, GithubClient
from gpt_index import Document
import os
import pytest
import base64
from loader_hub.github_repo import GithubClient
@pytest.fixture
def github_client():
return GithubClient(
github_token=os.getenv("GITHUB_API_TOKEN"),
verbose= True,
)
@pytest.mark.asyncio
async def test_github_client(github_client):
owner = "emptycrown"
repo = "llama-hub"
branch = "main"
commit_sha = "0cd691322e5244b48b68e3588d1343eb53f3a112" # Points to Add spotify reader, https://github.com/emptycrown/llama-hub/commit/0cd691322e5244b48b68e3588d1343eb53f3a112
# test get_branch
branch_data = await github_client.get_branch(owner, repo, branch)
assert branch_data.name == branch
assert branch_data._links.self == f"https://api.github.com/repos/{owner}/{repo}/branches/{branch}", "Branch self link is incorrect"
assert branch_data._links.html == f"https://github.com/{owner}/{repo}/tree/{branch}", "Branch html link is incorrect"
# test get_commit
commit_data = await github_client.get_commit(owner, repo, commit_sha)
assert commit_data.sha == commit_sha, "Commit sha is incorrect"
assert commit_data.url == f"https://api.github.com/repos/{owner}/{repo}/commits/{commit_sha}", "Commit url is incorrect"
# test get_tree
tree_data = await github_client.get_tree(owner, repo, commit_data.commit.tree.sha)
assert tree_data.url == f"https://api.github.com/repos/{owner}/{repo}/git/trees/{commit_data.commit.tree.sha}", "Tree url is incorrect"
assert tree_data.sha == commit_data.commit.tree.sha, "Tree sha is incorrect"
print(tree_data.tree[0].sha)
assert 1 == 1
# test get_blob
expected_files_in_first_depth_of_the_tree: List[Tuple[str, str]] = [
("test_requirements.txt", "blob"),
("README.md", "blob"),
("Makefile", "blob"),
(".gitignore", "blob"),
("tests", "tree"),
("loader_hub", "tree"),
(".github", "tree"),
]
# check if the first depth of the tree has the expected files. All the expected files should be in the first depth of the tree and vice versa
assert len(tree_data.tree) == len(expected_files_in_first_depth_of_the_tree), "The number of files in the first depth of the tree is incorrect"
for file in expected_files_in_first_depth_of_the_tree:
assert file in [(tree_file.path, tree_file.type) for tree_file in tree_data.tree], f"{file} is not in the first depth of the tree"
# checking the opposite
for tree_obj in tree_data.tree:
assert (tree_obj.path, tree_obj.type) in expected_files_in_first_depth_of_the_tree, f"{tree_obj.path} is not in the expected files"
# find test_reqirements.txt in the tree
test_requirements_txt = [tree_obj for tree_obj in tree_data.tree if tree_obj.path == "test_requirements.txt"][0]
# test get_blob
blob_data = await github_client.get_blob(owner, repo, test_requirements_txt.sha)
assert blob_data.encoding == "base64", "Blob encoding is incorrect"
assert blob_data.url == f"https://api.github.com/repos/{owner}/{repo}/git/blobs/{test_requirements_txt.sha}", "Blob url is incorrect"
assert blob_data.sha == test_requirements_txt.sha, "Blob sha is incorrect"
# decode blob content base64-decoded string to utf-8
decoded_blob_content = base64.b64decode(blob_data.content).decode("utf-8")
expected_decoded_blob_content = """
# For testing
pytest==7.2.1
pytest-dotenv==0.5.2
# TODO: remove gpt_index after migration
https://github.com/jerryjliu/gpt_index/archive/master.zip
llama-index
# For linting
# linting stubs
types-requests==2.28.11.8
# formatting
black==22.12.0
isort==5.11.4
"""
# check if the decoded blob content is correct
for dbc in zip(filter( lambda x: x != "", decoded_blob_content.splitlines()), filter( lambda x: x != "", expected_decoded_blob_content.splitlines())):
assert dbc[0] == dbc[1], f"{dbc[0]} is not equal to {dbc[1]}"