mirror of
https://github.com/run-llama/llama-hub.git
synced 2025-11-14 17:13:32 +00:00
test imports when downloaded from gpt_index
This commit is contained in:
parent
fbffd7e4fa
commit
f36f2e795e
@ -23,24 +23,19 @@ from typing import Any, Callable, List, Optional, Tuple
|
|||||||
from gpt_index.readers.base import BaseReader
|
from gpt_index.readers.base import BaseReader
|
||||||
from gpt_index.readers.file.base import DEFAULT_FILE_EXTRACTOR
|
from gpt_index.readers.file.base import DEFAULT_FILE_EXTRACTOR
|
||||||
|
|
||||||
import importlib
|
from github_client import (
|
||||||
|
BaseGithubClient,
|
||||||
|
GitBranchResponseModel,
|
||||||
|
GitCommitResponseModel,
|
||||||
|
GithubClient,
|
||||||
|
GitTreeResponseModel,
|
||||||
|
)
|
||||||
|
|
||||||
importlib.import_module
|
from utils import (
|
||||||
|
BufferedGitBlobDataIterator,
|
||||||
# from github_client import (
|
print_if_verbose,
|
||||||
# BaseGithubClient,
|
get_file_extension,
|
||||||
# GitBranchResponseModel,
|
)
|
||||||
# GitCommitResponseModel,
|
|
||||||
# GithubClient,
|
|
||||||
# GitTreeResponseModel,
|
|
||||||
# )
|
|
||||||
|
|
||||||
import utils
|
|
||||||
# from utils import (
|
|
||||||
# BufferedGitBlobDataIterator,
|
|
||||||
# print_if_verbose,
|
|
||||||
# get_file_extension,
|
|
||||||
# )
|
|
||||||
from gpt_index.readers.schema.base import Document
|
from gpt_index.readers.schema.base import Document
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
@ -70,7 +65,7 @@ class GithubRepositoryReader(BaseReader):
|
|||||||
self,
|
self,
|
||||||
owner: str,
|
owner: str,
|
||||||
repo: str,
|
repo: str,
|
||||||
github_client: github_client.BaseGithubClient = github_client.GithubClient(),
|
github_client: BaseGithubClient = GithubClient(),
|
||||||
use_parser: bool = True,
|
use_parser: bool = True,
|
||||||
verbose: bool = False,
|
verbose: bool = False,
|
||||||
concurrent_requests: int = 5,
|
concurrent_requests: int = 5,
|
||||||
@ -133,7 +128,7 @@ class GithubRepositoryReader(BaseReader):
|
|||||||
:return: True if the tree object should be allowed, False otherwise
|
:return: True if the tree object should be allowed, False otherwise
|
||||||
"""
|
"""
|
||||||
filter_directories, filter_type = self._filter_directories
|
filter_directories, filter_type = self._filter_directories
|
||||||
utils.print_if_verbose(
|
print_if_verbose(
|
||||||
self._verbose,
|
self._verbose,
|
||||||
f"Checking {tree_obj_path} whether to {filter_type} it"
|
f"Checking {tree_obj_path} whether to {filter_type} it"
|
||||||
+ f" based on the filter directories: {filter_directories}",
|
+ f" based on the filter directories: {filter_directories}",
|
||||||
@ -166,16 +161,16 @@ class GithubRepositoryReader(BaseReader):
|
|||||||
:return: True if the tree object should be allowed, False otherwise
|
:return: True if the tree object should be allowed, False otherwise
|
||||||
"""
|
"""
|
||||||
filter_file_extensions, filter_type = self._filter_file_extensions
|
filter_file_extensions, filter_type = self._filter_file_extensions
|
||||||
utils.print_if_verbose(
|
print_if_verbose(
|
||||||
self._verbose,
|
self._verbose,
|
||||||
f"Checking {tree_obj_path} whether to {filter_type} it"
|
f"Checking {tree_obj_path} whether to {filter_type} it"
|
||||||
+ f" based on the filter file extensions: {filter_file_extensions}",
|
+ f" based on the filter file extensions: {filter_file_extensions}",
|
||||||
)
|
)
|
||||||
|
|
||||||
if filter_type == self.FilterType.EXCLUDE:
|
if filter_type == self.FilterType.EXCLUDE:
|
||||||
return utils.get_file_extension(tree_obj_path) not in filter_file_extensions
|
return get_file_extension(tree_obj_path) not in filter_file_extensions
|
||||||
elif filter_type == self.FilterType.INCLUDE:
|
elif filter_type == self.FilterType.INCLUDE:
|
||||||
return utils.get_file_extension(tree_obj_path) in filter_file_extensions
|
return get_file_extension(tree_obj_path) in filter_file_extensions
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Unknown filter type: {filter_type}. "
|
f"Unknown filter type: {filter_type}. "
|
||||||
@ -209,14 +204,14 @@ class GithubRepositoryReader(BaseReader):
|
|||||||
|
|
||||||
:return: list of documents
|
:return: list of documents
|
||||||
"""
|
"""
|
||||||
commit_response: github_client.GitCommitResponseModel = self._loop.run_until_complete(
|
commit_response: GitCommitResponseModel = self._loop.run_until_complete(
|
||||||
self._github_client.get_commit(self._owner, self._repo, commit_sha)
|
self._github_client.get_commit(self._owner, self._repo, commit_sha)
|
||||||
)
|
)
|
||||||
|
|
||||||
tree_sha = commit_response.commit.tree.sha
|
tree_sha = commit_response.commit.tree.sha
|
||||||
blobs_and_paths = self._loop.run_until_complete(self._recurse_tree(tree_sha))
|
blobs_and_paths = self._loop.run_until_complete(self._recurse_tree(tree_sha))
|
||||||
|
|
||||||
utils.print_if_verbose(self._verbose, f"got {len(blobs_and_paths)} blobs")
|
print_if_verbose(self._verbose, f"got {len(blobs_and_paths)} blobs")
|
||||||
|
|
||||||
return self._loop.run_until_complete(
|
return self._loop.run_until_complete(
|
||||||
self._generate_documents(blobs_and_paths=blobs_and_paths)
|
self._generate_documents(blobs_and_paths=blobs_and_paths)
|
||||||
@ -232,14 +227,14 @@ class GithubRepositoryReader(BaseReader):
|
|||||||
|
|
||||||
:return: list of documents
|
:return: list of documents
|
||||||
"""
|
"""
|
||||||
branch_data: github_client.GitBranchResponseModel = self._loop.run_until_complete(
|
branch_data: GitBranchResponseModel = self._loop.run_until_complete(
|
||||||
self._github_client.get_branch(self._owner, self._repo, branch)
|
self._github_client.get_branch(self._owner, self._repo, branch)
|
||||||
)
|
)
|
||||||
|
|
||||||
tree_sha = branch_data.commit.commit.tree.sha
|
tree_sha = branch_data.commit.commit.tree.sha
|
||||||
blobs_and_paths = self._loop.run_until_complete(self._recurse_tree(tree_sha))
|
blobs_and_paths = self._loop.run_until_complete(self._recurse_tree(tree_sha))
|
||||||
|
|
||||||
utils.print_if_verbose(self._verbose, f"got {len(blobs_and_paths)} blobs")
|
print_if_verbose(self._verbose, f"got {len(blobs_and_paths)} blobs")
|
||||||
|
|
||||||
return self._loop.run_until_complete(
|
return self._loop.run_until_complete(
|
||||||
self._generate_documents(blobs_and_paths=blobs_and_paths)
|
self._generate_documents(blobs_and_paths=blobs_and_paths)
|
||||||
@ -290,15 +285,15 @@ class GithubRepositoryReader(BaseReader):
|
|||||||
:return: list of tuples of
|
:return: list of tuples of
|
||||||
(tree object, file's full path realtive to the root of the repo)
|
(tree object, file's full path realtive to the root of the repo)
|
||||||
"""
|
"""
|
||||||
blobs_and_full_paths: List[Tuple[github_client.GitTreeResponseModel.GitTreeObject, str]] = []
|
blobs_and_full_paths: List[Tuple[GitTreeResponseModel.GitTreeObject, str]] = []
|
||||||
utils.print_if_verbose(
|
print_if_verbose(
|
||||||
self._verbose, "\t" * current_depth + f"current path: {current_path}"
|
self._verbose, "\t" * current_depth + f"current path: {current_path}"
|
||||||
)
|
)
|
||||||
|
|
||||||
tree_data: github_client.GitTreeResponseModel = await self._github_client.get_tree(
|
tree_data: GitTreeResponseModel = await self._github_client.get_tree(
|
||||||
self._owner, self._repo, tree_sha
|
self._owner, self._repo, tree_sha
|
||||||
)
|
)
|
||||||
utils.print_if_verbose(
|
print_if_verbose(
|
||||||
self._verbose, "\t" * current_depth + f"processing tree {tree_sha}"
|
self._verbose, "\t" * current_depth + f"processing tree {tree_sha}"
|
||||||
)
|
)
|
||||||
for tree_obj in tree_data.tree:
|
for tree_obj in tree_data.tree:
|
||||||
@ -306,14 +301,14 @@ class GithubRepositoryReader(BaseReader):
|
|||||||
|
|
||||||
# filter tree object here
|
# filter tree object here
|
||||||
if not self._allow_tree_obj(file_path):
|
if not self._allow_tree_obj(file_path):
|
||||||
utils.print_if_verbose(
|
print_if_verbose(
|
||||||
self._verbose,
|
self._verbose,
|
||||||
"\t" * current_depth + f"ignoring {tree_obj.path} due to filter",
|
"\t" * current_depth + f"ignoring {tree_obj.path} due to filter",
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if tree_obj.type == "tree":
|
if tree_obj.type == "tree":
|
||||||
utils.print_if_verbose(
|
print_if_verbose(
|
||||||
self._verbose,
|
self._verbose,
|
||||||
"\t" * current_depth + f"recursing into {tree_obj.path}",
|
"\t" * current_depth + f"recursing into {tree_obj.path}",
|
||||||
)
|
)
|
||||||
@ -322,14 +317,14 @@ class GithubRepositoryReader(BaseReader):
|
|||||||
await self._recurse_tree(tree_obj.sha, file_path, current_depth + 1)
|
await self._recurse_tree(tree_obj.sha, file_path, current_depth + 1)
|
||||||
)
|
)
|
||||||
elif tree_obj.type == "blob":
|
elif tree_obj.type == "blob":
|
||||||
utils.print_if_verbose(
|
print_if_verbose(
|
||||||
self._verbose, "\t" * current_depth + f"found blob {tree_obj.path}"
|
self._verbose, "\t" * current_depth + f"found blob {tree_obj.path}"
|
||||||
)
|
)
|
||||||
blobs_and_full_paths.append((tree_obj, file_path))
|
blobs_and_full_paths.append((tree_obj, file_path))
|
||||||
return blobs_and_full_paths
|
return blobs_and_full_paths
|
||||||
|
|
||||||
async def _generate_documents(
|
async def _generate_documents(
|
||||||
self, blobs_and_paths: List[Tuple[github_client.GitTreeResponseModel.GitTreeObject, str]]
|
self, blobs_and_paths: List[Tuple[GitTreeResponseModel.GitTreeObject, str]]
|
||||||
) -> List[Document]:
|
) -> List[Document]:
|
||||||
"""
|
"""
|
||||||
Generate documents from a list of blobs and their full paths.
|
Generate documents from a list of blobs and their full paths.
|
||||||
@ -338,7 +333,7 @@ class GithubRepositoryReader(BaseReader):
|
|||||||
(tree object, file's full path in the repo realtive to the root of the repo)
|
(tree object, file's full path in the repo realtive to the root of the repo)
|
||||||
:return: list of documents
|
:return: list of documents
|
||||||
"""
|
"""
|
||||||
buffered_iterator = utils.BufferedGitBlobDataIterator(
|
buffered_iterator = BufferedGitBlobDataIterator(
|
||||||
blobs_and_paths=blobs_and_paths,
|
blobs_and_paths=blobs_and_paths,
|
||||||
github_client=self._github_client,
|
github_client=self._github_client,
|
||||||
owner=self._owner,
|
owner=self._owner,
|
||||||
@ -350,7 +345,7 @@ class GithubRepositoryReader(BaseReader):
|
|||||||
|
|
||||||
documents = []
|
documents = []
|
||||||
async for blob_data, full_path in buffered_iterator:
|
async for blob_data, full_path in buffered_iterator:
|
||||||
utils.print_if_verbose(self._verbose, f"generating document for {full_path}")
|
print_if_verbose(self._verbose, f"generating document for {full_path}")
|
||||||
assert (
|
assert (
|
||||||
blob_data.encoding == "base64"
|
blob_data.encoding == "base64"
|
||||||
), f"blob encoding {blob_data.encoding} not supported"
|
), f"blob encoding {blob_data.encoding} not supported"
|
||||||
@ -359,7 +354,7 @@ class GithubRepositoryReader(BaseReader):
|
|||||||
decoded_bytes = base64.b64decode(blob_data.content)
|
decoded_bytes = base64.b64decode(blob_data.content)
|
||||||
del blob_data.content
|
del blob_data.content
|
||||||
except binascii.Error:
|
except binascii.Error:
|
||||||
utils.print_if_verbose(
|
print_if_verbose(
|
||||||
self._verbose, f"could not decode {full_path} as base64"
|
self._verbose, f"could not decode {full_path} as base64"
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
@ -374,7 +369,7 @@ class GithubRepositoryReader(BaseReader):
|
|||||||
if document is not None:
|
if document is not None:
|
||||||
documents.append(document)
|
documents.append(document)
|
||||||
continue
|
continue
|
||||||
utils.print_if_verbose(
|
print_if_verbose(
|
||||||
self._verbose,
|
self._verbose,
|
||||||
f"could not parse {full_path} as a supported file type"
|
f"could not parse {full_path} as a supported file type"
|
||||||
+ " - falling back to decoding as utf-8 raw text",
|
+ " - falling back to decoding as utf-8 raw text",
|
||||||
@ -385,11 +380,11 @@ class GithubRepositoryReader(BaseReader):
|
|||||||
raise ValueError("decoded_bytes is None")
|
raise ValueError("decoded_bytes is None")
|
||||||
decoded_text = decoded_bytes.decode("utf-8")
|
decoded_text = decoded_bytes.decode("utf-8")
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
utils.print_if_verbose(
|
print_if_verbose(
|
||||||
self._verbose, f"could not decode {full_path} as utf-8"
|
self._verbose, f"could not decode {full_path} as utf-8"
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
utils.print_if_verbose(
|
print_if_verbose(
|
||||||
self._verbose,
|
self._verbose,
|
||||||
f"got {len(decoded_text)} characters"
|
f"got {len(decoded_text)} characters"
|
||||||
+ f"- adding to documents - {full_path}",
|
+ f"- adding to documents - {full_path}",
|
||||||
@ -415,10 +410,10 @@ class GithubRepositoryReader(BaseReader):
|
|||||||
:param `file_content`: content of the file
|
:param `file_content`: content of the file
|
||||||
:return: Document if the file is supported by a parser, None otherwise
|
:return: Document if the file is supported by a parser, None otherwise
|
||||||
"""
|
"""
|
||||||
file_extension = utils.get_file_extension(file_path)
|
file_extension = get_file_extension(file_path)
|
||||||
if (parser := DEFAULT_FILE_EXTRACTOR.get(file_extension)) is not None:
|
if (parser := DEFAULT_FILE_EXTRACTOR.get(file_extension)) is not None:
|
||||||
parser.init_parser()
|
parser.init_parser()
|
||||||
utils.print_if_verbose(
|
print_if_verbose(
|
||||||
self._verbose,
|
self._verbose,
|
||||||
f"parsing {file_path}"
|
f"parsing {file_path}"
|
||||||
+ f"as {file_extension} with "
|
+ f"as {file_extension} with "
|
||||||
@ -431,7 +426,7 @@ class GithubRepositoryReader(BaseReader):
|
|||||||
mode="w+b",
|
mode="w+b",
|
||||||
delete=False,
|
delete=False,
|
||||||
) as tmpfile:
|
) as tmpfile:
|
||||||
utils.print_if_verbose(
|
print_if_verbose(
|
||||||
self._verbose,
|
self._verbose,
|
||||||
"created a temporary file"
|
"created a temporary file"
|
||||||
+ f"{tmpfile.name} for parsing {file_path}",
|
+ f"{tmpfile.name} for parsing {file_path}",
|
||||||
@ -443,7 +438,7 @@ class GithubRepositoryReader(BaseReader):
|
|||||||
parsed_file = parser.parse_file(pathlib.Path(tmpfile.name))
|
parsed_file = parser.parse_file(pathlib.Path(tmpfile.name))
|
||||||
parsed_file = "\n\n".join(parsed_file)
|
parsed_file = "\n\n".join(parsed_file)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
utils.print_if_verbose(
|
print_if_verbose(
|
||||||
self._verbose, f"error while parsing {file_path}"
|
self._verbose, f"error while parsing {file_path}"
|
||||||
)
|
)
|
||||||
logger.error(
|
logger.error(
|
||||||
@ -482,7 +477,7 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
return wrapper
|
return wrapper
|
||||||
|
|
||||||
github_client = github_client.GithubClient(github_token=os.environ["GITHUB_TOKEN"], verbose=True)
|
github_client = GithubClient(github_token=os.environ["GITHUB_TOKEN"], verbose=True)
|
||||||
|
|
||||||
reader1 = GithubRepositoryReader(
|
reader1 = GithubRepositoryReader(
|
||||||
github_client=github_client,
|
github_client=github_client,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user