Added all other files

This commit is contained in:
EmptyCrown 2023-02-03 00:05:28 -08:00
parent 5cfa2a5098
commit 8889cc77c0
45 changed files with 1262 additions and 3 deletions

5
loader_hub/add_loader.sh Executable file
View File

@ -0,0 +1,5 @@
mkdir $1;
touch $1/base.py;
touch $1/README.md;
touch $1/__init__.py;
echo "\"\"\"Init file.\"\"\"" > $1/__init__.py;

View File

View File

@ -0,0 +1 @@
"""Init file."""

View File

@ -0,0 +1,95 @@
"""Database Reader."""
from typing import Any, List, Optional
from sqlalchemy import text
from sqlalchemy.engine import Engine
from gpt_index.langchain_helpers.sql_wrapper import SQLDatabase
from gpt_index.readers.base import BaseReader
from gpt_index.readers.schema.base import Document
class DatabaseReader(BaseReader):
"""Simple Database reader.
Concatenates each row into Document used by GPT Index.
Args:
sql_database (Optional[SQLDatabase]): SQL database to use,
including table names to specify.
See :ref:`Ref-Struct-Store` for more details.
OR
engine (Optional[Engine]): SQLAlchemy Engine object of the database connection.
OR
uri (Optional[str]): uri of the database connection.
OR
scheme (Optional[str]): scheme of the database connection.
host (Optional[str]): host of the database connection.
port (Optional[int]): port of the database connection.
user (Optional[str]): user of the database connection.
password (Optional[str]): password of the database connection.
dbname (Optional[str]): dbname of the database connection.
Returns:
DatabaseReader: A DatabaseReader object.
"""
def __init__(
self,
sql_database: Optional[SQLDatabase] = None,
engine: Optional[Engine] = None,
uri: Optional[str] = None,
scheme: Optional[str] = None,
host: Optional[str] = None,
port: Optional[str] = None,
user: Optional[str] = None,
password: Optional[str] = None,
dbname: Optional[str] = None,
*args: Optional[Any],
**kwargs: Optional[Any],
) -> None:
"""Initialize with parameters."""
if sql_database:
self.sql_database = sql_database
elif engine:
self.sql_database = SQLDatabase(engine, *args, **kwargs)
elif uri:
self.uri = uri
self.sql_database = SQLDatabase.from_uri(uri, *args, **kwargs)
elif scheme and host and port and user and password and dbname:
uri = f"{scheme}://{user}:{password}@{host}:{port}/{dbname}"
self.uri = uri
self.sql_database = SQLDatabase.from_uri(uri, *args, **kwargs)
else:
raise ValueError(
"You must provide either a SQLDatabase, "
"a SQL Alchemy Engine, a valid connection URI, or a valid "
"set of credentials."
)
def load_data(self, query: str) -> List[Document]:
"""Query and load data from the Database, returning a list of Documents.
Args:
query (str): Query parameter to filter tables and rows.
Returns:
List[Document]: A list of Document objects.
"""
documents = []
with self.sql_database.engine.connect() as connection:
if query is None:
raise ValueError("A query parameter is necessary to filter the data")
else:
result = connection.execute(text(query))
for item in result.fetchall():
documents.append(Document(item[0]))
return documents

View File

View File

@ -0,0 +1 @@
"""Init file."""

147
loader_hub/discord/base.py Normal file
View File

@ -0,0 +1,147 @@
"""Discord reader."""
import asyncio
import logging
import os
from typing import List, Optional
from gpt_index.readers.base import BaseReader
from gpt_index.readers.schema.base import Document
logger = logging.getLogger(__name__)
async def read_channel(
discord_token: str, channel_id: int, limit: Optional[int], oldest_first: bool
) -> str:
"""Async read channel.
Note: This is our hack to create a synchronous interface to the
async discord.py API. We use the `asyncio` module to run
this function with `asyncio.get_event_loop().run_until_complete`.
"""
import discord # noqa: F401
messages: List[discord.Message] = []
class CustomClient(discord.Client):
async def on_ready(self) -> None:
try:
print(f"{self.user} has connected to Discord!")
channel = client.get_channel(channel_id)
# only work for text channels for now
if not isinstance(channel, discord.TextChannel):
raise ValueError(
f"Channel {channel_id} is not a text channel. "
"Only text channels are supported for now."
)
# thread_dict maps thread_id to thread
thread_dict = {}
for thread in channel.threads:
thread_dict[thread.id] = thread
async for msg in channel.history(
limit=limit, oldest_first=oldest_first
):
messages.append(msg)
if msg.id in thread_dict:
thread = thread_dict[msg.id]
async for thread_msg in thread.history(
limit=limit, oldest_first=oldest_first
):
messages.append(thread_msg)
except Exception as e:
print("Encountered error: " + str(e))
finally:
await self.close()
intents = discord.Intents.default()
intents.message_content = True
client = CustomClient(intents=intents)
await client.start(discord_token)
msg_txt_list = [m.content for m in messages]
return "\n\n".join(msg_txt_list)
class DiscordReader(BaseReader):
"""Discord reader.
Reads conversations from channels.
Args:
discord_token (Optional[str]): Discord token. If not provided, we
assume the environment variable `DISCORD_TOKEN` is set.
"""
def __init__(self, discord_token: Optional[str] = None) -> None:
"""Initialize with parameters."""
try:
import discord # noqa: F401
except ImportError:
raise ValueError(
"`discord.py` package not found, please run `pip install discord.py`"
)
if discord_token is None:
discord_token = os.environ["DISCORD_TOKEN"]
if discord_token is None:
raise ValueError(
"Must specify `discord_token` or set environment "
"variable `DISCORD_TOKEN`."
)
self.discord_token = discord_token
def _read_channel(
self, channel_id: int, limit: Optional[int] = None, oldest_first: bool = True
) -> str:
"""Read channel."""
result = asyncio.get_event_loop().run_until_complete(
read_channel(
self.discord_token, channel_id, limit=limit, oldest_first=oldest_first
)
)
return result
def load_data(
self,
channel_ids: List[int],
limit: Optional[int] = None,
oldest_first: bool = True,
) -> List[Document]:
"""Load data from the input directory.
Args:
channel_ids (List[int]): List of channel ids to read.
limit (Optional[int]): Maximum number of messages to read.
oldest_first (bool): Whether to read oldest messages first.
Defaults to `True`.
Returns:
List[Document]: List of documents.
"""
results: List[Document] = []
for channel_id in channel_ids:
if not isinstance(channel_id, int):
raise ValueError(
f"Channel id {channel_id} must be an integer, "
f"not {type(channel_id)}."
)
channel_content = self._read_channel(
channel_id, limit=limit, oldest_first=oldest_first
)
results.append(
Document(channel_content, extra_info={"channel": channel_id})
)
return results
if __name__ == "__main__":
reader = DiscordReader()
print("initialized reader")
output = reader.load_data(channel_ids=[1057178784895348746], limit=10)
print(output)

View File

View File

@ -0,0 +1 @@
"""Init file."""

75
loader_hub/faiss/base.py Normal file
View File

@ -0,0 +1,75 @@
"""Faiss reader."""
from typing import Any, Dict, List
import numpy as np
from gpt_index.readers.base import BaseReader
from gpt_index.readers.schema.base import Document
class FaissReader(BaseReader):
"""Faiss reader.
Retrieves documents through an existing in-memory Faiss index.
These documents can then be used in a downstream GPT Index data structure.
If you wish use Faiss itself as an index to to organize documents,
insert documents, and perform queries on them, please use GPTFaissIndex.
Args:
faiss_index (faiss.Index): A Faiss Index object (required)
"""
def __init__(self, index: Any):
"""Initialize with parameters."""
import_err_msg = """
`faiss` package not found. For instructions on
how to install `faiss` please visit
https://github.com/facebookresearch/faiss/wiki/Installing-Faiss
"""
try:
import faiss # noqa: F401
except ImportError:
raise ValueError(import_err_msg)
self._index = index
def load_data(
self,
query: np.ndarray,
id_to_text_map: Dict[str, str],
k: int = 4,
separate_documents: bool = True,
) -> List[Document]:
"""Load data from Faiss.
Args:
query (np.ndarray): A 2D numpy array of query vectors.
id_to_text_map (Dict[str, str]): A map from ID's to text.
k (int): Number of nearest neighbors to retrieve. Defaults to 4.
separate_documents (Optional[bool]): Whether to return separate
documents. Defaults to True.
Returns:
List[Document]: A list of documents.
"""
dists, indices = self._index.search(query, k)
documents = []
for qidx in range(indices.shape[0]):
for didx in range(indices.shape[1]):
doc_id = indices[qidx, didx]
if doc_id not in id_to_text_map:
raise ValueError(
f"Document ID {doc_id} not found in id_to_text_map."
)
text = id_to_text_map[doc_id]
documents.append(Document(text=text))
if not separate_documents:
# join all documents into one
text_list = [doc.get_text() for doc in documents]
text = "\n\n".join(text_list)
documents = [Document(text=text)]
return documents

View File

@ -1 +1 @@
"""Init params."""
"""Init file."""

View File

@ -12,7 +12,46 @@
"id": "web/beautiful_soup_web",
"author": "thejessezhang"
},
"TrafilaturaWebReader": {
"id": "web/trafilatura_web"
"DatabaseReader": {
"id": "database"
},
"DiscordReader": {
"id": "discord"
},
"FaissReader": {
"id": "faiss"
},
"SimpleMongoReader": {
"id": "mongo"
},
"NotionPageReader": {
"id": "notion"
},
"ObsidianReader": {
"id": "obsidian"
},
"PineconeReader": {
"id": "pinecone"
},
"QdrantReader": {
"id": "qdrant"
},
"SlackReader": {
"id": "slack"
},
"StringIterableReader": {
"id": "string_iterable"
},
"TwitterTweetReader": {
"id": "twitter"
},
"WeaviateReader": {
"id": "weaviate"
},
"WikipediaReader": {
"id": "wikipedia"
},
"YoutubeTranscriptReader": {
"id": "youtube_transcript"
}
}

View File

View File

@ -0,0 +1 @@
"""Init file."""

59
loader_hub/mongo/base.py Normal file
View File

@ -0,0 +1,59 @@
"""Mongo client."""
from typing import Dict, List, Optional
from gpt_index.readers.base import BaseReader
from gpt_index.readers.schema.base import Document
class SimpleMongoReader(BaseReader):
"""Simple mongo reader.
Concatenates each Mongo doc into Document used by GPT Index.
Args:
host (str): Mongo host.
port (int): Mongo port.
max_docs (int): Maximum number of documents to load.
"""
def __init__(self, host: str, port: int, max_docs: int = 1000) -> None:
"""Initialize with parameters."""
try:
import pymongo # noqa: F401
from pymongo import MongoClient # noqa: F401
except ImportError:
raise ValueError(
"`pymongo` package not found, please run `pip install pymongo`"
)
self.client: MongoClient = MongoClient(host, port)
self.max_docs = max_docs
def load_data(
self, db_name: str, collection_name: str, query_dict: Optional[Dict] = None
) -> List[Document]:
"""Load data from the input directory.
Args:
db_name (str): name of the database.
collection_name (str): name of the collection.
query_dict (Optional[Dict]): query to filter documents.
Defaults to None
Returns:
List[Document]: A list of documents.
"""
documents = []
db = self.client[db_name]
if query_dict is None:
cursor = db[collection_name].find()
else:
cursor = db[collection_name].find(query_dict)
for item in cursor:
if "text" not in item:
raise ValueError("`text` field not found in Mongo document.")
documents.append(Document(item["text"]))
return documents

View File

View File

@ -0,0 +1 @@
"""Init file."""

166
loader_hub/notion/base.py Normal file
View File

@ -0,0 +1,166 @@
"""Notion reader."""
import os
from typing import Any, Dict, List, Optional
import requests # type: ignore
from gpt_index.readers.base import BaseReader
from gpt_index.readers.schema.base import Document
INTEGRATION_TOKEN_NAME = "NOTION_INTEGRATION_TOKEN"
BLOCK_CHILD_URL_TMPL = "https://api.notion.com/v1/blocks/{block_id}/children"
DATABASE_URL_TMPL = "https://api.notion.com/v1/databases/{database_id}/query"
SEARCH_URL = "https://api.notion.com/v1/search"
# TODO: Notion DB reader coming soon!
class NotionPageReader(BaseReader):
"""Notion Page reader.
Reads a set of Notion pages.
Args:
integration_token (str): Notion integration token.
"""
def __init__(self, integration_token: Optional[str] = None) -> None:
"""Initialize with parameters."""
if integration_token is None:
integration_token = os.getenv(INTEGRATION_TOKEN_NAME)
if integration_token is None:
raise ValueError(
"Must specify `integration_token` or set environment "
"variable `NOTION_INTEGRATION_TOKEN`."
)
self.token = integration_token
self.headers = {
"Authorization": "Bearer " + self.token,
"Content-Type": "application/json",
"Notion-Version": "2022-06-28",
}
def _read_block(self, block_id: str, num_tabs: int = 0) -> str:
"""Read a block."""
done = False
result_lines_arr = []
cur_block_id = block_id
while not done:
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=cur_block_id)
query_dict: Dict[str, Any] = {}
res = requests.request(
"GET", block_url, headers=self.headers, json=query_dict
)
data = res.json()
for result in data["results"]:
result_type = result["type"]
result_obj = result[result_type]
cur_result_text_arr = []
if "rich_text" in result_obj:
for rich_text in result_obj["rich_text"]:
# skip if doesn't have text object
if "text" in rich_text:
text = rich_text["text"]["content"]
prefix = "\t" * num_tabs
cur_result_text_arr.append(prefix + text)
result_block_id = result["id"]
has_children = result["has_children"]
if has_children:
children_text = self._read_block(
result_block_id, num_tabs=num_tabs + 1
)
cur_result_text_arr.append(children_text)
cur_result_text = "\n".join(cur_result_text_arr)
result_lines_arr.append(cur_result_text)
if data["next_cursor"] is None:
done = True
break
else:
cur_block_id = data["next_cursor"]
result_lines = "\n".join(result_lines_arr)
return result_lines
def read_page(self, page_id: str) -> str:
"""Read a page."""
return self._read_block(page_id)
def query_database(
self, database_id: str, query_dict: Dict[str, Any] = {}
) -> List[str]:
"""Get all the pages from a Notion database."""
res = requests.post(
DATABASE_URL_TMPL.format(database_id=database_id),
headers=self.headers,
json=query_dict,
)
data = res.json()
page_ids = []
for result in data["results"]:
page_id = result["id"]
page_ids.append(page_id)
return page_ids
def search(self, query: str) -> List[str]:
"""Search Notion page given a text query."""
done = False
next_cursor: Optional[str] = None
page_ids = []
while not done:
query_dict = {
"query": query,
}
if next_cursor is not None:
query_dict["start_cursor"] = next_cursor
res = requests.post(SEARCH_URL, headers=self.headers, json=query_dict)
data = res.json()
for result in data["results"]:
page_id = result["id"]
page_ids.append(page_id)
if data["next_cursor"] is None:
done = True
break
else:
next_cursor = data["next_cursor"]
return page_ids
def load_data(
self, page_ids: List[str] = [], database_id: Optional[str] = None
) -> List[Document]:
"""Load data from the input directory.
Args:
page_ids (List[str]): List of page ids to load.
Returns:
List[Document]: List of documents.
"""
if not page_ids and not database_id:
raise ValueError("Must specify either `page_ids` or `database_id`.")
docs = []
if database_id is not None:
# get all the pages in the database
page_ids = self.query_database(database_id)
for page_id in page_ids:
page_text = self.read_page(page_id)
docs.append(Document(page_text, extra_info={"page_id": page_id}))
else:
for page_id in page_ids:
page_text = self.read_page(page_id)
docs.append(Document(page_text, extra_info={"page_id": page_id}))
return docs
if __name__ == "__main__":
reader = NotionPageReader()
print(reader.search("What I"))

View File

View File

@ -0,0 +1 @@
"""Init file."""

View File

@ -0,0 +1,47 @@
"""Obsidian reader class.
Pass in the path to an Obsidian vault and it will parse all markdown
files into a List of Documents,
with each Document containing text from under an Obsidian header.
"""
import os
from pathlib import Path
from typing import Any, List
from langchain.docstore.document import Document as LCDocument
from gpt_index.readers.base import BaseReader
from gpt_index.readers.file.markdown_parser import MarkdownParser
from gpt_index.readers.schema.base import Document
class ObsidianReader(BaseReader):
"""Utilities for loading data from an Obsidian Vault.
Args:
input_dir (str): Path to the vault.
"""
def __init__(self, input_dir: str, verbose: bool = False):
"""Init params."""
self.verbose = verbose
self.input_dir = Path(input_dir)
def load_data(self, *args: Any, **load_kwargs: Any) -> List[Document]:
"""Load data from the input directory."""
docs: List[str] = []
for (dirpath, dirnames, filenames) in os.walk(self.input_dir):
dirnames[:] = [d for d in dirnames if not d.startswith(".")]
for filename in filenames:
if filename.endswith(".md"):
filepath = os.path.join(dirpath, filename)
content = MarkdownParser().parse_file(Path(filepath))
docs.extend(content)
return [Document(d) for d in docs]
def load_langchain_documents(self, **load_kwargs: Any) -> List[LCDocument]:
"""Load data in LangChain document format."""
docs = self.load_data(**load_kwargs)
return [d.to_langchain_format() for d in docs]

View File

View File

@ -0,0 +1 @@
"""Init file."""

View File

@ -0,0 +1,81 @@
"""Pinecone reader."""
from typing import Any, Dict, List, Optional
from gpt_index.readers.base import BaseReader
from gpt_index.readers.schema.base import Document
class PineconeReader(BaseReader):
"""Pinecone reader.
Args:
api_key (str): Pinecone API key.
environment (str): Pinecone environment.
"""
def __init__(self, api_key: str, environment: str):
"""Initialize with parameters."""
try:
import pinecone # noqa: F401
except ImportError:
raise ValueError(
"`pinecone` package not found, please run `pip install pinecone-client`"
)
self._api_key = api_key
self._environment = environment
pinecone.init(api_key=api_key, environment=environment)
def load_data(
self,
index_name: str,
id_to_text_map: Dict[str, str],
vector: Optional[List[float]],
top_k: int,
separate_documents: bool = True,
include_values: bool = True,
**query_kwargs: Any
) -> List[Document]:
"""Load data from Pinecone.
Args:
index_name (str): Name of the index.
id_to_text_map (Dict[str, str]): A map from ID's to text.
separate_documents (Optional[bool]): Whether to return separate
documents per retrieved entry. Defaults to True.
vector (List[float]): Query vector.
top_k (int): Number of results to return.
include_values (bool): Whether to include the embedding in the response.
Defaults to True.
**query_kwargs: Keyword arguments to pass to the query.
Arguments are the exact same as those found in
Pinecone's reference documentation for the
query method.
Returns:
List[Document]: A list of documents.
"""
import pinecone
index = pinecone.Index(index_name)
if "include_values" not in query_kwargs:
query_kwargs["include_values"] = True
response = index.query(top_k=top_k, vector=vector, **query_kwargs)
documents = []
for match in response.matches:
if match.id not in id_to_text_map:
raise ValueError("ID not found in id_to_text_map.")
text = id_to_text_map[match.id]
embedding = match.values
if len(embedding) == 0:
embedding = None
documents.append(Document(text=text, embedding=embedding))
if not separate_documents:
text_list = [doc.get_text() for doc in documents]
text = "\n\n".join(text_list)
documents = [Document(text=text)]
return documents

View File

View File

@ -0,0 +1 @@
"""Init file."""

105
loader_hub/qdrant/base.py Normal file
View File

@ -0,0 +1,105 @@
"""Qdrant reader."""
from typing import List, Optional, cast
from gpt_index.readers.base import BaseReader
from gpt_index.readers.schema.base import Document
class QdrantReader(BaseReader):
"""Qdrant reader.
Retrieve documents from existing Qdrant collections.
Args:
host: Host name of Qdrant service.
port: Port of the REST API interface. Default: 6333
grpc_port: Port of the gRPC interface. Default: 6334
prefer_grpc: If `true` - use gPRC interface whenever possible in custom methods.
https: If `true` - use HTTPS(SSL) protocol. Default: `false`
api_key: API key for authentication in Qdrant Cloud. Default: `None`
prefix:
If not `None` - add `prefix` to the REST URL path.
Example: `service/v1` will result in
`http://localhost:6333/service/v1/{qdrant-endpoint}` for REST API.
Default: `None`
timeout:
Timeout for REST and gRPC API requests.
Default: 5.0 seconds for REST and unlimited for gRPC
"""
def __init__(
self,
host: str,
port: int = 6333,
grpc_port: int = 6334,
prefer_grpc: bool = False,
https: Optional[bool] = None,
api_key: Optional[str] = None,
prefix: Optional[str] = None,
timeout: Optional[float] = None,
verbose: bool = False,
):
"""Initialize with parameters."""
super().__init__(verbose)
import_err_msg = (
"`qdrant-client` package not found, please run `pip install qdrant-client`"
)
try:
import qdrant_client # noqa: F401
except ImportError:
raise ValueError(import_err_msg)
self._client = qdrant_client.QdrantClient(
host=host,
port=port,
grpc_port=grpc_port,
prefer_grpc=prefer_grpc,
https=https,
api_key=api_key,
prefix=prefix,
timeout=timeout,
)
def load_data(
self,
collection_name: str,
query_vector: List[float],
limit: int = 10,
) -> List[Document]:
"""Load data from Qdrant.
Args:
collection_name (str): Name of the Qdrant collection.
query_vector (List[float]): Query vector.
limit (int): Number of results to return.
Returns:
List[Document]: A list of documents.
"""
from qdrant_client.http.models.models import Payload
response = self._client.search(
collection_name=collection_name,
query_vector=query_vector,
with_vectors=True,
with_payload=True,
limit=limit,
)
documents = []
for point in response:
payload = cast(Payload, point)
try:
vector = cast(List[float], point.vector)
except ValueError as e:
raise ValueError("Could not cast vector to List[float].") from e
document = Document(
doc_id=payload.get("doc_id"),
text=payload.get("text"),
embedding=vector,
)
documents.append(document)
return documents

View File

View File

@ -0,0 +1 @@
"""Init file."""

143
loader_hub/slack/base.py Normal file
View File

@ -0,0 +1,143 @@
"""Slack reader."""
import logging
import os
import time
from typing import List, Optional
from gpt_index.readers.base import BaseReader
from gpt_index.readers.schema.base import Document
logger = logging.getLogger(__name__)
class SlackReader(BaseReader):
"""Slack reader.
Reads conversations from channels.
Args:
slack_token (Optional[str]): Slack token. If not provided, we
assume the environment variable `SLACK_BOT_TOKEN` is set.
"""
def __init__(self, slack_token: Optional[str] = None) -> None:
"""Initialize with parameters."""
try:
from slack_sdk import WebClient
except ImportError:
raise ValueError(
"`slack_sdk` package not found, please run `pip install slack_sdk`"
)
if slack_token is None:
slack_token = os.environ["SLACK_BOT_TOKEN"]
if slack_token is None:
raise ValueError(
"Must specify `slack_token` or set environment "
"variable `SLACK_BOT_TOKEN`."
)
self.client = WebClient(token=slack_token)
res = self.client.api_test()
if not res["ok"]:
raise ValueError(f"Error initializing Slack API: {res['error']}")
def _read_message(self, channel_id: str, message_ts: str) -> str:
from slack_sdk.errors import SlackApiError
"""Read a message."""
messages_text = []
next_cursor = None
while True:
try:
# https://slack.com/api/conversations.replies
# List all replies to a message, including the message itself.
result = self.client.conversations_replies(
channel=channel_id, ts=message_ts, cursor=next_cursor
)
messages = result["messages"]
for message in messages:
messages_text.append(message["text"])
if not result["has_more"]:
break
next_cursor = result["response_metadata"]["next_cursor"]
except SlackApiError as e:
if e.response["error"] == "ratelimited":
logger.error(
"Rate limit error reached, sleeping for: {} seconds".format(
e.response.headers["retry-after"]
)
)
time.sleep(int(e.response.headers["retry-after"]))
else:
logger.error("Error parsing conversation replies: {}".format(e))
return "\n\n".join(messages_text)
def _read_channel(self, channel_id: str) -> str:
from slack_sdk.errors import SlackApiError
"""Read a channel."""
result_messages = []
next_cursor = None
while True:
try:
# Call the conversations.history method using the WebClient
# conversations.history returns the first 100 messages by default
# These results are paginated,
# see: https://api.slack.com/methods/conversations.history$pagination
result = self.client.conversations_history(
channel=channel_id, cursor=next_cursor
)
conversation_history = result["messages"]
# Print results
logger.info(
"{} messages found in {}".format(len(conversation_history), id)
)
for message in conversation_history:
result_messages.append(
self._read_message(channel_id, message["ts"])
)
if not result["has_more"]:
break
next_cursor = result["response_metadata"]["next_cursor"]
except SlackApiError as e:
if e.response["error"] == "ratelimited":
logger.error(
"Rate limit error reached, sleeping for: {} seconds".format(
e.response.headers["retry-after"]
)
)
time.sleep(int(e.response.headers["retry-after"]))
else:
logger.error("Error parsing conversation replies: {}".format(e))
return "\n\n".join(result_messages)
def load_data(self, channel_ids: List[str]) -> List[Document]:
"""Load data from the input directory.
Args:
channel_ids (List[str]): List of channel ids to read.
Returns:
List[Document]: List of documents.
"""
results = []
for channel_id in channel_ids:
channel_content = self._read_channel(channel_id)
results.append(
Document(channel_content, extra_info={"channel": channel_id})
)
return results
if __name__ == "__main__":
reader = SlackReader()
print(reader.load_data(channel_ids=["C04DC2VUY3F"]))

View File

View File

@ -0,0 +1 @@
"""Init file."""

View File

@ -0,0 +1,32 @@
"""Simple reader that turns an iterable of strings into a list of Documents."""
from typing import List
from gpt_index.readers.base import BaseReader
from gpt_index.readers.schema.base import Document
class StringIterableReader(BaseReader):
"""String Iterable Reader.
Gets a list of documents, given an iterable (e.g. list) of strings.
Example:
.. code-block:: python
from gpt_index import StringIterableReader, GPTTreeIndex
documents = StringIterableReader().load_data(
texts=["I went to the store", "I bought an apple"])
index = GPTTreeIndex(documents)
index.query("what did I buy?")
# response should be something like "You bought an apple."
"""
def load_data(self, texts: List[str]) -> List[Document]:
"""Load the data."""
results = []
for text in texts:
results.append(Document(text))
return results

View File

View File

@ -0,0 +1 @@
"""Init file."""

View File

@ -0,0 +1,60 @@
"""Simple reader that reads tweets of a twitter handle."""
from typing import Any, List, Optional
from gpt_index.readers.base import BaseReader
from gpt_index.readers.schema.base import Document
class TwitterTweetReader(BaseReader):
"""Twitter tweets reader.
Read tweets of user twitter handle.
Check 'https://developer.twitter.com/en/docs/twitter-api/\
getting-started/getting-access-to-the-twitter-api' \
on how to get access to twitter API.
Args:
bearer_token (str): bearer_token that you get from twitter API.
num_tweets (Optional[int]): Number of tweets for each user twitter handle.\
Default is 100 tweets.
"""
def __init__(
self,
bearer_token: str,
num_tweets: Optional[int] = 100,
verbose: bool = False,
) -> None:
"""Initialize with parameters."""
super().__init__(verbose=verbose)
self.bearer_token = bearer_token
self.num_tweets = num_tweets
def load_data(
self, twitterhandles: List[str], **load_kwargs: Any
) -> List[Document]:
"""Load tweets of twitter handles.
Args:
twitterhandles (List[str]): List of user twitter handles to read tweets.
"""
try:
import tweepy
except ImportError:
raise ValueError(
"`tweepy` package not found, please run `pip install tweepy`"
)
client = tweepy.Client(bearer_token=self.bearer_token)
results = []
for username in twitterhandles:
# tweets = api.user_timeline(screen_name=user, count=self.num_tweets)
user = client.get_user(username=username)
tweets = client.get_users_tweets(user.data.id, max_results=self.num_tweets)
response = " "
for tweet in tweets.data:
response = response + tweet.text + "\n"
results.append(Document(response))
return results

View File

View File

@ -0,0 +1 @@
"""Init file."""

116
loader_hub/weaviate/base.py Normal file
View File

@ -0,0 +1,116 @@
"""Weaviate reader."""
from typing import Any, List, Optional
from gpt_index.readers.base import BaseReader
from gpt_index.readers.schema.base import Document
class WeaviateReader(BaseReader):
"""Weaviate reader.
Retrieves documents from Weaviate through vector lookup. Allows option
to concatenate retrieved documents into one Document, or to return
separate Document objects per document.
Args:
host (str): host.
auth_client_secret (Optional[weaviate.auth.AuthCredentials]):
auth_client_secret.
"""
def __init__(
self,
host: str,
auth_client_secret: Optional[Any] = None,
) -> None:
"""Initialize with parameters."""
try:
import weaviate # noqa: F401
from weaviate import Client # noqa: F401
from weaviate.auth import AuthCredentials # noqa: F401
except ImportError:
raise ValueError(
"`weaviate` package not found, please run `pip install weaviate-client`"
)
self.client: Client = Client(host, auth_client_secret=auth_client_secret)
def load_data(
self,
class_name: Optional[str] = None,
properties: Optional[List[str]] = None,
graphql_query: Optional[str] = None,
separate_documents: Optional[bool] = True,
) -> List[Document]:
"""Load data from Weaviate.
If `graphql_query` is not found in load_kwargs, we assume that
`class_name` and `properties` are provided.
Args:
class_name (Optional[str]): class_name to retrieve documents from.
properties (Optional[List[str]]): properties to retrieve from documents.
graphql_query (Optional[str]): Raw GraphQL Query.
We assume that the query is a Get query.
separate_documents (Optional[bool]): Whether to return separate
documents. Defaults to True.
Returns:
List[Document]: A list of documents.
"""
if class_name is not None and properties is not None:
props_txt = "\n".join(properties)
graphql_query = f"""
{{
Get {{
{class_name} {{
{props_txt}
}}
}}
}}
"""
elif graphql_query is not None:
pass
else:
raise ValueError(
"Either `class_name` and `properties` must be specified, "
"or `graphql_query` must be specified."
)
response = self.client.query.raw(graphql_query)
if "errors" in response:
raise ValueError("Invalid query, got errors: {}".format(response["errors"]))
data_response = response["data"]
if "Get" not in data_response:
raise ValueError("Invalid query response, must be a Get query.")
if class_name is None:
# infer class_name if only graphql_query was provided
class_name = list(data_response["Get"].keys())[0]
entries = data_response["Get"][class_name]
documents = []
for entry in entries:
embedding = None
# for each entry, join properties into <property>:<value>
# separated by newlines
text_list = []
for k, v in entry.items():
if k == "_additional":
if "vector" in v:
embedding = v["vector"]
continue
text_list.append(f"{k}: {v}")
text = "\n".join(text_list)
documents.append(Document(text=text, embedding=embedding))
if not separate_documents:
# join all documents into one
text_list = [doc.get_text() for doc in documents]
text = "\n\n".join(text_list)
documents = [Document(text=text)]
return documents

View File

View File

@ -0,0 +1 @@
"""Init file."""

View File

@ -0,0 +1,37 @@
"""Simple reader that reads wikipedia."""
from typing import Any, List
from gpt_index.readers.base import BaseReader
from gpt_index.readers.schema.base import Document
class WikipediaReader(BaseReader):
"""Wikipedia reader.
Reads a page.
"""
def __init__(self) -> None:
"""Initialize with parameters."""
try:
import wikipedia # noqa: F401
except ImportError:
raise ValueError(
"`wikipedia` package not found, please run `pip install wikipedia`"
)
def load_data(self, pages: List[str], **load_kwargs: Any) -> List[Document]:
"""Load data from the input directory.
Args:
pages (List[str]): List of pages to read.
"""
import wikipedia
results = []
for page in pages:
page_content = wikipedia.page(page, **load_kwargs).content
results.append(Document(page_content))
return results

View File

View File

@ -0,0 +1 @@
"""Init file."""

View File

@ -0,0 +1,38 @@
"""Simple Reader that reads transcript of youtube video."""
from typing import Any, List
from gpt_index.readers.base import BaseReader
from gpt_index.readers.schema.base import Document
class YoutubeTranscriptReader(BaseReader):
"""Youtube Transcript reader."""
def __init__(self) -> None:
"""Initialize with parameters."""
def load_data(self, ytlinks: List[str], **load_kwargs: Any) -> List[Document]:
"""Load data from the input directory.
Args:
pages (List[str]): List of youtube links \
for which transcripts are to be read.
"""
try:
from youtube_transcript_api import YouTubeTranscriptApi
except ImportError:
raise ValueError(
"`youtube_transcript_api` package not found, \
please run `pip install youtube-transcript-api`"
)
results = []
for link in ytlinks:
video_id = link.split("?v=")[-1]
srt = YouTubeTranscriptApi.get_transcript(video_id)
transcript = ""
for chunk in srt:
transcript = transcript + chunk["text"] + "\n"
results.append(Document(transcript))
return results