From 8889cc77c0ef0cefd77aa2c431af38e1a750deff Mon Sep 17 00:00:00 2001 From: EmptyCrown Date: Fri, 3 Feb 2023 00:05:28 -0800 Subject: [PATCH] Added all other files --- loader_hub/add_loader.sh | 5 + loader_hub/database/README.md | 0 loader_hub/database/__init__.py | 1 + loader_hub/database/base.py | 95 +++++++++++++ loader_hub/discord/README.md | 0 loader_hub/discord/__init__.py | 1 + loader_hub/discord/base.py | 147 +++++++++++++++++++ loader_hub/faiss/README.md | 0 loader_hub/faiss/__init__.py | 1 + loader_hub/faiss/base.py | 75 ++++++++++ loader_hub/file/__init__.py | 2 +- loader_hub/library.json | 43 +++++- loader_hub/mongo/README.md | 0 loader_hub/mongo/__init__.py | 1 + loader_hub/mongo/base.py | 59 ++++++++ loader_hub/notion/README.md | 0 loader_hub/notion/__init__.py | 1 + loader_hub/notion/base.py | 166 ++++++++++++++++++++++ loader_hub/obsidian/README.md | 0 loader_hub/obsidian/__init__.py | 1 + loader_hub/obsidian/base.py | 47 ++++++ loader_hub/pinecone/README.md | 0 loader_hub/pinecone/__init__.py | 1 + loader_hub/pinecone/base.py | 81 +++++++++++ loader_hub/qdrant/README.md | 0 loader_hub/qdrant/__init__.py | 1 + loader_hub/qdrant/base.py | 105 ++++++++++++++ loader_hub/slack/README.md | 0 loader_hub/slack/__init__.py | 1 + loader_hub/slack/base.py | 143 +++++++++++++++++++ loader_hub/string_iterable/README.md | 0 loader_hub/string_iterable/__init__.py | 1 + loader_hub/string_iterable/base.py | 32 +++++ loader_hub/twitter/README.md | 0 loader_hub/twitter/__init__.py | 1 + loader_hub/twitter/base.py | 60 ++++++++ loader_hub/weaviate/README.md | 0 loader_hub/weaviate/__init__.py | 1 + loader_hub/weaviate/base.py | 116 +++++++++++++++ loader_hub/wikipedia/README.md | 0 loader_hub/wikipedia/__init__.py | 1 + loader_hub/wikipedia/base.py | 37 +++++ loader_hub/youtube_transcript/README.md | 0 loader_hub/youtube_transcript/__init__.py | 1 + loader_hub/youtube_transcript/base.py | 38 +++++ 45 files changed, 1262 insertions(+), 3 deletions(-) create mode 100755 loader_hub/add_loader.sh create mode 100644 loader_hub/database/README.md create mode 100644 loader_hub/database/__init__.py create mode 100644 loader_hub/database/base.py create mode 100644 loader_hub/discord/README.md create mode 100644 loader_hub/discord/__init__.py create mode 100644 loader_hub/discord/base.py create mode 100644 loader_hub/faiss/README.md create mode 100644 loader_hub/faiss/__init__.py create mode 100644 loader_hub/faiss/base.py create mode 100644 loader_hub/mongo/README.md create mode 100644 loader_hub/mongo/__init__.py create mode 100644 loader_hub/mongo/base.py create mode 100644 loader_hub/notion/README.md create mode 100644 loader_hub/notion/__init__.py create mode 100644 loader_hub/notion/base.py create mode 100644 loader_hub/obsidian/README.md create mode 100644 loader_hub/obsidian/__init__.py create mode 100644 loader_hub/obsidian/base.py create mode 100644 loader_hub/pinecone/README.md create mode 100644 loader_hub/pinecone/__init__.py create mode 100644 loader_hub/pinecone/base.py create mode 100644 loader_hub/qdrant/README.md create mode 100644 loader_hub/qdrant/__init__.py create mode 100644 loader_hub/qdrant/base.py create mode 100644 loader_hub/slack/README.md create mode 100644 loader_hub/slack/__init__.py create mode 100644 loader_hub/slack/base.py create mode 100644 loader_hub/string_iterable/README.md create mode 100644 loader_hub/string_iterable/__init__.py create mode 100644 loader_hub/string_iterable/base.py create mode 100644 loader_hub/twitter/README.md create mode 100644 loader_hub/twitter/__init__.py create mode 100644 loader_hub/twitter/base.py create mode 100644 loader_hub/weaviate/README.md create mode 100644 loader_hub/weaviate/__init__.py create mode 100644 loader_hub/weaviate/base.py create mode 100644 loader_hub/wikipedia/README.md create mode 100644 loader_hub/wikipedia/__init__.py create mode 100644 loader_hub/wikipedia/base.py create mode 100644 loader_hub/youtube_transcript/README.md create mode 100644 loader_hub/youtube_transcript/__init__.py create mode 100644 loader_hub/youtube_transcript/base.py diff --git a/loader_hub/add_loader.sh b/loader_hub/add_loader.sh new file mode 100755 index 00000000..ddecec9e --- /dev/null +++ b/loader_hub/add_loader.sh @@ -0,0 +1,5 @@ +mkdir $1; +touch $1/base.py; +touch $1/README.md; +touch $1/__init__.py; +echo "\"\"\"Init file.\"\"\"" > $1/__init__.py; diff --git a/loader_hub/database/README.md b/loader_hub/database/README.md new file mode 100644 index 00000000..e69de29b diff --git a/loader_hub/database/__init__.py b/loader_hub/database/__init__.py new file mode 100644 index 00000000..1d464056 --- /dev/null +++ b/loader_hub/database/__init__.py @@ -0,0 +1 @@ +"""Init file.""" diff --git a/loader_hub/database/base.py b/loader_hub/database/base.py new file mode 100644 index 00000000..4098c3e4 --- /dev/null +++ b/loader_hub/database/base.py @@ -0,0 +1,95 @@ +"""Database Reader.""" + +from typing import Any, List, Optional + +from sqlalchemy import text +from sqlalchemy.engine import Engine + +from gpt_index.langchain_helpers.sql_wrapper import SQLDatabase +from gpt_index.readers.base import BaseReader +from gpt_index.readers.schema.base import Document + + +class DatabaseReader(BaseReader): + """Simple Database reader. + + Concatenates each row into Document used by GPT Index. + + Args: + sql_database (Optional[SQLDatabase]): SQL database to use, + including table names to specify. + See :ref:`Ref-Struct-Store` for more details. + + OR + + engine (Optional[Engine]): SQLAlchemy Engine object of the database connection. + + OR + + uri (Optional[str]): uri of the database connection. + + OR + + scheme (Optional[str]): scheme of the database connection. + host (Optional[str]): host of the database connection. + port (Optional[int]): port of the database connection. + user (Optional[str]): user of the database connection. + password (Optional[str]): password of the database connection. + dbname (Optional[str]): dbname of the database connection. + + Returns: + DatabaseReader: A DatabaseReader object. + """ + + def __init__( + self, + sql_database: Optional[SQLDatabase] = None, + engine: Optional[Engine] = None, + uri: Optional[str] = None, + scheme: Optional[str] = None, + host: Optional[str] = None, + port: Optional[str] = None, + user: Optional[str] = None, + password: Optional[str] = None, + dbname: Optional[str] = None, + *args: Optional[Any], + **kwargs: Optional[Any], + ) -> None: + """Initialize with parameters.""" + if sql_database: + self.sql_database = sql_database + elif engine: + self.sql_database = SQLDatabase(engine, *args, **kwargs) + elif uri: + self.uri = uri + self.sql_database = SQLDatabase.from_uri(uri, *args, **kwargs) + elif scheme and host and port and user and password and dbname: + uri = f"{scheme}://{user}:{password}@{host}:{port}/{dbname}" + self.uri = uri + self.sql_database = SQLDatabase.from_uri(uri, *args, **kwargs) + else: + raise ValueError( + "You must provide either a SQLDatabase, " + "a SQL Alchemy Engine, a valid connection URI, or a valid " + "set of credentials." + ) + + def load_data(self, query: str) -> List[Document]: + """Query and load data from the Database, returning a list of Documents. + + Args: + query (str): Query parameter to filter tables and rows. + + Returns: + List[Document]: A list of Document objects. + """ + documents = [] + with self.sql_database.engine.connect() as connection: + if query is None: + raise ValueError("A query parameter is necessary to filter the data") + else: + result = connection.execute(text(query)) + + for item in result.fetchall(): + documents.append(Document(item[0])) + return documents diff --git a/loader_hub/discord/README.md b/loader_hub/discord/README.md new file mode 100644 index 00000000..e69de29b diff --git a/loader_hub/discord/__init__.py b/loader_hub/discord/__init__.py new file mode 100644 index 00000000..1d464056 --- /dev/null +++ b/loader_hub/discord/__init__.py @@ -0,0 +1 @@ +"""Init file.""" diff --git a/loader_hub/discord/base.py b/loader_hub/discord/base.py new file mode 100644 index 00000000..a701db8f --- /dev/null +++ b/loader_hub/discord/base.py @@ -0,0 +1,147 @@ +"""Discord reader.""" + +import asyncio +import logging +import os +from typing import List, Optional + +from gpt_index.readers.base import BaseReader +from gpt_index.readers.schema.base import Document + +logger = logging.getLogger(__name__) + + +async def read_channel( + discord_token: str, channel_id: int, limit: Optional[int], oldest_first: bool +) -> str: + """Async read channel. + + Note: This is our hack to create a synchronous interface to the + async discord.py API. We use the `asyncio` module to run + this function with `asyncio.get_event_loop().run_until_complete`. + + """ + import discord # noqa: F401 + + messages: List[discord.Message] = [] + + class CustomClient(discord.Client): + async def on_ready(self) -> None: + try: + print(f"{self.user} has connected to Discord!") + channel = client.get_channel(channel_id) + # only work for text channels for now + if not isinstance(channel, discord.TextChannel): + raise ValueError( + f"Channel {channel_id} is not a text channel. " + "Only text channels are supported for now." + ) + # thread_dict maps thread_id to thread + thread_dict = {} + for thread in channel.threads: + thread_dict[thread.id] = thread + + async for msg in channel.history( + limit=limit, oldest_first=oldest_first + ): + messages.append(msg) + if msg.id in thread_dict: + thread = thread_dict[msg.id] + async for thread_msg in thread.history( + limit=limit, oldest_first=oldest_first + ): + messages.append(thread_msg) + except Exception as e: + print("Encountered error: " + str(e)) + finally: + await self.close() + + intents = discord.Intents.default() + intents.message_content = True + client = CustomClient(intents=intents) + await client.start(discord_token) + + msg_txt_list = [m.content for m in messages] + + return "\n\n".join(msg_txt_list) + + +class DiscordReader(BaseReader): + """Discord reader. + + Reads conversations from channels. + + Args: + discord_token (Optional[str]): Discord token. If not provided, we + assume the environment variable `DISCORD_TOKEN` is set. + + """ + + def __init__(self, discord_token: Optional[str] = None) -> None: + """Initialize with parameters.""" + try: + import discord # noqa: F401 + except ImportError: + raise ValueError( + "`discord.py` package not found, please run `pip install discord.py`" + ) + if discord_token is None: + discord_token = os.environ["DISCORD_TOKEN"] + if discord_token is None: + raise ValueError( + "Must specify `discord_token` or set environment " + "variable `DISCORD_TOKEN`." + ) + + self.discord_token = discord_token + + def _read_channel( + self, channel_id: int, limit: Optional[int] = None, oldest_first: bool = True + ) -> str: + """Read channel.""" + result = asyncio.get_event_loop().run_until_complete( + read_channel( + self.discord_token, channel_id, limit=limit, oldest_first=oldest_first + ) + ) + return result + + def load_data( + self, + channel_ids: List[int], + limit: Optional[int] = None, + oldest_first: bool = True, + ) -> List[Document]: + """Load data from the input directory. + + Args: + channel_ids (List[int]): List of channel ids to read. + limit (Optional[int]): Maximum number of messages to read. + oldest_first (bool): Whether to read oldest messages first. + Defaults to `True`. + + Returns: + List[Document]: List of documents. + + """ + results: List[Document] = [] + for channel_id in channel_ids: + if not isinstance(channel_id, int): + raise ValueError( + f"Channel id {channel_id} must be an integer, " + f"not {type(channel_id)}." + ) + channel_content = self._read_channel( + channel_id, limit=limit, oldest_first=oldest_first + ) + results.append( + Document(channel_content, extra_info={"channel": channel_id}) + ) + return results + + +if __name__ == "__main__": + reader = DiscordReader() + print("initialized reader") + output = reader.load_data(channel_ids=[1057178784895348746], limit=10) + print(output) diff --git a/loader_hub/faiss/README.md b/loader_hub/faiss/README.md new file mode 100644 index 00000000..e69de29b diff --git a/loader_hub/faiss/__init__.py b/loader_hub/faiss/__init__.py new file mode 100644 index 00000000..1d464056 --- /dev/null +++ b/loader_hub/faiss/__init__.py @@ -0,0 +1 @@ +"""Init file.""" diff --git a/loader_hub/faiss/base.py b/loader_hub/faiss/base.py new file mode 100644 index 00000000..82f076eb --- /dev/null +++ b/loader_hub/faiss/base.py @@ -0,0 +1,75 @@ +"""Faiss reader.""" + +from typing import Any, Dict, List + +import numpy as np + +from gpt_index.readers.base import BaseReader +from gpt_index.readers.schema.base import Document + + +class FaissReader(BaseReader): + """Faiss reader. + + Retrieves documents through an existing in-memory Faiss index. + These documents can then be used in a downstream GPT Index data structure. + If you wish use Faiss itself as an index to to organize documents, + insert documents, and perform queries on them, please use GPTFaissIndex. + + Args: + faiss_index (faiss.Index): A Faiss Index object (required) + + """ + + def __init__(self, index: Any): + """Initialize with parameters.""" + import_err_msg = """ + `faiss` package not found. For instructions on + how to install `faiss` please visit + https://github.com/facebookresearch/faiss/wiki/Installing-Faiss + """ + try: + import faiss # noqa: F401 + except ImportError: + raise ValueError(import_err_msg) + + self._index = index + + def load_data( + self, + query: np.ndarray, + id_to_text_map: Dict[str, str], + k: int = 4, + separate_documents: bool = True, + ) -> List[Document]: + """Load data from Faiss. + + Args: + query (np.ndarray): A 2D numpy array of query vectors. + id_to_text_map (Dict[str, str]): A map from ID's to text. + k (int): Number of nearest neighbors to retrieve. Defaults to 4. + separate_documents (Optional[bool]): Whether to return separate + documents. Defaults to True. + Returns: + List[Document]: A list of documents. + + """ + dists, indices = self._index.search(query, k) + documents = [] + for qidx in range(indices.shape[0]): + for didx in range(indices.shape[1]): + doc_id = indices[qidx, didx] + if doc_id not in id_to_text_map: + raise ValueError( + f"Document ID {doc_id} not found in id_to_text_map." + ) + text = id_to_text_map[doc_id] + documents.append(Document(text=text)) + + if not separate_documents: + # join all documents into one + text_list = [doc.get_text() for doc in documents] + text = "\n\n".join(text_list) + documents = [Document(text=text)] + + return documents diff --git a/loader_hub/file/__init__.py b/loader_hub/file/__init__.py index c6373350..1d464056 100644 --- a/loader_hub/file/__init__.py +++ b/loader_hub/file/__init__.py @@ -1 +1 @@ -"""Init params.""" +"""Init file.""" diff --git a/loader_hub/library.json b/loader_hub/library.json index 64291136..e33f2afb 100644 --- a/loader_hub/library.json +++ b/loader_hub/library.json @@ -12,7 +12,46 @@ "id": "web/beautiful_soup_web", "author": "thejessezhang" }, - "TrafilaturaWebReader": { - "id": "web/trafilatura_web" + "DatabaseReader": { + "id": "database" + }, + "DiscordReader": { + "id": "discord" + }, + "FaissReader": { + "id": "faiss" + }, + "SimpleMongoReader": { + "id": "mongo" + }, + "NotionPageReader": { + "id": "notion" + }, + "ObsidianReader": { + "id": "obsidian" + }, + "PineconeReader": { + "id": "pinecone" + }, + "QdrantReader": { + "id": "qdrant" + }, + "SlackReader": { + "id": "slack" + }, + "StringIterableReader": { + "id": "string_iterable" + }, + "TwitterTweetReader": { + "id": "twitter" + }, + "WeaviateReader": { + "id": "weaviate" + }, + "WikipediaReader": { + "id": "wikipedia" + }, + "YoutubeTranscriptReader": { + "id": "youtube_transcript" } } \ No newline at end of file diff --git a/loader_hub/mongo/README.md b/loader_hub/mongo/README.md new file mode 100644 index 00000000..e69de29b diff --git a/loader_hub/mongo/__init__.py b/loader_hub/mongo/__init__.py new file mode 100644 index 00000000..1d464056 --- /dev/null +++ b/loader_hub/mongo/__init__.py @@ -0,0 +1 @@ +"""Init file.""" diff --git a/loader_hub/mongo/base.py b/loader_hub/mongo/base.py new file mode 100644 index 00000000..2bc13de0 --- /dev/null +++ b/loader_hub/mongo/base.py @@ -0,0 +1,59 @@ +"""Mongo client.""" + +from typing import Dict, List, Optional + +from gpt_index.readers.base import BaseReader +from gpt_index.readers.schema.base import Document + + +class SimpleMongoReader(BaseReader): + """Simple mongo reader. + + Concatenates each Mongo doc into Document used by GPT Index. + + Args: + host (str): Mongo host. + port (int): Mongo port. + max_docs (int): Maximum number of documents to load. + + """ + + def __init__(self, host: str, port: int, max_docs: int = 1000) -> None: + """Initialize with parameters.""" + try: + import pymongo # noqa: F401 + from pymongo import MongoClient # noqa: F401 + except ImportError: + raise ValueError( + "`pymongo` package not found, please run `pip install pymongo`" + ) + self.client: MongoClient = MongoClient(host, port) + self.max_docs = max_docs + + def load_data( + self, db_name: str, collection_name: str, query_dict: Optional[Dict] = None + ) -> List[Document]: + """Load data from the input directory. + + Args: + db_name (str): name of the database. + collection_name (str): name of the collection. + query_dict (Optional[Dict]): query to filter documents. + Defaults to None + + Returns: + List[Document]: A list of documents. + + """ + documents = [] + db = self.client[db_name] + if query_dict is None: + cursor = db[collection_name].find() + else: + cursor = db[collection_name].find(query_dict) + + for item in cursor: + if "text" not in item: + raise ValueError("`text` field not found in Mongo document.") + documents.append(Document(item["text"])) + return documents diff --git a/loader_hub/notion/README.md b/loader_hub/notion/README.md new file mode 100644 index 00000000..e69de29b diff --git a/loader_hub/notion/__init__.py b/loader_hub/notion/__init__.py new file mode 100644 index 00000000..1d464056 --- /dev/null +++ b/loader_hub/notion/__init__.py @@ -0,0 +1 @@ +"""Init file.""" diff --git a/loader_hub/notion/base.py b/loader_hub/notion/base.py new file mode 100644 index 00000000..0db830ef --- /dev/null +++ b/loader_hub/notion/base.py @@ -0,0 +1,166 @@ +"""Notion reader.""" +import os +from typing import Any, Dict, List, Optional + +import requests # type: ignore + +from gpt_index.readers.base import BaseReader +from gpt_index.readers.schema.base import Document + +INTEGRATION_TOKEN_NAME = "NOTION_INTEGRATION_TOKEN" +BLOCK_CHILD_URL_TMPL = "https://api.notion.com/v1/blocks/{block_id}/children" +DATABASE_URL_TMPL = "https://api.notion.com/v1/databases/{database_id}/query" +SEARCH_URL = "https://api.notion.com/v1/search" + + +# TODO: Notion DB reader coming soon! +class NotionPageReader(BaseReader): + """Notion Page reader. + + Reads a set of Notion pages. + + Args: + integration_token (str): Notion integration token. + + """ + + def __init__(self, integration_token: Optional[str] = None) -> None: + """Initialize with parameters.""" + if integration_token is None: + integration_token = os.getenv(INTEGRATION_TOKEN_NAME) + if integration_token is None: + raise ValueError( + "Must specify `integration_token` or set environment " + "variable `NOTION_INTEGRATION_TOKEN`." + ) + self.token = integration_token + self.headers = { + "Authorization": "Bearer " + self.token, + "Content-Type": "application/json", + "Notion-Version": "2022-06-28", + } + + def _read_block(self, block_id: str, num_tabs: int = 0) -> str: + """Read a block.""" + done = False + result_lines_arr = [] + cur_block_id = block_id + while not done: + block_url = BLOCK_CHILD_URL_TMPL.format(block_id=cur_block_id) + query_dict: Dict[str, Any] = {} + + res = requests.request( + "GET", block_url, headers=self.headers, json=query_dict + ) + data = res.json() + + for result in data["results"]: + result_type = result["type"] + result_obj = result[result_type] + + cur_result_text_arr = [] + if "rich_text" in result_obj: + for rich_text in result_obj["rich_text"]: + # skip if doesn't have text object + if "text" in rich_text: + text = rich_text["text"]["content"] + prefix = "\t" * num_tabs + cur_result_text_arr.append(prefix + text) + + result_block_id = result["id"] + has_children = result["has_children"] + if has_children: + children_text = self._read_block( + result_block_id, num_tabs=num_tabs + 1 + ) + cur_result_text_arr.append(children_text) + + cur_result_text = "\n".join(cur_result_text_arr) + result_lines_arr.append(cur_result_text) + + if data["next_cursor"] is None: + done = True + break + else: + cur_block_id = data["next_cursor"] + + result_lines = "\n".join(result_lines_arr) + return result_lines + + def read_page(self, page_id: str) -> str: + """Read a page.""" + return self._read_block(page_id) + + def query_database( + self, database_id: str, query_dict: Dict[str, Any] = {} + ) -> List[str]: + """Get all the pages from a Notion database.""" + res = requests.post( + DATABASE_URL_TMPL.format(database_id=database_id), + headers=self.headers, + json=query_dict, + ) + data = res.json() + page_ids = [] + for result in data["results"]: + page_id = result["id"] + page_ids.append(page_id) + + return page_ids + + def search(self, query: str) -> List[str]: + """Search Notion page given a text query.""" + done = False + next_cursor: Optional[str] = None + page_ids = [] + while not done: + query_dict = { + "query": query, + } + if next_cursor is not None: + query_dict["start_cursor"] = next_cursor + res = requests.post(SEARCH_URL, headers=self.headers, json=query_dict) + data = res.json() + for result in data["results"]: + page_id = result["id"] + page_ids.append(page_id) + + if data["next_cursor"] is None: + done = True + break + else: + next_cursor = data["next_cursor"] + return page_ids + + def load_data( + self, page_ids: List[str] = [], database_id: Optional[str] = None + ) -> List[Document]: + """Load data from the input directory. + + Args: + page_ids (List[str]): List of page ids to load. + + Returns: + List[Document]: List of documents. + + """ + if not page_ids and not database_id: + raise ValueError("Must specify either `page_ids` or `database_id`.") + docs = [] + if database_id is not None: + # get all the pages in the database + page_ids = self.query_database(database_id) + for page_id in page_ids: + page_text = self.read_page(page_id) + docs.append(Document(page_text, extra_info={"page_id": page_id})) + else: + for page_id in page_ids: + page_text = self.read_page(page_id) + docs.append(Document(page_text, extra_info={"page_id": page_id})) + + return docs + + +if __name__ == "__main__": + reader = NotionPageReader() + print(reader.search("What I")) diff --git a/loader_hub/obsidian/README.md b/loader_hub/obsidian/README.md new file mode 100644 index 00000000..e69de29b diff --git a/loader_hub/obsidian/__init__.py b/loader_hub/obsidian/__init__.py new file mode 100644 index 00000000..1d464056 --- /dev/null +++ b/loader_hub/obsidian/__init__.py @@ -0,0 +1 @@ +"""Init file.""" diff --git a/loader_hub/obsidian/base.py b/loader_hub/obsidian/base.py new file mode 100644 index 00000000..bee7c562 --- /dev/null +++ b/loader_hub/obsidian/base.py @@ -0,0 +1,47 @@ +"""Obsidian reader class. + +Pass in the path to an Obsidian vault and it will parse all markdown +files into a List of Documents, +with each Document containing text from under an Obsidian header. + +""" +import os +from pathlib import Path +from typing import Any, List + +from langchain.docstore.document import Document as LCDocument + +from gpt_index.readers.base import BaseReader +from gpt_index.readers.file.markdown_parser import MarkdownParser +from gpt_index.readers.schema.base import Document + + +class ObsidianReader(BaseReader): + """Utilities for loading data from an Obsidian Vault. + + Args: + input_dir (str): Path to the vault. + + """ + + def __init__(self, input_dir: str, verbose: bool = False): + """Init params.""" + self.verbose = verbose + self.input_dir = Path(input_dir) + + def load_data(self, *args: Any, **load_kwargs: Any) -> List[Document]: + """Load data from the input directory.""" + docs: List[str] = [] + for (dirpath, dirnames, filenames) in os.walk(self.input_dir): + dirnames[:] = [d for d in dirnames if not d.startswith(".")] + for filename in filenames: + if filename.endswith(".md"): + filepath = os.path.join(dirpath, filename) + content = MarkdownParser().parse_file(Path(filepath)) + docs.extend(content) + return [Document(d) for d in docs] + + def load_langchain_documents(self, **load_kwargs: Any) -> List[LCDocument]: + """Load data in LangChain document format.""" + docs = self.load_data(**load_kwargs) + return [d.to_langchain_format() for d in docs] diff --git a/loader_hub/pinecone/README.md b/loader_hub/pinecone/README.md new file mode 100644 index 00000000..e69de29b diff --git a/loader_hub/pinecone/__init__.py b/loader_hub/pinecone/__init__.py new file mode 100644 index 00000000..1d464056 --- /dev/null +++ b/loader_hub/pinecone/__init__.py @@ -0,0 +1 @@ +"""Init file.""" diff --git a/loader_hub/pinecone/base.py b/loader_hub/pinecone/base.py new file mode 100644 index 00000000..c121f325 --- /dev/null +++ b/loader_hub/pinecone/base.py @@ -0,0 +1,81 @@ +"""Pinecone reader.""" + +from typing import Any, Dict, List, Optional + +from gpt_index.readers.base import BaseReader +from gpt_index.readers.schema.base import Document + + +class PineconeReader(BaseReader): + """Pinecone reader. + + Args: + api_key (str): Pinecone API key. + environment (str): Pinecone environment. + """ + + def __init__(self, api_key: str, environment: str): + """Initialize with parameters.""" + try: + import pinecone # noqa: F401 + except ImportError: + raise ValueError( + "`pinecone` package not found, please run `pip install pinecone-client`" + ) + + self._api_key = api_key + self._environment = environment + pinecone.init(api_key=api_key, environment=environment) + + def load_data( + self, + index_name: str, + id_to_text_map: Dict[str, str], + vector: Optional[List[float]], + top_k: int, + separate_documents: bool = True, + include_values: bool = True, + **query_kwargs: Any + ) -> List[Document]: + """Load data from Pinecone. + + Args: + index_name (str): Name of the index. + id_to_text_map (Dict[str, str]): A map from ID's to text. + separate_documents (Optional[bool]): Whether to return separate + documents per retrieved entry. Defaults to True. + vector (List[float]): Query vector. + top_k (int): Number of results to return. + include_values (bool): Whether to include the embedding in the response. + Defaults to True. + **query_kwargs: Keyword arguments to pass to the query. + Arguments are the exact same as those found in + Pinecone's reference documentation for the + query method. + + Returns: + List[Document]: A list of documents. + """ + import pinecone + + index = pinecone.Index(index_name) + if "include_values" not in query_kwargs: + query_kwargs["include_values"] = True + response = index.query(top_k=top_k, vector=vector, **query_kwargs) + + documents = [] + for match in response.matches: + if match.id not in id_to_text_map: + raise ValueError("ID not found in id_to_text_map.") + text = id_to_text_map[match.id] + embedding = match.values + if len(embedding) == 0: + embedding = None + documents.append(Document(text=text, embedding=embedding)) + + if not separate_documents: + text_list = [doc.get_text() for doc in documents] + text = "\n\n".join(text_list) + documents = [Document(text=text)] + + return documents diff --git a/loader_hub/qdrant/README.md b/loader_hub/qdrant/README.md new file mode 100644 index 00000000..e69de29b diff --git a/loader_hub/qdrant/__init__.py b/loader_hub/qdrant/__init__.py new file mode 100644 index 00000000..1d464056 --- /dev/null +++ b/loader_hub/qdrant/__init__.py @@ -0,0 +1 @@ +"""Init file.""" diff --git a/loader_hub/qdrant/base.py b/loader_hub/qdrant/base.py new file mode 100644 index 00000000..55fb057b --- /dev/null +++ b/loader_hub/qdrant/base.py @@ -0,0 +1,105 @@ +"""Qdrant reader.""" + +from typing import List, Optional, cast + +from gpt_index.readers.base import BaseReader +from gpt_index.readers.schema.base import Document + + +class QdrantReader(BaseReader): + """Qdrant reader. + + Retrieve documents from existing Qdrant collections. + + Args: + host: Host name of Qdrant service. + port: Port of the REST API interface. Default: 6333 + grpc_port: Port of the gRPC interface. Default: 6334 + prefer_grpc: If `true` - use gPRC interface whenever possible in custom methods. + https: If `true` - use HTTPS(SSL) protocol. Default: `false` + api_key: API key for authentication in Qdrant Cloud. Default: `None` + prefix: + If not `None` - add `prefix` to the REST URL path. + Example: `service/v1` will result in + `http://localhost:6333/service/v1/{qdrant-endpoint}` for REST API. + Default: `None` + timeout: + Timeout for REST and gRPC API requests. + Default: 5.0 seconds for REST and unlimited for gRPC + """ + + def __init__( + self, + host: str, + port: int = 6333, + grpc_port: int = 6334, + prefer_grpc: bool = False, + https: Optional[bool] = None, + api_key: Optional[str] = None, + prefix: Optional[str] = None, + timeout: Optional[float] = None, + verbose: bool = False, + ): + """Initialize with parameters.""" + super().__init__(verbose) + + import_err_msg = ( + "`qdrant-client` package not found, please run `pip install qdrant-client`" + ) + try: + import qdrant_client # noqa: F401 + except ImportError: + raise ValueError(import_err_msg) + + self._client = qdrant_client.QdrantClient( + host=host, + port=port, + grpc_port=grpc_port, + prefer_grpc=prefer_grpc, + https=https, + api_key=api_key, + prefix=prefix, + timeout=timeout, + ) + + def load_data( + self, + collection_name: str, + query_vector: List[float], + limit: int = 10, + ) -> List[Document]: + """Load data from Qdrant. + + Args: + collection_name (str): Name of the Qdrant collection. + query_vector (List[float]): Query vector. + limit (int): Number of results to return. + + Returns: + List[Document]: A list of documents. + """ + from qdrant_client.http.models.models import Payload + + response = self._client.search( + collection_name=collection_name, + query_vector=query_vector, + with_vectors=True, + with_payload=True, + limit=limit, + ) + + documents = [] + for point in response: + payload = cast(Payload, point) + try: + vector = cast(List[float], point.vector) + except ValueError as e: + raise ValueError("Could not cast vector to List[float].") from e + document = Document( + doc_id=payload.get("doc_id"), + text=payload.get("text"), + embedding=vector, + ) + documents.append(document) + + return documents diff --git a/loader_hub/slack/README.md b/loader_hub/slack/README.md new file mode 100644 index 00000000..e69de29b diff --git a/loader_hub/slack/__init__.py b/loader_hub/slack/__init__.py new file mode 100644 index 00000000..1d464056 --- /dev/null +++ b/loader_hub/slack/__init__.py @@ -0,0 +1 @@ +"""Init file.""" diff --git a/loader_hub/slack/base.py b/loader_hub/slack/base.py new file mode 100644 index 00000000..d63d4455 --- /dev/null +++ b/loader_hub/slack/base.py @@ -0,0 +1,143 @@ +"""Slack reader.""" +import logging +import os +import time +from typing import List, Optional + +from gpt_index.readers.base import BaseReader +from gpt_index.readers.schema.base import Document + +logger = logging.getLogger(__name__) + + +class SlackReader(BaseReader): + """Slack reader. + + Reads conversations from channels. + + Args: + slack_token (Optional[str]): Slack token. If not provided, we + assume the environment variable `SLACK_BOT_TOKEN` is set. + + """ + + def __init__(self, slack_token: Optional[str] = None) -> None: + """Initialize with parameters.""" + try: + from slack_sdk import WebClient + except ImportError: + raise ValueError( + "`slack_sdk` package not found, please run `pip install slack_sdk`" + ) + if slack_token is None: + slack_token = os.environ["SLACK_BOT_TOKEN"] + if slack_token is None: + raise ValueError( + "Must specify `slack_token` or set environment " + "variable `SLACK_BOT_TOKEN`." + ) + self.client = WebClient(token=slack_token) + res = self.client.api_test() + if not res["ok"]: + raise ValueError(f"Error initializing Slack API: {res['error']}") + + def _read_message(self, channel_id: str, message_ts: str) -> str: + from slack_sdk.errors import SlackApiError + + """Read a message.""" + + messages_text = [] + next_cursor = None + while True: + try: + # https://slack.com/api/conversations.replies + # List all replies to a message, including the message itself. + result = self.client.conversations_replies( + channel=channel_id, ts=message_ts, cursor=next_cursor + ) + messages = result["messages"] + for message in messages: + messages_text.append(message["text"]) + + if not result["has_more"]: + break + + next_cursor = result["response_metadata"]["next_cursor"] + except SlackApiError as e: + if e.response["error"] == "ratelimited": + logger.error( + "Rate limit error reached, sleeping for: {} seconds".format( + e.response.headers["retry-after"] + ) + ) + time.sleep(int(e.response.headers["retry-after"])) + else: + logger.error("Error parsing conversation replies: {}".format(e)) + + return "\n\n".join(messages_text) + + def _read_channel(self, channel_id: str) -> str: + from slack_sdk.errors import SlackApiError + + """Read a channel.""" + + result_messages = [] + next_cursor = None + while True: + try: + # Call the conversations.history method using the WebClient + # conversations.history returns the first 100 messages by default + # These results are paginated, + # see: https://api.slack.com/methods/conversations.history$pagination + result = self.client.conversations_history( + channel=channel_id, cursor=next_cursor + ) + conversation_history = result["messages"] + # Print results + logger.info( + "{} messages found in {}".format(len(conversation_history), id) + ) + for message in conversation_history: + result_messages.append( + self._read_message(channel_id, message["ts"]) + ) + + if not result["has_more"]: + break + next_cursor = result["response_metadata"]["next_cursor"] + + except SlackApiError as e: + if e.response["error"] == "ratelimited": + logger.error( + "Rate limit error reached, sleeping for: {} seconds".format( + e.response.headers["retry-after"] + ) + ) + time.sleep(int(e.response.headers["retry-after"])) + else: + logger.error("Error parsing conversation replies: {}".format(e)) + + return "\n\n".join(result_messages) + + def load_data(self, channel_ids: List[str]) -> List[Document]: + """Load data from the input directory. + + Args: + channel_ids (List[str]): List of channel ids to read. + + Returns: + List[Document]: List of documents. + + """ + results = [] + for channel_id in channel_ids: + channel_content = self._read_channel(channel_id) + results.append( + Document(channel_content, extra_info={"channel": channel_id}) + ) + return results + + +if __name__ == "__main__": + reader = SlackReader() + print(reader.load_data(channel_ids=["C04DC2VUY3F"])) diff --git a/loader_hub/string_iterable/README.md b/loader_hub/string_iterable/README.md new file mode 100644 index 00000000..e69de29b diff --git a/loader_hub/string_iterable/__init__.py b/loader_hub/string_iterable/__init__.py new file mode 100644 index 00000000..1d464056 --- /dev/null +++ b/loader_hub/string_iterable/__init__.py @@ -0,0 +1 @@ +"""Init file.""" diff --git a/loader_hub/string_iterable/base.py b/loader_hub/string_iterable/base.py new file mode 100644 index 00000000..62876b8e --- /dev/null +++ b/loader_hub/string_iterable/base.py @@ -0,0 +1,32 @@ +"""Simple reader that turns an iterable of strings into a list of Documents.""" +from typing import List + +from gpt_index.readers.base import BaseReader +from gpt_index.readers.schema.base import Document + + +class StringIterableReader(BaseReader): + """String Iterable Reader. + + Gets a list of documents, given an iterable (e.g. list) of strings. + + Example: + .. code-block:: python + + from gpt_index import StringIterableReader, GPTTreeIndex + + documents = StringIterableReader().load_data( + texts=["I went to the store", "I bought an apple"]) + index = GPTTreeIndex(documents) + index.query("what did I buy?") + + # response should be something like "You bought an apple." + """ + + def load_data(self, texts: List[str]) -> List[Document]: + """Load the data.""" + results = [] + for text in texts: + results.append(Document(text)) + + return results diff --git a/loader_hub/twitter/README.md b/loader_hub/twitter/README.md new file mode 100644 index 00000000..e69de29b diff --git a/loader_hub/twitter/__init__.py b/loader_hub/twitter/__init__.py new file mode 100644 index 00000000..1d464056 --- /dev/null +++ b/loader_hub/twitter/__init__.py @@ -0,0 +1 @@ +"""Init file.""" diff --git a/loader_hub/twitter/base.py b/loader_hub/twitter/base.py new file mode 100644 index 00000000..a946acfd --- /dev/null +++ b/loader_hub/twitter/base.py @@ -0,0 +1,60 @@ +"""Simple reader that reads tweets of a twitter handle.""" +from typing import Any, List, Optional + +from gpt_index.readers.base import BaseReader +from gpt_index.readers.schema.base import Document + + +class TwitterTweetReader(BaseReader): + """Twitter tweets reader. + + Read tweets of user twitter handle. + + Check 'https://developer.twitter.com/en/docs/twitter-api/\ + getting-started/getting-access-to-the-twitter-api' \ + on how to get access to twitter API. + + Args: + bearer_token (str): bearer_token that you get from twitter API. + num_tweets (Optional[int]): Number of tweets for each user twitter handle.\ + Default is 100 tweets. + """ + + def __init__( + self, + bearer_token: str, + num_tweets: Optional[int] = 100, + verbose: bool = False, + ) -> None: + """Initialize with parameters.""" + super().__init__(verbose=verbose) + self.bearer_token = bearer_token + self.num_tweets = num_tweets + + def load_data( + self, twitterhandles: List[str], **load_kwargs: Any + ) -> List[Document]: + """Load tweets of twitter handles. + + Args: + twitterhandles (List[str]): List of user twitter handles to read tweets. + + """ + try: + import tweepy + except ImportError: + raise ValueError( + "`tweepy` package not found, please run `pip install tweepy`" + ) + + client = tweepy.Client(bearer_token=self.bearer_token) + results = [] + for username in twitterhandles: + # tweets = api.user_timeline(screen_name=user, count=self.num_tweets) + user = client.get_user(username=username) + tweets = client.get_users_tweets(user.data.id, max_results=self.num_tweets) + response = " " + for tweet in tweets.data: + response = response + tweet.text + "\n" + results.append(Document(response)) + return results diff --git a/loader_hub/weaviate/README.md b/loader_hub/weaviate/README.md new file mode 100644 index 00000000..e69de29b diff --git a/loader_hub/weaviate/__init__.py b/loader_hub/weaviate/__init__.py new file mode 100644 index 00000000..1d464056 --- /dev/null +++ b/loader_hub/weaviate/__init__.py @@ -0,0 +1 @@ +"""Init file.""" diff --git a/loader_hub/weaviate/base.py b/loader_hub/weaviate/base.py new file mode 100644 index 00000000..9fb7548d --- /dev/null +++ b/loader_hub/weaviate/base.py @@ -0,0 +1,116 @@ +"""Weaviate reader.""" + +from typing import Any, List, Optional + +from gpt_index.readers.base import BaseReader +from gpt_index.readers.schema.base import Document + + +class WeaviateReader(BaseReader): + """Weaviate reader. + + Retrieves documents from Weaviate through vector lookup. Allows option + to concatenate retrieved documents into one Document, or to return + separate Document objects per document. + + Args: + host (str): host. + auth_client_secret (Optional[weaviate.auth.AuthCredentials]): + auth_client_secret. + """ + + def __init__( + self, + host: str, + auth_client_secret: Optional[Any] = None, + ) -> None: + """Initialize with parameters.""" + try: + import weaviate # noqa: F401 + from weaviate import Client # noqa: F401 + from weaviate.auth import AuthCredentials # noqa: F401 + except ImportError: + raise ValueError( + "`weaviate` package not found, please run `pip install weaviate-client`" + ) + + self.client: Client = Client(host, auth_client_secret=auth_client_secret) + + def load_data( + self, + class_name: Optional[str] = None, + properties: Optional[List[str]] = None, + graphql_query: Optional[str] = None, + separate_documents: Optional[bool] = True, + ) -> List[Document]: + """Load data from Weaviate. + + If `graphql_query` is not found in load_kwargs, we assume that + `class_name` and `properties` are provided. + + Args: + class_name (Optional[str]): class_name to retrieve documents from. + properties (Optional[List[str]]): properties to retrieve from documents. + graphql_query (Optional[str]): Raw GraphQL Query. + We assume that the query is a Get query. + separate_documents (Optional[bool]): Whether to return separate + documents. Defaults to True. + + Returns: + List[Document]: A list of documents. + + """ + if class_name is not None and properties is not None: + props_txt = "\n".join(properties) + graphql_query = f""" + {{ + Get {{ + {class_name} {{ + {props_txt} + }} + }} + }} + """ + elif graphql_query is not None: + pass + else: + raise ValueError( + "Either `class_name` and `properties` must be specified, " + "or `graphql_query` must be specified." + ) + + response = self.client.query.raw(graphql_query) + if "errors" in response: + raise ValueError("Invalid query, got errors: {}".format(response["errors"])) + + data_response = response["data"] + if "Get" not in data_response: + raise ValueError("Invalid query response, must be a Get query.") + + if class_name is None: + # infer class_name if only graphql_query was provided + class_name = list(data_response["Get"].keys())[0] + entries = data_response["Get"][class_name] + documents = [] + for entry in entries: + embedding = None + # for each entry, join properties into : + # separated by newlines + text_list = [] + for k, v in entry.items(): + if k == "_additional": + if "vector" in v: + embedding = v["vector"] + continue + text_list.append(f"{k}: {v}") + + text = "\n".join(text_list) + documents.append(Document(text=text, embedding=embedding)) + + if not separate_documents: + # join all documents into one + text_list = [doc.get_text() for doc in documents] + text = "\n\n".join(text_list) + documents = [Document(text=text)] + + return documents diff --git a/loader_hub/wikipedia/README.md b/loader_hub/wikipedia/README.md new file mode 100644 index 00000000..e69de29b diff --git a/loader_hub/wikipedia/__init__.py b/loader_hub/wikipedia/__init__.py new file mode 100644 index 00000000..1d464056 --- /dev/null +++ b/loader_hub/wikipedia/__init__.py @@ -0,0 +1 @@ +"""Init file.""" diff --git a/loader_hub/wikipedia/base.py b/loader_hub/wikipedia/base.py new file mode 100644 index 00000000..55400c67 --- /dev/null +++ b/loader_hub/wikipedia/base.py @@ -0,0 +1,37 @@ +"""Simple reader that reads wikipedia.""" +from typing import Any, List + +from gpt_index.readers.base import BaseReader +from gpt_index.readers.schema.base import Document + + +class WikipediaReader(BaseReader): + """Wikipedia reader. + + Reads a page. + + """ + + def __init__(self) -> None: + """Initialize with parameters.""" + try: + import wikipedia # noqa: F401 + except ImportError: + raise ValueError( + "`wikipedia` package not found, please run `pip install wikipedia`" + ) + + def load_data(self, pages: List[str], **load_kwargs: Any) -> List[Document]: + """Load data from the input directory. + + Args: + pages (List[str]): List of pages to read. + + """ + import wikipedia + + results = [] + for page in pages: + page_content = wikipedia.page(page, **load_kwargs).content + results.append(Document(page_content)) + return results diff --git a/loader_hub/youtube_transcript/README.md b/loader_hub/youtube_transcript/README.md new file mode 100644 index 00000000..e69de29b diff --git a/loader_hub/youtube_transcript/__init__.py b/loader_hub/youtube_transcript/__init__.py new file mode 100644 index 00000000..1d464056 --- /dev/null +++ b/loader_hub/youtube_transcript/__init__.py @@ -0,0 +1 @@ +"""Init file.""" diff --git a/loader_hub/youtube_transcript/base.py b/loader_hub/youtube_transcript/base.py new file mode 100644 index 00000000..969ca38b --- /dev/null +++ b/loader_hub/youtube_transcript/base.py @@ -0,0 +1,38 @@ +"""Simple Reader that reads transcript of youtube video.""" +from typing import Any, List + +from gpt_index.readers.base import BaseReader +from gpt_index.readers.schema.base import Document + + +class YoutubeTranscriptReader(BaseReader): + """Youtube Transcript reader.""" + + def __init__(self) -> None: + """Initialize with parameters.""" + + def load_data(self, ytlinks: List[str], **load_kwargs: Any) -> List[Document]: + """Load data from the input directory. + + Args: + pages (List[str]): List of youtube links \ + for which transcripts are to be read. + + """ + try: + from youtube_transcript_api import YouTubeTranscriptApi + except ImportError: + raise ValueError( + "`youtube_transcript_api` package not found, \ + please run `pip install youtube-transcript-api`" + ) + + results = [] + for link in ytlinks: + video_id = link.split("?v=")[-1] + srt = YouTubeTranscriptApi.get_transcript(video_id) + transcript = "" + for chunk in srt: + transcript = transcript + chunk["text"] + "\n" + results.append(Document(transcript)) + return results