Added all other files

2025-11-30 08:59:55 +00:00 · 2023-02-03 00:05:28 -08:00 · 2023-02-03 00:05:28 -08:00 · 8889cc77c0
commit 8889cc77c0
parent 5cfa2a5098
45 changed files with 1262 additions and 3 deletions
--- a/loader_hub/add_loader.sh
+++ b/loader_hub/add_loader.sh
@ -0,0 +1,5 @@
+mkdir $1;
+touch $1/base.py;
+touch $1/README.md;
+touch $1/__init__.py;
+echo "\"\"\"Init file.\"\"\"" >  $1/__init__.py;
--- a/loader_hub/database/README.md
+++ b/loader_hub/database/README.md
--- a/loader_hub/database/init.py
+++ b/loader_hub/database/init.py
@ -0,0 +1 @@
+"""Init file."""
--- a/loader_hub/database/base.py
+++ b/loader_hub/database/base.py
@ -0,0 +1,95 @@
+"""Database Reader."""
+
+from typing import Any, List, Optional
+
+from sqlalchemy import text
+from sqlalchemy.engine import Engine
+
+from gpt_index.langchain_helpers.sql_wrapper import SQLDatabase
+from gpt_index.readers.base import BaseReader
+from gpt_index.readers.schema.base import Document
+
+
+class DatabaseReader(BaseReader):
+    """Simple Database reader.
+
+    Concatenates each row into Document used by GPT Index.
+
+    Args:
+        sql_database (Optional[SQLDatabase]): SQL database to use,
+            including table names to specify.
+            See :ref:`Ref-Struct-Store` for more details.
+
+        OR
+
+        engine (Optional[Engine]): SQLAlchemy Engine object of the database connection.
+
+        OR
+
+        uri (Optional[str]): uri of the database connection.
+
+        OR
+
+        scheme (Optional[str]): scheme of the database connection.
+        host (Optional[str]): host of the database connection.
+        port (Optional[int]): port of the database connection.
+        user (Optional[str]): user of the database connection.
+        password (Optional[str]): password of the database connection.
+        dbname (Optional[str]): dbname of the database connection.
+
+    Returns:
+        DatabaseReader: A DatabaseReader object.
+    """
+
+    def __init__(
+        self,
+        sql_database: Optional[SQLDatabase] = None,
+        engine: Optional[Engine] = None,
+        uri: Optional[str] = None,
+        scheme: Optional[str] = None,
+        host: Optional[str] = None,
+        port: Optional[str] = None,
+        user: Optional[str] = None,
+        password: Optional[str] = None,
+        dbname: Optional[str] = None,
+        *args: Optional[Any],
+        **kwargs: Optional[Any],
+    ) -> None:
+        """Initialize with parameters."""
+        if sql_database:
+            self.sql_database = sql_database
+        elif engine:
+            self.sql_database = SQLDatabase(engine, *args, **kwargs)
+        elif uri:
+            self.uri = uri
+            self.sql_database = SQLDatabase.from_uri(uri, *args, **kwargs)
+        elif scheme and host and port and user and password and dbname:
+            uri = f"{scheme}://{user}:{password}@{host}:{port}/{dbname}"
+            self.uri = uri
+            self.sql_database = SQLDatabase.from_uri(uri, *args, **kwargs)
+        else:
+            raise ValueError(
+                "You must provide either a SQLDatabase, "
+                "a SQL Alchemy Engine, a valid connection URI, or a valid "
+                "set of credentials."
+            )
+
+    def load_data(self, query: str) -> List[Document]:
+        """Query and load data from the Database, returning a list of Documents.
+
+        Args:
+            query (str): Query parameter to filter tables and rows.
+
+        Returns:
+            List[Document]: A list of Document objects.
+        """
+        documents = []
+        with self.sql_database.engine.connect() as connection:
+            if query is None:
+                raise ValueError("A query parameter is necessary to filter the data")
+            else:
+                result = connection.execute(text(query))
+
+            for item in result.fetchall():
+                documents.append(Document(item[0]))
+        return documents
--- a/loader_hub/discord/README.md
+++ b/loader_hub/discord/README.md
--- a/loader_hub/discord/init.py
+++ b/loader_hub/discord/init.py
@ -0,0 +1 @@
+"""Init file."""
--- a/loader_hub/discord/base.py
+++ b/loader_hub/discord/base.py
@ -0,0 +1,147 @@
+"""Discord reader."""
+
+import asyncio
+import logging
+import os
+from typing import List, Optional
+
+from gpt_index.readers.base import BaseReader
+from gpt_index.readers.schema.base import Document
+
+logger = logging.getLogger(__name__)
+
+
+async def read_channel(
+    discord_token: str, channel_id: int, limit: Optional[int], oldest_first: bool
+) -> str:
+    """Async read channel.
+
+    Note: This is our hack to create a synchronous interface to the
+    async discord.py API. We use the `asyncio` module to run
+    this function with `asyncio.get_event_loop().run_until_complete`.
+
+    """
+    import discord  # noqa: F401
+
+    messages: List[discord.Message] = []
+
+    class CustomClient(discord.Client):
+        async def on_ready(self) -> None:
+            try:
+                print(f"{self.user} has connected to Discord!")
+                channel = client.get_channel(channel_id)
+                # only work for text channels for now
+                if not isinstance(channel, discord.TextChannel):
+                    raise ValueError(
+                        f"Channel {channel_id} is not a text channel. "
+                        "Only text channels are supported for now."
+                    )
+                # thread_dict maps thread_id to thread
+                thread_dict = {}
+                for thread in channel.threads:
+                    thread_dict[thread.id] = thread
+
+                async for msg in channel.history(
+                    limit=limit, oldest_first=oldest_first
+                ):
+                    messages.append(msg)
+                    if msg.id in thread_dict:
+                        thread = thread_dict[msg.id]
+                        async for thread_msg in thread.history(
+                            limit=limit, oldest_first=oldest_first
+                        ):
+                            messages.append(thread_msg)
+            except Exception as e:
+                print("Encountered error: " + str(e))
+            finally:
+                await self.close()
+
+    intents = discord.Intents.default()
+    intents.message_content = True
+    client = CustomClient(intents=intents)
+    await client.start(discord_token)
+
+    msg_txt_list = [m.content for m in messages]
+
+    return "\n\n".join(msg_txt_list)
+
+
+class DiscordReader(BaseReader):
+    """Discord reader.
+
+    Reads conversations from channels.
+
+    Args:
+        discord_token (Optional[str]): Discord token. If not provided, we
+            assume the environment variable `DISCORD_TOKEN` is set.
+
+    """
+
+    def __init__(self, discord_token: Optional[str] = None) -> None:
+        """Initialize with parameters."""
+        try:
+            import discord  # noqa: F401
+        except ImportError:
+            raise ValueError(
+                "`discord.py` package not found, please run `pip install discord.py`"
+            )
+        if discord_token is None:
+            discord_token = os.environ["DISCORD_TOKEN"]
+            if discord_token is None:
+                raise ValueError(
+                    "Must specify `discord_token` or set environment "
+                    "variable `DISCORD_TOKEN`."
+                )
+
+        self.discord_token = discord_token
+
+    def _read_channel(
+        self, channel_id: int, limit: Optional[int] = None, oldest_first: bool = True
+    ) -> str:
+        """Read channel."""
+        result = asyncio.get_event_loop().run_until_complete(
+            read_channel(
+                self.discord_token, channel_id, limit=limit, oldest_first=oldest_first
+            )
+        )
+        return result
+
+    def load_data(
+        self,
+        channel_ids: List[int],
+        limit: Optional[int] = None,
+        oldest_first: bool = True,
+    ) -> List[Document]:
+        """Load data from the input directory.
+
+        Args:
+            channel_ids (List[int]): List of channel ids to read.
+            limit (Optional[int]): Maximum number of messages to read.
+            oldest_first (bool): Whether to read oldest messages first.
+                Defaults to `True`.
+
+        Returns:
+            List[Document]: List of documents.
+
+        """
+        results: List[Document] = []
+        for channel_id in channel_ids:
+            if not isinstance(channel_id, int):
+                raise ValueError(
+                    f"Channel id {channel_id} must be an integer, "
+                    f"not {type(channel_id)}."
+                )
+            channel_content = self._read_channel(
+                channel_id, limit=limit, oldest_first=oldest_first
+            )
+            results.append(
+                Document(channel_content, extra_info={"channel": channel_id})
+            )
+        return results
+
+
+if __name__ == "__main__":
+    reader = DiscordReader()
+    print("initialized reader")
+    output = reader.load_data(channel_ids=[1057178784895348746], limit=10)
+    print(output)
--- a/loader_hub/faiss/README.md
+++ b/loader_hub/faiss/README.md
--- a/loader_hub/faiss/init.py
+++ b/loader_hub/faiss/init.py
@ -0,0 +1 @@
+"""Init file."""
--- a/loader_hub/faiss/base.py
+++ b/loader_hub/faiss/base.py
@ -0,0 +1,75 @@
+"""Faiss reader."""
+
+from typing import Any, Dict, List
+
+import numpy as np
+
+from gpt_index.readers.base import BaseReader
+from gpt_index.readers.schema.base import Document
+
+
+class FaissReader(BaseReader):
+    """Faiss reader.
+
+    Retrieves documents through an existing in-memory Faiss index.
+    These documents can then be used in a downstream GPT Index data structure.
+    If you wish use Faiss itself as an index to to organize documents,
+    insert documents, and perform queries on them, please use GPTFaissIndex.
+
+    Args:
+        faiss_index (faiss.Index): A Faiss Index object (required)
+
+    """
+
+    def __init__(self, index: Any):
+        """Initialize with parameters."""
+        import_err_msg = """
+            `faiss` package not found. For instructions on
+            how to install `faiss` please visit
+            https://github.com/facebookresearch/faiss/wiki/Installing-Faiss
+        """
+        try:
+            import faiss  # noqa: F401
+        except ImportError:
+            raise ValueError(import_err_msg)
+
+        self._index = index
+
+    def load_data(
+        self,
+        query: np.ndarray,
+        id_to_text_map: Dict[str, str],
+        k: int = 4,
+        separate_documents: bool = True,
+    ) -> List[Document]:
+        """Load data from Faiss.
+
+        Args:
+            query (np.ndarray): A 2D numpy array of query vectors.
+            id_to_text_map (Dict[str, str]): A map from ID's to text.
+            k (int): Number of nearest neighbors to retrieve. Defaults to 4.
+            separate_documents (Optional[bool]): Whether to return separate
+                documents. Defaults to True.
+        Returns:
+            List[Document]: A list of documents.
+
+        """
+        dists, indices = self._index.search(query, k)
+        documents = []
+        for qidx in range(indices.shape[0]):
+            for didx in range(indices.shape[1]):
+                doc_id = indices[qidx, didx]
+                if doc_id not in id_to_text_map:
+                    raise ValueError(
+                        f"Document ID {doc_id} not found in id_to_text_map."
+                    )
+                text = id_to_text_map[doc_id]
+                documents.append(Document(text=text))
+
+        if not separate_documents:
+            # join all documents into one
+            text_list = [doc.get_text() for doc in documents]
+            text = "\n\n".join(text_list)
+            documents = [Document(text=text)]
+
+        return documents
--- a/loader_hub/file/init.py
+++ b/loader_hub/file/init.py
@ -1 +1 @@
-"""Init params."""
+"""Init file."""
--- a/loader_hub/library.json
+++ b/loader_hub/library.json
@ -12,7 +12,46 @@
    "id": "web/beautiful_soup_web",
    "author": "thejessezhang"
  },
-  "TrafilaturaWebReader": {
-    "id": "web/trafilatura_web"
+  "DatabaseReader": {
+    "id": "database"
+  },
+  "DiscordReader": {
+    "id": "discord"
+  },
+  "FaissReader": {
+    "id": "faiss"
+  },
+  "SimpleMongoReader": {
+    "id": "mongo"
+  },
+  "NotionPageReader": {
+    "id": "notion"
+  },
+  "ObsidianReader": {
+    "id": "obsidian"
+  },
+  "PineconeReader": {
+    "id": "pinecone"
+  },
+  "QdrantReader": {
+    "id": "qdrant"
+  },
+  "SlackReader": {
+    "id": "slack"
+  },
+  "StringIterableReader": {
+    "id": "string_iterable"
+  },
+  "TwitterTweetReader": {
+    "id": "twitter"
+  },
+  "WeaviateReader": {
+    "id": "weaviate"
+  },
+  "WikipediaReader": {
+    "id": "wikipedia"
+  },
+  "YoutubeTranscriptReader": {
+    "id": "youtube_transcript"
  }
 }
--- a/loader_hub/mongo/README.md
+++ b/loader_hub/mongo/README.md
--- a/loader_hub/mongo/init.py
+++ b/loader_hub/mongo/init.py
@ -0,0 +1 @@
+"""Init file."""
--- a/loader_hub/mongo/base.py
+++ b/loader_hub/mongo/base.py
@ -0,0 +1,59 @@
+"""Mongo client."""
+
+from typing import Dict, List, Optional
+
+from gpt_index.readers.base import BaseReader
+from gpt_index.readers.schema.base import Document
+
+
+class SimpleMongoReader(BaseReader):
+    """Simple mongo reader.
+
+    Concatenates each Mongo doc into Document used by GPT Index.
+
+    Args:
+        host (str): Mongo host.
+        port (int): Mongo port.
+        max_docs (int): Maximum number of documents to load.
+
+    """
+
+    def __init__(self, host: str, port: int, max_docs: int = 1000) -> None:
+        """Initialize with parameters."""
+        try:
+            import pymongo  # noqa: F401
+            from pymongo import MongoClient  # noqa: F401
+        except ImportError:
+            raise ValueError(
+                "`pymongo` package not found, please run `pip install pymongo`"
+            )
+        self.client: MongoClient = MongoClient(host, port)
+        self.max_docs = max_docs
+
+    def load_data(
+        self, db_name: str, collection_name: str, query_dict: Optional[Dict] = None
+    ) -> List[Document]:
+        """Load data from the input directory.
+
+        Args:
+            db_name (str): name of the database.
+            collection_name (str): name of the collection.
+            query_dict (Optional[Dict]): query to filter documents.
+                Defaults to None
+
+        Returns:
+            List[Document]: A list of documents.
+
+        """
+        documents = []
+        db = self.client[db_name]
+        if query_dict is None:
+            cursor = db[collection_name].find()
+        else:
+            cursor = db[collection_name].find(query_dict)
+
+        for item in cursor:
+            if "text" not in item:
+                raise ValueError("`text` field not found in Mongo document.")
+            documents.append(Document(item["text"]))
+        return documents
--- a/loader_hub/notion/README.md
+++ b/loader_hub/notion/README.md
--- a/loader_hub/notion/init.py
+++ b/loader_hub/notion/init.py
@ -0,0 +1 @@
+"""Init file."""
--- a/loader_hub/notion/base.py
+++ b/loader_hub/notion/base.py
@ -0,0 +1,166 @@
+"""Notion reader."""
+import os
+from typing import Any, Dict, List, Optional
+
+import requests  # type: ignore
+
+from gpt_index.readers.base import BaseReader
+from gpt_index.readers.schema.base import Document
+
+INTEGRATION_TOKEN_NAME = "NOTION_INTEGRATION_TOKEN"
+BLOCK_CHILD_URL_TMPL = "https://api.notion.com/v1/blocks/{block_id}/children"
+DATABASE_URL_TMPL = "https://api.notion.com/v1/databases/{database_id}/query"
+SEARCH_URL = "https://api.notion.com/v1/search"
+
+
+# TODO: Notion DB reader coming soon!
+class NotionPageReader(BaseReader):
+    """Notion Page reader.
+
+    Reads a set of Notion pages.
+
+    Args:
+        integration_token (str): Notion integration token.
+
+    """
+
+    def __init__(self, integration_token: Optional[str] = None) -> None:
+        """Initialize with parameters."""
+        if integration_token is None:
+            integration_token = os.getenv(INTEGRATION_TOKEN_NAME)
+            if integration_token is None:
+                raise ValueError(
+                    "Must specify `integration_token` or set environment "
+                    "variable `NOTION_INTEGRATION_TOKEN`."
+                )
+        self.token = integration_token
+        self.headers = {
+            "Authorization": "Bearer " + self.token,
+            "Content-Type": "application/json",
+            "Notion-Version": "2022-06-28",
+        }
+
+    def _read_block(self, block_id: str, num_tabs: int = 0) -> str:
+        """Read a block."""
+        done = False
+        result_lines_arr = []
+        cur_block_id = block_id
+        while not done:
+            block_url = BLOCK_CHILD_URL_TMPL.format(block_id=cur_block_id)
+            query_dict: Dict[str, Any] = {}
+
+            res = requests.request(
+                "GET", block_url, headers=self.headers, json=query_dict
+            )
+            data = res.json()
+
+            for result in data["results"]:
+                result_type = result["type"]
+                result_obj = result[result_type]
+
+                cur_result_text_arr = []
+                if "rich_text" in result_obj:
+                    for rich_text in result_obj["rich_text"]:
+                        # skip if doesn't have text object
+                        if "text" in rich_text:
+                            text = rich_text["text"]["content"]
+                            prefix = "\t" * num_tabs
+                            cur_result_text_arr.append(prefix + text)
+
+                result_block_id = result["id"]
+                has_children = result["has_children"]
+                if has_children:
+                    children_text = self._read_block(
+                        result_block_id, num_tabs=num_tabs + 1
+                    )
+                    cur_result_text_arr.append(children_text)
+
+                cur_result_text = "\n".join(cur_result_text_arr)
+                result_lines_arr.append(cur_result_text)
+
+            if data["next_cursor"] is None:
+                done = True
+                break
+            else:
+                cur_block_id = data["next_cursor"]
+
+        result_lines = "\n".join(result_lines_arr)
+        return result_lines
+
+    def read_page(self, page_id: str) -> str:
+        """Read a page."""
+        return self._read_block(page_id)
+
+    def query_database(
+        self, database_id: str, query_dict: Dict[str, Any] = {}
+    ) -> List[str]:
+        """Get all the pages from a Notion database."""
+        res = requests.post(
+            DATABASE_URL_TMPL.format(database_id=database_id),
+            headers=self.headers,
+            json=query_dict,
+        )
+        data = res.json()
+        page_ids = []
+        for result in data["results"]:
+            page_id = result["id"]
+            page_ids.append(page_id)
+
+        return page_ids
+
+    def search(self, query: str) -> List[str]:
+        """Search Notion page given a text query."""
+        done = False
+        next_cursor: Optional[str] = None
+        page_ids = []
+        while not done:
+            query_dict = {
+                "query": query,
+            }
+            if next_cursor is not None:
+                query_dict["start_cursor"] = next_cursor
+            res = requests.post(SEARCH_URL, headers=self.headers, json=query_dict)
+            data = res.json()
+            for result in data["results"]:
+                page_id = result["id"]
+                page_ids.append(page_id)
+
+            if data["next_cursor"] is None:
+                done = True
+                break
+            else:
+                next_cursor = data["next_cursor"]
+        return page_ids
+
+    def load_data(
+        self, page_ids: List[str] = [], database_id: Optional[str] = None
+    ) -> List[Document]:
+        """Load data from the input directory.
+
+        Args:
+            page_ids (List[str]): List of page ids to load.
+
+        Returns:
+            List[Document]: List of documents.
+
+        """
+        if not page_ids and not database_id:
+            raise ValueError("Must specify either `page_ids` or `database_id`.")
+        docs = []
+        if database_id is not None:
+            # get all the pages in the database
+            page_ids = self.query_database(database_id)
+            for page_id in page_ids:
+                page_text = self.read_page(page_id)
+                docs.append(Document(page_text, extra_info={"page_id": page_id}))
+        else:
+            for page_id in page_ids:
+                page_text = self.read_page(page_id)
+                docs.append(Document(page_text, extra_info={"page_id": page_id}))
+
+        return docs
+
+
+if __name__ == "__main__":
+    reader = NotionPageReader()
+    print(reader.search("What I"))
--- a/loader_hub/obsidian/README.md
+++ b/loader_hub/obsidian/README.md
--- a/loader_hub/obsidian/init.py
+++ b/loader_hub/obsidian/init.py
@ -0,0 +1 @@
+"""Init file."""
--- a/loader_hub/obsidian/base.py
+++ b/loader_hub/obsidian/base.py
@ -0,0 +1,47 @@
+"""Obsidian reader class.
+
+Pass in the path to an Obsidian vault and it will parse all markdown
+files into a List of Documents,
+with each Document containing text from under an Obsidian header.
+
+"""
+import os
+from pathlib import Path
+from typing import Any, List
+
+from langchain.docstore.document import Document as LCDocument
+
+from gpt_index.readers.base import BaseReader
+from gpt_index.readers.file.markdown_parser import MarkdownParser
+from gpt_index.readers.schema.base import Document
+
+
+class ObsidianReader(BaseReader):
+    """Utilities for loading data from an Obsidian Vault.
+
+    Args:
+        input_dir (str): Path to the vault.
+
+    """
+
+    def __init__(self, input_dir: str, verbose: bool = False):
+        """Init params."""
+        self.verbose = verbose
+        self.input_dir = Path(input_dir)
+
+    def load_data(self, *args: Any, **load_kwargs: Any) -> List[Document]:
+        """Load data from the input directory."""
+        docs: List[str] = []
+        for (dirpath, dirnames, filenames) in os.walk(self.input_dir):
+            dirnames[:] = [d for d in dirnames if not d.startswith(".")]
+            for filename in filenames:
+                if filename.endswith(".md"):
+                    filepath = os.path.join(dirpath, filename)
+                    content = MarkdownParser().parse_file(Path(filepath))
+                    docs.extend(content)
+        return [Document(d) for d in docs]
+
+    def load_langchain_documents(self, **load_kwargs: Any) -> List[LCDocument]:
+        """Load data in LangChain document format."""
+        docs = self.load_data(**load_kwargs)
+        return [d.to_langchain_format() for d in docs]
--- a/loader_hub/pinecone/README.md
+++ b/loader_hub/pinecone/README.md
--- a/loader_hub/pinecone/init.py
+++ b/loader_hub/pinecone/init.py
@ -0,0 +1 @@
+"""Init file."""
--- a/loader_hub/pinecone/base.py
+++ b/loader_hub/pinecone/base.py
@ -0,0 +1,81 @@
+"""Pinecone reader."""
+
+from typing import Any, Dict, List, Optional
+
+from gpt_index.readers.base import BaseReader
+from gpt_index.readers.schema.base import Document
+
+
+class PineconeReader(BaseReader):
+    """Pinecone reader.
+
+    Args:
+        api_key (str): Pinecone API key.
+        environment (str): Pinecone environment.
+    """
+
+    def __init__(self, api_key: str, environment: str):
+        """Initialize with parameters."""
+        try:
+            import pinecone  # noqa: F401
+        except ImportError:
+            raise ValueError(
+                "`pinecone` package not found, please run `pip install pinecone-client`"
+            )
+
+        self._api_key = api_key
+        self._environment = environment
+        pinecone.init(api_key=api_key, environment=environment)
+
+    def load_data(
+        self,
+        index_name: str,
+        id_to_text_map: Dict[str, str],
+        vector: Optional[List[float]],
+        top_k: int,
+        separate_documents: bool = True,
+        include_values: bool = True,
+        **query_kwargs: Any
+    ) -> List[Document]:
+        """Load data from Pinecone.
+
+        Args:
+            index_name (str): Name of the index.
+            id_to_text_map (Dict[str, str]): A map from ID's to text.
+            separate_documents (Optional[bool]): Whether to return separate
+                documents per retrieved entry. Defaults to True.
+            vector (List[float]): Query vector.
+            top_k (int): Number of results to return.
+            include_values (bool): Whether to include the embedding in the response.
+                Defaults to True.
+            **query_kwargs: Keyword arguments to pass to the query.
+                Arguments are the exact same as those found in
+                Pinecone's reference documentation for the
+                query method.
+
+        Returns:
+            List[Document]: A list of documents.
+        """
+        import pinecone
+
+        index = pinecone.Index(index_name)
+        if "include_values" not in query_kwargs:
+            query_kwargs["include_values"] = True
+        response = index.query(top_k=top_k, vector=vector, **query_kwargs)
+
+        documents = []
+        for match in response.matches:
+            if match.id not in id_to_text_map:
+                raise ValueError("ID not found in id_to_text_map.")
+            text = id_to_text_map[match.id]
+            embedding = match.values
+            if len(embedding) == 0:
+                embedding = None
+            documents.append(Document(text=text, embedding=embedding))
+
+        if not separate_documents:
+            text_list = [doc.get_text() for doc in documents]
+            text = "\n\n".join(text_list)
+            documents = [Document(text=text)]
+
+        return documents
--- a/loader_hub/qdrant/README.md
+++ b/loader_hub/qdrant/README.md
--- a/loader_hub/qdrant/init.py
+++ b/loader_hub/qdrant/init.py
@ -0,0 +1 @@
+"""Init file."""
--- a/loader_hub/qdrant/base.py
+++ b/loader_hub/qdrant/base.py
@ -0,0 +1,105 @@
+"""Qdrant reader."""
+
+from typing import List, Optional, cast
+
+from gpt_index.readers.base import BaseReader
+from gpt_index.readers.schema.base import Document
+
+
+class QdrantReader(BaseReader):
+    """Qdrant reader.
+
+    Retrieve documents from existing Qdrant collections.
+
+    Args:
+        host: Host name of Qdrant service.
+        port: Port of the REST API interface. Default: 6333
+        grpc_port: Port of the gRPC interface. Default: 6334
+        prefer_grpc: If `true` - use gPRC interface whenever possible in custom methods.
+        https: If `true` - use HTTPS(SSL) protocol. Default: `false`
+        api_key: API key for authentication in Qdrant Cloud. Default: `None`
+        prefix:
+            If not `None` - add `prefix` to the REST URL path.
+            Example: `service/v1` will result in
+            `http://localhost:6333/service/v1/{qdrant-endpoint}` for REST API.
+            Default: `None`
+        timeout:
+            Timeout for REST and gRPC API requests.
+            Default: 5.0 seconds for REST and unlimited for gRPC
+    """
+
+    def __init__(
+        self,
+        host: str,
+        port: int = 6333,
+        grpc_port: int = 6334,
+        prefer_grpc: bool = False,
+        https: Optional[bool] = None,
+        api_key: Optional[str] = None,
+        prefix: Optional[str] = None,
+        timeout: Optional[float] = None,
+        verbose: bool = False,
+    ):
+        """Initialize with parameters."""
+        super().__init__(verbose)
+
+        import_err_msg = (
+            "`qdrant-client` package not found, please run `pip install qdrant-client`"
+        )
+        try:
+            import qdrant_client  # noqa: F401
+        except ImportError:
+            raise ValueError(import_err_msg)
+
+        self._client = qdrant_client.QdrantClient(
+            host=host,
+            port=port,
+            grpc_port=grpc_port,
+            prefer_grpc=prefer_grpc,
+            https=https,
+            api_key=api_key,
+            prefix=prefix,
+            timeout=timeout,
+        )
+
+    def load_data(
+        self,
+        collection_name: str,
+        query_vector: List[float],
+        limit: int = 10,
+    ) -> List[Document]:
+        """Load data from Qdrant.
+
+        Args:
+            collection_name (str): Name of the Qdrant collection.
+            query_vector (List[float]): Query vector.
+            limit (int): Number of results to return.
+
+        Returns:
+            List[Document]: A list of documents.
+        """
+        from qdrant_client.http.models.models import Payload
+
+        response = self._client.search(
+            collection_name=collection_name,
+            query_vector=query_vector,
+            with_vectors=True,
+            with_payload=True,
+            limit=limit,
+        )
+
+        documents = []
+        for point in response:
+            payload = cast(Payload, point)
+            try:
+                vector = cast(List[float], point.vector)
+            except ValueError as e:
+                raise ValueError("Could not cast vector to List[float].") from e
+            document = Document(
+                doc_id=payload.get("doc_id"),
+                text=payload.get("text"),
+                embedding=vector,
+            )
+            documents.append(document)
+
+        return documents
--- a/loader_hub/slack/README.md
+++ b/loader_hub/slack/README.md
--- a/loader_hub/slack/init.py
+++ b/loader_hub/slack/init.py
@ -0,0 +1 @@
+"""Init file."""
--- a/loader_hub/slack/base.py
+++ b/loader_hub/slack/base.py
@ -0,0 +1,143 @@
+"""Slack reader."""
+import logging
+import os
+import time
+from typing import List, Optional
+
+from gpt_index.readers.base import BaseReader
+from gpt_index.readers.schema.base import Document
+
+logger = logging.getLogger(__name__)
+
+
+class SlackReader(BaseReader):
+    """Slack reader.
+
+    Reads conversations from channels.
+
+    Args:
+        slack_token (Optional[str]): Slack token. If not provided, we
+            assume the environment variable `SLACK_BOT_TOKEN` is set.
+
+    """
+
+    def __init__(self, slack_token: Optional[str] = None) -> None:
+        """Initialize with parameters."""
+        try:
+            from slack_sdk import WebClient
+        except ImportError:
+            raise ValueError(
+                "`slack_sdk` package not found, please run `pip install slack_sdk`"
+            )
+        if slack_token is None:
+            slack_token = os.environ["SLACK_BOT_TOKEN"]
+            if slack_token is None:
+                raise ValueError(
+                    "Must specify `slack_token` or set environment "
+                    "variable `SLACK_BOT_TOKEN`."
+                )
+        self.client = WebClient(token=slack_token)
+        res = self.client.api_test()
+        if not res["ok"]:
+            raise ValueError(f"Error initializing Slack API: {res['error']}")
+
+    def _read_message(self, channel_id: str, message_ts: str) -> str:
+        from slack_sdk.errors import SlackApiError
+
+        """Read a message."""
+
+        messages_text = []
+        next_cursor = None
+        while True:
+            try:
+                # https://slack.com/api/conversations.replies
+                # List all replies to a message, including the message itself.
+                result = self.client.conversations_replies(
+                    channel=channel_id, ts=message_ts, cursor=next_cursor
+                )
+                messages = result["messages"]
+                for message in messages:
+                    messages_text.append(message["text"])
+
+                if not result["has_more"]:
+                    break
+
+                next_cursor = result["response_metadata"]["next_cursor"]
+            except SlackApiError as e:
+                if e.response["error"] == "ratelimited":
+                    logger.error(
+                        "Rate limit error reached, sleeping for: {} seconds".format(
+                            e.response.headers["retry-after"]
+                        )
+                    )
+                    time.sleep(int(e.response.headers["retry-after"]))
+                else:
+                    logger.error("Error parsing conversation replies: {}".format(e))
+
+        return "\n\n".join(messages_text)
+
+    def _read_channel(self, channel_id: str) -> str:
+        from slack_sdk.errors import SlackApiError
+
+        """Read a channel."""
+
+        result_messages = []
+        next_cursor = None
+        while True:
+            try:
+                # Call the conversations.history method using the WebClient
+                # conversations.history returns the first 100 messages by default
+                # These results are paginated,
+                # see: https://api.slack.com/methods/conversations.history$pagination
+                result = self.client.conversations_history(
+                    channel=channel_id, cursor=next_cursor
+                )
+                conversation_history = result["messages"]
+                # Print results
+                logger.info(
+                    "{} messages found in {}".format(len(conversation_history), id)
+                )
+                for message in conversation_history:
+                    result_messages.append(
+                        self._read_message(channel_id, message["ts"])
+                    )
+
+                if not result["has_more"]:
+                    break
+                next_cursor = result["response_metadata"]["next_cursor"]
+
+            except SlackApiError as e:
+                if e.response["error"] == "ratelimited":
+                    logger.error(
+                        "Rate limit error reached, sleeping for: {} seconds".format(
+                            e.response.headers["retry-after"]
+                        )
+                    )
+                    time.sleep(int(e.response.headers["retry-after"]))
+                else:
+                    logger.error("Error parsing conversation replies: {}".format(e))
+
+        return "\n\n".join(result_messages)
+
+    def load_data(self, channel_ids: List[str]) -> List[Document]:
+        """Load data from the input directory.
+
+        Args:
+            channel_ids (List[str]): List of channel ids to read.
+
+        Returns:
+            List[Document]: List of documents.
+
+        """
+        results = []
+        for channel_id in channel_ids:
+            channel_content = self._read_channel(channel_id)
+            results.append(
+                Document(channel_content, extra_info={"channel": channel_id})
+            )
+        return results
+
+
+if __name__ == "__main__":
+    reader = SlackReader()
+    print(reader.load_data(channel_ids=["C04DC2VUY3F"]))
--- a/loader_hub/string_iterable/README.md
+++ b/loader_hub/string_iterable/README.md
--- a/loader_hub/string_iterable/init.py
+++ b/loader_hub/string_iterable/init.py
@ -0,0 +1 @@
+"""Init file."""
--- a/loader_hub/string_iterable/base.py
+++ b/loader_hub/string_iterable/base.py
@ -0,0 +1,32 @@
+"""Simple reader that turns an iterable of strings into a list of Documents."""
+from typing import List
+
+from gpt_index.readers.base import BaseReader
+from gpt_index.readers.schema.base import Document
+
+
+class StringIterableReader(BaseReader):
+    """String Iterable Reader.
+
+    Gets a list of documents, given an iterable (e.g. list) of strings.
+
+    Example:
+        .. code-block:: python
+
+            from gpt_index import StringIterableReader, GPTTreeIndex
+
+            documents = StringIterableReader().load_data(
+                texts=["I went to the store", "I bought an apple"])
+            index = GPTTreeIndex(documents)
+            index.query("what did I buy?")
+
+            # response should be something like "You bought an apple."
+    """
+
+    def load_data(self, texts: List[str]) -> List[Document]:
+        """Load the data."""
+        results = []
+        for text in texts:
+            results.append(Document(text))
+
+        return results
--- a/loader_hub/twitter/README.md
+++ b/loader_hub/twitter/README.md
--- a/loader_hub/twitter/init.py
+++ b/loader_hub/twitter/init.py
@ -0,0 +1 @@
+"""Init file."""
--- a/loader_hub/twitter/base.py
+++ b/loader_hub/twitter/base.py
@ -0,0 +1,60 @@
+"""Simple reader that reads tweets of a twitter handle."""
+from typing import Any, List, Optional
+
+from gpt_index.readers.base import BaseReader
+from gpt_index.readers.schema.base import Document
+
+
+class TwitterTweetReader(BaseReader):
+    """Twitter tweets reader.
+
+    Read tweets of user twitter handle.
+
+    Check 'https://developer.twitter.com/en/docs/twitter-api/\
+        getting-started/getting-access-to-the-twitter-api' \
+        on how to get access to twitter API.
+
+    Args:
+        bearer_token (str): bearer_token that you get from twitter API.
+        num_tweets (Optional[int]): Number of tweets for each user twitter handle.\
+            Default is 100 tweets.
+    """
+
+    def __init__(
+        self,
+        bearer_token: str,
+        num_tweets: Optional[int] = 100,
+        verbose: bool = False,
+    ) -> None:
+        """Initialize with parameters."""
+        super().__init__(verbose=verbose)
+        self.bearer_token = bearer_token
+        self.num_tweets = num_tweets
+
+    def load_data(
+        self, twitterhandles: List[str], **load_kwargs: Any
+    ) -> List[Document]:
+        """Load tweets of twitter handles.
+
+        Args:
+            twitterhandles (List[str]): List of user twitter handles to read tweets.
+
+        """
+        try:
+            import tweepy
+        except ImportError:
+            raise ValueError(
+                "`tweepy` package not found, please run `pip install tweepy`"
+            )
+
+        client = tweepy.Client(bearer_token=self.bearer_token)
+        results = []
+        for username in twitterhandles:
+            # tweets = api.user_timeline(screen_name=user, count=self.num_tweets)
+            user = client.get_user(username=username)
+            tweets = client.get_users_tweets(user.data.id, max_results=self.num_tweets)
+            response = " "
+            for tweet in tweets.data:
+                response = response + tweet.text + "\n"
+            results.append(Document(response))
+        return results
--- a/loader_hub/weaviate/README.md
+++ b/loader_hub/weaviate/README.md
--- a/loader_hub/weaviate/init.py
+++ b/loader_hub/weaviate/init.py
@ -0,0 +1 @@
+"""Init file."""
--- a/loader_hub/weaviate/base.py
+++ b/loader_hub/weaviate/base.py
@ -0,0 +1,116 @@
+"""Weaviate reader."""
+
+from typing import Any, List, Optional
+
+from gpt_index.readers.base import BaseReader
+from gpt_index.readers.schema.base import Document
+
+
+class WeaviateReader(BaseReader):
+    """Weaviate reader.
+
+    Retrieves documents from Weaviate through vector lookup. Allows option
+    to concatenate retrieved documents into one Document, or to return
+    separate Document objects per document.
+
+    Args:
+        host (str): host.
+        auth_client_secret (Optional[weaviate.auth.AuthCredentials]):
+            auth_client_secret.
+    """
+
+    def __init__(
+        self,
+        host: str,
+        auth_client_secret: Optional[Any] = None,
+    ) -> None:
+        """Initialize with parameters."""
+        try:
+            import weaviate  # noqa: F401
+            from weaviate import Client  # noqa: F401
+            from weaviate.auth import AuthCredentials  # noqa: F401
+        except ImportError:
+            raise ValueError(
+                "`weaviate` package not found, please run `pip install weaviate-client`"
+            )
+
+        self.client: Client = Client(host, auth_client_secret=auth_client_secret)
+
+    def load_data(
+        self,
+        class_name: Optional[str] = None,
+        properties: Optional[List[str]] = None,
+        graphql_query: Optional[str] = None,
+        separate_documents: Optional[bool] = True,
+    ) -> List[Document]:
+        """Load data from Weaviate.
+
+        If `graphql_query` is not found in load_kwargs, we assume that
+        `class_name` and `properties` are provided.
+
+        Args:
+            class_name (Optional[str]): class_name to retrieve documents from.
+            properties (Optional[List[str]]): properties to retrieve from documents.
+            graphql_query (Optional[str]): Raw GraphQL Query.
+                We assume that the query is a Get query.
+            separate_documents (Optional[bool]): Whether to return separate
+                documents. Defaults to True.
+
+        Returns:
+            List[Document]: A list of documents.
+
+        """
+        if class_name is not None and properties is not None:
+            props_txt = "\n".join(properties)
+            graphql_query = f"""
+            {{
+                Get {{
+                    {class_name} {{
+                        {props_txt}
+                    }}
+                }}
+            }}
+            """
+        elif graphql_query is not None:
+            pass
+        else:
+            raise ValueError(
+                "Either `class_name` and `properties` must be specified, "
+                "or `graphql_query` must be specified."
+            )
+
+        response = self.client.query.raw(graphql_query)
+        if "errors" in response:
+            raise ValueError("Invalid query, got errors: {}".format(response["errors"]))
+
+        data_response = response["data"]
+        if "Get" not in data_response:
+            raise ValueError("Invalid query response, must be a Get query.")
+
+        if class_name is None:
+            # infer class_name if only graphql_query was provided
+            class_name = list(data_response["Get"].keys())[0]
+        entries = data_response["Get"][class_name]
+        documents = []
+        for entry in entries:
+            embedding = None
+            # for each entry, join properties into <property>:<value>
+            # separated by newlines
+            text_list = []
+            for k, v in entry.items():
+                if k == "_additional":
+                    if "vector" in v:
+                        embedding = v["vector"]
+                    continue
+                text_list.append(f"{k}: {v}")
+
+            text = "\n".join(text_list)
+            documents.append(Document(text=text, embedding=embedding))
+
+        if not separate_documents:
+            # join all documents into one
+            text_list = [doc.get_text() for doc in documents]
+            text = "\n\n".join(text_list)
+            documents = [Document(text=text)]
+
+        return documents
--- a/loader_hub/wikipedia/README.md
+++ b/loader_hub/wikipedia/README.md
--- a/loader_hub/wikipedia/init.py
+++ b/loader_hub/wikipedia/init.py
@ -0,0 +1 @@
+"""Init file."""
--- a/loader_hub/wikipedia/base.py
+++ b/loader_hub/wikipedia/base.py
@ -0,0 +1,37 @@
+"""Simple reader that reads wikipedia."""
+from typing import Any, List
+
+from gpt_index.readers.base import BaseReader
+from gpt_index.readers.schema.base import Document
+
+
+class WikipediaReader(BaseReader):
+    """Wikipedia reader.
+
+    Reads a page.
+
+    """
+
+    def __init__(self) -> None:
+        """Initialize with parameters."""
+        try:
+            import wikipedia  # noqa: F401
+        except ImportError:
+            raise ValueError(
+                "`wikipedia` package not found, please run `pip install wikipedia`"
+            )
+
+    def load_data(self, pages: List[str], **load_kwargs: Any) -> List[Document]:
+        """Load data from the input directory.
+
+        Args:
+            pages (List[str]): List of pages to read.
+
+        """
+        import wikipedia
+
+        results = []
+        for page in pages:
+            page_content = wikipedia.page(page, **load_kwargs).content
+            results.append(Document(page_content))
+        return results
--- a/loader_hub/youtube_transcript/README.md
+++ b/loader_hub/youtube_transcript/README.md
--- a/loader_hub/youtube_transcript/init.py
+++ b/loader_hub/youtube_transcript/init.py
@ -0,0 +1 @@
+"""Init file."""
--- a/loader_hub/youtube_transcript/base.py
+++ b/loader_hub/youtube_transcript/base.py
@ -0,0 +1,38 @@
+"""Simple Reader that reads transcript of youtube video."""
+from typing import Any, List
+
+from gpt_index.readers.base import BaseReader
+from gpt_index.readers.schema.base import Document
+
+
+class YoutubeTranscriptReader(BaseReader):
+    """Youtube Transcript reader."""
+
+    def __init__(self) -> None:
+        """Initialize with parameters."""
+
+    def load_data(self, ytlinks: List[str], **load_kwargs: Any) -> List[Document]:
+        """Load data from the input directory.
+
+        Args:
+            pages (List[str]): List of youtube links \
+                for which transcripts are to be read.
+
+        """
+        try:
+            from youtube_transcript_api import YouTubeTranscriptApi
+        except ImportError:
+            raise ValueError(
+                "`youtube_transcript_api` package not found, \
+                    please run `pip install youtube-transcript-api`"
+            )
+
+        results = []
+        for link in ytlinks:
+            video_id = link.split("?v=")[-1]
+            srt = YouTubeTranscriptApi.get_transcript(video_id)
+            transcript = ""
+            for chunk in srt:
+                transcript = transcript + chunk["text"] + "\n"
+            results.append(Document(transcript))
+        return results