mirror of
https://github.com/run-llama/llama-hub.git
synced 2025-11-30 08:59:55 +00:00
Added all other files
This commit is contained in:
parent
5cfa2a5098
commit
8889cc77c0
5
loader_hub/add_loader.sh
Executable file
5
loader_hub/add_loader.sh
Executable file
@ -0,0 +1,5 @@
|
||||
mkdir $1;
|
||||
touch $1/base.py;
|
||||
touch $1/README.md;
|
||||
touch $1/__init__.py;
|
||||
echo "\"\"\"Init file.\"\"\"" > $1/__init__.py;
|
||||
0
loader_hub/database/README.md
Normal file
0
loader_hub/database/README.md
Normal file
1
loader_hub/database/__init__.py
Normal file
1
loader_hub/database/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
"""Init file."""
|
||||
95
loader_hub/database/base.py
Normal file
95
loader_hub/database/base.py
Normal file
@ -0,0 +1,95 @@
|
||||
"""Database Reader."""
|
||||
|
||||
from typing import Any, List, Optional
|
||||
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.engine import Engine
|
||||
|
||||
from gpt_index.langchain_helpers.sql_wrapper import SQLDatabase
|
||||
from gpt_index.readers.base import BaseReader
|
||||
from gpt_index.readers.schema.base import Document
|
||||
|
||||
|
||||
class DatabaseReader(BaseReader):
|
||||
"""Simple Database reader.
|
||||
|
||||
Concatenates each row into Document used by GPT Index.
|
||||
|
||||
Args:
|
||||
sql_database (Optional[SQLDatabase]): SQL database to use,
|
||||
including table names to specify.
|
||||
See :ref:`Ref-Struct-Store` for more details.
|
||||
|
||||
OR
|
||||
|
||||
engine (Optional[Engine]): SQLAlchemy Engine object of the database connection.
|
||||
|
||||
OR
|
||||
|
||||
uri (Optional[str]): uri of the database connection.
|
||||
|
||||
OR
|
||||
|
||||
scheme (Optional[str]): scheme of the database connection.
|
||||
host (Optional[str]): host of the database connection.
|
||||
port (Optional[int]): port of the database connection.
|
||||
user (Optional[str]): user of the database connection.
|
||||
password (Optional[str]): password of the database connection.
|
||||
dbname (Optional[str]): dbname of the database connection.
|
||||
|
||||
Returns:
|
||||
DatabaseReader: A DatabaseReader object.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
sql_database: Optional[SQLDatabase] = None,
|
||||
engine: Optional[Engine] = None,
|
||||
uri: Optional[str] = None,
|
||||
scheme: Optional[str] = None,
|
||||
host: Optional[str] = None,
|
||||
port: Optional[str] = None,
|
||||
user: Optional[str] = None,
|
||||
password: Optional[str] = None,
|
||||
dbname: Optional[str] = None,
|
||||
*args: Optional[Any],
|
||||
**kwargs: Optional[Any],
|
||||
) -> None:
|
||||
"""Initialize with parameters."""
|
||||
if sql_database:
|
||||
self.sql_database = sql_database
|
||||
elif engine:
|
||||
self.sql_database = SQLDatabase(engine, *args, **kwargs)
|
||||
elif uri:
|
||||
self.uri = uri
|
||||
self.sql_database = SQLDatabase.from_uri(uri, *args, **kwargs)
|
||||
elif scheme and host and port and user and password and dbname:
|
||||
uri = f"{scheme}://{user}:{password}@{host}:{port}/{dbname}"
|
||||
self.uri = uri
|
||||
self.sql_database = SQLDatabase.from_uri(uri, *args, **kwargs)
|
||||
else:
|
||||
raise ValueError(
|
||||
"You must provide either a SQLDatabase, "
|
||||
"a SQL Alchemy Engine, a valid connection URI, or a valid "
|
||||
"set of credentials."
|
||||
)
|
||||
|
||||
def load_data(self, query: str) -> List[Document]:
|
||||
"""Query and load data from the Database, returning a list of Documents.
|
||||
|
||||
Args:
|
||||
query (str): Query parameter to filter tables and rows.
|
||||
|
||||
Returns:
|
||||
List[Document]: A list of Document objects.
|
||||
"""
|
||||
documents = []
|
||||
with self.sql_database.engine.connect() as connection:
|
||||
if query is None:
|
||||
raise ValueError("A query parameter is necessary to filter the data")
|
||||
else:
|
||||
result = connection.execute(text(query))
|
||||
|
||||
for item in result.fetchall():
|
||||
documents.append(Document(item[0]))
|
||||
return documents
|
||||
0
loader_hub/discord/README.md
Normal file
0
loader_hub/discord/README.md
Normal file
1
loader_hub/discord/__init__.py
Normal file
1
loader_hub/discord/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
"""Init file."""
|
||||
147
loader_hub/discord/base.py
Normal file
147
loader_hub/discord/base.py
Normal file
@ -0,0 +1,147 @@
|
||||
"""Discord reader."""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
from typing import List, Optional
|
||||
|
||||
from gpt_index.readers.base import BaseReader
|
||||
from gpt_index.readers.schema.base import Document
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def read_channel(
|
||||
discord_token: str, channel_id: int, limit: Optional[int], oldest_first: bool
|
||||
) -> str:
|
||||
"""Async read channel.
|
||||
|
||||
Note: This is our hack to create a synchronous interface to the
|
||||
async discord.py API. We use the `asyncio` module to run
|
||||
this function with `asyncio.get_event_loop().run_until_complete`.
|
||||
|
||||
"""
|
||||
import discord # noqa: F401
|
||||
|
||||
messages: List[discord.Message] = []
|
||||
|
||||
class CustomClient(discord.Client):
|
||||
async def on_ready(self) -> None:
|
||||
try:
|
||||
print(f"{self.user} has connected to Discord!")
|
||||
channel = client.get_channel(channel_id)
|
||||
# only work for text channels for now
|
||||
if not isinstance(channel, discord.TextChannel):
|
||||
raise ValueError(
|
||||
f"Channel {channel_id} is not a text channel. "
|
||||
"Only text channels are supported for now."
|
||||
)
|
||||
# thread_dict maps thread_id to thread
|
||||
thread_dict = {}
|
||||
for thread in channel.threads:
|
||||
thread_dict[thread.id] = thread
|
||||
|
||||
async for msg in channel.history(
|
||||
limit=limit, oldest_first=oldest_first
|
||||
):
|
||||
messages.append(msg)
|
||||
if msg.id in thread_dict:
|
||||
thread = thread_dict[msg.id]
|
||||
async for thread_msg in thread.history(
|
||||
limit=limit, oldest_first=oldest_first
|
||||
):
|
||||
messages.append(thread_msg)
|
||||
except Exception as e:
|
||||
print("Encountered error: " + str(e))
|
||||
finally:
|
||||
await self.close()
|
||||
|
||||
intents = discord.Intents.default()
|
||||
intents.message_content = True
|
||||
client = CustomClient(intents=intents)
|
||||
await client.start(discord_token)
|
||||
|
||||
msg_txt_list = [m.content for m in messages]
|
||||
|
||||
return "\n\n".join(msg_txt_list)
|
||||
|
||||
|
||||
class DiscordReader(BaseReader):
|
||||
"""Discord reader.
|
||||
|
||||
Reads conversations from channels.
|
||||
|
||||
Args:
|
||||
discord_token (Optional[str]): Discord token. If not provided, we
|
||||
assume the environment variable `DISCORD_TOKEN` is set.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, discord_token: Optional[str] = None) -> None:
|
||||
"""Initialize with parameters."""
|
||||
try:
|
||||
import discord # noqa: F401
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"`discord.py` package not found, please run `pip install discord.py`"
|
||||
)
|
||||
if discord_token is None:
|
||||
discord_token = os.environ["DISCORD_TOKEN"]
|
||||
if discord_token is None:
|
||||
raise ValueError(
|
||||
"Must specify `discord_token` or set environment "
|
||||
"variable `DISCORD_TOKEN`."
|
||||
)
|
||||
|
||||
self.discord_token = discord_token
|
||||
|
||||
def _read_channel(
|
||||
self, channel_id: int, limit: Optional[int] = None, oldest_first: bool = True
|
||||
) -> str:
|
||||
"""Read channel."""
|
||||
result = asyncio.get_event_loop().run_until_complete(
|
||||
read_channel(
|
||||
self.discord_token, channel_id, limit=limit, oldest_first=oldest_first
|
||||
)
|
||||
)
|
||||
return result
|
||||
|
||||
def load_data(
|
||||
self,
|
||||
channel_ids: List[int],
|
||||
limit: Optional[int] = None,
|
||||
oldest_first: bool = True,
|
||||
) -> List[Document]:
|
||||
"""Load data from the input directory.
|
||||
|
||||
Args:
|
||||
channel_ids (List[int]): List of channel ids to read.
|
||||
limit (Optional[int]): Maximum number of messages to read.
|
||||
oldest_first (bool): Whether to read oldest messages first.
|
||||
Defaults to `True`.
|
||||
|
||||
Returns:
|
||||
List[Document]: List of documents.
|
||||
|
||||
"""
|
||||
results: List[Document] = []
|
||||
for channel_id in channel_ids:
|
||||
if not isinstance(channel_id, int):
|
||||
raise ValueError(
|
||||
f"Channel id {channel_id} must be an integer, "
|
||||
f"not {type(channel_id)}."
|
||||
)
|
||||
channel_content = self._read_channel(
|
||||
channel_id, limit=limit, oldest_first=oldest_first
|
||||
)
|
||||
results.append(
|
||||
Document(channel_content, extra_info={"channel": channel_id})
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
reader = DiscordReader()
|
||||
print("initialized reader")
|
||||
output = reader.load_data(channel_ids=[1057178784895348746], limit=10)
|
||||
print(output)
|
||||
0
loader_hub/faiss/README.md
Normal file
0
loader_hub/faiss/README.md
Normal file
1
loader_hub/faiss/__init__.py
Normal file
1
loader_hub/faiss/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
"""Init file."""
|
||||
75
loader_hub/faiss/base.py
Normal file
75
loader_hub/faiss/base.py
Normal file
@ -0,0 +1,75 @@
|
||||
"""Faiss reader."""
|
||||
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import numpy as np
|
||||
|
||||
from gpt_index.readers.base import BaseReader
|
||||
from gpt_index.readers.schema.base import Document
|
||||
|
||||
|
||||
class FaissReader(BaseReader):
|
||||
"""Faiss reader.
|
||||
|
||||
Retrieves documents through an existing in-memory Faiss index.
|
||||
These documents can then be used in a downstream GPT Index data structure.
|
||||
If you wish use Faiss itself as an index to to organize documents,
|
||||
insert documents, and perform queries on them, please use GPTFaissIndex.
|
||||
|
||||
Args:
|
||||
faiss_index (faiss.Index): A Faiss Index object (required)
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, index: Any):
|
||||
"""Initialize with parameters."""
|
||||
import_err_msg = """
|
||||
`faiss` package not found. For instructions on
|
||||
how to install `faiss` please visit
|
||||
https://github.com/facebookresearch/faiss/wiki/Installing-Faiss
|
||||
"""
|
||||
try:
|
||||
import faiss # noqa: F401
|
||||
except ImportError:
|
||||
raise ValueError(import_err_msg)
|
||||
|
||||
self._index = index
|
||||
|
||||
def load_data(
|
||||
self,
|
||||
query: np.ndarray,
|
||||
id_to_text_map: Dict[str, str],
|
||||
k: int = 4,
|
||||
separate_documents: bool = True,
|
||||
) -> List[Document]:
|
||||
"""Load data from Faiss.
|
||||
|
||||
Args:
|
||||
query (np.ndarray): A 2D numpy array of query vectors.
|
||||
id_to_text_map (Dict[str, str]): A map from ID's to text.
|
||||
k (int): Number of nearest neighbors to retrieve. Defaults to 4.
|
||||
separate_documents (Optional[bool]): Whether to return separate
|
||||
documents. Defaults to True.
|
||||
Returns:
|
||||
List[Document]: A list of documents.
|
||||
|
||||
"""
|
||||
dists, indices = self._index.search(query, k)
|
||||
documents = []
|
||||
for qidx in range(indices.shape[0]):
|
||||
for didx in range(indices.shape[1]):
|
||||
doc_id = indices[qidx, didx]
|
||||
if doc_id not in id_to_text_map:
|
||||
raise ValueError(
|
||||
f"Document ID {doc_id} not found in id_to_text_map."
|
||||
)
|
||||
text = id_to_text_map[doc_id]
|
||||
documents.append(Document(text=text))
|
||||
|
||||
if not separate_documents:
|
||||
# join all documents into one
|
||||
text_list = [doc.get_text() for doc in documents]
|
||||
text = "\n\n".join(text_list)
|
||||
documents = [Document(text=text)]
|
||||
|
||||
return documents
|
||||
@ -1 +1 @@
|
||||
"""Init params."""
|
||||
"""Init file."""
|
||||
|
||||
@ -12,7 +12,46 @@
|
||||
"id": "web/beautiful_soup_web",
|
||||
"author": "thejessezhang"
|
||||
},
|
||||
"TrafilaturaWebReader": {
|
||||
"id": "web/trafilatura_web"
|
||||
"DatabaseReader": {
|
||||
"id": "database"
|
||||
},
|
||||
"DiscordReader": {
|
||||
"id": "discord"
|
||||
},
|
||||
"FaissReader": {
|
||||
"id": "faiss"
|
||||
},
|
||||
"SimpleMongoReader": {
|
||||
"id": "mongo"
|
||||
},
|
||||
"NotionPageReader": {
|
||||
"id": "notion"
|
||||
},
|
||||
"ObsidianReader": {
|
||||
"id": "obsidian"
|
||||
},
|
||||
"PineconeReader": {
|
||||
"id": "pinecone"
|
||||
},
|
||||
"QdrantReader": {
|
||||
"id": "qdrant"
|
||||
},
|
||||
"SlackReader": {
|
||||
"id": "slack"
|
||||
},
|
||||
"StringIterableReader": {
|
||||
"id": "string_iterable"
|
||||
},
|
||||
"TwitterTweetReader": {
|
||||
"id": "twitter"
|
||||
},
|
||||
"WeaviateReader": {
|
||||
"id": "weaviate"
|
||||
},
|
||||
"WikipediaReader": {
|
||||
"id": "wikipedia"
|
||||
},
|
||||
"YoutubeTranscriptReader": {
|
||||
"id": "youtube_transcript"
|
||||
}
|
||||
}
|
||||
0
loader_hub/mongo/README.md
Normal file
0
loader_hub/mongo/README.md
Normal file
1
loader_hub/mongo/__init__.py
Normal file
1
loader_hub/mongo/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
"""Init file."""
|
||||
59
loader_hub/mongo/base.py
Normal file
59
loader_hub/mongo/base.py
Normal file
@ -0,0 +1,59 @@
|
||||
"""Mongo client."""
|
||||
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from gpt_index.readers.base import BaseReader
|
||||
from gpt_index.readers.schema.base import Document
|
||||
|
||||
|
||||
class SimpleMongoReader(BaseReader):
|
||||
"""Simple mongo reader.
|
||||
|
||||
Concatenates each Mongo doc into Document used by GPT Index.
|
||||
|
||||
Args:
|
||||
host (str): Mongo host.
|
||||
port (int): Mongo port.
|
||||
max_docs (int): Maximum number of documents to load.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, host: str, port: int, max_docs: int = 1000) -> None:
|
||||
"""Initialize with parameters."""
|
||||
try:
|
||||
import pymongo # noqa: F401
|
||||
from pymongo import MongoClient # noqa: F401
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"`pymongo` package not found, please run `pip install pymongo`"
|
||||
)
|
||||
self.client: MongoClient = MongoClient(host, port)
|
||||
self.max_docs = max_docs
|
||||
|
||||
def load_data(
|
||||
self, db_name: str, collection_name: str, query_dict: Optional[Dict] = None
|
||||
) -> List[Document]:
|
||||
"""Load data from the input directory.
|
||||
|
||||
Args:
|
||||
db_name (str): name of the database.
|
||||
collection_name (str): name of the collection.
|
||||
query_dict (Optional[Dict]): query to filter documents.
|
||||
Defaults to None
|
||||
|
||||
Returns:
|
||||
List[Document]: A list of documents.
|
||||
|
||||
"""
|
||||
documents = []
|
||||
db = self.client[db_name]
|
||||
if query_dict is None:
|
||||
cursor = db[collection_name].find()
|
||||
else:
|
||||
cursor = db[collection_name].find(query_dict)
|
||||
|
||||
for item in cursor:
|
||||
if "text" not in item:
|
||||
raise ValueError("`text` field not found in Mongo document.")
|
||||
documents.append(Document(item["text"]))
|
||||
return documents
|
||||
0
loader_hub/notion/README.md
Normal file
0
loader_hub/notion/README.md
Normal file
1
loader_hub/notion/__init__.py
Normal file
1
loader_hub/notion/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
"""Init file."""
|
||||
166
loader_hub/notion/base.py
Normal file
166
loader_hub/notion/base.py
Normal file
@ -0,0 +1,166 @@
|
||||
"""Notion reader."""
|
||||
import os
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import requests # type: ignore
|
||||
|
||||
from gpt_index.readers.base import BaseReader
|
||||
from gpt_index.readers.schema.base import Document
|
||||
|
||||
INTEGRATION_TOKEN_NAME = "NOTION_INTEGRATION_TOKEN"
|
||||
BLOCK_CHILD_URL_TMPL = "https://api.notion.com/v1/blocks/{block_id}/children"
|
||||
DATABASE_URL_TMPL = "https://api.notion.com/v1/databases/{database_id}/query"
|
||||
SEARCH_URL = "https://api.notion.com/v1/search"
|
||||
|
||||
|
||||
# TODO: Notion DB reader coming soon!
|
||||
class NotionPageReader(BaseReader):
|
||||
"""Notion Page reader.
|
||||
|
||||
Reads a set of Notion pages.
|
||||
|
||||
Args:
|
||||
integration_token (str): Notion integration token.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, integration_token: Optional[str] = None) -> None:
|
||||
"""Initialize with parameters."""
|
||||
if integration_token is None:
|
||||
integration_token = os.getenv(INTEGRATION_TOKEN_NAME)
|
||||
if integration_token is None:
|
||||
raise ValueError(
|
||||
"Must specify `integration_token` or set environment "
|
||||
"variable `NOTION_INTEGRATION_TOKEN`."
|
||||
)
|
||||
self.token = integration_token
|
||||
self.headers = {
|
||||
"Authorization": "Bearer " + self.token,
|
||||
"Content-Type": "application/json",
|
||||
"Notion-Version": "2022-06-28",
|
||||
}
|
||||
|
||||
def _read_block(self, block_id: str, num_tabs: int = 0) -> str:
|
||||
"""Read a block."""
|
||||
done = False
|
||||
result_lines_arr = []
|
||||
cur_block_id = block_id
|
||||
while not done:
|
||||
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=cur_block_id)
|
||||
query_dict: Dict[str, Any] = {}
|
||||
|
||||
res = requests.request(
|
||||
"GET", block_url, headers=self.headers, json=query_dict
|
||||
)
|
||||
data = res.json()
|
||||
|
||||
for result in data["results"]:
|
||||
result_type = result["type"]
|
||||
result_obj = result[result_type]
|
||||
|
||||
cur_result_text_arr = []
|
||||
if "rich_text" in result_obj:
|
||||
for rich_text in result_obj["rich_text"]:
|
||||
# skip if doesn't have text object
|
||||
if "text" in rich_text:
|
||||
text = rich_text["text"]["content"]
|
||||
prefix = "\t" * num_tabs
|
||||
cur_result_text_arr.append(prefix + text)
|
||||
|
||||
result_block_id = result["id"]
|
||||
has_children = result["has_children"]
|
||||
if has_children:
|
||||
children_text = self._read_block(
|
||||
result_block_id, num_tabs=num_tabs + 1
|
||||
)
|
||||
cur_result_text_arr.append(children_text)
|
||||
|
||||
cur_result_text = "\n".join(cur_result_text_arr)
|
||||
result_lines_arr.append(cur_result_text)
|
||||
|
||||
if data["next_cursor"] is None:
|
||||
done = True
|
||||
break
|
||||
else:
|
||||
cur_block_id = data["next_cursor"]
|
||||
|
||||
result_lines = "\n".join(result_lines_arr)
|
||||
return result_lines
|
||||
|
||||
def read_page(self, page_id: str) -> str:
|
||||
"""Read a page."""
|
||||
return self._read_block(page_id)
|
||||
|
||||
def query_database(
|
||||
self, database_id: str, query_dict: Dict[str, Any] = {}
|
||||
) -> List[str]:
|
||||
"""Get all the pages from a Notion database."""
|
||||
res = requests.post(
|
||||
DATABASE_URL_TMPL.format(database_id=database_id),
|
||||
headers=self.headers,
|
||||
json=query_dict,
|
||||
)
|
||||
data = res.json()
|
||||
page_ids = []
|
||||
for result in data["results"]:
|
||||
page_id = result["id"]
|
||||
page_ids.append(page_id)
|
||||
|
||||
return page_ids
|
||||
|
||||
def search(self, query: str) -> List[str]:
|
||||
"""Search Notion page given a text query."""
|
||||
done = False
|
||||
next_cursor: Optional[str] = None
|
||||
page_ids = []
|
||||
while not done:
|
||||
query_dict = {
|
||||
"query": query,
|
||||
}
|
||||
if next_cursor is not None:
|
||||
query_dict["start_cursor"] = next_cursor
|
||||
res = requests.post(SEARCH_URL, headers=self.headers, json=query_dict)
|
||||
data = res.json()
|
||||
for result in data["results"]:
|
||||
page_id = result["id"]
|
||||
page_ids.append(page_id)
|
||||
|
||||
if data["next_cursor"] is None:
|
||||
done = True
|
||||
break
|
||||
else:
|
||||
next_cursor = data["next_cursor"]
|
||||
return page_ids
|
||||
|
||||
def load_data(
|
||||
self, page_ids: List[str] = [], database_id: Optional[str] = None
|
||||
) -> List[Document]:
|
||||
"""Load data from the input directory.
|
||||
|
||||
Args:
|
||||
page_ids (List[str]): List of page ids to load.
|
||||
|
||||
Returns:
|
||||
List[Document]: List of documents.
|
||||
|
||||
"""
|
||||
if not page_ids and not database_id:
|
||||
raise ValueError("Must specify either `page_ids` or `database_id`.")
|
||||
docs = []
|
||||
if database_id is not None:
|
||||
# get all the pages in the database
|
||||
page_ids = self.query_database(database_id)
|
||||
for page_id in page_ids:
|
||||
page_text = self.read_page(page_id)
|
||||
docs.append(Document(page_text, extra_info={"page_id": page_id}))
|
||||
else:
|
||||
for page_id in page_ids:
|
||||
page_text = self.read_page(page_id)
|
||||
docs.append(Document(page_text, extra_info={"page_id": page_id}))
|
||||
|
||||
return docs
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
reader = NotionPageReader()
|
||||
print(reader.search("What I"))
|
||||
0
loader_hub/obsidian/README.md
Normal file
0
loader_hub/obsidian/README.md
Normal file
1
loader_hub/obsidian/__init__.py
Normal file
1
loader_hub/obsidian/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
"""Init file."""
|
||||
47
loader_hub/obsidian/base.py
Normal file
47
loader_hub/obsidian/base.py
Normal file
@ -0,0 +1,47 @@
|
||||
"""Obsidian reader class.
|
||||
|
||||
Pass in the path to an Obsidian vault and it will parse all markdown
|
||||
files into a List of Documents,
|
||||
with each Document containing text from under an Obsidian header.
|
||||
|
||||
"""
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any, List
|
||||
|
||||
from langchain.docstore.document import Document as LCDocument
|
||||
|
||||
from gpt_index.readers.base import BaseReader
|
||||
from gpt_index.readers.file.markdown_parser import MarkdownParser
|
||||
from gpt_index.readers.schema.base import Document
|
||||
|
||||
|
||||
class ObsidianReader(BaseReader):
|
||||
"""Utilities for loading data from an Obsidian Vault.
|
||||
|
||||
Args:
|
||||
input_dir (str): Path to the vault.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, input_dir: str, verbose: bool = False):
|
||||
"""Init params."""
|
||||
self.verbose = verbose
|
||||
self.input_dir = Path(input_dir)
|
||||
|
||||
def load_data(self, *args: Any, **load_kwargs: Any) -> List[Document]:
|
||||
"""Load data from the input directory."""
|
||||
docs: List[str] = []
|
||||
for (dirpath, dirnames, filenames) in os.walk(self.input_dir):
|
||||
dirnames[:] = [d for d in dirnames if not d.startswith(".")]
|
||||
for filename in filenames:
|
||||
if filename.endswith(".md"):
|
||||
filepath = os.path.join(dirpath, filename)
|
||||
content = MarkdownParser().parse_file(Path(filepath))
|
||||
docs.extend(content)
|
||||
return [Document(d) for d in docs]
|
||||
|
||||
def load_langchain_documents(self, **load_kwargs: Any) -> List[LCDocument]:
|
||||
"""Load data in LangChain document format."""
|
||||
docs = self.load_data(**load_kwargs)
|
||||
return [d.to_langchain_format() for d in docs]
|
||||
0
loader_hub/pinecone/README.md
Normal file
0
loader_hub/pinecone/README.md
Normal file
1
loader_hub/pinecone/__init__.py
Normal file
1
loader_hub/pinecone/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
"""Init file."""
|
||||
81
loader_hub/pinecone/base.py
Normal file
81
loader_hub/pinecone/base.py
Normal file
@ -0,0 +1,81 @@
|
||||
"""Pinecone reader."""
|
||||
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from gpt_index.readers.base import BaseReader
|
||||
from gpt_index.readers.schema.base import Document
|
||||
|
||||
|
||||
class PineconeReader(BaseReader):
|
||||
"""Pinecone reader.
|
||||
|
||||
Args:
|
||||
api_key (str): Pinecone API key.
|
||||
environment (str): Pinecone environment.
|
||||
"""
|
||||
|
||||
def __init__(self, api_key: str, environment: str):
|
||||
"""Initialize with parameters."""
|
||||
try:
|
||||
import pinecone # noqa: F401
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"`pinecone` package not found, please run `pip install pinecone-client`"
|
||||
)
|
||||
|
||||
self._api_key = api_key
|
||||
self._environment = environment
|
||||
pinecone.init(api_key=api_key, environment=environment)
|
||||
|
||||
def load_data(
|
||||
self,
|
||||
index_name: str,
|
||||
id_to_text_map: Dict[str, str],
|
||||
vector: Optional[List[float]],
|
||||
top_k: int,
|
||||
separate_documents: bool = True,
|
||||
include_values: bool = True,
|
||||
**query_kwargs: Any
|
||||
) -> List[Document]:
|
||||
"""Load data from Pinecone.
|
||||
|
||||
Args:
|
||||
index_name (str): Name of the index.
|
||||
id_to_text_map (Dict[str, str]): A map from ID's to text.
|
||||
separate_documents (Optional[bool]): Whether to return separate
|
||||
documents per retrieved entry. Defaults to True.
|
||||
vector (List[float]): Query vector.
|
||||
top_k (int): Number of results to return.
|
||||
include_values (bool): Whether to include the embedding in the response.
|
||||
Defaults to True.
|
||||
**query_kwargs: Keyword arguments to pass to the query.
|
||||
Arguments are the exact same as those found in
|
||||
Pinecone's reference documentation for the
|
||||
query method.
|
||||
|
||||
Returns:
|
||||
List[Document]: A list of documents.
|
||||
"""
|
||||
import pinecone
|
||||
|
||||
index = pinecone.Index(index_name)
|
||||
if "include_values" not in query_kwargs:
|
||||
query_kwargs["include_values"] = True
|
||||
response = index.query(top_k=top_k, vector=vector, **query_kwargs)
|
||||
|
||||
documents = []
|
||||
for match in response.matches:
|
||||
if match.id not in id_to_text_map:
|
||||
raise ValueError("ID not found in id_to_text_map.")
|
||||
text = id_to_text_map[match.id]
|
||||
embedding = match.values
|
||||
if len(embedding) == 0:
|
||||
embedding = None
|
||||
documents.append(Document(text=text, embedding=embedding))
|
||||
|
||||
if not separate_documents:
|
||||
text_list = [doc.get_text() for doc in documents]
|
||||
text = "\n\n".join(text_list)
|
||||
documents = [Document(text=text)]
|
||||
|
||||
return documents
|
||||
0
loader_hub/qdrant/README.md
Normal file
0
loader_hub/qdrant/README.md
Normal file
1
loader_hub/qdrant/__init__.py
Normal file
1
loader_hub/qdrant/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
"""Init file."""
|
||||
105
loader_hub/qdrant/base.py
Normal file
105
loader_hub/qdrant/base.py
Normal file
@ -0,0 +1,105 @@
|
||||
"""Qdrant reader."""
|
||||
|
||||
from typing import List, Optional, cast
|
||||
|
||||
from gpt_index.readers.base import BaseReader
|
||||
from gpt_index.readers.schema.base import Document
|
||||
|
||||
|
||||
class QdrantReader(BaseReader):
|
||||
"""Qdrant reader.
|
||||
|
||||
Retrieve documents from existing Qdrant collections.
|
||||
|
||||
Args:
|
||||
host: Host name of Qdrant service.
|
||||
port: Port of the REST API interface. Default: 6333
|
||||
grpc_port: Port of the gRPC interface. Default: 6334
|
||||
prefer_grpc: If `true` - use gPRC interface whenever possible in custom methods.
|
||||
https: If `true` - use HTTPS(SSL) protocol. Default: `false`
|
||||
api_key: API key for authentication in Qdrant Cloud. Default: `None`
|
||||
prefix:
|
||||
If not `None` - add `prefix` to the REST URL path.
|
||||
Example: `service/v1` will result in
|
||||
`http://localhost:6333/service/v1/{qdrant-endpoint}` for REST API.
|
||||
Default: `None`
|
||||
timeout:
|
||||
Timeout for REST and gRPC API requests.
|
||||
Default: 5.0 seconds for REST and unlimited for gRPC
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
host: str,
|
||||
port: int = 6333,
|
||||
grpc_port: int = 6334,
|
||||
prefer_grpc: bool = False,
|
||||
https: Optional[bool] = None,
|
||||
api_key: Optional[str] = None,
|
||||
prefix: Optional[str] = None,
|
||||
timeout: Optional[float] = None,
|
||||
verbose: bool = False,
|
||||
):
|
||||
"""Initialize with parameters."""
|
||||
super().__init__(verbose)
|
||||
|
||||
import_err_msg = (
|
||||
"`qdrant-client` package not found, please run `pip install qdrant-client`"
|
||||
)
|
||||
try:
|
||||
import qdrant_client # noqa: F401
|
||||
except ImportError:
|
||||
raise ValueError(import_err_msg)
|
||||
|
||||
self._client = qdrant_client.QdrantClient(
|
||||
host=host,
|
||||
port=port,
|
||||
grpc_port=grpc_port,
|
||||
prefer_grpc=prefer_grpc,
|
||||
https=https,
|
||||
api_key=api_key,
|
||||
prefix=prefix,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
def load_data(
|
||||
self,
|
||||
collection_name: str,
|
||||
query_vector: List[float],
|
||||
limit: int = 10,
|
||||
) -> List[Document]:
|
||||
"""Load data from Qdrant.
|
||||
|
||||
Args:
|
||||
collection_name (str): Name of the Qdrant collection.
|
||||
query_vector (List[float]): Query vector.
|
||||
limit (int): Number of results to return.
|
||||
|
||||
Returns:
|
||||
List[Document]: A list of documents.
|
||||
"""
|
||||
from qdrant_client.http.models.models import Payload
|
||||
|
||||
response = self._client.search(
|
||||
collection_name=collection_name,
|
||||
query_vector=query_vector,
|
||||
with_vectors=True,
|
||||
with_payload=True,
|
||||
limit=limit,
|
||||
)
|
||||
|
||||
documents = []
|
||||
for point in response:
|
||||
payload = cast(Payload, point)
|
||||
try:
|
||||
vector = cast(List[float], point.vector)
|
||||
except ValueError as e:
|
||||
raise ValueError("Could not cast vector to List[float].") from e
|
||||
document = Document(
|
||||
doc_id=payload.get("doc_id"),
|
||||
text=payload.get("text"),
|
||||
embedding=vector,
|
||||
)
|
||||
documents.append(document)
|
||||
|
||||
return documents
|
||||
0
loader_hub/slack/README.md
Normal file
0
loader_hub/slack/README.md
Normal file
1
loader_hub/slack/__init__.py
Normal file
1
loader_hub/slack/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
"""Init file."""
|
||||
143
loader_hub/slack/base.py
Normal file
143
loader_hub/slack/base.py
Normal file
@ -0,0 +1,143 @@
|
||||
"""Slack reader."""
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from typing import List, Optional
|
||||
|
||||
from gpt_index.readers.base import BaseReader
|
||||
from gpt_index.readers.schema.base import Document
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SlackReader(BaseReader):
|
||||
"""Slack reader.
|
||||
|
||||
Reads conversations from channels.
|
||||
|
||||
Args:
|
||||
slack_token (Optional[str]): Slack token. If not provided, we
|
||||
assume the environment variable `SLACK_BOT_TOKEN` is set.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, slack_token: Optional[str] = None) -> None:
|
||||
"""Initialize with parameters."""
|
||||
try:
|
||||
from slack_sdk import WebClient
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"`slack_sdk` package not found, please run `pip install slack_sdk`"
|
||||
)
|
||||
if slack_token is None:
|
||||
slack_token = os.environ["SLACK_BOT_TOKEN"]
|
||||
if slack_token is None:
|
||||
raise ValueError(
|
||||
"Must specify `slack_token` or set environment "
|
||||
"variable `SLACK_BOT_TOKEN`."
|
||||
)
|
||||
self.client = WebClient(token=slack_token)
|
||||
res = self.client.api_test()
|
||||
if not res["ok"]:
|
||||
raise ValueError(f"Error initializing Slack API: {res['error']}")
|
||||
|
||||
def _read_message(self, channel_id: str, message_ts: str) -> str:
|
||||
from slack_sdk.errors import SlackApiError
|
||||
|
||||
"""Read a message."""
|
||||
|
||||
messages_text = []
|
||||
next_cursor = None
|
||||
while True:
|
||||
try:
|
||||
# https://slack.com/api/conversations.replies
|
||||
# List all replies to a message, including the message itself.
|
||||
result = self.client.conversations_replies(
|
||||
channel=channel_id, ts=message_ts, cursor=next_cursor
|
||||
)
|
||||
messages = result["messages"]
|
||||
for message in messages:
|
||||
messages_text.append(message["text"])
|
||||
|
||||
if not result["has_more"]:
|
||||
break
|
||||
|
||||
next_cursor = result["response_metadata"]["next_cursor"]
|
||||
except SlackApiError as e:
|
||||
if e.response["error"] == "ratelimited":
|
||||
logger.error(
|
||||
"Rate limit error reached, sleeping for: {} seconds".format(
|
||||
e.response.headers["retry-after"]
|
||||
)
|
||||
)
|
||||
time.sleep(int(e.response.headers["retry-after"]))
|
||||
else:
|
||||
logger.error("Error parsing conversation replies: {}".format(e))
|
||||
|
||||
return "\n\n".join(messages_text)
|
||||
|
||||
def _read_channel(self, channel_id: str) -> str:
|
||||
from slack_sdk.errors import SlackApiError
|
||||
|
||||
"""Read a channel."""
|
||||
|
||||
result_messages = []
|
||||
next_cursor = None
|
||||
while True:
|
||||
try:
|
||||
# Call the conversations.history method using the WebClient
|
||||
# conversations.history returns the first 100 messages by default
|
||||
# These results are paginated,
|
||||
# see: https://api.slack.com/methods/conversations.history$pagination
|
||||
result = self.client.conversations_history(
|
||||
channel=channel_id, cursor=next_cursor
|
||||
)
|
||||
conversation_history = result["messages"]
|
||||
# Print results
|
||||
logger.info(
|
||||
"{} messages found in {}".format(len(conversation_history), id)
|
||||
)
|
||||
for message in conversation_history:
|
||||
result_messages.append(
|
||||
self._read_message(channel_id, message["ts"])
|
||||
)
|
||||
|
||||
if not result["has_more"]:
|
||||
break
|
||||
next_cursor = result["response_metadata"]["next_cursor"]
|
||||
|
||||
except SlackApiError as e:
|
||||
if e.response["error"] == "ratelimited":
|
||||
logger.error(
|
||||
"Rate limit error reached, sleeping for: {} seconds".format(
|
||||
e.response.headers["retry-after"]
|
||||
)
|
||||
)
|
||||
time.sleep(int(e.response.headers["retry-after"]))
|
||||
else:
|
||||
logger.error("Error parsing conversation replies: {}".format(e))
|
||||
|
||||
return "\n\n".join(result_messages)
|
||||
|
||||
def load_data(self, channel_ids: List[str]) -> List[Document]:
|
||||
"""Load data from the input directory.
|
||||
|
||||
Args:
|
||||
channel_ids (List[str]): List of channel ids to read.
|
||||
|
||||
Returns:
|
||||
List[Document]: List of documents.
|
||||
|
||||
"""
|
||||
results = []
|
||||
for channel_id in channel_ids:
|
||||
channel_content = self._read_channel(channel_id)
|
||||
results.append(
|
||||
Document(channel_content, extra_info={"channel": channel_id})
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
reader = SlackReader()
|
||||
print(reader.load_data(channel_ids=["C04DC2VUY3F"]))
|
||||
0
loader_hub/string_iterable/README.md
Normal file
0
loader_hub/string_iterable/README.md
Normal file
1
loader_hub/string_iterable/__init__.py
Normal file
1
loader_hub/string_iterable/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
"""Init file."""
|
||||
32
loader_hub/string_iterable/base.py
Normal file
32
loader_hub/string_iterable/base.py
Normal file
@ -0,0 +1,32 @@
|
||||
"""Simple reader that turns an iterable of strings into a list of Documents."""
|
||||
from typing import List
|
||||
|
||||
from gpt_index.readers.base import BaseReader
|
||||
from gpt_index.readers.schema.base import Document
|
||||
|
||||
|
||||
class StringIterableReader(BaseReader):
|
||||
"""String Iterable Reader.
|
||||
|
||||
Gets a list of documents, given an iterable (e.g. list) of strings.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from gpt_index import StringIterableReader, GPTTreeIndex
|
||||
|
||||
documents = StringIterableReader().load_data(
|
||||
texts=["I went to the store", "I bought an apple"])
|
||||
index = GPTTreeIndex(documents)
|
||||
index.query("what did I buy?")
|
||||
|
||||
# response should be something like "You bought an apple."
|
||||
"""
|
||||
|
||||
def load_data(self, texts: List[str]) -> List[Document]:
|
||||
"""Load the data."""
|
||||
results = []
|
||||
for text in texts:
|
||||
results.append(Document(text))
|
||||
|
||||
return results
|
||||
0
loader_hub/twitter/README.md
Normal file
0
loader_hub/twitter/README.md
Normal file
1
loader_hub/twitter/__init__.py
Normal file
1
loader_hub/twitter/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
"""Init file."""
|
||||
60
loader_hub/twitter/base.py
Normal file
60
loader_hub/twitter/base.py
Normal file
@ -0,0 +1,60 @@
|
||||
"""Simple reader that reads tweets of a twitter handle."""
|
||||
from typing import Any, List, Optional
|
||||
|
||||
from gpt_index.readers.base import BaseReader
|
||||
from gpt_index.readers.schema.base import Document
|
||||
|
||||
|
||||
class TwitterTweetReader(BaseReader):
|
||||
"""Twitter tweets reader.
|
||||
|
||||
Read tweets of user twitter handle.
|
||||
|
||||
Check 'https://developer.twitter.com/en/docs/twitter-api/\
|
||||
getting-started/getting-access-to-the-twitter-api' \
|
||||
on how to get access to twitter API.
|
||||
|
||||
Args:
|
||||
bearer_token (str): bearer_token that you get from twitter API.
|
||||
num_tweets (Optional[int]): Number of tweets for each user twitter handle.\
|
||||
Default is 100 tweets.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
bearer_token: str,
|
||||
num_tweets: Optional[int] = 100,
|
||||
verbose: bool = False,
|
||||
) -> None:
|
||||
"""Initialize with parameters."""
|
||||
super().__init__(verbose=verbose)
|
||||
self.bearer_token = bearer_token
|
||||
self.num_tweets = num_tweets
|
||||
|
||||
def load_data(
|
||||
self, twitterhandles: List[str], **load_kwargs: Any
|
||||
) -> List[Document]:
|
||||
"""Load tweets of twitter handles.
|
||||
|
||||
Args:
|
||||
twitterhandles (List[str]): List of user twitter handles to read tweets.
|
||||
|
||||
"""
|
||||
try:
|
||||
import tweepy
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"`tweepy` package not found, please run `pip install tweepy`"
|
||||
)
|
||||
|
||||
client = tweepy.Client(bearer_token=self.bearer_token)
|
||||
results = []
|
||||
for username in twitterhandles:
|
||||
# tweets = api.user_timeline(screen_name=user, count=self.num_tweets)
|
||||
user = client.get_user(username=username)
|
||||
tweets = client.get_users_tweets(user.data.id, max_results=self.num_tweets)
|
||||
response = " "
|
||||
for tweet in tweets.data:
|
||||
response = response + tweet.text + "\n"
|
||||
results.append(Document(response))
|
||||
return results
|
||||
0
loader_hub/weaviate/README.md
Normal file
0
loader_hub/weaviate/README.md
Normal file
1
loader_hub/weaviate/__init__.py
Normal file
1
loader_hub/weaviate/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
"""Init file."""
|
||||
116
loader_hub/weaviate/base.py
Normal file
116
loader_hub/weaviate/base.py
Normal file
@ -0,0 +1,116 @@
|
||||
"""Weaviate reader."""
|
||||
|
||||
from typing import Any, List, Optional
|
||||
|
||||
from gpt_index.readers.base import BaseReader
|
||||
from gpt_index.readers.schema.base import Document
|
||||
|
||||
|
||||
class WeaviateReader(BaseReader):
|
||||
"""Weaviate reader.
|
||||
|
||||
Retrieves documents from Weaviate through vector lookup. Allows option
|
||||
to concatenate retrieved documents into one Document, or to return
|
||||
separate Document objects per document.
|
||||
|
||||
Args:
|
||||
host (str): host.
|
||||
auth_client_secret (Optional[weaviate.auth.AuthCredentials]):
|
||||
auth_client_secret.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
host: str,
|
||||
auth_client_secret: Optional[Any] = None,
|
||||
) -> None:
|
||||
"""Initialize with parameters."""
|
||||
try:
|
||||
import weaviate # noqa: F401
|
||||
from weaviate import Client # noqa: F401
|
||||
from weaviate.auth import AuthCredentials # noqa: F401
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"`weaviate` package not found, please run `pip install weaviate-client`"
|
||||
)
|
||||
|
||||
self.client: Client = Client(host, auth_client_secret=auth_client_secret)
|
||||
|
||||
def load_data(
|
||||
self,
|
||||
class_name: Optional[str] = None,
|
||||
properties: Optional[List[str]] = None,
|
||||
graphql_query: Optional[str] = None,
|
||||
separate_documents: Optional[bool] = True,
|
||||
) -> List[Document]:
|
||||
"""Load data from Weaviate.
|
||||
|
||||
If `graphql_query` is not found in load_kwargs, we assume that
|
||||
`class_name` and `properties` are provided.
|
||||
|
||||
Args:
|
||||
class_name (Optional[str]): class_name to retrieve documents from.
|
||||
properties (Optional[List[str]]): properties to retrieve from documents.
|
||||
graphql_query (Optional[str]): Raw GraphQL Query.
|
||||
We assume that the query is a Get query.
|
||||
separate_documents (Optional[bool]): Whether to return separate
|
||||
documents. Defaults to True.
|
||||
|
||||
Returns:
|
||||
List[Document]: A list of documents.
|
||||
|
||||
"""
|
||||
if class_name is not None and properties is not None:
|
||||
props_txt = "\n".join(properties)
|
||||
graphql_query = f"""
|
||||
{{
|
||||
Get {{
|
||||
{class_name} {{
|
||||
{props_txt}
|
||||
}}
|
||||
}}
|
||||
}}
|
||||
"""
|
||||
elif graphql_query is not None:
|
||||
pass
|
||||
else:
|
||||
raise ValueError(
|
||||
"Either `class_name` and `properties` must be specified, "
|
||||
"or `graphql_query` must be specified."
|
||||
)
|
||||
|
||||
response = self.client.query.raw(graphql_query)
|
||||
if "errors" in response:
|
||||
raise ValueError("Invalid query, got errors: {}".format(response["errors"]))
|
||||
|
||||
data_response = response["data"]
|
||||
if "Get" not in data_response:
|
||||
raise ValueError("Invalid query response, must be a Get query.")
|
||||
|
||||
if class_name is None:
|
||||
# infer class_name if only graphql_query was provided
|
||||
class_name = list(data_response["Get"].keys())[0]
|
||||
entries = data_response["Get"][class_name]
|
||||
documents = []
|
||||
for entry in entries:
|
||||
embedding = None
|
||||
# for each entry, join properties into <property>:<value>
|
||||
# separated by newlines
|
||||
text_list = []
|
||||
for k, v in entry.items():
|
||||
if k == "_additional":
|
||||
if "vector" in v:
|
||||
embedding = v["vector"]
|
||||
continue
|
||||
text_list.append(f"{k}: {v}")
|
||||
|
||||
text = "\n".join(text_list)
|
||||
documents.append(Document(text=text, embedding=embedding))
|
||||
|
||||
if not separate_documents:
|
||||
# join all documents into one
|
||||
text_list = [doc.get_text() for doc in documents]
|
||||
text = "\n\n".join(text_list)
|
||||
documents = [Document(text=text)]
|
||||
|
||||
return documents
|
||||
0
loader_hub/wikipedia/README.md
Normal file
0
loader_hub/wikipedia/README.md
Normal file
1
loader_hub/wikipedia/__init__.py
Normal file
1
loader_hub/wikipedia/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
"""Init file."""
|
||||
37
loader_hub/wikipedia/base.py
Normal file
37
loader_hub/wikipedia/base.py
Normal file
@ -0,0 +1,37 @@
|
||||
"""Simple reader that reads wikipedia."""
|
||||
from typing import Any, List
|
||||
|
||||
from gpt_index.readers.base import BaseReader
|
||||
from gpt_index.readers.schema.base import Document
|
||||
|
||||
|
||||
class WikipediaReader(BaseReader):
|
||||
"""Wikipedia reader.
|
||||
|
||||
Reads a page.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize with parameters."""
|
||||
try:
|
||||
import wikipedia # noqa: F401
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"`wikipedia` package not found, please run `pip install wikipedia`"
|
||||
)
|
||||
|
||||
def load_data(self, pages: List[str], **load_kwargs: Any) -> List[Document]:
|
||||
"""Load data from the input directory.
|
||||
|
||||
Args:
|
||||
pages (List[str]): List of pages to read.
|
||||
|
||||
"""
|
||||
import wikipedia
|
||||
|
||||
results = []
|
||||
for page in pages:
|
||||
page_content = wikipedia.page(page, **load_kwargs).content
|
||||
results.append(Document(page_content))
|
||||
return results
|
||||
0
loader_hub/youtube_transcript/README.md
Normal file
0
loader_hub/youtube_transcript/README.md
Normal file
1
loader_hub/youtube_transcript/__init__.py
Normal file
1
loader_hub/youtube_transcript/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
"""Init file."""
|
||||
38
loader_hub/youtube_transcript/base.py
Normal file
38
loader_hub/youtube_transcript/base.py
Normal file
@ -0,0 +1,38 @@
|
||||
"""Simple Reader that reads transcript of youtube video."""
|
||||
from typing import Any, List
|
||||
|
||||
from gpt_index.readers.base import BaseReader
|
||||
from gpt_index.readers.schema.base import Document
|
||||
|
||||
|
||||
class YoutubeTranscriptReader(BaseReader):
|
||||
"""Youtube Transcript reader."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize with parameters."""
|
||||
|
||||
def load_data(self, ytlinks: List[str], **load_kwargs: Any) -> List[Document]:
|
||||
"""Load data from the input directory.
|
||||
|
||||
Args:
|
||||
pages (List[str]): List of youtube links \
|
||||
for which transcripts are to be read.
|
||||
|
||||
"""
|
||||
try:
|
||||
from youtube_transcript_api import YouTubeTranscriptApi
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"`youtube_transcript_api` package not found, \
|
||||
please run `pip install youtube-transcript-api`"
|
||||
)
|
||||
|
||||
results = []
|
||||
for link in ytlinks:
|
||||
video_id = link.split("?v=")[-1]
|
||||
srt = YouTubeTranscriptApi.get_transcript(video_id)
|
||||
transcript = ""
|
||||
for chunk in srt:
|
||||
transcript = transcript + chunk["text"] + "\n"
|
||||
results.append(Document(transcript))
|
||||
return results
|
||||
Loading…
x
Reference in New Issue
Block a user