diff --git a/loader_hub/database/README.md b/loader_hub/database/README.md index 6de8deb0..51679fa5 100644 --- a/loader_hub/database/README.md +++ b/loader_hub/database/README.md @@ -29,3 +29,5 @@ WHERE age >= 18 documents = reader.load_data(query=query) ``` + +This loader is designed to be used as a way to load data into [GPT Index](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/loader-hub/tree/main) for examples. diff --git a/loader_hub/discord/README.md b/loader_hub/discord/README.md index 92430e19..e3f7d9f5 100644 --- a/loader_hub/discord/README.md +++ b/loader_hub/discord/README.md @@ -18,3 +18,5 @@ channel_ids = [1057178784895348746] # Replace with your channel_id reader = DiscordReader(discord_token=discord_token) documents = reader.load_data(channel_ids=channel_ids) ``` + +This loader is designed to be used as a way to load data into [GPT Index](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/loader-hub/tree/main) for examples. diff --git a/loader_hub/faiss/README.md b/loader_hub/faiss/README.md index 0d22d45d..7ec91a92 100644 --- a/loader_hub/faiss/README.md +++ b/loader_hub/faiss/README.md @@ -23,7 +23,7 @@ index.add(...) # initalize reader reader = FaissReader(index) -# To load data from the Faiss index, you must specify: +# To load data from the Faiss index, you must specify: # k: top nearest neighbors # query: a 2D embedding representation of your queries (rows are queries) k = 4 @@ -33,3 +33,5 @@ query=np.array([query1, query2]) documents = reader.load_data(query=query, id_to_text_map=id_to_text_map, k=k) ``` + +This loader is designed to be used as a way to load data into [GPT Index](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/loader-hub/tree/main) for examples. diff --git a/loader_hub/file/README.md b/loader_hub/file/README.md index 3398c17f..7f2398c7 100644 --- a/loader_hub/file/README.md +++ b/loader_hub/file/README.md @@ -1,15 +1,17 @@ # File Loader -This loader takes in a local directory containing files and extracts `Document`s from each of the files. +This loader takes in a local directory containing files and extracts `Document`s from each of the files. By default, the loader will utilize the specialized loaders in this library to parse common file extensions (e.g. .pdf, .png, .docx, etc). You can optionally pass in your own custom loaders. Note: if no loader is found for a file extension, and the file extension is not in the list to skip, the file will be read directly. ## Usage To use this loader, you simply need to instantiate the `SimpleDirectoryReader` class with a directory, along with other optional settings, such as whether to ignore hidden files. See the code for the complete list. ```python -from loader_hub import SimpleDirectoryReader +from gpt_index import download_loader -loader = SimpleDirectoryReader('data', recursive=True, exclude_hidden=True) +SimpleDirectoryReader = download_loader(SimpleDirectoryReader) + +loader = SimpleDirectoryReader('./data', recursive=True, exclude_hidden=True) documents = loader.load_data() ``` @@ -20,10 +22,11 @@ This loader is designed to be used as a way to load data into [GPT Index](https: ### GPT Index ```python -from loader_hub import SimpleDirectoryReader -from gpt_index import GPTSimpleVectorIndex +from gpt_index import GPTSimpleVectorIndex, download_loader -loader = SimpleDirectoryReader('data', recursive=True, exclude_hidden=True) +SimpleDirectoryReader = download_loader(SimpleDirectoryReader) + +loader = SimpleDirectoryReader('./data', recursive=True, exclude_hidden=True) documents = loader.load_data() index = GPTSimpleVectorIndex(documents) index.query('What are these files about?') @@ -34,13 +37,14 @@ index.query('What are these files about?') Note: Make sure you change the description of the `Tool` to match your use-case. ```python -from loader_hub import SimpleDirectoryReader -from gpt_index import GPTSimpleVectorIndex +from gpt_index import GPTSimpleVectorIndex, download_loader from langchain.agents import initialize_agent, Tool from langchain.llms import OpenAI from langchain.chains.conversation.memory import ConversationBufferMemory -loader = SimpleDirectoryReader('data', recursive=True, exclude_hidden=True) +SimpleDirectoryReader = download_loader(SimpleDirectoryReader) + +loader = SimpleDirectoryReader('./data', recursive=True, exclude_hidden=True) documents = loader.load_data() index = GPTSimpleVectorIndex(documents) diff --git a/loader_hub/file/audio/README.md b/loader_hub/file/audio/README.md index e69de29b..b9ca47bb 100644 --- a/loader_hub/file/audio/README.md +++ b/loader_hub/file/audio/README.md @@ -0,0 +1,19 @@ +# Audio File Loader + +This loader uses OpenAI's Whisper model to transcribe the text of an audio file or the audio track of a video file. The file formats .mp3 and .mp4 are preferred. A single local file is passed in each time you call `load_data`. + +## Usage + +To use this loader, you need to pass in a `Path` to a local file. + +```python +from pathlib import Path +from gpt_index import download_loader + +AudioTranscriber = download_loader("AudioTranscriber") + +loader = AudioTranscriber() +documents = loader.load_data(file=Path('./podcast.mp3')) +``` + +This loader is designed to be used as a way to load data into [GPT Index](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/loader-hub/tree/main) for examples. diff --git a/loader_hub/file/docx/README.md b/loader_hub/file/docx/README.md index e69de29b..06bbeba2 100644 --- a/loader_hub/file/docx/README.md +++ b/loader_hub/file/docx/README.md @@ -0,0 +1,19 @@ +# Microsoft Word Loader + +This loader extracts the text from a local Microsoft Word (.docx) file. Non-text items in the document are ignored. A single local file is passed in each time you call `load_data`. + +## Usage + +To use this loader, you need to pass in a `Path` to a local file. + +```python +from pathlib import Path +from gpt_index import download_loader + +DocxReader = download_loader("DocxReader") + +loader = DocxReader() +documents = loader.load_data(file=Path('./homework.docx')) +``` + +This loader is designed to be used as a way to load data into [GPT Index](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/loader-hub/tree/main) for examples. diff --git a/loader_hub/file/epub/README.md b/loader_hub/file/epub/README.md index e69de29b..9c882f71 100644 --- a/loader_hub/file/epub/README.md +++ b/loader_hub/file/epub/README.md @@ -0,0 +1,19 @@ +# Epub Loader + +This loader extracts the text from a local Epub file. A single local file is passed in each time you call `load_data`. + +## Usage + +To use this loader, you need to pass in a `Path` to a local file. + +```python +from pathlib import Path +from gpt_index import download_loader + +EpubReader = download_loader("EpubReader") + +loader = EpubReader() +documents = loader.load_data(file=Path('./book.epub')) +``` + +This loader is designed to be used as a way to load data into [GPT Index](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/loader-hub/tree/main) for examples. diff --git a/loader_hub/file/image/README.md b/loader_hub/file/image/README.md index e69de29b..a2d3cb64 100644 --- a/loader_hub/file/image/README.md +++ b/loader_hub/file/image/README.md @@ -0,0 +1,19 @@ +# Image Loader + +This loader extracts the text from an image that has text in it (e.g. a receipt). The [Donut](https://huggingface.co/docs/transformers/model_doc/donut) transformer model is used. The file extensions .png, .jpg, and .jpeg are preferred. A single local file is passed in each time you call `load_data`. + +## Usage + +To use this loader, you need to pass in a `Path` to a local file. + +```python +from pathlib import Path +from gpt_index import download_loader + +ImageReader = download_loader("ImageReader") + +loader = ImageReader() +documents = loader.load_data(file=Path('./receipt.png')) +``` + +This loader is designed to be used as a way to load data into [GPT Index](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/loader-hub/tree/main) for examples. diff --git a/loader_hub/file/markdown/README.md b/loader_hub/file/markdown/README.md index e69de29b..8d774b39 100644 --- a/loader_hub/file/markdown/README.md +++ b/loader_hub/file/markdown/README.md @@ -0,0 +1,19 @@ +# Markdown Loader + +This loader extracts the text from a local Markdown file. A single local file is passed in each time you call `load_data`. + +## Usage + +To use this loader, you need to pass in a `Path` to a local file. + +```python +from pathlib import Path +from gpt_index import download_loader + +MarkdownReader = download_loader("MarkdownReader") + +loader = MarkdownReader() +documents = loader.load_data(file=Path('./README.md')) +``` + +This loader is designed to be used as a way to load data into [GPT Index](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/loader-hub/tree/main) for examples. diff --git a/loader_hub/file/pandas_csv/README.md b/loader_hub/file/pandas_csv/README.md index e69de29b..0f9944bc 100644 --- a/loader_hub/file/pandas_csv/README.md +++ b/loader_hub/file/pandas_csv/README.md @@ -0,0 +1,19 @@ +# Pandas CSV Loader + +This loader extracts the text from a local .csv file using the `pandas` Python package. A single local file is passed in each time you call `load_data`. + +## Usage + +To use this loader, you need to pass in a `Path` to a local file. + +```python +from pathlib import Path +from gpt_index import download_loader + +PandasCSVReader = download_loader("PandasCSVReader") + +loader = PandasCSVReader() +documents = loader.load_data(file=Path('./transactions.csv')) +``` + +This loader is designed to be used as a way to load data into [GPT Index](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/loader-hub/tree/main) for examples. diff --git a/loader_hub/file/pandas_csv/base.py b/loader_hub/file/pandas_csv/base.py index 2f64d028..fae28d22 100644 --- a/loader_hub/file/pandas_csv/base.py +++ b/loader_hub/file/pandas_csv/base.py @@ -10,7 +10,7 @@ from gpt_index.readers.base import BaseReader from gpt_index.readers.schema.base import Document -class PandasCSVParser(BaseReader): +class PandasCSVReader(BaseReader): r"""Pandas-based CSV parser. Parses CSVs using the separator detection from Pandas `read_csv`function. diff --git a/loader_hub/file/pdf/README.md b/loader_hub/file/pdf/README.md index e69de29b..6a1fc607 100644 --- a/loader_hub/file/pdf/README.md +++ b/loader_hub/file/pdf/README.md @@ -0,0 +1,19 @@ +# Pandas CSV Loader + +This loader extracts the text from a local PDF file using the `PyPDF2` Python package. Any non-text elements are ignored. A single local file is passed in each time you call `load_data`. + +## Usage + +To use this loader, you need to pass in a `Path` to a local file. + +```python +from pathlib import Path +from gpt_index import download_loader + +PDFReader = download_loader("PDFReader") + +loader = PDFReader() +documents = loader.load_data(file=Path('./article.pdf')) +``` + +This loader is designed to be used as a way to load data into [GPT Index](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/loader-hub/tree/main) for examples. diff --git a/loader_hub/file/pptx/README.md b/loader_hub/file/pptx/README.md index e69de29b..a469650e 100644 --- a/loader_hub/file/pptx/README.md +++ b/loader_hub/file/pptx/README.md @@ -0,0 +1,19 @@ +# Microsoft PowerPoint Loader + +This loader extracts the text from a local Microsoft PowerPoint (.pptx) file. Image elements are automatically captioned and inserted as text into the final `Document` using [GPT2 Image Captioning model](https://huggingface.co/nlpconnect/vit-gpt2-image-captioning). A single local file is passed in each time you call `load_data`. + +## Usage + +To use this loader, you need to pass in a `Path` to a local file. + +```python +from pathlib import Path +from gpt_index import download_loader + +PptxReader = download_loader("PptxReader") + +loader = PptxReader() +documents = loader.load_data(file=Path('./deck.pptx')) +``` + +This loader is designed to be used as a way to load data into [GPT Index](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/loader-hub/tree/main) for examples. diff --git a/loader_hub/file/simple_csv/README.md b/loader_hub/file/simple_csv/README.md index e69de29b..17f2fdd8 100644 --- a/loader_hub/file/simple_csv/README.md +++ b/loader_hub/file/simple_csv/README.md @@ -0,0 +1,19 @@ +# Simple CSV Loader + +This loader extracts the text from a local .csv file by directly reading the file row by row. A single local file is passed in each time you call `load_data`. + +## Usage + +To use this loader, you need to pass in a `Path` to a local file. + +```python +from pathlib import Path +from gpt_index import download_loader + +SimpleCSVReader = download_loader("SimpleCSVReader") + +loader = SimpleCSVReader() +documents = loader.load_data(file=Path('./transactions.csv')) +``` + +This loader is designed to be used as a way to load data into [GPT Index](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/loader-hub/tree/main) for examples. diff --git a/loader_hub/make_com/README.md b/loader_hub/make_com/README.md index 5bf5afaf..a1cbf49a 100644 --- a/loader_hub/make_com/README.md +++ b/loader_hub/make_com/README.md @@ -29,5 +29,6 @@ wrapper.pass_response_to_webhook( query_str ) - ``` + +This loader is designed to be used as a way to load data into [GPT Index](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/loader-hub/tree/main) for examples. diff --git a/loader_hub/mongo/README.md b/loader_hub/mongo/README.md index e77ec507..ac0ef0e9 100644 --- a/loader_hub/mongo/README.md +++ b/loader_hub/mongo/README.md @@ -23,3 +23,5 @@ query_dict = {} reader = SimpleMongoReader(host, port) documents = reader.load_data(db_name, collection_name, query_dict=query_dict) ``` + +This loader is designed to be used as a way to load data into [GPT Index](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/loader-hub/tree/main) for examples. diff --git a/loader_hub/notion/README.md b/loader_hub/notion/README.md index 23d24554..c334658c 100644 --- a/loader_hub/notion/README.md +++ b/loader_hub/notion/README.md @@ -20,3 +20,5 @@ reader = NotionPageReader(integration_token=integration_token) documents = reader.load_data(page_ids=page_ids) ``` + +This loader is designed to be used as a way to load data into [GPT Index](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/loader-hub/tree/main) for examples. diff --git a/loader_hub/obsidian/README.md b/loader_hub/obsidian/README.md index 4d9429f2..e951a2e7 100644 --- a/loader_hub/obsidian/README.md +++ b/loader_hub/obsidian/README.md @@ -11,5 +11,7 @@ from gpt_index import download_loader import os ObsidianReader = download_loader('ObsidianReader') -documents = ObsidianReader('/path/to/dir').load_data() # Returns list of documents +documents = ObsidianReader('/path/to/dir').load_data() # Returns list of documents ``` + +This loader is designed to be used as a way to load data into [GPT Index](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/loader-hub/tree/main) for examples. diff --git a/loader_hub/pinecone/README.md b/loader_hub/pinecone/README.md index c10a43c9..ab755133 100644 --- a/loader_hub/pinecone/README.md +++ b/loader_hub/pinecone/README.md @@ -13,7 +13,7 @@ import os PineconeReader = download_loader('PineconeReader') -# the id_to_text_map specifies a mapping from the ID specified in Pinecone to your text. +# the id_to_text_map specifies a mapping from the ID specified in Pinecone to your text. id_to_text_map = { "id1": "text blob 1", "id2": "text blob 2", @@ -27,10 +27,12 @@ query_vector=[n1, n2, n3, ...] reader = PineconeReader(api_key=api_key, environment="us-west1-gcp") documents = reader.load_data( - index_name='quickstart', - id_to_text_map=id_to_text_map, - top_k=3, - vector=query_vector, + index_name='quickstart', + id_to_text_map=id_to_text_map, + top_k=3, + vector=query_vector, separate_documents=True ) ``` + +This loader is designed to be used as a way to load data into [GPT Index](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/loader-hub/tree/main) for examples. diff --git a/loader_hub/qdrant/README.md b/loader_hub/qdrant/README.md index bb09b1d6..ef66ea03 100644 --- a/loader_hub/qdrant/README.md +++ b/loader_hub/qdrant/README.md @@ -20,7 +20,7 @@ query_vector=[n1, n2, n3, ...] # NOTE: Required args are collection_name, query_vector. # See the Python client: https://github.com/qdrant/qdrant_client -# for more details. +# for more details. documents = reader.load_data( collection_name="demo", query_vector=query_vector, @@ -28,3 +28,5 @@ documents = reader.load_data( ) ``` + +This loader is designed to be used as a way to load data into [GPT Index](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/loader-hub/tree/main) for examples. diff --git a/loader_hub/string_iterable/README.md b/loader_hub/string_iterable/README.md index 6eeab7b6..56768055 100644 --- a/loader_hub/string_iterable/README.md +++ b/loader_hub/string_iterable/README.md @@ -1,4 +1,4 @@ -# Twitter Loader +# String Iterable Loader This loader converts an iterable (e.g. list) of strings into `Document`s.