mirror of
https://github.com/run-llama/llama-hub.git
synced 2025-12-27 06:59:06 +00:00
cr (#93)
This commit is contained in:
parent
aff12c3b3c
commit
87dd8721db
@ -312,5 +312,10 @@
|
||||
"id": "wordpress",
|
||||
"author": "bbornsztein",
|
||||
"keywords": ["wordpress", "blog"]
|
||||
},
|
||||
"SteamshipFileReader": {
|
||||
"id": "steamship",
|
||||
"author": "douglas-reid",
|
||||
"keywords": ["steamship"]
|
||||
}
|
||||
}
|
||||
24
loader_hub/steamship/README.md
Normal file
24
loader_hub/steamship/README.md
Normal file
@ -0,0 +1,24 @@
|
||||
# Steamship Loader
|
||||
|
||||
This loader loads persistent Steamship files and converts them to a Document object. Requires an active Steamship API key.
|
||||
|
||||
## Usage
|
||||
|
||||
To use this loader, you need to pass in your API key during initialization.
|
||||
|
||||
You may then specify a `query` and/or a `file_handles` to fetch files.
|
||||
|
||||
```python
|
||||
from llama_index import download_loader
|
||||
|
||||
SteamshipFileReader = download_loader("SteamshipFileReader")
|
||||
|
||||
loader = SteamshipFileReader(api_key="<api_key>")
|
||||
documents = loader.load_data(
|
||||
"<workspace>",
|
||||
query="filetag and value(\"import-id\")=\"import-001\"",
|
||||
file_handles=["smooth-valley-9kbdr"]
|
||||
)
|
||||
```
|
||||
|
||||
This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
|
||||
0
loader_hub/steamship/__init__.py
Normal file
0
loader_hub/steamship/__init__.py
Normal file
92
loader_hub/steamship/base.py
Normal file
92
loader_hub/steamship/base.py
Normal file
@ -0,0 +1,92 @@
|
||||
"""Load Documents from a set of persistent Steamship Files."""
|
||||
from typing import List, Optional
|
||||
|
||||
from llama_index.readers.base import BaseReader
|
||||
from llama_index.readers.schema.base import Document
|
||||
|
||||
|
||||
class SteamshipFileReader(BaseReader):
|
||||
"""Reads persistent Steamship Files and converts them to Documents.
|
||||
|
||||
Args:
|
||||
api_key: Steamship API key. Defaults to STEAMSHIP_API_KEY value if not provided.
|
||||
|
||||
Note:
|
||||
Requires install of `steamship` package and an active Steamship API Key.
|
||||
To get a Steamship API Key, visit: https://steamship.com/account/api.
|
||||
Once you have an API Key, expose it via an environment variable named
|
||||
`STEAMSHIP_API_KEY` or pass it as an init argument (`api_key`).
|
||||
"""
|
||||
|
||||
def __init__(self, api_key: Optional[str] = None) -> None:
|
||||
"""Initialize the Reader."""
|
||||
try:
|
||||
import steamship # noqa: F401
|
||||
|
||||
self.api_key = api_key
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"`steamship` must be installed to use the SteamshipFileReader.\n"
|
||||
"Please run `pip install --upgrade steamship."
|
||||
)
|
||||
|
||||
def load_data(
|
||||
self,
|
||||
workspace: str,
|
||||
query: Optional[str] = None,
|
||||
file_handles: Optional[List[str]] = None,
|
||||
collapse_blocks: bool = True,
|
||||
join_str: str = "\n\n",
|
||||
) -> List[Document]:
|
||||
"""Load data from persistent Steamship Files into Documents.
|
||||
|
||||
Args:
|
||||
workspace: the handle for a Steamship workspace
|
||||
(see: https://docs.steamship.com/workspaces/index.html)
|
||||
query: a Steamship tag query for retrieving files
|
||||
(ex: 'filetag and value("import-id")="import-001"')
|
||||
file_handles: a list of Steamship File handles
|
||||
(ex: `smooth-valley-9kbdr`)
|
||||
collapse_blocks: whether to merge individual File Blocks into a
|
||||
single Document, or separate them.
|
||||
join_str: when collapse_blocks is True, this is how the block texts
|
||||
will be concatenated.
|
||||
|
||||
Note:
|
||||
The collection of Files from both `query` and `file_handles` will be
|
||||
combined. There is no (current) support for deconflicting the collections
|
||||
(meaning that if a file appears both in the result set of the query and
|
||||
as a handle in file_handles, it will be loaded twice).
|
||||
"""
|
||||
from steamship import File, Steamship
|
||||
|
||||
client = Steamship(workspace=workspace, api_key=self.api_key)
|
||||
files = []
|
||||
if query:
|
||||
files_from_query = File.query(client=client, tag_filter_query=query).files
|
||||
files.extend(files_from_query)
|
||||
|
||||
if file_handles:
|
||||
files.extend([File.get(client=client, handle=h) for h in file_handles])
|
||||
|
||||
docs = []
|
||||
for file in files:
|
||||
extra_info = {"source": file.handle}
|
||||
|
||||
for tag in file.tags:
|
||||
extra_info[tag.kind] = tag.value
|
||||
|
||||
if collapse_blocks:
|
||||
text = join_str.join([b.text for b in file.blocks])
|
||||
docs.append(
|
||||
Document(text=text, doc_id=file.handle, extra_info=extra_info)
|
||||
)
|
||||
else:
|
||||
docs.extend(
|
||||
[
|
||||
Document(text=b.text, doc_id=file.handle, extra_info=extra_info)
|
||||
for b in file.blocks
|
||||
]
|
||||
)
|
||||
|
||||
return docs
|
||||
1
loader_hub/steamship/requirements.txt
Normal file
1
loader_hub/steamship/requirements.txt
Normal file
@ -0,0 +1 @@
|
||||
steamship
|
||||
Loading…
x
Reference in New Issue
Block a user