cr (#93)

2025-12-27 06:59:06 +00:00 · 2023-03-09 12:42:09 -08:00 · 2023-03-09 12:42:09 -08:00 · 87dd8721db
commit 87dd8721db
parent aff12c3b3c
5 changed files with 122 additions and 0 deletions
--- a/loader_hub/library.json
+++ b/loader_hub/library.json
@ -312,5 +312,10 @@
    "id": "wordpress",
    "author": "bbornsztein",
    "keywords": ["wordpress", "blog"]
+  },
+  "SteamshipFileReader": {
+    "id": "steamship",
+    "author": "douglas-reid",
+    "keywords": ["steamship"]
  }
 }
--- a/loader_hub/steamship/README.md
+++ b/loader_hub/steamship/README.md
@ -0,0 +1,24 @@
+# Steamship Loader
+
+This loader loads persistent Steamship files and converts them to a Document object. Requires an active Steamship API key.
+
+## Usage
+
+To use this loader, you need to pass in your API key during initialization.
+
+You may then specify a `query` and/or a `file_handles` to fetch files.
+
+```python
+from llama_index import download_loader
+
+SteamshipFileReader = download_loader("SteamshipFileReader")
+
+loader = SteamshipFileReader(api_key="<api_key>")
+documents = loader.load_data(
+    "<workspace>", 
+    query="filetag and value(\"import-id\")=\"import-001\"", 
+    file_handles=["smooth-valley-9kbdr"]
+)
+```
+
+This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
--- a/loader_hub/steamship/init.py
+++ b/loader_hub/steamship/init.py
--- a/loader_hub/steamship/base.py
+++ b/loader_hub/steamship/base.py
@ -0,0 +1,92 @@
+"""Load Documents from a set of persistent Steamship Files."""
+from typing import List, Optional
+
+from llama_index.readers.base import BaseReader
+from llama_index.readers.schema.base import Document
+
+
+class SteamshipFileReader(BaseReader):
+    """Reads persistent Steamship Files and converts them to Documents.
+
+    Args:
+        api_key: Steamship API key. Defaults to STEAMSHIP_API_KEY value if not provided.
+
+    Note:
+        Requires install of `steamship` package and an active Steamship API Key.
+        To get a Steamship API Key, visit: https://steamship.com/account/api.
+        Once you have an API Key, expose it via an environment variable named
+        `STEAMSHIP_API_KEY` or pass it as an init argument (`api_key`).
+    """
+
+    def __init__(self, api_key: Optional[str] = None) -> None:
+        """Initialize the Reader."""
+        try:
+            import steamship  # noqa: F401
+
+            self.api_key = api_key
+        except ImportError:
+            raise ImportError(
+                "`steamship` must be installed to use the SteamshipFileReader.\n"
+                "Please run `pip install --upgrade steamship."
+            )
+
+    def load_data(
+        self,
+        workspace: str,
+        query: Optional[str] = None,
+        file_handles: Optional[List[str]] = None,
+        collapse_blocks: bool = True,
+        join_str: str = "\n\n",
+    ) -> List[Document]:
+        """Load data from persistent Steamship Files into Documents.
+
+        Args:
+            workspace: the handle for a Steamship workspace
+                (see: https://docs.steamship.com/workspaces/index.html)
+            query: a Steamship tag query for retrieving files
+                (ex: 'filetag and value("import-id")="import-001"')
+            file_handles: a list of Steamship File handles
+                (ex: `smooth-valley-9kbdr`)
+            collapse_blocks: whether to merge individual File Blocks into a
+                single Document, or separate them.
+            join_str: when collapse_blocks is True, this is how the block texts
+                will be concatenated.
+
+        Note:
+            The collection of Files from both `query` and `file_handles` will be
+            combined. There is no (current) support for deconflicting the collections
+            (meaning that if a file appears both in the result set of the query and
+            as a handle in file_handles, it will be loaded twice).
+        """
+        from steamship import File, Steamship
+
+        client = Steamship(workspace=workspace, api_key=self.api_key)
+        files = []
+        if query:
+            files_from_query = File.query(client=client, tag_filter_query=query).files
+            files.extend(files_from_query)
+
+        if file_handles:
+            files.extend([File.get(client=client, handle=h) for h in file_handles])
+
+        docs = []
+        for file in files:
+            extra_info = {"source": file.handle}
+
+            for tag in file.tags:
+                extra_info[tag.kind] = tag.value
+
+            if collapse_blocks:
+                text = join_str.join([b.text for b in file.blocks])
+                docs.append(
+                    Document(text=text, doc_id=file.handle, extra_info=extra_info)
+                )
+            else:
+                docs.extend(
+                    [
+                        Document(text=b.text, doc_id=file.handle, extra_info=extra_info)
+                        for b in file.blocks
+                    ]
+                )
+
+        return docs
--- a/loader_hub/steamship/requirements.txt
+++ b/loader_hub/steamship/requirements.txt
@ -0,0 +1 @@
+steamship