mirror of
https://github.com/run-llama/llama-hub.git
synced 2025-08-14 03:31:41 +00:00
87 lines
2.7 KiB
Python
87 lines
2.7 KiB
Python
"""Opendal file and directory reader.
|
|
|
|
A loader that fetches a file or iterates through a directory on AWS S3 or other compatible service.
|
|
|
|
"""
|
|
import tempfile
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Union, cast
|
|
import asyncio
|
|
|
|
from llama_index import download_loader
|
|
from llama_index.readers.base import BaseReader
|
|
from llama_index.readers.schema.base import Document
|
|
|
|
|
|
class OpendalReader(BaseReader):
|
|
"""General reader for any opendal operator."""
|
|
|
|
def __init__(
|
|
self,
|
|
scheme: str,
|
|
path: str = "/",
|
|
file_extractor: Optional[Dict[str, Union[str, BaseReader]]] = None,
|
|
**kwargs,
|
|
) -> None:
|
|
"""Initialize opendal operator, along with credentials if needed.
|
|
|
|
|
|
Args:
|
|
scheme (str): the scheme of the service
|
|
path (str): the path of the data. If none is provided,
|
|
this loader will iterate through the entire bucket. If path is endswith `/`, this loader will iterate through the entire dir. Otherwise, this loeader will load the file.
|
|
file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file
|
|
extension to a BaseReader class that specifies how to convert that file
|
|
to text. See `SimpleDirectoryReader` for more details.
|
|
"""
|
|
import opendal
|
|
|
|
super().__init__()
|
|
|
|
self.path = path
|
|
self.file_extractor = file_extractor
|
|
|
|
self.op = opendal.AsyncOperator(scheme, **kwargs)
|
|
|
|
def load_data(self) -> List[Document]:
|
|
"""Load file(s) from OpenDAL."""
|
|
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
if not self.path.endswith("/"):
|
|
asyncio.run(download_file_from_opendal(self.op, temp_dir, self.path))
|
|
else:
|
|
asyncio.run(download_dir_from_opendal(self.op, temp_dir, self.path))
|
|
|
|
SimpleDirectoryReader = download_loader("SimpleDirectoryReader")
|
|
loader = SimpleDirectoryReader(temp_dir, file_extractor=self.file_extractor)
|
|
|
|
return loader.load_data()
|
|
|
|
|
|
async def download_file_from_opendal(
|
|
op: Any, temp_dir: str, path: str
|
|
) -> str:
|
|
"""Download file from OpenDAL."""
|
|
import opendal
|
|
op = cast(opendal.AsyncOperator, op)
|
|
|
|
suffix = Path(path).suffix
|
|
filepath = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}"
|
|
|
|
async with op.open_reader(path) as r:
|
|
with open(filepath, "wb") as w:
|
|
w.write(await r.read())
|
|
|
|
return filepath
|
|
|
|
|
|
async def download_dir_from_opendal(
|
|
op: Any, temp_dir: str, dir: str
|
|
) -> str:
|
|
"""Download directory from opendal."""
|
|
|
|
import opendal
|
|
op = cast(opendal.AsyncOperator, op)
|
|
async for obj in await op.scan(dir):
|
|
await download_file_from_opendal(op, temp_dir, obj.path)
|