"""Opendal file and directory reader. A loader that fetches a file or iterates through a directory on AWS S3 or other compatible service. """ import tempfile from pathlib import Path from typing import Any, Dict, List, Optional, Union, cast import asyncio from llama_index import download_loader from llama_index.readers.base import BaseReader from llama_index.readers.schema.base import Document class OpendalReader(BaseReader): """General reader for any opendal operator.""" def __init__( self, scheme: str, path: str = "/", file_extractor: Optional[Dict[str, Union[str, BaseReader]]] = None, **kwargs, ) -> None: """Initialize opendal operator, along with credentials if needed. Args: scheme (str): the scheme of the service path (str): the path of the data. If none is provided, this loader will iterate through the entire bucket. If path is endswith `/`, this loader will iterate through the entire dir. Otherwise, this loeader will load the file. file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file extension to a BaseReader class that specifies how to convert that file to text. See `SimpleDirectoryReader` for more details. """ import opendal super().__init__() self.path = path self.file_extractor = file_extractor self.op = opendal.AsyncOperator(scheme, **kwargs) def load_data(self) -> List[Document]: """Load file(s) from OpenDAL.""" with tempfile.TemporaryDirectory() as temp_dir: if not self.path.endswith("/"): asyncio.run(download_file_from_opendal(self.op, temp_dir, self.path)) else: asyncio.run(download_dir_from_opendal(self.op, temp_dir, self.path)) SimpleDirectoryReader = download_loader("SimpleDirectoryReader") loader = SimpleDirectoryReader(temp_dir, file_extractor=self.file_extractor) return loader.load_data() async def download_file_from_opendal( op: Any, temp_dir: str, path: str ) -> str: """Download file from OpenDAL.""" import opendal op = cast(opendal.AsyncOperator, op) suffix = Path(path).suffix filepath = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}" async with op.open_reader(path) as r: with open(filepath, "wb") as w: w.write(await r.read()) return filepath async def download_dir_from_opendal( op: Any, temp_dir: str, dir: str ) -> str: """Download directory from opendal.""" import opendal op = cast(opendal.AsyncOperator, op) async for obj in await op.scan(dir): await download_file_from_opendal(op, temp_dir, obj.path)