"""Simple reader that reads files of different formats from a directory.""" from pathlib import Path from typing import Callable, Dict, List, Optional from loader_hub.file.base_parser import BaseParser from loader_hub.file.docs_parser import DocxParser, PDFParser from loader_hub.file.image_parser import ImageParser from loader_hub.file.slides_parser import PptxParser from loader_hub.file.tabular_parser import CSVParser from loader_hub.file.video_audio import VideoAudioParser from gpt_index.readers.base import BaseReader from gpt_index.readers.schema.base import Document DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = { ".pdf": PDFParser(), ".docx": DocxParser(), ".pptx": PptxParser(), ".jpg": ImageParser(), ".png": ImageParser(), ".jpeg": ImageParser(), ".mp3": VideoAudioParser(), ".mp4": VideoAudioParser(), ".csv": CSVParser(), } class SimpleDirectoryReader(BaseReader): """Simple directory reader. Can read files into separate documents, or concatenates files into one document text. Args: input_dir (str): Path to the directory. exclude_hidden (bool): Whether to exclude hidden files (dotfiles). errors (str): how encoding and decoding errors are to be handled, see https://docs.python.org/3/library/functions.html#open recursive (bool): Whether to recursively search in subdirectories. False by default. required_exts (Optional[List[str]]): List of required extensions. Default is None. file_extractor (Optional[Dict[str, BaseParser]]): A mapping of file extension to a BaseParser class that specifies how to convert that file to text. See DEFAULT_FILE_EXTRACTOR. num_files_limit (Optional[int]): Maximum number of files to read. Default is None. file_metadata (Optional[Callable[str, Dict]]): A function that takes in a filename and returns a Dict of metadata for the Document. Default is None. """ def __init__( self, input_dir: str, exclude_hidden: bool = True, errors: str = "ignore", recursive: bool = False, required_exts: Optional[List[str]] = None, file_extractor: Optional[Dict[str, BaseParser]] = None, num_files_limit: Optional[int] = None, file_metadata: Optional[Callable[[str], Dict]] = None, verbose: bool = False, ) -> None: """Initialize with parameters.""" super().__init__(verbose=verbose) self.input_dir = Path(input_dir) self.errors = errors self.recursive = recursive self.exclude_hidden = exclude_hidden self.required_exts = required_exts self.num_files_limit = num_files_limit self.input_files = self._add_files(self.input_dir) self.file_extractor = file_extractor or DEFAULT_FILE_EXTRACTOR self.file_metadata = file_metadata def _add_files(self, input_dir: Path) -> List[Path]: """Add files.""" input_files = sorted(input_dir.iterdir()) new_input_files = [] dirs_to_explore = [] for input_file in input_files: if self.exclude_hidden and input_file.stem.startswith("."): continue elif input_file.is_dir(): if self.recursive: dirs_to_explore.append(input_file) elif ( self.required_exts is not None and input_file.suffix not in self.required_exts ): continue else: new_input_files.append(input_file) for dir_to_explore in dirs_to_explore: sub_input_files = self._add_files(dir_to_explore) new_input_files.extend(sub_input_files) if self.num_files_limit is not None and self.num_files_limit > 0: new_input_files = new_input_files[0 : self.num_files_limit] # print total number of files added if self.verbose: print( f"> [SimpleDirectoryReader] Total files added: {len(new_input_files)}" ) return new_input_files def load_data(self, concatenate: bool = False) -> List[Document]: """Load data from the input directory. Args: concatenate (bool): whether to concatenate all files into one document. If set to True, file metadata is ignored. False by default. Returns: List[Document]: A list of documents. """ data = "" data_list = [] metadata_list = [] for input_file in self.input_files: if input_file.suffix in self.file_extractor: parser = self.file_extractor[input_file.suffix] if not parser.parser_config_set: parser.init_parser() data = parser.parse_file(input_file, errors=self.errors) else: # do standard read with open(input_file, "r", errors=self.errors) as f: data = f.read() data_list.append(data) if self.file_metadata is not None: metadata_list.append(self.file_metadata(str(input_file))) if concatenate: return [Document("\n".join(data_list))] elif self.file_metadata is not None: return [Document(d, extra_info=m) for d, m in zip(data_list, metadata_list)] else: return [Document(d) for d in data_list]