"""Obsidian reader class. Pass in the path to an Obsidian vault and it will parse all markdown files into a List of Documents, with each Document containing text from under an Obsidian header. """ import os from pathlib import Path from typing import Any, List from gpt_index.readers.base import BaseReader from gpt_index.readers.file.markdown_parser import MarkdownParser from gpt_index.readers.schema.base import Document from langchain.docstore.document import Document as LCDocument class ObsidianReader(BaseReader): """Utilities for loading data from an Obsidian Vault. Args: input_dir (str): Path to the vault. """ def __init__(self, input_dir: str): """Init params.""" self.input_dir = Path(input_dir) def load_data(self, *args: Any, **load_kwargs: Any) -> List[Document]: """Load data from the input directory.""" docs: List[str] = [] for (dirpath, dirnames, filenames) in os.walk(self.input_dir): dirnames[:] = [d for d in dirnames if not d.startswith(".")] for filename in filenames: if filename.endswith(".md"): filepath = os.path.join(dirpath, filename) content = MarkdownParser().parse_file(Path(filepath)) docs.extend(content) return [Document(d) for d in docs] def load_langchain_documents(self, **load_kwargs: Any) -> List[LCDocument]: """Load data in LangChain document format.""" docs = self.load_data(**load_kwargs) return [d.to_langchain_format() for d in docs]