46 lines
1.6 KiB
Python
Raw Normal View History

2023-02-03 00:05:28 -08:00
"""Obsidian reader class.
Pass in the path to an Obsidian vault and it will parse all markdown
files into a List of Documents,
with each Document containing text from under an Obsidian header.
"""
import os
from pathlib import Path
from typing import Any, List
from gpt_index.readers.base import BaseReader
from gpt_index.readers.file.markdown_parser import MarkdownParser
from gpt_index.readers.schema.base import Document
2023-02-03 23:38:12 -08:00
from langchain.docstore.document import Document as LCDocument
2023-02-03 00:05:28 -08:00
class ObsidianReader(BaseReader):
"""Utilities for loading data from an Obsidian Vault.
Args:
input_dir (str): Path to the vault.
"""
2023-02-10 14:47:37 -08:00
def __init__(self, input_dir: str):
2023-02-03 00:05:28 -08:00
"""Init params."""
self.input_dir = Path(input_dir)
def load_data(self, *args: Any, **load_kwargs: Any) -> List[Document]:
"""Load data from the input directory."""
docs: List[str] = []
for (dirpath, dirnames, filenames) in os.walk(self.input_dir):
dirnames[:] = [d for d in dirnames if not d.startswith(".")]
for filename in filenames:
if filename.endswith(".md"):
filepath = os.path.join(dirpath, filename)
content = MarkdownParser().parse_file(Path(filepath))
docs.extend(content)
return [Document(d) for d in docs]
def load_langchain_documents(self, **load_kwargs: Any) -> List[LCDocument]:
"""Load data in LangChain document format."""
docs = self.load_data(**load_kwargs)
return [d.to_langchain_format() for d in docs]