46 lines
1.5 KiB
Python
Raw Normal View History

2023-02-03 00:05:28 -08:00
"""Obsidian reader class.
Pass in the path to an Obsidian vault and it will parse all markdown
files into a List of Documents,
with each Document containing text from under an Obsidian header.
"""
import os
from pathlib import Path
from typing import Any, List
2023-02-24 23:39:32 -08:00
from langchain.docstore.document import Document as LCDocument
from llama_index.readers.base import BaseReader
from llama_index.readers.file.markdown_reader import MarkdownReader
from llama_index.readers.schema.base import Document
2023-02-03 00:05:28 -08:00
class ObsidianReader(BaseReader):
"""Utilities for loading data from an Obsidian Vault.
Args:
input_dir (str): Path to the vault.
"""
2023-02-10 14:47:37 -08:00
def __init__(self, input_dir: str):
2023-02-03 00:05:28 -08:00
"""Init params."""
self.input_dir = Path(input_dir)
def load_data(self, *args: Any, **load_kwargs: Any) -> List[Document]:
"""Load data from the input directory."""
docs: List[Document] = []
2023-02-03 00:05:28 -08:00
for (dirpath, dirnames, filenames) in os.walk(self.input_dir):
dirnames[:] = [d for d in dirnames if not d.startswith(".")]
for filename in filenames:
if filename.endswith(".md"):
filepath = os.path.join(dirpath, filename)
content = MarkdownReader().load_data(Path(filepath))
2023-02-03 00:05:28 -08:00
docs.extend(content)
return docs
2023-02-03 00:05:28 -08:00
def load_langchain_documents(self, **load_kwargs: Any) -> List[LCDocument]:
"""Load data in LangChain document format."""
docs = self.load_data(**load_kwargs)
return [d.to_langchain_format() for d in docs]