mirror of
https://github.com/run-llama/llama-hub.git
synced 2025-08-13 19:21:15 +00:00
46 lines
1.5 KiB
Python
46 lines
1.5 KiB
Python
"""Obsidian reader class.
|
|
|
|
Pass in the path to an Obsidian vault and it will parse all markdown
|
|
files into a List of Documents,
|
|
with each Document containing text from under an Obsidian header.
|
|
|
|
"""
|
|
import os
|
|
from pathlib import Path
|
|
from typing import Any, List
|
|
|
|
from langchain.docstore.document import Document as LCDocument
|
|
from llama_index.readers.base import BaseReader
|
|
from llama_index.readers.file.markdown_reader import MarkdownReader
|
|
from llama_index.readers.schema.base import Document
|
|
|
|
|
|
class ObsidianReader(BaseReader):
|
|
"""Utilities for loading data from an Obsidian Vault.
|
|
|
|
Args:
|
|
input_dir (str): Path to the vault.
|
|
|
|
"""
|
|
|
|
def __init__(self, input_dir: str):
|
|
"""Init params."""
|
|
self.input_dir = Path(input_dir)
|
|
|
|
def load_data(self, *args: Any, **load_kwargs: Any) -> List[Document]:
|
|
"""Load data from the input directory."""
|
|
docs: List[Document] = []
|
|
for (dirpath, dirnames, filenames) in os.walk(self.input_dir):
|
|
dirnames[:] = [d for d in dirnames if not d.startswith(".")]
|
|
for filename in filenames:
|
|
if filename.endswith(".md"):
|
|
filepath = os.path.join(dirpath, filename)
|
|
content = MarkdownReader().load_data(Path(filepath))
|
|
docs.extend(content)
|
|
return docs
|
|
|
|
def load_langchain_documents(self, **load_kwargs: Any) -> List[LCDocument]:
|
|
"""Load data in LangChain document format."""
|
|
docs = self.load_data(**load_kwargs)
|
|
return [d.to_langchain_format() for d in docs]
|