Jerry Liu e97bb81915
swap out gpt_index imports for llama_index imports (#49)
* cr

* cr

* cr

---------

Co-authored-by: Jerry Liu <jerry@robustintelligence.com>
Co-authored-by: Jesse Zhang <jessetanzhang@gmail.com>
2023-02-20 21:46:58 -08:00

46 lines
1.6 KiB
Python

"""Obsidian reader class.
Pass in the path to an Obsidian vault and it will parse all markdown
files into a List of Documents,
with each Document containing text from under an Obsidian header.
"""
import os
from pathlib import Path
from typing import Any, List
from llama_index.readers.base import BaseReader
from llama_index.readers.file.markdown_parser import MarkdownParser
from llama_index.readers.schema.base import Document
from langchain.docstore.document import Document as LCDocument
class ObsidianReader(BaseReader):
"""Utilities for loading data from an Obsidian Vault.
Args:
input_dir (str): Path to the vault.
"""
def __init__(self, input_dir: str):
"""Init params."""
self.input_dir = Path(input_dir)
def load_data(self, *args: Any, **load_kwargs: Any) -> List[Document]:
"""Load data from the input directory."""
docs: List[str] = []
for (dirpath, dirnames, filenames) in os.walk(self.input_dir):
dirnames[:] = [d for d in dirnames if not d.startswith(".")]
for filename in filenames:
if filename.endswith(".md"):
filepath = os.path.join(dirpath, filename)
content = MarkdownParser().parse_file(Path(filepath))
docs.extend(content)
return [Document(d) for d in docs]
def load_langchain_documents(self, **load_kwargs: Any) -> List[LCDocument]:
"""Load data in LangChain document format."""
docs = self.load_data(**load_kwargs)
return [d.to_langchain_format() for d in docs]