mirror of
https://github.com/run-llama/llama-hub.git
synced 2025-08-14 19:51:25 +00:00
45 lines
1.3 KiB
Python
45 lines
1.3 KiB
Python
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Dict, Optional, List
|
|
|
|
from llama_index.readers.base import BaseReader
|
|
from llama_index.readers.schema.base import Document
|
|
|
|
|
|
class IPYNBReader(BaseReader):
|
|
"""Ipynb file loader.
|
|
|
|
Reads jupyter notebook files.
|
|
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
parser_config: Optional[Dict] = None,
|
|
concatenate: bool = False,
|
|
):
|
|
"""Init params."""
|
|
self._parser_config = parser_config
|
|
self._concatenate = concatenate
|
|
|
|
def load_data(self, file: Path, extra_info: Optional[Dict] = None) -> List[Document]:
|
|
"""Parse file."""
|
|
|
|
if file.name.endswith(".ipynb"):
|
|
try:
|
|
import nbconvert # noqa: F401
|
|
except ImportError:
|
|
raise ImportError("Please install nbconvert 'pip install nbconvert' ")
|
|
string = nbconvert.exporters.ScriptExporter().from_file(file)[0]
|
|
# split each In[] cell into a separate string
|
|
splits = re.split(r'In\[\d+\]:', string)
|
|
# remove the first element, which is empty
|
|
splits.pop(0)
|
|
|
|
if self._concatenate:
|
|
docs = [Document(text="\n\n".join(splits))]
|
|
else:
|
|
docs = [Document(text=s) for s in splits]
|
|
return docs
|