Josh XT 61de0c1648
Json data loader (#170)
Co-authored-by: Jerry Liu <jerryjliu98@gmail.com>
2023-04-07 22:03:23 -07:00

55 lines
1.6 KiB
Python

"""Json Data Reader."""
import json
import re
from typing import Dict, Generator, List, Union
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document
def _depth_first_yield(json_data: Dict, path: List[str]) -> Generator[str, None, None]:
"""Do depth first yield of all of the leaf nodes of a JSON.
Combines keys in the JSON tree using spaces.
"""
if isinstance(json_data, dict):
for key, value in json_data.items():
new_path = path[:]
new_path.append(key)
yield from _depth_first_yield(value, new_path)
elif isinstance(json_data, list):
for _, value in enumerate(json_data):
yield from _depth_first_yield(value, path)
else:
path.append(str(json_data))
yield " ".join(path)
class JSONDataReader(BaseReader):
"""Json Data reader.
Reads in Json Data.
Args:
data(Union[str, Dict]): Json data to read. Can be either a JSON
string or dictionary.
"""
def __init__(self) -> None:
"""Initialize with arguments."""
super().__init__()
def load_data(self, input_data: Union[str, Dict]) -> List[Document]:
"""Load data from the input file."""
if isinstance(input_data, str):
data = json.loads(input_data)
else:
data = input_data
json_output = json.dumps(data, indent=0)
lines = json_output.split("\n")
useful_lines = [
line for line in lines if not re.match(r"^[{}\[\],]*$", line)
]
return [Document("\n".join(useful_lines))]