2023-04-26 14:03:19 -07:00

72 lines
1.8 KiB
Python

"""Hugging Face file reader.
A parser for HF files.
"""
import json
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Any, Dict, List, Optional
import pandas as pd
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document
class HuggingFaceFSReader(BaseReader):
r"""Hugging Face File System reader.
Uses the new Filesystem API from the Hugging Face Hub client library.
Args:
"""
def __init__(self) -> None:
from huggingface_hub import HfFileSystem
self.fs = HfFileSystem()
def load_dicts(self, path: str) -> List[Dict]:
"""Parse file."""
test_data = self.fs.read_bytes(path)
path = Path(path)
if ".gz" in path.suffixes:
import gzip
with TemporaryDirectory() as tmp:
tmp = Path(tmp)
with open(tmp / "tmp.jsonl.gz", "wb") as fp:
fp.write(test_data)
f = gzip.open(tmp / "tmp.jsonl.gz", "rb")
raw = f.read()
data = raw.decode()
else:
data = test_data.decode()
text_lines = data.split("\n")
json_dicts = []
for t in text_lines:
try:
json_dict = json.loads(t)
except json.decoder.JSONDecodeError:
continue
json_dicts.append(json_dict)
return json_dicts
def load_df(self, path: str) -> pd.DataFrame:
"""Load pandas dataframe."""
return pd.DataFrame(self.load_dicts(path))
def load_data(self, path: str) -> List[Document]:
"""Load data."""
json_dicts = self.load_dicts(path)
docs = []
for d in json_dicts:
docs.append(Document(str(d)))
return docs