mirror of
https://github.com/run-llama/llama-hub.git
synced 2025-08-14 03:31:41 +00:00
72 lines
1.8 KiB
Python
72 lines
1.8 KiB
Python
"""Hugging Face file reader.
|
|
|
|
A parser for HF files.
|
|
|
|
"""
|
|
import json
|
|
from pathlib import Path
|
|
from tempfile import TemporaryDirectory
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
import pandas as pd
|
|
from llama_index.readers.base import BaseReader
|
|
from llama_index.readers.schema.base import Document
|
|
|
|
|
|
class HuggingFaceFSReader(BaseReader):
|
|
r"""Hugging Face File System reader.
|
|
|
|
Uses the new Filesystem API from the Hugging Face Hub client library.
|
|
|
|
Args:
|
|
|
|
|
|
"""
|
|
|
|
def __init__(self) -> None:
|
|
from huggingface_hub import HfFileSystem
|
|
|
|
self.fs = HfFileSystem()
|
|
|
|
def load_dicts(self, path: str) -> List[Dict]:
|
|
"""Parse file."""
|
|
|
|
test_data = self.fs.read_bytes(path)
|
|
|
|
path = Path(path)
|
|
if ".gz" in path.suffixes:
|
|
import gzip
|
|
|
|
with TemporaryDirectory() as tmp:
|
|
tmp = Path(tmp)
|
|
with open(tmp / "tmp.jsonl.gz", "wb") as fp:
|
|
fp.write(test_data)
|
|
|
|
f = gzip.open(tmp / "tmp.jsonl.gz", "rb")
|
|
raw = f.read()
|
|
data = raw.decode()
|
|
else:
|
|
data = test_data.decode()
|
|
|
|
text_lines = data.split("\n")
|
|
json_dicts = []
|
|
for t in text_lines:
|
|
try:
|
|
json_dict = json.loads(t)
|
|
except json.decoder.JSONDecodeError:
|
|
continue
|
|
json_dicts.append(json_dict)
|
|
return json_dicts
|
|
|
|
def load_df(self, path: str) -> pd.DataFrame:
|
|
"""Load pandas dataframe."""
|
|
return pd.DataFrame(self.load_dicts(path))
|
|
|
|
def load_data(self, path: str) -> List[Document]:
|
|
"""Load data."""
|
|
json_dicts = self.load_dicts(path)
|
|
docs = []
|
|
for d in json_dicts:
|
|
docs.append(Document(str(d)))
|
|
return docs
|