llama-hub/loader_hub/huggingface/fs/base.py

"""Hugging Face file reader.

A parser for HF files.

"""
import json
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Any, Dict, List, Optional

import pandas as pd
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document


class HuggingFaceFSReader(BaseReader):
    r"""Hugging Face File System reader.

    Uses the new Filesystem API from the Hugging Face Hub client library.

    Args:


    """

    def __init__(self) -> None:
        from huggingface_hub import HfFileSystem

        self.fs = HfFileSystem()

    def load_dicts(self, path: str) -> List[Dict]:
        """Parse file."""

        test_data = self.fs.read_bytes(path)

        path = Path(path)
        if ".gz" in path.suffixes:
            import gzip

            with TemporaryDirectory() as tmp:
                tmp = Path(tmp)
                with open(tmp / "tmp.jsonl.gz", "wb") as fp:
                    fp.write(test_data)

                f = gzip.open(tmp / "tmp.jsonl.gz", "rb")
                raw = f.read()
                data = raw.decode()
        else:
            data = test_data.decode()

        text_lines = data.split("\n")
        json_dicts = []
        for t in text_lines:
            try:
                json_dict = json.loads(t)
            except json.decoder.JSONDecodeError:
                continue
            json_dicts.append(json_dict)
        return json_dicts

    def load_df(self, path: str) -> pd.DataFrame:
        """Load pandas dataframe."""
        return pd.DataFrame(self.load_dicts(path))

    def load_data(self, path: str) -> List[Document]:
        """Load data."""
        json_dicts = self.load_dicts(path)
        docs = []
        for d in json_dicts:
            docs.append(Document(str(d)))
        return docs