llama-hub/loader_hub/file/pandas_excel/base.py

"""Pandas Excel reader.

Pandas parser for .xlsx files.

"""
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document


class PandasExcelReader(BaseReader):
    r"""Pandas-based CSV parser.

    Parses CSVs using the separator detection from Pandas `read_csv`function.
    If special parameters are required, use the `pandas_config` dict.

    Args:

        pandas_config (dict): Options for the `pandas.read_excel` function call.
            Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html
            for more information. Set to empty dict by default, this means defaults will be used.

    """

    def __init__(
        self,
        *args: Any,
        pandas_config: dict = {},
        concat_rows: bool = True,
        **kwargs: Any
    ) -> None:
        """Init params."""
        super().__init__(*args, **kwargs)
        self._pandas_config = pandas_config
        self._concat_rows = concat_rows

    def load_data(
        self, file: Path, sheet_name: Optional[Union[str, int]] = None, extra_info: Optional[Dict] = None
    ) -> List[Document]:
        """Parse file and extract values from a specific column.

        Args:
            file (Path): The path to the Excel file to read.
            column_name (str): The name of the column to use when creating the Document objects.
        Returns:
            List[Document]: A list of`Document objects containing the values from the specified column in the Excel file.
        """
        import itertools

        import pandas as pd

        df = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)

        keys = df.keys()

        df_sheets = []

        for key in keys:
            sheet = df[key].values.astype(str).tolist()
            df_sheets.append(sheet)

        text_list = list(itertools.chain.from_iterable(df_sheets))  # flatten list of lists

        if self._concat_rows:
            return [Document((self._row_joiner).join(text_list), extra_info=extra_info)]
        else:
            return [Document(text, extra_info=extra_info) for text in text_list]