mirror of
https://github.com/run-llama/llama-hub.git
synced 2025-08-14 11:41:56 +00:00
70 lines
2.2 KiB
Python
70 lines
2.2 KiB
Python
"""Pandas Excel reader.
|
|
|
|
Pandas parser for .xlsx files.
|
|
|
|
"""
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Union
|
|
|
|
from llama_index.readers.base import BaseReader
|
|
from llama_index.readers.schema.base import Document
|
|
|
|
|
|
class PandasExcelReader(BaseReader):
|
|
r"""Pandas-based CSV parser.
|
|
|
|
Parses CSVs using the separator detection from Pandas `read_csv`function.
|
|
If special parameters are required, use the `pandas_config` dict.
|
|
|
|
Args:
|
|
|
|
pandas_config (dict): Options for the `pandas.read_excel` function call.
|
|
Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html
|
|
for more information. Set to empty dict by default, this means defaults will be used.
|
|
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
*args: Any,
|
|
pandas_config: dict = {},
|
|
concat_rows: bool = True,
|
|
**kwargs: Any
|
|
) -> None:
|
|
"""Init params."""
|
|
super().__init__(*args, **kwargs)
|
|
self._pandas_config = pandas_config
|
|
self._concat_rows = concat_rows
|
|
|
|
def load_data(
|
|
self, file: Path, sheet_name: Optional[Union[str, int]] = None, extra_info: Optional[Dict] = None
|
|
) -> List[Document]:
|
|
"""Parse file and extract values from a specific column.
|
|
|
|
Args:
|
|
file (Path): The path to the Excel file to read.
|
|
column_name (str): The name of the column to use when creating the Document objects.
|
|
Returns:
|
|
List[Document]: A list of`Document objects containing the values from the specified column in the Excel file.
|
|
"""
|
|
import itertools
|
|
|
|
import pandas as pd
|
|
|
|
df = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)
|
|
|
|
keys = df.keys()
|
|
|
|
df_sheets = []
|
|
|
|
for key in keys:
|
|
sheet = df[key].values.astype(str).tolist()
|
|
df_sheets.append(sheet)
|
|
|
|
text_list = list(itertools.chain.from_iterable(df_sheets)) # flatten list of lists
|
|
|
|
if self._concat_rows:
|
|
return [Document((self._row_joiner).join(text_list), extra_info=extra_info)]
|
|
else:
|
|
return [Document(text, extra_info=extra_info) for text in text_list]
|