Logan 62f94d0eba
add concat rows to pandas excel (#262)
Co-authored-by: Jerry Liu <jerryjliu98@gmail.com>
2023-05-17 09:16:05 -07:00

70 lines
2.2 KiB
Python

"""Pandas Excel reader.
Pandas parser for .xlsx files.
"""
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document
class PandasExcelReader(BaseReader):
r"""Pandas-based CSV parser.
Parses CSVs using the separator detection from Pandas `read_csv`function.
If special parameters are required, use the `pandas_config` dict.
Args:
pandas_config (dict): Options for the `pandas.read_excel` function call.
Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html
for more information. Set to empty dict by default, this means defaults will be used.
"""
def __init__(
self,
*args: Any,
pandas_config: dict = {},
concat_rows: bool = True,
**kwargs: Any
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
self._pandas_config = pandas_config
self._concat_rows = concat_rows
def load_data(
self, file: Path, sheet_name: Optional[Union[str, int]] = None, extra_info: Optional[Dict] = None
) -> List[Document]:
"""Parse file and extract values from a specific column.
Args:
file (Path): The path to the Excel file to read.
column_name (str): The name of the column to use when creating the Document objects.
Returns:
List[Document]: A list of`Document objects containing the values from the specified column in the Excel file.
"""
import itertools
import pandas as pd
df = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)
keys = df.keys()
df_sheets = []
for key in keys:
sheet = df[key].values.astype(str).tolist()
df_sheets.append(sheet)
text_list = list(itertools.chain.from_iterable(df_sheets)) # flatten list of lists
if self._concat_rows:
return [Document((self._row_joiner).join(text_list), extra_info=extra_info)]
else:
return [Document(text, extra_info=extra_info) for text in text_list]