"""Pandas CSV reader. A parser for tabular data files using pandas. """ from pathlib import Path from typing import Any, Dict, List, Optional from llama_index.readers.base import BaseReader from llama_index.readers.schema.base import Document class PandasCSVReader(BaseReader): r"""Pandas-based CSV parser. Parses CSVs using the separator detection from Pandas `read_csv`function. If special parameters are required, use the `pandas_config` dict. Args: concat_rows (bool): whether to concatenate all rows into one document. If set to False, a Document will be created for each row. True by default. col_joiner (str): Separator to use for joining cols per row. Set to ", " by default. row_joiner (str): Separator to use for joining each row. Only used when `concat_rows=True`. Set to "\n" by default. pandas_config (dict): Options for the `pandas.read_csv` function call. Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html for more information. Set to empty dict by default, this means pandas will try to figure out the separators, table head, etc. on its own. """ def __init__( self, *args: Any, concat_rows: bool = True, col_joiner: str = ", ", row_joiner: str = "\n", pandas_config: dict = {}, **kwargs: Any ) -> None: """Init params.""" super().__init__(*args, **kwargs) self._concat_rows = concat_rows self._col_joiner = col_joiner self._row_joiner = row_joiner self._pandas_config = pandas_config def load_data( self, file: Path, extra_info: Optional[Dict] = None ) -> List[Document]: """Parse file.""" import pandas as pd df = pd.read_csv(file, **self._pandas_config) text_list = df.apply( lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1 ).tolist() if self._concat_rows: return [Document((self._row_joiner).join(text_list), extra_info=extra_info)] else: return [Document(text, extra_info=extra_info) for text in text_list]