Sivasurya Santhanam 1b2e016a36
utf-8 encoding fixes UnicodeDecodeError (#196)
Opening the file with encoding = 'utf-8', fixes the following UnicodeDecodeError
UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 1094: character maps to

OS: Windows 10 x64
2023-04-15 00:09:11 -07:00

43 lines
1.2 KiB
Python

"""Simple CSV reader.
A parser for tabular data files.
"""
from pathlib import Path
from typing import Any, Dict, List, Optional
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document
class SimpleCSVReader(BaseReader):
"""CSV parser.
Args:
concat_rows (bool): whether to concatenate all rows into one document.
If set to False, a Document will be created for each row.
True by default.
"""
def __init__(self, *args: Any, concat_rows: bool = True, **kwargs: Any) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
self._concat_rows = concat_rows
def load_data(
self, file: Path, extra_info: Optional[Dict] = None
) -> List[Document]:
"""Parse file."""
import csv
text_list = []
with open(file, "r", encoding='utf-8') as fp:
csv_reader = csv.reader(fp)
for row in csv_reader:
text_list.append(", ".join(row))
if self._concat_rows:
return [Document("\n".join(text_list), extra_info=extra_info)]
else:
return [Document(text, extra_info=extra_info) for text in text_list]