feat: Support encoding parameter in partition_csv (#3564)

See added test file. Added support for the encoding parameter, which can
be passed directly to `pd.read_csv`.
This commit is contained in:
Austin Walker 2024-08-28 10:19:58 -04:00 committed by GitHub
parent f21c853ade
commit f440eb476c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 26 additions and 4 deletions

View File

@ -1,9 +1,11 @@
## 0.15.9-dev0
## 0.15.9-dev1
### Enhancements
### Features
* **Add support for encoding parameter in partition_csv**
### Fixes
* **Fix disk space leaks and Windows errors when accessing file.name on a NamedTemporaryFile** Uses of `NamedTemporaryFile(..., delete=False)` and/or uses of `file.name` of NamedTemporaryFiles have been replaced with TemporaryFileDirectory to avoid a known issue: https://docs.python.org/3/library/tempfile.html#tempfile.NamedTemporaryFile

Binary file not shown.
1 Stanley Cups
2 Team Location Stanley Cups
3 Blues STL 1
4 Flyers PHI 2
5 Maple Leafs TOR 13

View File

@ -74,6 +74,12 @@ def test_partition_csv_from_filename_with_metadata_filename():
assert elements[0].metadata.filename == "test"
def test_partition_csv_with_encoding():
elements = partition_csv(example_doc_path("stanley-cups-utf-16.csv"), encoding="utf-16")
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
@pytest.mark.parametrize(
("filename", "expected_text", "expected_table"),
[
@ -279,6 +285,7 @@ class Describe_CsvPartitioningContext:
ctx = _CsvPartitioningContext.load(
file_path=example_doc_path("stanley-cups.csv"),
file=None,
encoding=None,
metadata_file_path=None,
metadata_last_modified=None,
include_header=True,
@ -292,6 +299,7 @@ class Describe_CsvPartitioningContext:
_CsvPartitioningContext.load(
file_path=None,
file=None,
encoding=None,
metadata_file_path=None,
metadata_last_modified=None,
include_header=True,

View File

@ -7,6 +7,7 @@ from pandas.core.frame import DataFrame
def read_csv(
filepath_or_buffer: str | IO[bytes],
*,
encoding: str | None = ...,
sep: str | None = ...,
header: int | None | Literal["infer"] = ...,
) -> DataFrame: ...

View File

@ -1 +1 @@
__version__ = "0.15.9-dev0" # pragma: no cover
__version__ = "0.15.9-dev1" # pragma: no cover

View File

@ -207,6 +207,7 @@ def partition(
elements = partition_csv(
filename=filename,
file=file,
encoding=encoding,
infer_table_structure=infer_table_structure,
languages=languages,
detect_language_per_element=detect_language_per_element,

View File

@ -29,6 +29,7 @@ DETECTION_ORIGIN: str = "csv"
def partition_csv(
filename: str | None = None,
file: IO[bytes] | None = None,
encoding: str | None = None,
metadata_filename: str | None = None,
metadata_last_modified: str | None = None,
include_header: bool = False,
@ -47,6 +48,8 @@ def partition_csv(
A string defining the target filename path.
file
A file-like object using "rb" mode --> open(filename, "rb").
encoding
The encoding method used to decode the text input. If None, utf-8 will be used.
metadata_filename
The filename to use for the metadata.
metadata_last_modified
@ -73,6 +76,7 @@ def partition_csv(
ctx = _CsvPartitioningContext(
file_path=filename,
file=file,
encoding=encoding,
metadata_file_path=metadata_filename,
metadata_last_modified=metadata_last_modified,
include_header=include_header,
@ -81,7 +85,7 @@ def partition_csv(
)
with ctx.open() as file:
dataframe = pd.read_csv(file, header=ctx.header, sep=ctx.delimiter)
dataframe = pd.read_csv(file, header=ctx.header, sep=ctx.delimiter, encoding=encoding)
html_text = dataframe.to_html(index=False, header=include_header, na_rep="")
text = soupparser_fromstring(html_text).text_content()
@ -110,6 +114,7 @@ class _CsvPartitioningContext:
self,
file_path: str | None = None,
file: IO[bytes] | None = None,
encoding: str | None = None,
metadata_file_path: str | None = None,
metadata_last_modified: str | None = None,
include_header: bool = False,
@ -118,6 +123,7 @@ class _CsvPartitioningContext:
):
self._file_path = file_path
self._file = file
self._encoding = encoding
self._metadata_file_path = metadata_file_path
self._metadata_last_modified = metadata_last_modified
self._include_header = include_header
@ -129,6 +135,7 @@ class _CsvPartitioningContext:
cls,
file_path: str | None,
file: IO[bytes] | None,
encoding: str | None,
metadata_file_path: str | None,
metadata_last_modified: str | None,
include_header: bool,
@ -138,6 +145,7 @@ class _CsvPartitioningContext:
return cls(
file_path=file_path,
file=file,
encoding=encoding,
metadata_file_path=metadata_file_path,
metadata_last_modified=metadata_last_modified,
include_header=include_header,
@ -156,7 +164,9 @@ class _CsvPartitioningContext:
with self.open() as file:
# -- read whole lines, sniffer can be confused by a trailing partial line --
data = "\n".join(ln.decode("utf-8") for ln in file.readlines(num_bytes))
data = "\n".join(
ln.decode(self._encoding or "utf-8") for ln in file.readlines(num_bytes)
)
try:
return sniffer.sniff(data, delimiters=",;").delimiter