mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-24 13:44:05 +00:00
feat: Support encoding parameter in partition_csv (#3564)
See added test file. Added support for the encoding parameter, which can be passed directly to `pd.read_csv`.
This commit is contained in:
parent
f21c853ade
commit
f440eb476c
@ -1,9 +1,11 @@
|
||||
## 0.15.9-dev0
|
||||
## 0.15.9-dev1
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
* **Add support for encoding parameter in partition_csv**
|
||||
|
||||
### Fixes
|
||||
|
||||
* **Fix disk space leaks and Windows errors when accessing file.name on a NamedTemporaryFile** Uses of `NamedTemporaryFile(..., delete=False)` and/or uses of `file.name` of NamedTemporaryFiles have been replaced with TemporaryFileDirectory to avoid a known issue: https://docs.python.org/3/library/tempfile.html#tempfile.NamedTemporaryFile
|
||||
|
||||
BIN
example-docs/stanley-cups-utf-16.csv
Normal file
BIN
example-docs/stanley-cups-utf-16.csv
Normal file
Binary file not shown.
|
@ -74,6 +74,12 @@ def test_partition_csv_from_filename_with_metadata_filename():
|
||||
assert elements[0].metadata.filename == "test"
|
||||
|
||||
|
||||
def test_partition_csv_with_encoding():
|
||||
elements = partition_csv(example_doc_path("stanley-cups-utf-16.csv"), encoding="utf-16")
|
||||
|
||||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("filename", "expected_text", "expected_table"),
|
||||
[
|
||||
@ -279,6 +285,7 @@ class Describe_CsvPartitioningContext:
|
||||
ctx = _CsvPartitioningContext.load(
|
||||
file_path=example_doc_path("stanley-cups.csv"),
|
||||
file=None,
|
||||
encoding=None,
|
||||
metadata_file_path=None,
|
||||
metadata_last_modified=None,
|
||||
include_header=True,
|
||||
@ -292,6 +299,7 @@ class Describe_CsvPartitioningContext:
|
||||
_CsvPartitioningContext.load(
|
||||
file_path=None,
|
||||
file=None,
|
||||
encoding=None,
|
||||
metadata_file_path=None,
|
||||
metadata_last_modified=None,
|
||||
include_header=True,
|
||||
|
||||
@ -7,6 +7,7 @@ from pandas.core.frame import DataFrame
|
||||
def read_csv(
|
||||
filepath_or_buffer: str | IO[bytes],
|
||||
*,
|
||||
encoding: str | None = ...,
|
||||
sep: str | None = ...,
|
||||
header: int | None | Literal["infer"] = ...,
|
||||
) -> DataFrame: ...
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.15.9-dev0" # pragma: no cover
|
||||
__version__ = "0.15.9-dev1" # pragma: no cover
|
||||
|
||||
@ -207,6 +207,7 @@ def partition(
|
||||
elements = partition_csv(
|
||||
filename=filename,
|
||||
file=file,
|
||||
encoding=encoding,
|
||||
infer_table_structure=infer_table_structure,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
|
||||
@ -29,6 +29,7 @@ DETECTION_ORIGIN: str = "csv"
|
||||
def partition_csv(
|
||||
filename: str | None = None,
|
||||
file: IO[bytes] | None = None,
|
||||
encoding: str | None = None,
|
||||
metadata_filename: str | None = None,
|
||||
metadata_last_modified: str | None = None,
|
||||
include_header: bool = False,
|
||||
@ -47,6 +48,8 @@ def partition_csv(
|
||||
A string defining the target filename path.
|
||||
file
|
||||
A file-like object using "rb" mode --> open(filename, "rb").
|
||||
encoding
|
||||
The encoding method used to decode the text input. If None, utf-8 will be used.
|
||||
metadata_filename
|
||||
The filename to use for the metadata.
|
||||
metadata_last_modified
|
||||
@ -73,6 +76,7 @@ def partition_csv(
|
||||
ctx = _CsvPartitioningContext(
|
||||
file_path=filename,
|
||||
file=file,
|
||||
encoding=encoding,
|
||||
metadata_file_path=metadata_filename,
|
||||
metadata_last_modified=metadata_last_modified,
|
||||
include_header=include_header,
|
||||
@ -81,7 +85,7 @@ def partition_csv(
|
||||
)
|
||||
|
||||
with ctx.open() as file:
|
||||
dataframe = pd.read_csv(file, header=ctx.header, sep=ctx.delimiter)
|
||||
dataframe = pd.read_csv(file, header=ctx.header, sep=ctx.delimiter, encoding=encoding)
|
||||
|
||||
html_text = dataframe.to_html(index=False, header=include_header, na_rep="")
|
||||
text = soupparser_fromstring(html_text).text_content()
|
||||
@ -110,6 +114,7 @@ class _CsvPartitioningContext:
|
||||
self,
|
||||
file_path: str | None = None,
|
||||
file: IO[bytes] | None = None,
|
||||
encoding: str | None = None,
|
||||
metadata_file_path: str | None = None,
|
||||
metadata_last_modified: str | None = None,
|
||||
include_header: bool = False,
|
||||
@ -118,6 +123,7 @@ class _CsvPartitioningContext:
|
||||
):
|
||||
self._file_path = file_path
|
||||
self._file = file
|
||||
self._encoding = encoding
|
||||
self._metadata_file_path = metadata_file_path
|
||||
self._metadata_last_modified = metadata_last_modified
|
||||
self._include_header = include_header
|
||||
@ -129,6 +135,7 @@ class _CsvPartitioningContext:
|
||||
cls,
|
||||
file_path: str | None,
|
||||
file: IO[bytes] | None,
|
||||
encoding: str | None,
|
||||
metadata_file_path: str | None,
|
||||
metadata_last_modified: str | None,
|
||||
include_header: bool,
|
||||
@ -138,6 +145,7 @@ class _CsvPartitioningContext:
|
||||
return cls(
|
||||
file_path=file_path,
|
||||
file=file,
|
||||
encoding=encoding,
|
||||
metadata_file_path=metadata_file_path,
|
||||
metadata_last_modified=metadata_last_modified,
|
||||
include_header=include_header,
|
||||
@ -156,7 +164,9 @@ class _CsvPartitioningContext:
|
||||
|
||||
with self.open() as file:
|
||||
# -- read whole lines, sniffer can be confused by a trailing partial line --
|
||||
data = "\n".join(ln.decode("utf-8") for ln in file.readlines(num_bytes))
|
||||
data = "\n".join(
|
||||
ln.decode(self._encoding or "utf-8") for ln in file.readlines(num_bytes)
|
||||
)
|
||||
|
||||
try:
|
||||
return sniffer.sniff(data, delimiters=",;").delimiter
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user