rfctr(csv): accommodate single column CSV files (#3483)

**Summary**
Improve factoring, type-annotation, and tests for `partition_csv()` and
accommodate single-column CSV files.

Fixes: #2616
This commit is contained in:
Steve Canny 2024-08-05 17:48:37 -07:00 committed by GitHub
parent 59ec64235b
commit a468b2de3b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
18 changed files with 448 additions and 126 deletions

View File

@ -1,13 +1,13 @@
## 0.15.2-dev0
## 0.15.2-dev1
### Enhancements
### Features
### Fixes
* **Renames Astra to Astra DB** Conforms with DataStax internal naming conventions.
* **Accommodate single-column CSV files.** Resolves a limitation of `partition_csv()` where delimiter detection would fail on a single-column CSV file (which naturally has no delimeters).
## 0.15.1

View File

@ -0,0 +1,5 @@
Lorem, ipsum; dolor sit; amet
consectetur; adipiscing; elit
sed, do; eiusmod; tempor incididunt
ut labore; et, dolore; magna aliqua
Ut enim; ad minim; veniam, quis
1 Lorem, ipsum dolor sit amet
2 consectetur adipiscing elit
3 sed, do eiusmod tempor incididunt
4 ut labore et, dolore magna aliqua
5 Ut enim ad minim veniam, quis

View File

@ -0,0 +1,9 @@
Lorem, ipsum
dolor sit
amet consectetur
adipiscing, elit
sed, do eiusmod
tempor incididunt
ut labore et
dolore; magna aliqua
Ut enim, ad minim, veniam
1 Lorem, ipsum
2 dolor sit
3 amet consectetur
4 adipiscing, elit
5 sed, do eiusmod
6 tempor incididunt
7 ut labore et
8 dolore; magna aliqua
9 Ut enim, ad minim, veniam

View File

@ -2,4 +2,4 @@ Stanley Cups,,
Team,Location,Stanley Cups
Blues,STL,1
Flyers,PHI,2
Maple Leafs,TOR,13
Maple Leafs,TOR,13

1 Stanley Cups
2 Team Location Stanley Cups
3 Blues STL 1
4 Flyers PHI 2
5 Maple Leafs TOR 13

View File

@ -1,6 +1,12 @@
# pyright: reportPrivateUsage=false
from __future__ import annotations
import io
from tempfile import SpooledTemporaryFile
import pytest
from pytest_mock import MockFixture
from test_unstructured.partition.test_constants import (
EXPECTED_TABLE,
@ -11,11 +17,17 @@ from test_unstructured.partition.test_constants import (
EXPECTED_TEXT_WITH_EMOJI,
EXPECTED_TEXT_XLSX,
)
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
from test_unstructured.unit_utils import (
FixtureRequest,
Mock,
assert_round_trips_through_JSON,
example_doc_path,
function_mock,
)
from unstructured.chunking.title import chunk_by_title
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import Table
from unstructured.partition.csv import get_delimiter, partition_csv
from unstructured.partition.csv import _CsvPartitioningContext, partition_csv
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
EXPECTED_FILETYPE = "text/csv"
@ -33,7 +45,7 @@ EXPECTED_FILETYPE = "text/csv"
),
],
)
def test_partition_csv_from_filename(filename, expected_text, expected_table):
def test_partition_csv_from_filename(filename: str, expected_text: str, expected_table: str):
f_path = f"example-docs/{filename}"
elements = partition_csv(filename=f_path)
@ -43,14 +55,8 @@ def test_partition_csv_from_filename(filename, expected_text, expected_table):
assert elements[0].metadata.filename == filename
@pytest.mark.parametrize(
"infer_table_structure",
[
True,
False,
],
)
def test_partition_csv_from_filename_infer_table_structure(infer_table_structure):
@pytest.mark.parametrize("infer_table_structure", [True, False])
def test_partition_csv_from_filename_infer_table_structure(infer_table_structure: bool):
f_path = "example-docs/stanley-cups.csv"
elements = partition_csv(filename=f_path, infer_table_structure=infer_table_structure)
@ -61,10 +67,8 @@ def test_partition_csv_from_filename_infer_table_structure(infer_table_structure
assert table_element_has_text_as_html_field == infer_table_structure
def test_partition_csv_from_filename_with_metadata_filename(
filename="example-docs/stanley-cups.csv",
):
elements = partition_csv(filename=filename, metadata_filename="test")
def test_partition_csv_from_filename_with_metadata_filename():
elements = partition_csv(example_doc_path("stanley-cups.csv"), metadata_filename="test")
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert elements[0].metadata.filename == "test"
@ -77,7 +81,7 @@ def test_partition_csv_from_filename_with_metadata_filename(
("stanley-cups-with-emoji.csv", EXPECTED_TEXT_WITH_EMOJI, EXPECTED_TABLE_WITH_EMOJI),
],
)
def test_partition_csv_from_file(filename, expected_text, expected_table):
def test_partition_csv_from_file(filename: str, expected_text: str, expected_table: str):
f_path = f"example-docs/{filename}"
with open(f_path, "rb") as f:
elements = partition_csv(file=f)
@ -90,16 +94,16 @@ def test_partition_csv_from_file(filename, expected_text, expected_table):
assert {element.metadata.detection_origin for element in elements} == {"csv"}
def test_partition_csv_from_file_with_metadata_filename(filename="example-docs/stanley-cups.csv"):
with open(filename, "rb") as f:
def test_partition_csv_from_file_with_metadata_filename():
with open(example_doc_path("stanley-cups.csv"), "rb") as f:
elements = partition_csv(file=f, metadata_filename="test")
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert elements[0].metadata.filename == "test"
def test_partition_csv_can_exclude_metadata(filename="example-docs/stanley-cups.csv"):
elements = partition_csv(filename=filename, include_metadata=False)
def test_partition_csv_can_exclude_metadata():
elements = partition_csv(example_doc_path("stanley-cups.csv"), include_metadata=False)
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert isinstance(elements[0], Table)
@ -108,23 +112,21 @@ def test_partition_csv_can_exclude_metadata(filename="example-docs/stanley-cups.
assert elements[0].metadata.filename is None
def test_partition_csv_metadata_date(mocker, filename="example-docs/stanley-cups.csv"):
def test_partition_csv_metadata_date(mocker: MockFixture):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.csv.get_last_modified_date",
return_value=mocked_last_modification_date,
)
elements = partition_csv(filename=filename)
elements = partition_csv(example_doc_path("stanley-cups.csv"))
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert isinstance(elements[0], Table)
assert elements[0].metadata.last_modified == mocked_last_modification_date
def test_partition_csv_custom_metadata_date(
mocker,
filename="example-docs/stanley-cups.csv",
):
def test_partition_csv_custom_metadata_date(mocker: MockFixture):
mocked_last_modification_date = "2029-07-05T09:24:28"
expected_last_modification_date = "2020-07-05T09:24:28"
@ -134,7 +136,7 @@ def test_partition_csv_custom_metadata_date(
)
elements = partition_csv(
filename=filename,
example_doc_path("stanley-cups.csv"),
metadata_last_modified=expected_last_modification_date,
include_header=False,
)
@ -144,10 +146,7 @@ def test_partition_csv_custom_metadata_date(
assert elements[0].metadata.last_modified == expected_last_modification_date
def test_partition_csv_from_file_metadata_date(
mocker,
filename="example-docs/stanley-cups.csv",
):
def test_partition_csv_from_file_metadata_date(mocker: MockFixture):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
@ -155,7 +154,7 @@ def test_partition_csv_from_file_metadata_date(
return_value=mocked_last_modification_date,
)
with open(filename, "rb") as f:
with open(example_doc_path("stanley-cups.csv"), "rb") as f:
elements = partition_csv(file=f, include_header=False)
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
@ -163,10 +162,7 @@ def test_partition_csv_from_file_metadata_date(
assert elements[0].metadata.last_modified is None
def test_partition_csv_from_file_explicit_get_metadata_date(
mocker,
filename="example-docs/stanley-cups.csv",
):
def test_partition_csv_from_file_explicit_get_metadata_date(mocker: MockFixture):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
@ -174,7 +170,7 @@ def test_partition_csv_from_file_explicit_get_metadata_date(
return_value=mocked_last_modification_date,
)
with open(filename, "rb") as f:
with open(example_doc_path("stanley-cups.csv"), "rb") as f:
elements = partition_csv(file=f, include_header=False, date_from_file_object=True)
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
@ -182,10 +178,7 @@ def test_partition_csv_from_file_explicit_get_metadata_date(
assert elements[0].metadata.last_modified == mocked_last_modification_date
def test_partition_csv_from_file_custom_metadata_date(
mocker,
filename="example-docs/stanley-cups.csv",
):
def test_partition_csv_from_file_custom_metadata_date(mocker: MockFixture):
mocked_last_modification_date = "2029-07-05T09:24:28"
expected_last_modification_date = "2020-07-05T09:24:28"
@ -194,7 +187,7 @@ def test_partition_csv_from_file_custom_metadata_date(
return_value=mocked_last_modification_date,
)
with open(filename, "rb") as f:
with open(example_doc_path("stanley-cups.csv"), "rb") as f:
elements = partition_csv(
file=f,
metadata_last_modified=expected_last_modification_date,
@ -207,13 +200,10 @@ def test_partition_csv_from_file_custom_metadata_date(
assert elements[0].metadata.last_modified == expected_last_modification_date
def test_partition_csv_from_file_without_metadata(
mocker,
filename="example-docs/stanley-cups.csv",
):
def test_partition_csv_from_file_without_metadata(mocker: MockFixture):
"""Test partition_csv() with file that are not possible to get last modified date"""
with open(filename, "rb") as f:
with open(example_doc_path("stanley-cups.csv"), "rb") as f:
sf = SpooledTemporaryFile()
sf.write(f.read())
sf.seek(0)
@ -263,15 +253,158 @@ def test_partition_csv_respects_languages_arg():
def test_partition_csv_header():
filename = "example-docs/stanley-cups.csv"
elements = partition_csv(filename=filename, strategy="fast", include_header=True)
assert (
clean_extra_whitespace(elements[0].text)
== "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
elements = partition_csv(
example_doc_path("stanley-cups.csv"), strategy="fast", include_header=True
)
assert "<thead>" in elements[0].metadata.text_as_html
table = elements[0]
assert clean_extra_whitespace(table.text) == (
"Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
)
assert table.metadata.text_as_html is not None
assert "<thead>" in table.metadata.text_as_html
def test_partition_csv_detects_the_right_csv_delimiter():
# -- Issue #2643: previously raised `_csv.Error: Could not determine delimiter` on this file --
assert get_delimiter("example-docs/csv-with-long-lines.csv") == ","
# ================================================================================================
# UNIT-TESTS
# ================================================================================================
class Describe_CsvPartitioningContext:
"""Unit-test suite for `unstructured.partition.csv._CsvPartitioningContext`."""
# -- .load() ------------------------------------------------
def it_provides_a_validating_alternate_constructor(self):
ctx = _CsvPartitioningContext.load(
file_path=example_doc_path("stanley-cups.csv"),
file=None,
metadata_file_path=None,
metadata_last_modified=None,
include_header=True,
infer_table_structure=True,
date_from_file_object=False,
)
assert isinstance(ctx, _CsvPartitioningContext)
def and_the_validating_constructor_raises_on_an_invalid_context(self):
with pytest.raises(ValueError, match="either file-path or file-like object must be prov"):
_CsvPartitioningContext.load(
file_path=None,
file=None,
metadata_file_path=None,
metadata_last_modified=None,
include_header=True,
infer_table_structure=True,
date_from_file_object=False,
)
# -- .delimiter ---------------------------------------------
@pytest.mark.parametrize(
"file_name",
[
"stanley-cups.csv",
# -- Issue #2643: previously raised `_csv.Error: Could not determine delimiter` on
# -- this file
"csv-with-long-lines.csv",
],
)
def it_auto_detects_the_delimiter_for_a_comma_delimited_CSV_file(self, file_name: str):
ctx = _CsvPartitioningContext(example_doc_path(file_name))
assert ctx.delimiter == ","
def and_it_auto_detects_the_delimiter_for_a_semicolon_delimited_CSV_file(self):
ctx = _CsvPartitioningContext(example_doc_path("semicolon-delimited.csv"))
assert ctx.delimiter == ";"
def but_it_returns_None_as_the_delimiter_for_a_single_column_CSV_file(self):
ctx = _CsvPartitioningContext(example_doc_path("single-column.csv"))
assert ctx.delimiter is None
# -- .header ------------------------------------------------
@pytest.mark.parametrize(("include_header", "expected_value"), [(False, None), (True, 0)])
def it_identifies_the_header_row_based_on_include_header_arg(
self, include_header: bool, expected_value: int | None
):
assert _CsvPartitioningContext(include_header=include_header).header == expected_value
# -- .last_modified --------------------------
def it_gets_the_last_modified_date_of_the_document_from_the_caller_when_provided(self):
ctx = _CsvPartitioningContext(metadata_last_modified="2024-08-04T13:12:35")
assert ctx.last_modified == "2024-08-04T13:12:35"
def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_path_is_provided(
self, get_last_modified_date_: Mock
):
get_last_modified_date_.return_value = "2024-08-04T02:23:53"
ctx = _CsvPartitioningContext(file_path="a/b/document.csv")
last_modified = ctx.last_modified
get_last_modified_date_.assert_called_once_with("a/b/document.csv")
assert last_modified == "2024-08-04T02:23:53"
def and_it_falls_back_to_last_modified_date_of_file_when_a_file_like_object_is_provided(
self, get_last_modified_date_from_file_: Mock
):
get_last_modified_date_from_file_.return_value = "2024-08-04T13:17:47"
file = io.BytesIO(b"abcdefg")
ctx = _CsvPartitioningContext(file=file, date_from_file_object=True)
last_modified = ctx.last_modified
get_last_modified_date_from_file_.assert_called_once_with(file)
assert last_modified == "2024-08-04T13:17:47"
def but_it_falls_back_to_None_for_the_last_modified_date_when_date_from_file_object_is_False(
self, get_last_modified_date_from_file_: Mock
):
get_last_modified_date_from_file_.return_value = "2024-08-04T13:18:57"
file = io.BytesIO(b"abcdefg")
ctx = _CsvPartitioningContext(file=file, date_from_file_object=False)
last_modified = ctx.last_modified
get_last_modified_date_from_file_.assert_not_called()
assert last_modified is None
# -- .open() ------------------------------------------------
def it_provides_transparent_access_to_the_source_file_when_it_is_a_file_like_object(self):
with open(example_doc_path("stanley-cups.csv"), "rb") as f:
# -- read so file cursor is at end of file --
f.read()
ctx = _CsvPartitioningContext(file=f)
with ctx.open() as file:
assert file is f
# -- read cursor is reset to 0 on .open() context entry --
assert f.tell() == 0
assert file.read(14) == b"Stanley Cups,,"
assert f.tell() == 14
# -- and read cursor is reset to 0 on .open() context exit --
assert f.tell() == 0
def it_provides_transparent_access_to_the_source_file_when_it_is_a_file_path(self):
ctx = _CsvPartitioningContext(example_doc_path("stanley-cups.csv"))
with ctx.open() as file:
assert file.read(14) == b"Stanley Cups,,"
# -- .validate() --------------------------------------------
def it_raises_when_neither_file_path_nor_file_is_provided(self):
with pytest.raises(ValueError, match="either file-path or file-like object must be prov"):
_CsvPartitioningContext()._validate()
# -- fixtures --------------------------------------------------------------------------------
@pytest.fixture()
def get_last_modified_date_(self, request: FixtureRequest) -> Mock:
return function_mock(request, "unstructured.partition.csv.get_last_modified_date")
@pytest.fixture()
def get_last_modified_date_from_file_(self, request: FixtureRequest):
return function_mock(request, "unstructured.partition.csv.get_last_modified_date_from_file")

View File

@ -15,6 +15,8 @@ _VT_co = TypeVar("_VT_co", covariant=True)
_AttrName: TypeAlias = str
_AttrVal: TypeAlias = _TextArg
_ElemPathArg: TypeAlias = str | QName
_ElementOrTree: TypeAlias = _ET | _ElementTree[_ET]
@ -23,6 +25,9 @@ _TagName: TypeAlias = str
_TagSelector: TypeAlias = _TagName | Callable[..., _Element]
# String argument also support QName in various places
_TextArg: TypeAlias = str | bytes | QName
_XPathObject = Any
class SupportsLaxedItems(Protocol[_KT_co, _VT_co]):

View File

@ -2,7 +2,7 @@
from __future__ import annotations
from ._element import _Element
from ._element import _Attrib, _Element
class ElementBase(_Element):
"""The public Element class
@ -49,6 +49,8 @@ class ElementBase(_Element):
**_extra: str,
) -> None: ...
def _init(self) -> None: ...
@property
def attrib(self) -> _Attrib: ...
class ElementClassLookup:
"""Superclass of Element class lookups"""

View File

@ -48,3 +48,24 @@ class _Element:
) -> _t._XPathObject: ...
class _ElementTree(Generic[_t._ET_co]): ...
# Behaves like MutableMapping but deviates a lot in details
class _Attrib:
def __bool__(self) -> bool: ...
def __contains__(self, __o: object) -> bool: ...
def __delitem__(self, __k: _t._AttrName) -> None: ...
def __getitem__(self, __k: _t._AttrName) -> str: ...
def __iter__(self) -> Iterator[str]: ...
def __len__(self) -> int: ...
def __setitem__(self, __k: _t._AttrName, __v: _t._AttrVal) -> None: ...
@property
def _element(self) -> _Element: ...
def clear(self) -> None: ...
def get(self, key: _t._AttrName, default: _T) -> str | _T: ...
def has_key(self, key: _t._AttrName) -> bool: ...
def items(self) -> list[tuple[str, str]]: ...
def iteritems(self) -> Iterator[tuple[str, str]]: ...
def iterkeys(self) -> Iterator[str]: ...
def itervalues(self) -> Iterator[str]: ...
def keys(self) -> list[str]: ...
def values(self) -> list[str]: ...

View File

@ -0,0 +1,6 @@
from __future__ import annotations
from .. import etree
class HtmlElement(etree.ElementBase):
def text_content(self) -> str: ...

View File

@ -0,0 +1,9 @@
# pyright: reportPrivateUsage=false
from __future__ import annotations
from lxml.html._element import HtmlElement
def fromstring(
data: str,
) -> HtmlElement: ...

View File

@ -0,0 +1,8 @@
from __future__ import annotations
from pandas.core.api import (
DataFrame as DataFrame,
)
from pandas.io.api import (
read_csv as read_csv,
)

View File

@ -0,0 +1,3 @@
from __future__ import annotations
from pandas.core.frame import DataFrame as DataFrame

View File

@ -0,0 +1,9 @@
from __future__ import annotations
class DataFrame:
def to_html(
self,
index: bool = ...,
header: bool = ...,
na_rep: str = ...,
) -> str: ...

View File

@ -0,0 +1,5 @@
from __future__ import annotations
from pandas.io.parsers import (
read_csv as read_csv,
)

View File

@ -0,0 +1,5 @@
from __future__ import annotations
from pandas.io.parsers.readers import (
read_csv as read_csv,
)

View File

@ -0,0 +1,12 @@
from __future__ import annotations
from typing import IO, Literal
from pandas.core.frame import DataFrame
def read_csv(
filepath_or_buffer: str | IO[bytes],
*,
sep: str | None = ...,
header: int | None | Literal["infer"] = ...,
) -> DataFrame: ...

View File

@ -1 +1 @@
__version__ = "0.15.2-dev0" # pragma: no cover
__version__ = "0.15.2-dev1" # pragma: no cover

View File

@ -1,7 +1,8 @@
from __future__ import annotations
import contextlib
import csv
from typing import IO, Any, Optional, cast
from typing import IO, Any, Iterator
import pandas as pd
from lxml.html.soupparser import fromstring as soupparser_fromstring
@ -15,13 +16,9 @@ from unstructured.documents.elements import (
)
from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common import (
exactly_one,
get_last_modified_date,
get_last_modified_date_from_file,
spooled_to_bytes_io_if_needed,
)
from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file
from unstructured.partition.lang import apply_lang_metadata
from unstructured.utils import is_temp_file_path, lazyproperty
DETECTION_ORIGIN: str = "csv"
@ -30,16 +27,15 @@ DETECTION_ORIGIN: str = "csv"
@add_metadata_with_filetype(FileType.CSV)
@add_chunking_strategy
def partition_csv(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,
metadata_filename: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
filename: str | None = None,
file: IO[bytes] | None = None,
metadata_filename: str | None = None,
metadata_last_modified: str | None = None,
include_header: bool = False,
include_metadata: bool = True,
infer_table_structure: bool = True,
languages: Optional[list[str]] = ["auto"],
# NOTE (jennings) partition_csv generates a single TableElement
# so detect_language_per_element is not included as a param
languages: list[str] | None = ["auto"],
# NOTE (jennings) partition_csv generates a single TableElement so detect_language_per_element
# is not included as a param
date_from_file_object: bool = False,
**kwargs: Any,
) -> list[Element]:
@ -73,62 +69,156 @@ def partition_csv(
Applies only when providing file via `file` parameter. If this option is True, attempt
infer last_modified metadata from bytes, otherwise set it to None.
"""
exactly_one(filename=filename, file=file)
header = 0 if include_header else None
if filename:
delimiter = get_delimiter(file_path=filename)
table = pd.read_csv(filename, header=header, sep=delimiter)
last_modification_date = get_last_modified_date(filename)
elif file:
last_modification_date = (
get_last_modified_date_from_file(file) if date_from_file_object else None
)
f = spooled_to_bytes_io_if_needed(file)
delimiter = get_delimiter(file=f)
table = pd.read_csv(f, header=header, sep=delimiter)
html_text = table.to_html(index=False, header=include_header, na_rep="")
text = cast(str, soupparser_fromstring(html_text).text_content())
if include_metadata:
metadata = ElementMetadata(
filename=metadata_filename or filename,
last_modified=metadata_last_modified or last_modification_date,
languages=languages,
)
if infer_table_structure:
metadata.text_as_html = html_text
else:
metadata = ElementMetadata()
elements = apply_lang_metadata(
[Table(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)],
languages=languages,
ctx = _CsvPartitioningContext(
file_path=filename,
file=file,
metadata_file_path=metadata_filename,
metadata_last_modified=metadata_last_modified,
include_header=include_header,
infer_table_structure=infer_table_structure,
date_from_file_object=date_from_file_object,
)
return list(elements)
with ctx.open() as file:
dataframe = pd.read_csv(file, header=ctx.header, sep=ctx.delimiter)
html_text = dataframe.to_html(index=False, header=include_header, na_rep="")
text = soupparser_fromstring(html_text).text_content()
metadata = ElementMetadata(
filename=metadata_filename or filename,
last_modified=ctx.last_modified,
languages=languages,
text_as_html=html_text if infer_table_structure else None,
)
# -- a CSV file becomes a single `Table` element --
elements = [Table(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)]
return list(apply_lang_metadata(elements, languages=languages))
def get_delimiter(file_path: str | None = None, file: IO[bytes] | None = None):
"""Use the standard csv sniffer to determine the delimiter.
class _CsvPartitioningContext:
"""Encapsulates the partitioning-run details.
Reads just a small portion in case the file is large.
Provides access to argument values and especially encapsulates computation of values derived
from those values so they don't obscure the core partitioning logic.
"""
sniffer = csv.Sniffer()
num_bytes = 65536
# -- read whole lines, sniffer can be confused by a trailing partial line --
if file:
lines = file.readlines(num_bytes)
file.seek(0)
data = "\n".join(ln.decode("utf-8") for ln in lines)
elif file_path is not None:
with open(file_path) as f:
data = "\n".join(f.readlines(num_bytes))
else:
raise ValueError("either `file_path` or `file` argument must be provided")
def __init__(
self,
file_path: str | None = None,
file: IO[bytes] | None = None,
metadata_file_path: str | None = None,
metadata_last_modified: str | None = None,
include_header: bool = False,
infer_table_structure: bool = True,
date_from_file_object: bool = False,
):
self._file_path = file_path
self._file = file
self._metadata_file_path = metadata_file_path
self._metadata_last_modified = metadata_last_modified
self._include_header = include_header
self._infer_table_structure = infer_table_structure
self._date_from_file_object = date_from_file_object
return sniffer.sniff(data, delimiters=",;").delimiter
@classmethod
def load(
cls,
file_path: str | None,
file: IO[bytes] | None,
metadata_file_path: str | None,
metadata_last_modified: str | None,
include_header: bool,
infer_table_structure: bool,
date_from_file_object: bool = False,
) -> _CsvPartitioningContext:
return cls(
file_path=file_path,
file=file,
metadata_file_path=metadata_file_path,
metadata_last_modified=metadata_last_modified,
include_header=include_header,
infer_table_structure=infer_table_structure,
date_from_file_object=date_from_file_object,
)._validate()
@lazyproperty
def delimiter(self) -> str | None:
"""The CSV delimiter, nominally a comma ",".
`None` for a single-column CSV file which naturally has no delimiter.
"""
sniffer = csv.Sniffer()
num_bytes = 65536
with self.open() as file:
# -- read whole lines, sniffer can be confused by a trailing partial line --
data = "\n".join(ln.decode("utf-8") for ln in file.readlines(num_bytes))
try:
return sniffer.sniff(data, delimiters=",;").delimiter
except csv.Error:
# -- sniffing will fail on single-column csv as no default can be assumed --
return None
@lazyproperty
def header(self) -> int | None:
"""Identifies the header row, if any, to Pandas, by idx."""
return 0 if self._include_header else None
@lazyproperty
def last_modified(self) -> str | None:
"""The best last-modified date available, None if no sources are available."""
# -- Value explicitly specified by caller takes precedence. This is used for example when
# -- this file was converted from another format.
if self._metadata_last_modified:
return self._metadata_last_modified
if self._file_path:
return (
None
if is_temp_file_path(self._file_path)
else get_last_modified_date(self._file_path)
)
if self._file:
return (
get_last_modified_date_from_file(self._file)
if self._date_from_file_object
else None
)
return None
@contextlib.contextmanager
def open(self) -> Iterator[IO[bytes]]:
"""Encapsulates complexity of dealing with file-path or file-like-object.
Provides an `IO[bytes]` object as the "common-denominator" document source.
Must be used as a context manager using a `with` statement:
with self._file as file:
do things with file
File is guaranteed to be at read position 0 when called.
"""
if self._file_path:
with open(self._file_path, "rb") as f:
yield f
else:
file = self._file
assert file is not None # -- guaranteed by `._validate()` --
# -- Be polite on principle. Reset file-pointer both before and after use --
file.seek(0)
yield file
file.seek(0)
def _validate(self) -> _CsvPartitioningContext:
"""Raise on invalid argument values."""
if self._file_path is None and self._file is None:
raise ValueError("either file-path or file-like object must be provided")
return self