rfctr(csv): accommodate single column CSV files (#3483)

**Summary** Improve factoring, type-annotation, and tests for `partition_csv()` and accommodate single-column CSV files. Fixes: #2616
2025-06-27 02:30:08 +00:00 · 2024-08-05 17:48:37 -07:00 · 2024-08-05 17:48:37 -07:00 · a468b2de3b
commit a468b2de3b
parent 59ec64235b
18 changed files with 448 additions and 126 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,13 +1,13 @@
-## 0.15.2-dev0
+## 0.15.2-dev1

 ### Enhancements

 ### Features

-
 ### Fixes

 * **Renames Astra to Astra DB** Conforms with DataStax internal naming conventions.
+* **Accommodate single-column CSV files.** Resolves a limitation of `partition_csv()` where delimiter detection would fail on a single-column CSV file (which naturally has no delimeters).

 ## 0.15.1

--- a/example-docs/semicolon-delimited.csv
+++ b/example-docs/semicolon-delimited.csv
@ -0,0 +1,5 @@
+Lorem, ipsum; dolor sit; amet
+consectetur; adipiscing; elit
+sed, do; eiusmod; tempor incididunt
+ut labore; et, dolore; magna aliqua
+Ut enim; ad minim; veniam, quis
--- a/example-docs/single-column.csv
+++ b/example-docs/single-column.csv
@ -0,0 +1,9 @@
+Lorem, ipsum
+dolor sit
+amet consectetur
+adipiscing, elit
+sed, do eiusmod
+tempor incididunt
+ut labore et
+dolore; magna aliqua
+Ut enim, ad minim, veniam
--- a/example-docs/stanley-cups.csv
+++ b/example-docs/stanley-cups.csv
@ -2,4 +2,4 @@ Stanley Cups,,
 Team,Location,Stanley Cups
 Blues,STL,1
 Flyers,PHI,2
-Maple Leafs,TOR,13
+Maple Leafs,TOR,13
--- a/test_unstructured/partition/test_csv.py
+++ b/test_unstructured/partition/test_csv.py
@ -1,6 +1,12 @@
+# pyright: reportPrivateUsage=false
+
+from __future__ import annotations
+
+import io
 from tempfile import SpooledTemporaryFile

 import pytest
+from pytest_mock import MockFixture

 from test_unstructured.partition.test_constants import (
    EXPECTED_TABLE,
@ -11,11 +17,17 @@ from test_unstructured.partition.test_constants import (
    EXPECTED_TEXT_WITH_EMOJI,
    EXPECTED_TEXT_XLSX,
 )
-from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
+from test_unstructured.unit_utils import (
+    FixtureRequest,
+    Mock,
+    assert_round_trips_through_JSON,
+    example_doc_path,
+    function_mock,
+)
 from unstructured.chunking.title import chunk_by_title
 from unstructured.cleaners.core import clean_extra_whitespace
 from unstructured.documents.elements import Table
-from unstructured.partition.csv import get_delimiter, partition_csv
+from unstructured.partition.csv import _CsvPartitioningContext, partition_csv
 from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA

 EXPECTED_FILETYPE = "text/csv"
@ -33,7 +45,7 @@ EXPECTED_FILETYPE = "text/csv"
        ),
    ],
 )
-def test_partition_csv_from_filename(filename, expected_text, expected_table):
+def test_partition_csv_from_filename(filename: str, expected_text: str, expected_table: str):
    f_path = f"example-docs/{filename}"
    elements = partition_csv(filename=f_path)

@ -43,14 +55,8 @@ def test_partition_csv_from_filename(filename, expected_text, expected_table):
    assert elements[0].metadata.filename == filename


-@pytest.mark.parametrize(
-    "infer_table_structure",
-    [
-        True,
-        False,
-    ],
-)
-def test_partition_csv_from_filename_infer_table_structure(infer_table_structure):
+@pytest.mark.parametrize("infer_table_structure", [True, False])
+def test_partition_csv_from_filename_infer_table_structure(infer_table_structure: bool):
    f_path = "example-docs/stanley-cups.csv"
    elements = partition_csv(filename=f_path, infer_table_structure=infer_table_structure)

@ -61,10 +67,8 @@ def test_partition_csv_from_filename_infer_table_structure(infer_table_structure
    assert table_element_has_text_as_html_field == infer_table_structure


-def test_partition_csv_from_filename_with_metadata_filename(
-    filename="example-docs/stanley-cups.csv",
-):
-    elements = partition_csv(filename=filename, metadata_filename="test")
+def test_partition_csv_from_filename_with_metadata_filename():
+    elements = partition_csv(example_doc_path("stanley-cups.csv"), metadata_filename="test")

    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
    assert elements[0].metadata.filename == "test"
@ -77,7 +81,7 @@ def test_partition_csv_from_filename_with_metadata_filename(
        ("stanley-cups-with-emoji.csv", EXPECTED_TEXT_WITH_EMOJI, EXPECTED_TABLE_WITH_EMOJI),
    ],
 )
-def test_partition_csv_from_file(filename, expected_text, expected_table):
+def test_partition_csv_from_file(filename: str, expected_text: str, expected_table: str):
    f_path = f"example-docs/{filename}"
    with open(f_path, "rb") as f:
        elements = partition_csv(file=f)
@ -90,16 +94,16 @@ def test_partition_csv_from_file(filename, expected_text, expected_table):
        assert {element.metadata.detection_origin for element in elements} == {"csv"}


-def test_partition_csv_from_file_with_metadata_filename(filename="example-docs/stanley-cups.csv"):
-    with open(filename, "rb") as f:
+def test_partition_csv_from_file_with_metadata_filename():
+    with open(example_doc_path("stanley-cups.csv"), "rb") as f:
        elements = partition_csv(file=f, metadata_filename="test")

    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
    assert elements[0].metadata.filename == "test"


-def test_partition_csv_can_exclude_metadata(filename="example-docs/stanley-cups.csv"):
-    elements = partition_csv(filename=filename, include_metadata=False)
+def test_partition_csv_can_exclude_metadata():
+    elements = partition_csv(example_doc_path("stanley-cups.csv"), include_metadata=False)

    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
    assert isinstance(elements[0], Table)
@ -108,23 +112,21 @@ def test_partition_csv_can_exclude_metadata(filename="example-docs/stanley-cups.
    assert elements[0].metadata.filename is None


-def test_partition_csv_metadata_date(mocker, filename="example-docs/stanley-cups.csv"):
+def test_partition_csv_metadata_date(mocker: MockFixture):
    mocked_last_modification_date = "2029-07-05T09:24:28"
    mocker.patch(
        "unstructured.partition.csv.get_last_modified_date",
        return_value=mocked_last_modification_date,
    )
-    elements = partition_csv(filename=filename)
+
+    elements = partition_csv(example_doc_path("stanley-cups.csv"))

    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
    assert isinstance(elements[0], Table)
    assert elements[0].metadata.last_modified == mocked_last_modification_date


-def test_partition_csv_custom_metadata_date(
-    mocker,
-    filename="example-docs/stanley-cups.csv",
-):
+def test_partition_csv_custom_metadata_date(mocker: MockFixture):
    mocked_last_modification_date = "2029-07-05T09:24:28"
    expected_last_modification_date = "2020-07-05T09:24:28"

@ -134,7 +136,7 @@ def test_partition_csv_custom_metadata_date(
    )

    elements = partition_csv(
-        filename=filename,
+        example_doc_path("stanley-cups.csv"),
        metadata_last_modified=expected_last_modification_date,
        include_header=False,
    )
@ -144,10 +146,7 @@ def test_partition_csv_custom_metadata_date(
    assert elements[0].metadata.last_modified == expected_last_modification_date


-def test_partition_csv_from_file_metadata_date(
-    mocker,
-    filename="example-docs/stanley-cups.csv",
-):
+def test_partition_csv_from_file_metadata_date(mocker: MockFixture):
    mocked_last_modification_date = "2029-07-05T09:24:28"

    mocker.patch(
@ -155,7 +154,7 @@ def test_partition_csv_from_file_metadata_date(
        return_value=mocked_last_modification_date,
    )

-    with open(filename, "rb") as f:
+    with open(example_doc_path("stanley-cups.csv"), "rb") as f:
        elements = partition_csv(file=f, include_header=False)

    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
@ -163,10 +162,7 @@ def test_partition_csv_from_file_metadata_date(
    assert elements[0].metadata.last_modified is None


-def test_partition_csv_from_file_explicit_get_metadata_date(
-    mocker,
-    filename="example-docs/stanley-cups.csv",
-):
+def test_partition_csv_from_file_explicit_get_metadata_date(mocker: MockFixture):
    mocked_last_modification_date = "2029-07-05T09:24:28"

    mocker.patch(
@ -174,7 +170,7 @@ def test_partition_csv_from_file_explicit_get_metadata_date(
        return_value=mocked_last_modification_date,
    )

-    with open(filename, "rb") as f:
+    with open(example_doc_path("stanley-cups.csv"), "rb") as f:
        elements = partition_csv(file=f, include_header=False, date_from_file_object=True)

    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
@ -182,10 +178,7 @@ def test_partition_csv_from_file_explicit_get_metadata_date(
    assert elements[0].metadata.last_modified == mocked_last_modification_date


-def test_partition_csv_from_file_custom_metadata_date(
-    mocker,
-    filename="example-docs/stanley-cups.csv",
-):
+def test_partition_csv_from_file_custom_metadata_date(mocker: MockFixture):
    mocked_last_modification_date = "2029-07-05T09:24:28"
    expected_last_modification_date = "2020-07-05T09:24:28"

@ -194,7 +187,7 @@ def test_partition_csv_from_file_custom_metadata_date(
        return_value=mocked_last_modification_date,
    )

-    with open(filename, "rb") as f:
+    with open(example_doc_path("stanley-cups.csv"), "rb") as f:
        elements = partition_csv(
            file=f,
            metadata_last_modified=expected_last_modification_date,
@ -207,13 +200,10 @@ def test_partition_csv_from_file_custom_metadata_date(
    assert elements[0].metadata.last_modified == expected_last_modification_date


-def test_partition_csv_from_file_without_metadata(
-    mocker,
-    filename="example-docs/stanley-cups.csv",
-):
+def test_partition_csv_from_file_without_metadata(mocker: MockFixture):
    """Test partition_csv() with file that are not possible to get last modified date"""

-    with open(filename, "rb") as f:
+    with open(example_doc_path("stanley-cups.csv"), "rb") as f:
        sf = SpooledTemporaryFile()
        sf.write(f.read())
        sf.seek(0)
@ -263,15 +253,158 @@ def test_partition_csv_respects_languages_arg():


 def test_partition_csv_header():
-    filename = "example-docs/stanley-cups.csv"
-    elements = partition_csv(filename=filename, strategy="fast", include_header=True)
-    assert (
-        clean_extra_whitespace(elements[0].text)
-        == "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
+    elements = partition_csv(
+        example_doc_path("stanley-cups.csv"), strategy="fast", include_header=True
    )
-    assert "<thead>" in elements[0].metadata.text_as_html
+
+    table = elements[0]
+    assert clean_extra_whitespace(table.text) == (
+        "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
+    )
+    assert table.metadata.text_as_html is not None
+    assert "<thead>" in table.metadata.text_as_html


-def test_partition_csv_detects_the_right_csv_delimiter():
-    # -- Issue #2643: previously raised `_csv.Error: Could not determine delimiter` on this file --
-    assert get_delimiter("example-docs/csv-with-long-lines.csv") == ","
+# ================================================================================================
+# UNIT-TESTS
+# ================================================================================================
+
+
+class Describe_CsvPartitioningContext:
+    """Unit-test suite for `unstructured.partition.csv._CsvPartitioningContext`."""
+
+    # -- .load() ------------------------------------------------
+
+    def it_provides_a_validating_alternate_constructor(self):
+        ctx = _CsvPartitioningContext.load(
+            file_path=example_doc_path("stanley-cups.csv"),
+            file=None,
+            metadata_file_path=None,
+            metadata_last_modified=None,
+            include_header=True,
+            infer_table_structure=True,
+            date_from_file_object=False,
+        )
+        assert isinstance(ctx, _CsvPartitioningContext)
+
+    def and_the_validating_constructor_raises_on_an_invalid_context(self):
+        with pytest.raises(ValueError, match="either file-path or file-like object must be prov"):
+            _CsvPartitioningContext.load(
+                file_path=None,
+                file=None,
+                metadata_file_path=None,
+                metadata_last_modified=None,
+                include_header=True,
+                infer_table_structure=True,
+                date_from_file_object=False,
+            )
+
+    # -- .delimiter ---------------------------------------------
+
+    @pytest.mark.parametrize(
+        "file_name",
+        [
+            "stanley-cups.csv",
+            # -- Issue #2643: previously raised `_csv.Error: Could not determine delimiter` on
+            # -- this file
+            "csv-with-long-lines.csv",
+        ],
+    )
+    def it_auto_detects_the_delimiter_for_a_comma_delimited_CSV_file(self, file_name: str):
+        ctx = _CsvPartitioningContext(example_doc_path(file_name))
+        assert ctx.delimiter == ","
+
+    def and_it_auto_detects_the_delimiter_for_a_semicolon_delimited_CSV_file(self):
+        ctx = _CsvPartitioningContext(example_doc_path("semicolon-delimited.csv"))
+        assert ctx.delimiter == ";"
+
+    def but_it_returns_None_as_the_delimiter_for_a_single_column_CSV_file(self):
+        ctx = _CsvPartitioningContext(example_doc_path("single-column.csv"))
+        assert ctx.delimiter is None
+
+    # -- .header ------------------------------------------------
+
+    @pytest.mark.parametrize(("include_header", "expected_value"), [(False, None), (True, 0)])
+    def it_identifies_the_header_row_based_on_include_header_arg(
+        self, include_header: bool, expected_value: int | None
+    ):
+        assert _CsvPartitioningContext(include_header=include_header).header == expected_value
+
+    # -- .last_modified --------------------------
+
+    def it_gets_the_last_modified_date_of_the_document_from_the_caller_when_provided(self):
+        ctx = _CsvPartitioningContext(metadata_last_modified="2024-08-04T13:12:35")
+        assert ctx.last_modified == "2024-08-04T13:12:35"
+
+    def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_path_is_provided(
+        self, get_last_modified_date_: Mock
+    ):
+        get_last_modified_date_.return_value = "2024-08-04T02:23:53"
+        ctx = _CsvPartitioningContext(file_path="a/b/document.csv")
+
+        last_modified = ctx.last_modified
+
+        get_last_modified_date_.assert_called_once_with("a/b/document.csv")
+        assert last_modified == "2024-08-04T02:23:53"
+
+    def and_it_falls_back_to_last_modified_date_of_file_when_a_file_like_object_is_provided(
+        self, get_last_modified_date_from_file_: Mock
+    ):
+        get_last_modified_date_from_file_.return_value = "2024-08-04T13:17:47"
+        file = io.BytesIO(b"abcdefg")
+        ctx = _CsvPartitioningContext(file=file, date_from_file_object=True)
+
+        last_modified = ctx.last_modified
+
+        get_last_modified_date_from_file_.assert_called_once_with(file)
+        assert last_modified == "2024-08-04T13:17:47"
+
+    def but_it_falls_back_to_None_for_the_last_modified_date_when_date_from_file_object_is_False(
+        self, get_last_modified_date_from_file_: Mock
+    ):
+        get_last_modified_date_from_file_.return_value = "2024-08-04T13:18:57"
+        file = io.BytesIO(b"abcdefg")
+        ctx = _CsvPartitioningContext(file=file, date_from_file_object=False)
+
+        last_modified = ctx.last_modified
+
+        get_last_modified_date_from_file_.assert_not_called()
+        assert last_modified is None
+
+    # -- .open() ------------------------------------------------
+
+    def it_provides_transparent_access_to_the_source_file_when_it_is_a_file_like_object(self):
+        with open(example_doc_path("stanley-cups.csv"), "rb") as f:
+            # -- read so file cursor is at end of file --
+            f.read()
+            ctx = _CsvPartitioningContext(file=f)
+            with ctx.open() as file:
+                assert file is f
+                # -- read cursor is reset to 0 on .open() context entry --
+                assert f.tell() == 0
+                assert file.read(14) == b"Stanley Cups,,"
+                assert f.tell() == 14
+
+            # -- and read cursor is reset to 0 on .open() context exit --
+            assert f.tell() == 0
+
+    def it_provides_transparent_access_to_the_source_file_when_it_is_a_file_path(self):
+        ctx = _CsvPartitioningContext(example_doc_path("stanley-cups.csv"))
+        with ctx.open() as file:
+            assert file.read(14) == b"Stanley Cups,,"
+
+    # -- .validate() --------------------------------------------
+
+    def it_raises_when_neither_file_path_nor_file_is_provided(self):
+        with pytest.raises(ValueError, match="either file-path or file-like object must be prov"):
+            _CsvPartitioningContext()._validate()
+
+    # -- fixtures --------------------------------------------------------------------------------
+
+    @pytest.fixture()
+    def get_last_modified_date_(self, request: FixtureRequest) -> Mock:
+        return function_mock(request, "unstructured.partition.csv.get_last_modified_date")
+
+    @pytest.fixture()
+    def get_last_modified_date_from_file_(self, request: FixtureRequest):
+        return function_mock(request, "unstructured.partition.csv.get_last_modified_date_from_file")
--- a/typings/lxml/_types.pyi
+++ b/typings/lxml/_types.pyi
@ -15,6 +15,8 @@ _VT_co = TypeVar("_VT_co", covariant=True)

 _AttrName: TypeAlias = str

+_AttrVal: TypeAlias = _TextArg
+
 _ElemPathArg: TypeAlias = str | QName

 _ElementOrTree: TypeAlias = _ET | _ElementTree[_ET]
@ -23,6 +25,9 @@ _TagName: TypeAlias = str

 _TagSelector: TypeAlias = _TagName | Callable[..., _Element]

+# String argument also support QName in various places
+_TextArg: TypeAlias = str | bytes | QName
+
 _XPathObject = Any

 class SupportsLaxedItems(Protocol[_KT_co, _VT_co]):
--- a/typings/lxml/etree/_classlookup.pyi
+++ b/typings/lxml/etree/_classlookup.pyi
@ -2,7 +2,7 @@

 from __future__ import annotations

-from ._element import _Element
+from ._element import _Attrib, _Element

 class ElementBase(_Element):
    """The public Element class
@ -49,6 +49,8 @@ class ElementBase(_Element):
        **_extra: str,
    ) -> None: ...
    def _init(self) -> None: ...
+    @property
+    def attrib(self) -> _Attrib: ...

 class ElementClassLookup:
    """Superclass of Element class lookups"""
--- a/typings/lxml/etree/_element.pyi
+++ b/typings/lxml/etree/_element.pyi
@ -48,3 +48,24 @@ class _Element:
    ) -> _t._XPathObject: ...

 class _ElementTree(Generic[_t._ET_co]): ...
+
+# Behaves like MutableMapping but deviates a lot in details
+class _Attrib:
+    def __bool__(self) -> bool: ...
+    def __contains__(self, __o: object) -> bool: ...
+    def __delitem__(self, __k: _t._AttrName) -> None: ...
+    def __getitem__(self, __k: _t._AttrName) -> str: ...
+    def __iter__(self) -> Iterator[str]: ...
+    def __len__(self) -> int: ...
+    def __setitem__(self, __k: _t._AttrName, __v: _t._AttrVal) -> None: ...
+    @property
+    def _element(self) -> _Element: ...
+    def clear(self) -> None: ...
+    def get(self, key: _t._AttrName, default: _T) -> str | _T: ...
+    def has_key(self, key: _t._AttrName) -> bool: ...
+    def items(self) -> list[tuple[str, str]]: ...
+    def iteritems(self) -> Iterator[tuple[str, str]]: ...
+    def iterkeys(self) -> Iterator[str]: ...
+    def itervalues(self) -> Iterator[str]: ...
+    def keys(self) -> list[str]: ...
+    def values(self) -> list[str]: ...
--- a/typings/lxml/html/_element.pyi
+++ b/typings/lxml/html/_element.pyi
@ -0,0 +1,6 @@
+from __future__ import annotations
+
+from .. import etree
+
+class HtmlElement(etree.ElementBase):
+    def text_content(self) -> str: ...
--- a/typings/lxml/html/soupparser.pyi
+++ b/typings/lxml/html/soupparser.pyi
@ -0,0 +1,9 @@
+# pyright: reportPrivateUsage=false
+
+from __future__ import annotations
+
+from lxml.html._element import HtmlElement
+
+def fromstring(
+    data: str,
+) -> HtmlElement: ...
--- a/typings/pandas/init.pyi
+++ b/typings/pandas/init.pyi
@ -0,0 +1,8 @@
+from __future__ import annotations
+
+from pandas.core.api import (
+    DataFrame as DataFrame,
+)
+from pandas.io.api import (
+    read_csv as read_csv,
+)
--- a/typings/pandas/core/api.pyi
+++ b/typings/pandas/core/api.pyi
@ -0,0 +1,3 @@
+from __future__ import annotations
+
+from pandas.core.frame import DataFrame as DataFrame
--- a/typings/pandas/core/frame.pyi
+++ b/typings/pandas/core/frame.pyi
@ -0,0 +1,9 @@
+from __future__ import annotations
+
+class DataFrame:
+    def to_html(
+        self,
+        index: bool = ...,
+        header: bool = ...,
+        na_rep: str = ...,
+    ) -> str: ...
--- a/typings/pandas/io/api.pyi
+++ b/typings/pandas/io/api.pyi
@ -0,0 +1,5 @@
+from __future__ import annotations
+
+from pandas.io.parsers import (
+    read_csv as read_csv,
+)
--- a/typings/pandas/io/parsers/init.pyi
+++ b/typings/pandas/io/parsers/init.pyi
@ -0,0 +1,5 @@
+from __future__ import annotations
+
+from pandas.io.parsers.readers import (
+    read_csv as read_csv,
+)
--- a/typings/pandas/io/parsers/readers.pyi
+++ b/typings/pandas/io/parsers/readers.pyi
@ -0,0 +1,12 @@
+from __future__ import annotations
+
+from typing import IO, Literal
+
+from pandas.core.frame import DataFrame
+
+def read_csv(
+    filepath_or_buffer: str | IO[bytes],
+    *,
+    sep: str | None = ...,
+    header: int | None | Literal["infer"] = ...,
+) -> DataFrame: ...
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.15.2-dev0"  # pragma: no cover
+__version__ = "0.15.2-dev1"  # pragma: no cover
--- a/unstructured/partition/csv.py
+++ b/unstructured/partition/csv.py
@ -1,7 +1,8 @@
 from __future__ import annotations

+import contextlib
 import csv
-from typing import IO, Any, Optional, cast
+from typing import IO, Any, Iterator

 import pandas as pd
 from lxml.html.soupparser import fromstring as soupparser_fromstring
@ -15,13 +16,9 @@ from unstructured.documents.elements import (
 )
 from unstructured.file_utils.filetype import add_metadata_with_filetype
 from unstructured.file_utils.model import FileType
-from unstructured.partition.common import (
-    exactly_one,
-    get_last_modified_date,
-    get_last_modified_date_from_file,
-    spooled_to_bytes_io_if_needed,
-)
+from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file
 from unstructured.partition.lang import apply_lang_metadata
+from unstructured.utils import is_temp_file_path, lazyproperty

 DETECTION_ORIGIN: str = "csv"

@ -30,16 +27,15 @@ DETECTION_ORIGIN: str = "csv"
@add_metadata_with_filetype(FileType.CSV)
@add_chunking_strategy
 def partition_csv(
-    filename: Optional[str] = None,
-    file: Optional[IO[bytes]] = None,
-    metadata_filename: Optional[str] = None,
-    metadata_last_modified: Optional[str] = None,
+    filename: str | None = None,
+    file: IO[bytes] | None = None,
+    metadata_filename: str | None = None,
+    metadata_last_modified: str | None = None,
    include_header: bool = False,
-    include_metadata: bool = True,
    infer_table_structure: bool = True,
-    languages: Optional[list[str]] = ["auto"],
-    # NOTE (jennings) partition_csv generates a single TableElement
-    # so detect_language_per_element is not included as a param
+    languages: list[str] | None = ["auto"],
+    # NOTE (jennings) partition_csv generates a single TableElement so detect_language_per_element
+    # is not included as a param
    date_from_file_object: bool = False,
    **kwargs: Any,
 ) -> list[Element]:
@ -73,62 +69,156 @@ def partition_csv(
        Applies only when providing file via `file` parameter. If this option is True, attempt
        infer last_modified metadata from bytes, otherwise set it to None.
    """
-    exactly_one(filename=filename, file=file)

-    header = 0 if include_header else None
-
-    if filename:
-        delimiter = get_delimiter(file_path=filename)
-        table = pd.read_csv(filename, header=header, sep=delimiter)
-        last_modification_date = get_last_modified_date(filename)
-
-    elif file:
-        last_modification_date = (
-            get_last_modified_date_from_file(file) if date_from_file_object else None
-        )
-        f = spooled_to_bytes_io_if_needed(file)
-        delimiter = get_delimiter(file=f)
-        table = pd.read_csv(f, header=header, sep=delimiter)
-
-    html_text = table.to_html(index=False, header=include_header, na_rep="")
-    text = cast(str, soupparser_fromstring(html_text).text_content())
-
-    if include_metadata:
-        metadata = ElementMetadata(
-            filename=metadata_filename or filename,
-            last_modified=metadata_last_modified or last_modification_date,
-            languages=languages,
-        )
-        if infer_table_structure:
-            metadata.text_as_html = html_text
-    else:
-        metadata = ElementMetadata()
-
-    elements = apply_lang_metadata(
-        [Table(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)],
-        languages=languages,
+    ctx = _CsvPartitioningContext(
+        file_path=filename,
+        file=file,
+        metadata_file_path=metadata_filename,
+        metadata_last_modified=metadata_last_modified,
+        include_header=include_header,
+        infer_table_structure=infer_table_structure,
+        date_from_file_object=date_from_file_object,
    )

-    return list(elements)
+    with ctx.open() as file:
+        dataframe = pd.read_csv(file, header=ctx.header, sep=ctx.delimiter)
+
+    html_text = dataframe.to_html(index=False, header=include_header, na_rep="")
+    text = soupparser_fromstring(html_text).text_content()
+
+    metadata = ElementMetadata(
+        filename=metadata_filename or filename,
+        last_modified=ctx.last_modified,
+        languages=languages,
+        text_as_html=html_text if infer_table_structure else None,
+    )
+
+    # -- a CSV file becomes a single `Table` element --
+    elements = [Table(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)]
+
+    return list(apply_lang_metadata(elements, languages=languages))


-def get_delimiter(file_path: str | None = None, file: IO[bytes] | None = None):
-    """Use the standard csv sniffer to determine the delimiter.
+class _CsvPartitioningContext:
+    """Encapsulates the partitioning-run details.

-    Reads just a small portion in case the file is large.
+    Provides access to argument values and especially encapsulates computation of values derived
+    from those values so they don't obscure the core partitioning logic.
    """
-    sniffer = csv.Sniffer()
-    num_bytes = 65536

-    # -- read whole lines, sniffer can be confused by a trailing partial line --
-    if file:
-        lines = file.readlines(num_bytes)
-        file.seek(0)
-        data = "\n".join(ln.decode("utf-8") for ln in lines)
-    elif file_path is not None:
-        with open(file_path) as f:
-            data = "\n".join(f.readlines(num_bytes))
-    else:
-        raise ValueError("either `file_path` or `file` argument must be provided")
+    def __init__(
+        self,
+        file_path: str | None = None,
+        file: IO[bytes] | None = None,
+        metadata_file_path: str | None = None,
+        metadata_last_modified: str | None = None,
+        include_header: bool = False,
+        infer_table_structure: bool = True,
+        date_from_file_object: bool = False,
+    ):
+        self._file_path = file_path
+        self._file = file
+        self._metadata_file_path = metadata_file_path
+        self._metadata_last_modified = metadata_last_modified
+        self._include_header = include_header
+        self._infer_table_structure = infer_table_structure
+        self._date_from_file_object = date_from_file_object

-    return sniffer.sniff(data, delimiters=",;").delimiter
+    @classmethod
+    def load(
+        cls,
+        file_path: str | None,
+        file: IO[bytes] | None,
+        metadata_file_path: str | None,
+        metadata_last_modified: str | None,
+        include_header: bool,
+        infer_table_structure: bool,
+        date_from_file_object: bool = False,
+    ) -> _CsvPartitioningContext:
+        return cls(
+            file_path=file_path,
+            file=file,
+            metadata_file_path=metadata_file_path,
+            metadata_last_modified=metadata_last_modified,
+            include_header=include_header,
+            infer_table_structure=infer_table_structure,
+            date_from_file_object=date_from_file_object,
+        )._validate()
+
+    @lazyproperty
+    def delimiter(self) -> str | None:
+        """The CSV delimiter, nominally a comma ",".
+
+        `None` for a single-column CSV file which naturally has no delimiter.
+        """
+        sniffer = csv.Sniffer()
+        num_bytes = 65536
+
+        with self.open() as file:
+            # -- read whole lines, sniffer can be confused by a trailing partial line --
+            data = "\n".join(ln.decode("utf-8") for ln in file.readlines(num_bytes))
+
+        try:
+            return sniffer.sniff(data, delimiters=",;").delimiter
+        except csv.Error:
+            # -- sniffing will fail on single-column csv as no default can be assumed --
+            return None
+
+    @lazyproperty
+    def header(self) -> int | None:
+        """Identifies the header row, if any, to Pandas, by idx."""
+        return 0 if self._include_header else None
+
+    @lazyproperty
+    def last_modified(self) -> str | None:
+        """The best last-modified date available, None if no sources are available."""
+        # -- Value explicitly specified by caller takes precedence. This is used for example when
+        # -- this file was converted from another format.
+        if self._metadata_last_modified:
+            return self._metadata_last_modified
+
+        if self._file_path:
+            return (
+                None
+                if is_temp_file_path(self._file_path)
+                else get_last_modified_date(self._file_path)
+            )
+
+        if self._file:
+            return (
+                get_last_modified_date_from_file(self._file)
+                if self._date_from_file_object
+                else None
+            )
+
+        return None
+
+    @contextlib.contextmanager
+    def open(self) -> Iterator[IO[bytes]]:
+        """Encapsulates complexity of dealing with file-path or file-like-object.
+
+        Provides an `IO[bytes]` object as the "common-denominator" document source.
+
+        Must be used as a context manager using a `with` statement:
+
+            with self._file as file:
+                do things with file
+
+        File is guaranteed to be at read position 0 when called.
+        """
+        if self._file_path:
+            with open(self._file_path, "rb") as f:
+                yield f
+        else:
+            file = self._file
+            assert file is not None  # -- guaranteed by `._validate()` --
+            # -- Be polite on principle. Reset file-pointer both before and after use --
+            file.seek(0)
+            yield file
+            file.seek(0)
+
+    def _validate(self) -> _CsvPartitioningContext:
+        """Raise on invalid argument values."""
+        if self._file_path is None and self._file is None:
+            raise ValueError("either file-path or file-like object must be provided")
+        return self