From a468b2de3bd51b322ec3b3e333b121aaed8d26d3 Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Mon, 5 Aug 2024 17:48:37 -0700 Subject: [PATCH] rfctr(csv): accommodate single column CSV files (#3483) **Summary** Improve factoring, type-annotation, and tests for `partition_csv()` and accommodate single-column CSV files. Fixes: #2616 --- CHANGELOG.md | 4 +- example-docs/semicolon-delimited.csv | 5 + example-docs/single-column.csv | 9 + example-docs/stanley-cups.csv | 2 +- test_unstructured/partition/test_csv.py | 245 ++++++++++++++++++------ typings/lxml/_types.pyi | 5 + typings/lxml/etree/_classlookup.pyi | 4 +- typings/lxml/etree/_element.pyi | 21 ++ typings/lxml/html/_element.pyi | 6 + typings/lxml/html/soupparser.pyi | 9 + typings/pandas/__init__.pyi | 8 + typings/pandas/core/api.pyi | 3 + typings/pandas/core/frame.pyi | 9 + typings/pandas/io/api.pyi | 5 + typings/pandas/io/parsers/__init__.pyi | 5 + typings/pandas/io/parsers/readers.pyi | 12 ++ unstructured/__version__.py | 2 +- unstructured/partition/csv.py | 220 ++++++++++++++------- 18 files changed, 448 insertions(+), 126 deletions(-) create mode 100644 example-docs/semicolon-delimited.csv create mode 100644 example-docs/single-column.csv create mode 100644 typings/lxml/html/_element.pyi create mode 100644 typings/lxml/html/soupparser.pyi create mode 100644 typings/pandas/__init__.pyi create mode 100644 typings/pandas/core/api.pyi create mode 100644 typings/pandas/core/frame.pyi create mode 100644 typings/pandas/io/api.pyi create mode 100644 typings/pandas/io/parsers/__init__.pyi create mode 100644 typings/pandas/io/parsers/readers.pyi diff --git a/CHANGELOG.md b/CHANGELOG.md index 31cf4c457..c08843cb4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,13 +1,13 @@ -## 0.15.2-dev0 +## 0.15.2-dev1 ### Enhancements ### Features - ### Fixes * **Renames Astra to Astra DB** Conforms with DataStax internal naming conventions. +* **Accommodate single-column CSV files.** Resolves a limitation of `partition_csv()` where delimiter detection would fail on a single-column CSV file (which naturally has no delimeters). ## 0.15.1 diff --git a/example-docs/semicolon-delimited.csv b/example-docs/semicolon-delimited.csv new file mode 100644 index 000000000..5b3d9cf16 --- /dev/null +++ b/example-docs/semicolon-delimited.csv @@ -0,0 +1,5 @@ +Lorem, ipsum; dolor sit; amet +consectetur; adipiscing; elit +sed, do; eiusmod; tempor incididunt +ut labore; et, dolore; magna aliqua +Ut enim; ad minim; veniam, quis diff --git a/example-docs/single-column.csv b/example-docs/single-column.csv new file mode 100644 index 000000000..ba928a49e --- /dev/null +++ b/example-docs/single-column.csv @@ -0,0 +1,9 @@ +Lorem, ipsum +dolor sit +amet consectetur +adipiscing, elit +sed, do eiusmod +tempor incididunt +ut labore et +dolore; magna aliqua +Ut enim, ad minim, veniam diff --git a/example-docs/stanley-cups.csv b/example-docs/stanley-cups.csv index 4414023f0..ab6de8893 100644 --- a/example-docs/stanley-cups.csv +++ b/example-docs/stanley-cups.csv @@ -2,4 +2,4 @@ Stanley Cups,, Team,Location,Stanley Cups Blues,STL,1 Flyers,PHI,2 -Maple Leafs,TOR,13 \ No newline at end of file +Maple Leafs,TOR,13 diff --git a/test_unstructured/partition/test_csv.py b/test_unstructured/partition/test_csv.py index 466d8f0a8..4360b4771 100644 --- a/test_unstructured/partition/test_csv.py +++ b/test_unstructured/partition/test_csv.py @@ -1,6 +1,12 @@ +# pyright: reportPrivateUsage=false + +from __future__ import annotations + +import io from tempfile import SpooledTemporaryFile import pytest +from pytest_mock import MockFixture from test_unstructured.partition.test_constants import ( EXPECTED_TABLE, @@ -11,11 +17,17 @@ from test_unstructured.partition.test_constants import ( EXPECTED_TEXT_WITH_EMOJI, EXPECTED_TEXT_XLSX, ) -from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path +from test_unstructured.unit_utils import ( + FixtureRequest, + Mock, + assert_round_trips_through_JSON, + example_doc_path, + function_mock, +) from unstructured.chunking.title import chunk_by_title from unstructured.cleaners.core import clean_extra_whitespace from unstructured.documents.elements import Table -from unstructured.partition.csv import get_delimiter, partition_csv +from unstructured.partition.csv import _CsvPartitioningContext, partition_csv from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA EXPECTED_FILETYPE = "text/csv" @@ -33,7 +45,7 @@ EXPECTED_FILETYPE = "text/csv" ), ], ) -def test_partition_csv_from_filename(filename, expected_text, expected_table): +def test_partition_csv_from_filename(filename: str, expected_text: str, expected_table: str): f_path = f"example-docs/{filename}" elements = partition_csv(filename=f_path) @@ -43,14 +55,8 @@ def test_partition_csv_from_filename(filename, expected_text, expected_table): assert elements[0].metadata.filename == filename -@pytest.mark.parametrize( - "infer_table_structure", - [ - True, - False, - ], -) -def test_partition_csv_from_filename_infer_table_structure(infer_table_structure): +@pytest.mark.parametrize("infer_table_structure", [True, False]) +def test_partition_csv_from_filename_infer_table_structure(infer_table_structure: bool): f_path = "example-docs/stanley-cups.csv" elements = partition_csv(filename=f_path, infer_table_structure=infer_table_structure) @@ -61,10 +67,8 @@ def test_partition_csv_from_filename_infer_table_structure(infer_table_structure assert table_element_has_text_as_html_field == infer_table_structure -def test_partition_csv_from_filename_with_metadata_filename( - filename="example-docs/stanley-cups.csv", -): - elements = partition_csv(filename=filename, metadata_filename="test") +def test_partition_csv_from_filename_with_metadata_filename(): + elements = partition_csv(example_doc_path("stanley-cups.csv"), metadata_filename="test") assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT assert elements[0].metadata.filename == "test" @@ -77,7 +81,7 @@ def test_partition_csv_from_filename_with_metadata_filename( ("stanley-cups-with-emoji.csv", EXPECTED_TEXT_WITH_EMOJI, EXPECTED_TABLE_WITH_EMOJI), ], ) -def test_partition_csv_from_file(filename, expected_text, expected_table): +def test_partition_csv_from_file(filename: str, expected_text: str, expected_table: str): f_path = f"example-docs/{filename}" with open(f_path, "rb") as f: elements = partition_csv(file=f) @@ -90,16 +94,16 @@ def test_partition_csv_from_file(filename, expected_text, expected_table): assert {element.metadata.detection_origin for element in elements} == {"csv"} -def test_partition_csv_from_file_with_metadata_filename(filename="example-docs/stanley-cups.csv"): - with open(filename, "rb") as f: +def test_partition_csv_from_file_with_metadata_filename(): + with open(example_doc_path("stanley-cups.csv"), "rb") as f: elements = partition_csv(file=f, metadata_filename="test") assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT assert elements[0].metadata.filename == "test" -def test_partition_csv_can_exclude_metadata(filename="example-docs/stanley-cups.csv"): - elements = partition_csv(filename=filename, include_metadata=False) +def test_partition_csv_can_exclude_metadata(): + elements = partition_csv(example_doc_path("stanley-cups.csv"), include_metadata=False) assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT assert isinstance(elements[0], Table) @@ -108,23 +112,21 @@ def test_partition_csv_can_exclude_metadata(filename="example-docs/stanley-cups. assert elements[0].metadata.filename is None -def test_partition_csv_metadata_date(mocker, filename="example-docs/stanley-cups.csv"): +def test_partition_csv_metadata_date(mocker: MockFixture): mocked_last_modification_date = "2029-07-05T09:24:28" mocker.patch( "unstructured.partition.csv.get_last_modified_date", return_value=mocked_last_modification_date, ) - elements = partition_csv(filename=filename) + + elements = partition_csv(example_doc_path("stanley-cups.csv")) assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT assert isinstance(elements[0], Table) assert elements[0].metadata.last_modified == mocked_last_modification_date -def test_partition_csv_custom_metadata_date( - mocker, - filename="example-docs/stanley-cups.csv", -): +def test_partition_csv_custom_metadata_date(mocker: MockFixture): mocked_last_modification_date = "2029-07-05T09:24:28" expected_last_modification_date = "2020-07-05T09:24:28" @@ -134,7 +136,7 @@ def test_partition_csv_custom_metadata_date( ) elements = partition_csv( - filename=filename, + example_doc_path("stanley-cups.csv"), metadata_last_modified=expected_last_modification_date, include_header=False, ) @@ -144,10 +146,7 @@ def test_partition_csv_custom_metadata_date( assert elements[0].metadata.last_modified == expected_last_modification_date -def test_partition_csv_from_file_metadata_date( - mocker, - filename="example-docs/stanley-cups.csv", -): +def test_partition_csv_from_file_metadata_date(mocker: MockFixture): mocked_last_modification_date = "2029-07-05T09:24:28" mocker.patch( @@ -155,7 +154,7 @@ def test_partition_csv_from_file_metadata_date( return_value=mocked_last_modification_date, ) - with open(filename, "rb") as f: + with open(example_doc_path("stanley-cups.csv"), "rb") as f: elements = partition_csv(file=f, include_header=False) assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT @@ -163,10 +162,7 @@ def test_partition_csv_from_file_metadata_date( assert elements[0].metadata.last_modified is None -def test_partition_csv_from_file_explicit_get_metadata_date( - mocker, - filename="example-docs/stanley-cups.csv", -): +def test_partition_csv_from_file_explicit_get_metadata_date(mocker: MockFixture): mocked_last_modification_date = "2029-07-05T09:24:28" mocker.patch( @@ -174,7 +170,7 @@ def test_partition_csv_from_file_explicit_get_metadata_date( return_value=mocked_last_modification_date, ) - with open(filename, "rb") as f: + with open(example_doc_path("stanley-cups.csv"), "rb") as f: elements = partition_csv(file=f, include_header=False, date_from_file_object=True) assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT @@ -182,10 +178,7 @@ def test_partition_csv_from_file_explicit_get_metadata_date( assert elements[0].metadata.last_modified == mocked_last_modification_date -def test_partition_csv_from_file_custom_metadata_date( - mocker, - filename="example-docs/stanley-cups.csv", -): +def test_partition_csv_from_file_custom_metadata_date(mocker: MockFixture): mocked_last_modification_date = "2029-07-05T09:24:28" expected_last_modification_date = "2020-07-05T09:24:28" @@ -194,7 +187,7 @@ def test_partition_csv_from_file_custom_metadata_date( return_value=mocked_last_modification_date, ) - with open(filename, "rb") as f: + with open(example_doc_path("stanley-cups.csv"), "rb") as f: elements = partition_csv( file=f, metadata_last_modified=expected_last_modification_date, @@ -207,13 +200,10 @@ def test_partition_csv_from_file_custom_metadata_date( assert elements[0].metadata.last_modified == expected_last_modification_date -def test_partition_csv_from_file_without_metadata( - mocker, - filename="example-docs/stanley-cups.csv", -): +def test_partition_csv_from_file_without_metadata(mocker: MockFixture): """Test partition_csv() with file that are not possible to get last modified date""" - with open(filename, "rb") as f: + with open(example_doc_path("stanley-cups.csv"), "rb") as f: sf = SpooledTemporaryFile() sf.write(f.read()) sf.seek(0) @@ -263,15 +253,158 @@ def test_partition_csv_respects_languages_arg(): def test_partition_csv_header(): - filename = "example-docs/stanley-cups.csv" - elements = partition_csv(filename=filename, strategy="fast", include_header=True) - assert ( - clean_extra_whitespace(elements[0].text) - == "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX + elements = partition_csv( + example_doc_path("stanley-cups.csv"), strategy="fast", include_header=True ) - assert "" in elements[0].metadata.text_as_html + + table = elements[0] + assert clean_extra_whitespace(table.text) == ( + "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX + ) + assert table.metadata.text_as_html is not None + assert "" in table.metadata.text_as_html -def test_partition_csv_detects_the_right_csv_delimiter(): - # -- Issue #2643: previously raised `_csv.Error: Could not determine delimiter` on this file -- - assert get_delimiter("example-docs/csv-with-long-lines.csv") == "," +# ================================================================================================ +# UNIT-TESTS +# ================================================================================================ + + +class Describe_CsvPartitioningContext: + """Unit-test suite for `unstructured.partition.csv._CsvPartitioningContext`.""" + + # -- .load() ------------------------------------------------ + + def it_provides_a_validating_alternate_constructor(self): + ctx = _CsvPartitioningContext.load( + file_path=example_doc_path("stanley-cups.csv"), + file=None, + metadata_file_path=None, + metadata_last_modified=None, + include_header=True, + infer_table_structure=True, + date_from_file_object=False, + ) + assert isinstance(ctx, _CsvPartitioningContext) + + def and_the_validating_constructor_raises_on_an_invalid_context(self): + with pytest.raises(ValueError, match="either file-path or file-like object must be prov"): + _CsvPartitioningContext.load( + file_path=None, + file=None, + metadata_file_path=None, + metadata_last_modified=None, + include_header=True, + infer_table_structure=True, + date_from_file_object=False, + ) + + # -- .delimiter --------------------------------------------- + + @pytest.mark.parametrize( + "file_name", + [ + "stanley-cups.csv", + # -- Issue #2643: previously raised `_csv.Error: Could not determine delimiter` on + # -- this file + "csv-with-long-lines.csv", + ], + ) + def it_auto_detects_the_delimiter_for_a_comma_delimited_CSV_file(self, file_name: str): + ctx = _CsvPartitioningContext(example_doc_path(file_name)) + assert ctx.delimiter == "," + + def and_it_auto_detects_the_delimiter_for_a_semicolon_delimited_CSV_file(self): + ctx = _CsvPartitioningContext(example_doc_path("semicolon-delimited.csv")) + assert ctx.delimiter == ";" + + def but_it_returns_None_as_the_delimiter_for_a_single_column_CSV_file(self): + ctx = _CsvPartitioningContext(example_doc_path("single-column.csv")) + assert ctx.delimiter is None + + # -- .header ------------------------------------------------ + + @pytest.mark.parametrize(("include_header", "expected_value"), [(False, None), (True, 0)]) + def it_identifies_the_header_row_based_on_include_header_arg( + self, include_header: bool, expected_value: int | None + ): + assert _CsvPartitioningContext(include_header=include_header).header == expected_value + + # -- .last_modified -------------------------- + + def it_gets_the_last_modified_date_of_the_document_from_the_caller_when_provided(self): + ctx = _CsvPartitioningContext(metadata_last_modified="2024-08-04T13:12:35") + assert ctx.last_modified == "2024-08-04T13:12:35" + + def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_path_is_provided( + self, get_last_modified_date_: Mock + ): + get_last_modified_date_.return_value = "2024-08-04T02:23:53" + ctx = _CsvPartitioningContext(file_path="a/b/document.csv") + + last_modified = ctx.last_modified + + get_last_modified_date_.assert_called_once_with("a/b/document.csv") + assert last_modified == "2024-08-04T02:23:53" + + def and_it_falls_back_to_last_modified_date_of_file_when_a_file_like_object_is_provided( + self, get_last_modified_date_from_file_: Mock + ): + get_last_modified_date_from_file_.return_value = "2024-08-04T13:17:47" + file = io.BytesIO(b"abcdefg") + ctx = _CsvPartitioningContext(file=file, date_from_file_object=True) + + last_modified = ctx.last_modified + + get_last_modified_date_from_file_.assert_called_once_with(file) + assert last_modified == "2024-08-04T13:17:47" + + def but_it_falls_back_to_None_for_the_last_modified_date_when_date_from_file_object_is_False( + self, get_last_modified_date_from_file_: Mock + ): + get_last_modified_date_from_file_.return_value = "2024-08-04T13:18:57" + file = io.BytesIO(b"abcdefg") + ctx = _CsvPartitioningContext(file=file, date_from_file_object=False) + + last_modified = ctx.last_modified + + get_last_modified_date_from_file_.assert_not_called() + assert last_modified is None + + # -- .open() ------------------------------------------------ + + def it_provides_transparent_access_to_the_source_file_when_it_is_a_file_like_object(self): + with open(example_doc_path("stanley-cups.csv"), "rb") as f: + # -- read so file cursor is at end of file -- + f.read() + ctx = _CsvPartitioningContext(file=f) + with ctx.open() as file: + assert file is f + # -- read cursor is reset to 0 on .open() context entry -- + assert f.tell() == 0 + assert file.read(14) == b"Stanley Cups,," + assert f.tell() == 14 + + # -- and read cursor is reset to 0 on .open() context exit -- + assert f.tell() == 0 + + def it_provides_transparent_access_to_the_source_file_when_it_is_a_file_path(self): + ctx = _CsvPartitioningContext(example_doc_path("stanley-cups.csv")) + with ctx.open() as file: + assert file.read(14) == b"Stanley Cups,," + + # -- .validate() -------------------------------------------- + + def it_raises_when_neither_file_path_nor_file_is_provided(self): + with pytest.raises(ValueError, match="either file-path or file-like object must be prov"): + _CsvPartitioningContext()._validate() + + # -- fixtures -------------------------------------------------------------------------------- + + @pytest.fixture() + def get_last_modified_date_(self, request: FixtureRequest) -> Mock: + return function_mock(request, "unstructured.partition.csv.get_last_modified_date") + + @pytest.fixture() + def get_last_modified_date_from_file_(self, request: FixtureRequest): + return function_mock(request, "unstructured.partition.csv.get_last_modified_date_from_file") diff --git a/typings/lxml/_types.pyi b/typings/lxml/_types.pyi index 53fd97523..615c09e5c 100644 --- a/typings/lxml/_types.pyi +++ b/typings/lxml/_types.pyi @@ -15,6 +15,8 @@ _VT_co = TypeVar("_VT_co", covariant=True) _AttrName: TypeAlias = str +_AttrVal: TypeAlias = _TextArg + _ElemPathArg: TypeAlias = str | QName _ElementOrTree: TypeAlias = _ET | _ElementTree[_ET] @@ -23,6 +25,9 @@ _TagName: TypeAlias = str _TagSelector: TypeAlias = _TagName | Callable[..., _Element] +# String argument also support QName in various places +_TextArg: TypeAlias = str | bytes | QName + _XPathObject = Any class SupportsLaxedItems(Protocol[_KT_co, _VT_co]): diff --git a/typings/lxml/etree/_classlookup.pyi b/typings/lxml/etree/_classlookup.pyi index 03313c3c4..6d4446304 100644 --- a/typings/lxml/etree/_classlookup.pyi +++ b/typings/lxml/etree/_classlookup.pyi @@ -2,7 +2,7 @@ from __future__ import annotations -from ._element import _Element +from ._element import _Attrib, _Element class ElementBase(_Element): """The public Element class @@ -49,6 +49,8 @@ class ElementBase(_Element): **_extra: str, ) -> None: ... def _init(self) -> None: ... + @property + def attrib(self) -> _Attrib: ... class ElementClassLookup: """Superclass of Element class lookups""" diff --git a/typings/lxml/etree/_element.pyi b/typings/lxml/etree/_element.pyi index 65fa4e0f3..7afd99601 100644 --- a/typings/lxml/etree/_element.pyi +++ b/typings/lxml/etree/_element.pyi @@ -48,3 +48,24 @@ class _Element: ) -> _t._XPathObject: ... class _ElementTree(Generic[_t._ET_co]): ... + +# Behaves like MutableMapping but deviates a lot in details +class _Attrib: + def __bool__(self) -> bool: ... + def __contains__(self, __o: object) -> bool: ... + def __delitem__(self, __k: _t._AttrName) -> None: ... + def __getitem__(self, __k: _t._AttrName) -> str: ... + def __iter__(self) -> Iterator[str]: ... + def __len__(self) -> int: ... + def __setitem__(self, __k: _t._AttrName, __v: _t._AttrVal) -> None: ... + @property + def _element(self) -> _Element: ... + def clear(self) -> None: ... + def get(self, key: _t._AttrName, default: _T) -> str | _T: ... + def has_key(self, key: _t._AttrName) -> bool: ... + def items(self) -> list[tuple[str, str]]: ... + def iteritems(self) -> Iterator[tuple[str, str]]: ... + def iterkeys(self) -> Iterator[str]: ... + def itervalues(self) -> Iterator[str]: ... + def keys(self) -> list[str]: ... + def values(self) -> list[str]: ... diff --git a/typings/lxml/html/_element.pyi b/typings/lxml/html/_element.pyi new file mode 100644 index 000000000..582977663 --- /dev/null +++ b/typings/lxml/html/_element.pyi @@ -0,0 +1,6 @@ +from __future__ import annotations + +from .. import etree + +class HtmlElement(etree.ElementBase): + def text_content(self) -> str: ... diff --git a/typings/lxml/html/soupparser.pyi b/typings/lxml/html/soupparser.pyi new file mode 100644 index 000000000..45a7a31fd --- /dev/null +++ b/typings/lxml/html/soupparser.pyi @@ -0,0 +1,9 @@ +# pyright: reportPrivateUsage=false + +from __future__ import annotations + +from lxml.html._element import HtmlElement + +def fromstring( + data: str, +) -> HtmlElement: ... diff --git a/typings/pandas/__init__.pyi b/typings/pandas/__init__.pyi new file mode 100644 index 000000000..cc25d44ab --- /dev/null +++ b/typings/pandas/__init__.pyi @@ -0,0 +1,8 @@ +from __future__ import annotations + +from pandas.core.api import ( + DataFrame as DataFrame, +) +from pandas.io.api import ( + read_csv as read_csv, +) diff --git a/typings/pandas/core/api.pyi b/typings/pandas/core/api.pyi new file mode 100644 index 000000000..5d05ba59b --- /dev/null +++ b/typings/pandas/core/api.pyi @@ -0,0 +1,3 @@ +from __future__ import annotations + +from pandas.core.frame import DataFrame as DataFrame diff --git a/typings/pandas/core/frame.pyi b/typings/pandas/core/frame.pyi new file mode 100644 index 000000000..ea4c9f5f2 --- /dev/null +++ b/typings/pandas/core/frame.pyi @@ -0,0 +1,9 @@ +from __future__ import annotations + +class DataFrame: + def to_html( + self, + index: bool = ..., + header: bool = ..., + na_rep: str = ..., + ) -> str: ... diff --git a/typings/pandas/io/api.pyi b/typings/pandas/io/api.pyi new file mode 100644 index 000000000..0c267998d --- /dev/null +++ b/typings/pandas/io/api.pyi @@ -0,0 +1,5 @@ +from __future__ import annotations + +from pandas.io.parsers import ( + read_csv as read_csv, +) diff --git a/typings/pandas/io/parsers/__init__.pyi b/typings/pandas/io/parsers/__init__.pyi new file mode 100644 index 000000000..05f87ed57 --- /dev/null +++ b/typings/pandas/io/parsers/__init__.pyi @@ -0,0 +1,5 @@ +from __future__ import annotations + +from pandas.io.parsers.readers import ( + read_csv as read_csv, +) diff --git a/typings/pandas/io/parsers/readers.pyi b/typings/pandas/io/parsers/readers.pyi new file mode 100644 index 000000000..eb79991c3 --- /dev/null +++ b/typings/pandas/io/parsers/readers.pyi @@ -0,0 +1,12 @@ +from __future__ import annotations + +from typing import IO, Literal + +from pandas.core.frame import DataFrame + +def read_csv( + filepath_or_buffer: str | IO[bytes], + *, + sep: str | None = ..., + header: int | None | Literal["infer"] = ..., +) -> DataFrame: ... diff --git a/unstructured/__version__.py b/unstructured/__version__.py index d583bfccb..44fea565b 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.2-dev0" # pragma: no cover +__version__ = "0.15.2-dev1" # pragma: no cover diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py index 5a6171a47..e3c177bb1 100644 --- a/unstructured/partition/csv.py +++ b/unstructured/partition/csv.py @@ -1,7 +1,8 @@ from __future__ import annotations +import contextlib import csv -from typing import IO, Any, Optional, cast +from typing import IO, Any, Iterator import pandas as pd from lxml.html.soupparser import fromstring as soupparser_fromstring @@ -15,13 +16,9 @@ from unstructured.documents.elements import ( ) from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.model import FileType -from unstructured.partition.common import ( - exactly_one, - get_last_modified_date, - get_last_modified_date_from_file, - spooled_to_bytes_io_if_needed, -) +from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file from unstructured.partition.lang import apply_lang_metadata +from unstructured.utils import is_temp_file_path, lazyproperty DETECTION_ORIGIN: str = "csv" @@ -30,16 +27,15 @@ DETECTION_ORIGIN: str = "csv" @add_metadata_with_filetype(FileType.CSV) @add_chunking_strategy def partition_csv( - filename: Optional[str] = None, - file: Optional[IO[bytes]] = None, - metadata_filename: Optional[str] = None, - metadata_last_modified: Optional[str] = None, + filename: str | None = None, + file: IO[bytes] | None = None, + metadata_filename: str | None = None, + metadata_last_modified: str | None = None, include_header: bool = False, - include_metadata: bool = True, infer_table_structure: bool = True, - languages: Optional[list[str]] = ["auto"], - # NOTE (jennings) partition_csv generates a single TableElement - # so detect_language_per_element is not included as a param + languages: list[str] | None = ["auto"], + # NOTE (jennings) partition_csv generates a single TableElement so detect_language_per_element + # is not included as a param date_from_file_object: bool = False, **kwargs: Any, ) -> list[Element]: @@ -73,62 +69,156 @@ def partition_csv( Applies only when providing file via `file` parameter. If this option is True, attempt infer last_modified metadata from bytes, otherwise set it to None. """ - exactly_one(filename=filename, file=file) - header = 0 if include_header else None - - if filename: - delimiter = get_delimiter(file_path=filename) - table = pd.read_csv(filename, header=header, sep=delimiter) - last_modification_date = get_last_modified_date(filename) - - elif file: - last_modification_date = ( - get_last_modified_date_from_file(file) if date_from_file_object else None - ) - f = spooled_to_bytes_io_if_needed(file) - delimiter = get_delimiter(file=f) - table = pd.read_csv(f, header=header, sep=delimiter) - - html_text = table.to_html(index=False, header=include_header, na_rep="") - text = cast(str, soupparser_fromstring(html_text).text_content()) - - if include_metadata: - metadata = ElementMetadata( - filename=metadata_filename or filename, - last_modified=metadata_last_modified or last_modification_date, - languages=languages, - ) - if infer_table_structure: - metadata.text_as_html = html_text - else: - metadata = ElementMetadata() - - elements = apply_lang_metadata( - [Table(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)], - languages=languages, + ctx = _CsvPartitioningContext( + file_path=filename, + file=file, + metadata_file_path=metadata_filename, + metadata_last_modified=metadata_last_modified, + include_header=include_header, + infer_table_structure=infer_table_structure, + date_from_file_object=date_from_file_object, ) - return list(elements) + with ctx.open() as file: + dataframe = pd.read_csv(file, header=ctx.header, sep=ctx.delimiter) + + html_text = dataframe.to_html(index=False, header=include_header, na_rep="") + text = soupparser_fromstring(html_text).text_content() + + metadata = ElementMetadata( + filename=metadata_filename or filename, + last_modified=ctx.last_modified, + languages=languages, + text_as_html=html_text if infer_table_structure else None, + ) + + # -- a CSV file becomes a single `Table` element -- + elements = [Table(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)] + + return list(apply_lang_metadata(elements, languages=languages)) -def get_delimiter(file_path: str | None = None, file: IO[bytes] | None = None): - """Use the standard csv sniffer to determine the delimiter. +class _CsvPartitioningContext: + """Encapsulates the partitioning-run details. - Reads just a small portion in case the file is large. + Provides access to argument values and especially encapsulates computation of values derived + from those values so they don't obscure the core partitioning logic. """ - sniffer = csv.Sniffer() - num_bytes = 65536 - # -- read whole lines, sniffer can be confused by a trailing partial line -- - if file: - lines = file.readlines(num_bytes) - file.seek(0) - data = "\n".join(ln.decode("utf-8") for ln in lines) - elif file_path is not None: - with open(file_path) as f: - data = "\n".join(f.readlines(num_bytes)) - else: - raise ValueError("either `file_path` or `file` argument must be provided") + def __init__( + self, + file_path: str | None = None, + file: IO[bytes] | None = None, + metadata_file_path: str | None = None, + metadata_last_modified: str | None = None, + include_header: bool = False, + infer_table_structure: bool = True, + date_from_file_object: bool = False, + ): + self._file_path = file_path + self._file = file + self._metadata_file_path = metadata_file_path + self._metadata_last_modified = metadata_last_modified + self._include_header = include_header + self._infer_table_structure = infer_table_structure + self._date_from_file_object = date_from_file_object - return sniffer.sniff(data, delimiters=",;").delimiter + @classmethod + def load( + cls, + file_path: str | None, + file: IO[bytes] | None, + metadata_file_path: str | None, + metadata_last_modified: str | None, + include_header: bool, + infer_table_structure: bool, + date_from_file_object: bool = False, + ) -> _CsvPartitioningContext: + return cls( + file_path=file_path, + file=file, + metadata_file_path=metadata_file_path, + metadata_last_modified=metadata_last_modified, + include_header=include_header, + infer_table_structure=infer_table_structure, + date_from_file_object=date_from_file_object, + )._validate() + + @lazyproperty + def delimiter(self) -> str | None: + """The CSV delimiter, nominally a comma ",". + + `None` for a single-column CSV file which naturally has no delimiter. + """ + sniffer = csv.Sniffer() + num_bytes = 65536 + + with self.open() as file: + # -- read whole lines, sniffer can be confused by a trailing partial line -- + data = "\n".join(ln.decode("utf-8") for ln in file.readlines(num_bytes)) + + try: + return sniffer.sniff(data, delimiters=",;").delimiter + except csv.Error: + # -- sniffing will fail on single-column csv as no default can be assumed -- + return None + + @lazyproperty + def header(self) -> int | None: + """Identifies the header row, if any, to Pandas, by idx.""" + return 0 if self._include_header else None + + @lazyproperty + def last_modified(self) -> str | None: + """The best last-modified date available, None if no sources are available.""" + # -- Value explicitly specified by caller takes precedence. This is used for example when + # -- this file was converted from another format. + if self._metadata_last_modified: + return self._metadata_last_modified + + if self._file_path: + return ( + None + if is_temp_file_path(self._file_path) + else get_last_modified_date(self._file_path) + ) + + if self._file: + return ( + get_last_modified_date_from_file(self._file) + if self._date_from_file_object + else None + ) + + return None + + @contextlib.contextmanager + def open(self) -> Iterator[IO[bytes]]: + """Encapsulates complexity of dealing with file-path or file-like-object. + + Provides an `IO[bytes]` object as the "common-denominator" document source. + + Must be used as a context manager using a `with` statement: + + with self._file as file: + do things with file + + File is guaranteed to be at read position 0 when called. + """ + if self._file_path: + with open(self._file_path, "rb") as f: + yield f + else: + file = self._file + assert file is not None # -- guaranteed by `._validate()` -- + # -- Be polite on principle. Reset file-pointer both before and after use -- + file.seek(0) + yield file + file.seek(0) + + def _validate(self) -> _CsvPartitioningContext: + """Raise on invalid argument values.""" + if self._file_path is None and self._file is None: + raise ValueError("either file-path or file-like object must be provided") + return self