mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
rfctr(csv): accommodate single column CSV files (#3483)
**Summary** Improve factoring, type-annotation, and tests for `partition_csv()` and accommodate single-column CSV files. Fixes: #2616
This commit is contained in:
parent
59ec64235b
commit
a468b2de3b
@ -1,13 +1,13 @@
|
||||
## 0.15.2-dev0
|
||||
## 0.15.2-dev1
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
|
||||
### Fixes
|
||||
|
||||
* **Renames Astra to Astra DB** Conforms with DataStax internal naming conventions.
|
||||
* **Accommodate single-column CSV files.** Resolves a limitation of `partition_csv()` where delimiter detection would fail on a single-column CSV file (which naturally has no delimeters).
|
||||
|
||||
## 0.15.1
|
||||
|
||||
|
5
example-docs/semicolon-delimited.csv
Normal file
5
example-docs/semicolon-delimited.csv
Normal file
@ -0,0 +1,5 @@
|
||||
Lorem, ipsum; dolor sit; amet
|
||||
consectetur; adipiscing; elit
|
||||
sed, do; eiusmod; tempor incididunt
|
||||
ut labore; et, dolore; magna aliqua
|
||||
Ut enim; ad minim; veniam, quis
|
|
9
example-docs/single-column.csv
Normal file
9
example-docs/single-column.csv
Normal file
@ -0,0 +1,9 @@
|
||||
Lorem, ipsum
|
||||
dolor sit
|
||||
amet consectetur
|
||||
adipiscing, elit
|
||||
sed, do eiusmod
|
||||
tempor incididunt
|
||||
ut labore et
|
||||
dolore; magna aliqua
|
||||
Ut enim, ad minim, veniam
|
|
@ -2,4 +2,4 @@ Stanley Cups,,
|
||||
Team,Location,Stanley Cups
|
||||
Blues,STL,1
|
||||
Flyers,PHI,2
|
||||
Maple Leafs,TOR,13
|
||||
Maple Leafs,TOR,13
|
||||
|
|
@ -1,6 +1,12 @@
|
||||
# pyright: reportPrivateUsage=false
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
from tempfile import SpooledTemporaryFile
|
||||
|
||||
import pytest
|
||||
from pytest_mock import MockFixture
|
||||
|
||||
from test_unstructured.partition.test_constants import (
|
||||
EXPECTED_TABLE,
|
||||
@ -11,11 +17,17 @@ from test_unstructured.partition.test_constants import (
|
||||
EXPECTED_TEXT_WITH_EMOJI,
|
||||
EXPECTED_TEXT_XLSX,
|
||||
)
|
||||
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
|
||||
from test_unstructured.unit_utils import (
|
||||
FixtureRequest,
|
||||
Mock,
|
||||
assert_round_trips_through_JSON,
|
||||
example_doc_path,
|
||||
function_mock,
|
||||
)
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
from unstructured.cleaners.core import clean_extra_whitespace
|
||||
from unstructured.documents.elements import Table
|
||||
from unstructured.partition.csv import get_delimiter, partition_csv
|
||||
from unstructured.partition.csv import _CsvPartitioningContext, partition_csv
|
||||
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
|
||||
|
||||
EXPECTED_FILETYPE = "text/csv"
|
||||
@ -33,7 +45,7 @@ EXPECTED_FILETYPE = "text/csv"
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_partition_csv_from_filename(filename, expected_text, expected_table):
|
||||
def test_partition_csv_from_filename(filename: str, expected_text: str, expected_table: str):
|
||||
f_path = f"example-docs/{filename}"
|
||||
elements = partition_csv(filename=f_path)
|
||||
|
||||
@ -43,14 +55,8 @@ def test_partition_csv_from_filename(filename, expected_text, expected_table):
|
||||
assert elements[0].metadata.filename == filename
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"infer_table_structure",
|
||||
[
|
||||
True,
|
||||
False,
|
||||
],
|
||||
)
|
||||
def test_partition_csv_from_filename_infer_table_structure(infer_table_structure):
|
||||
@pytest.mark.parametrize("infer_table_structure", [True, False])
|
||||
def test_partition_csv_from_filename_infer_table_structure(infer_table_structure: bool):
|
||||
f_path = "example-docs/stanley-cups.csv"
|
||||
elements = partition_csv(filename=f_path, infer_table_structure=infer_table_structure)
|
||||
|
||||
@ -61,10 +67,8 @@ def test_partition_csv_from_filename_infer_table_structure(infer_table_structure
|
||||
assert table_element_has_text_as_html_field == infer_table_structure
|
||||
|
||||
|
||||
def test_partition_csv_from_filename_with_metadata_filename(
|
||||
filename="example-docs/stanley-cups.csv",
|
||||
):
|
||||
elements = partition_csv(filename=filename, metadata_filename="test")
|
||||
def test_partition_csv_from_filename_with_metadata_filename():
|
||||
elements = partition_csv(example_doc_path("stanley-cups.csv"), metadata_filename="test")
|
||||
|
||||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
||||
assert elements[0].metadata.filename == "test"
|
||||
@ -77,7 +81,7 @@ def test_partition_csv_from_filename_with_metadata_filename(
|
||||
("stanley-cups-with-emoji.csv", EXPECTED_TEXT_WITH_EMOJI, EXPECTED_TABLE_WITH_EMOJI),
|
||||
],
|
||||
)
|
||||
def test_partition_csv_from_file(filename, expected_text, expected_table):
|
||||
def test_partition_csv_from_file(filename: str, expected_text: str, expected_table: str):
|
||||
f_path = f"example-docs/{filename}"
|
||||
with open(f_path, "rb") as f:
|
||||
elements = partition_csv(file=f)
|
||||
@ -90,16 +94,16 @@ def test_partition_csv_from_file(filename, expected_text, expected_table):
|
||||
assert {element.metadata.detection_origin for element in elements} == {"csv"}
|
||||
|
||||
|
||||
def test_partition_csv_from_file_with_metadata_filename(filename="example-docs/stanley-cups.csv"):
|
||||
with open(filename, "rb") as f:
|
||||
def test_partition_csv_from_file_with_metadata_filename():
|
||||
with open(example_doc_path("stanley-cups.csv"), "rb") as f:
|
||||
elements = partition_csv(file=f, metadata_filename="test")
|
||||
|
||||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
||||
assert elements[0].metadata.filename == "test"
|
||||
|
||||
|
||||
def test_partition_csv_can_exclude_metadata(filename="example-docs/stanley-cups.csv"):
|
||||
elements = partition_csv(filename=filename, include_metadata=False)
|
||||
def test_partition_csv_can_exclude_metadata():
|
||||
elements = partition_csv(example_doc_path("stanley-cups.csv"), include_metadata=False)
|
||||
|
||||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
||||
assert isinstance(elements[0], Table)
|
||||
@ -108,23 +112,21 @@ def test_partition_csv_can_exclude_metadata(filename="example-docs/stanley-cups.
|
||||
assert elements[0].metadata.filename is None
|
||||
|
||||
|
||||
def test_partition_csv_metadata_date(mocker, filename="example-docs/stanley-cups.csv"):
|
||||
def test_partition_csv_metadata_date(mocker: MockFixture):
|
||||
mocked_last_modification_date = "2029-07-05T09:24:28"
|
||||
mocker.patch(
|
||||
"unstructured.partition.csv.get_last_modified_date",
|
||||
return_value=mocked_last_modification_date,
|
||||
)
|
||||
elements = partition_csv(filename=filename)
|
||||
|
||||
elements = partition_csv(example_doc_path("stanley-cups.csv"))
|
||||
|
||||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
||||
assert isinstance(elements[0], Table)
|
||||
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
||||
|
||||
|
||||
def test_partition_csv_custom_metadata_date(
|
||||
mocker,
|
||||
filename="example-docs/stanley-cups.csv",
|
||||
):
|
||||
def test_partition_csv_custom_metadata_date(mocker: MockFixture):
|
||||
mocked_last_modification_date = "2029-07-05T09:24:28"
|
||||
expected_last_modification_date = "2020-07-05T09:24:28"
|
||||
|
||||
@ -134,7 +136,7 @@ def test_partition_csv_custom_metadata_date(
|
||||
)
|
||||
|
||||
elements = partition_csv(
|
||||
filename=filename,
|
||||
example_doc_path("stanley-cups.csv"),
|
||||
metadata_last_modified=expected_last_modification_date,
|
||||
include_header=False,
|
||||
)
|
||||
@ -144,10 +146,7 @@ def test_partition_csv_custom_metadata_date(
|
||||
assert elements[0].metadata.last_modified == expected_last_modification_date
|
||||
|
||||
|
||||
def test_partition_csv_from_file_metadata_date(
|
||||
mocker,
|
||||
filename="example-docs/stanley-cups.csv",
|
||||
):
|
||||
def test_partition_csv_from_file_metadata_date(mocker: MockFixture):
|
||||
mocked_last_modification_date = "2029-07-05T09:24:28"
|
||||
|
||||
mocker.patch(
|
||||
@ -155,7 +154,7 @@ def test_partition_csv_from_file_metadata_date(
|
||||
return_value=mocked_last_modification_date,
|
||||
)
|
||||
|
||||
with open(filename, "rb") as f:
|
||||
with open(example_doc_path("stanley-cups.csv"), "rb") as f:
|
||||
elements = partition_csv(file=f, include_header=False)
|
||||
|
||||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
||||
@ -163,10 +162,7 @@ def test_partition_csv_from_file_metadata_date(
|
||||
assert elements[0].metadata.last_modified is None
|
||||
|
||||
|
||||
def test_partition_csv_from_file_explicit_get_metadata_date(
|
||||
mocker,
|
||||
filename="example-docs/stanley-cups.csv",
|
||||
):
|
||||
def test_partition_csv_from_file_explicit_get_metadata_date(mocker: MockFixture):
|
||||
mocked_last_modification_date = "2029-07-05T09:24:28"
|
||||
|
||||
mocker.patch(
|
||||
@ -174,7 +170,7 @@ def test_partition_csv_from_file_explicit_get_metadata_date(
|
||||
return_value=mocked_last_modification_date,
|
||||
)
|
||||
|
||||
with open(filename, "rb") as f:
|
||||
with open(example_doc_path("stanley-cups.csv"), "rb") as f:
|
||||
elements = partition_csv(file=f, include_header=False, date_from_file_object=True)
|
||||
|
||||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
||||
@ -182,10 +178,7 @@ def test_partition_csv_from_file_explicit_get_metadata_date(
|
||||
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
||||
|
||||
|
||||
def test_partition_csv_from_file_custom_metadata_date(
|
||||
mocker,
|
||||
filename="example-docs/stanley-cups.csv",
|
||||
):
|
||||
def test_partition_csv_from_file_custom_metadata_date(mocker: MockFixture):
|
||||
mocked_last_modification_date = "2029-07-05T09:24:28"
|
||||
expected_last_modification_date = "2020-07-05T09:24:28"
|
||||
|
||||
@ -194,7 +187,7 @@ def test_partition_csv_from_file_custom_metadata_date(
|
||||
return_value=mocked_last_modification_date,
|
||||
)
|
||||
|
||||
with open(filename, "rb") as f:
|
||||
with open(example_doc_path("stanley-cups.csv"), "rb") as f:
|
||||
elements = partition_csv(
|
||||
file=f,
|
||||
metadata_last_modified=expected_last_modification_date,
|
||||
@ -207,13 +200,10 @@ def test_partition_csv_from_file_custom_metadata_date(
|
||||
assert elements[0].metadata.last_modified == expected_last_modification_date
|
||||
|
||||
|
||||
def test_partition_csv_from_file_without_metadata(
|
||||
mocker,
|
||||
filename="example-docs/stanley-cups.csv",
|
||||
):
|
||||
def test_partition_csv_from_file_without_metadata(mocker: MockFixture):
|
||||
"""Test partition_csv() with file that are not possible to get last modified date"""
|
||||
|
||||
with open(filename, "rb") as f:
|
||||
with open(example_doc_path("stanley-cups.csv"), "rb") as f:
|
||||
sf = SpooledTemporaryFile()
|
||||
sf.write(f.read())
|
||||
sf.seek(0)
|
||||
@ -263,15 +253,158 @@ def test_partition_csv_respects_languages_arg():
|
||||
|
||||
|
||||
def test_partition_csv_header():
|
||||
filename = "example-docs/stanley-cups.csv"
|
||||
elements = partition_csv(filename=filename, strategy="fast", include_header=True)
|
||||
assert (
|
||||
clean_extra_whitespace(elements[0].text)
|
||||
== "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
|
||||
elements = partition_csv(
|
||||
example_doc_path("stanley-cups.csv"), strategy="fast", include_header=True
|
||||
)
|
||||
assert "<thead>" in elements[0].metadata.text_as_html
|
||||
|
||||
table = elements[0]
|
||||
assert clean_extra_whitespace(table.text) == (
|
||||
"Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
|
||||
)
|
||||
assert table.metadata.text_as_html is not None
|
||||
assert "<thead>" in table.metadata.text_as_html
|
||||
|
||||
|
||||
def test_partition_csv_detects_the_right_csv_delimiter():
|
||||
# -- Issue #2643: previously raised `_csv.Error: Could not determine delimiter` on this file --
|
||||
assert get_delimiter("example-docs/csv-with-long-lines.csv") == ","
|
||||
# ================================================================================================
|
||||
# UNIT-TESTS
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
class Describe_CsvPartitioningContext:
|
||||
"""Unit-test suite for `unstructured.partition.csv._CsvPartitioningContext`."""
|
||||
|
||||
# -- .load() ------------------------------------------------
|
||||
|
||||
def it_provides_a_validating_alternate_constructor(self):
|
||||
ctx = _CsvPartitioningContext.load(
|
||||
file_path=example_doc_path("stanley-cups.csv"),
|
||||
file=None,
|
||||
metadata_file_path=None,
|
||||
metadata_last_modified=None,
|
||||
include_header=True,
|
||||
infer_table_structure=True,
|
||||
date_from_file_object=False,
|
||||
)
|
||||
assert isinstance(ctx, _CsvPartitioningContext)
|
||||
|
||||
def and_the_validating_constructor_raises_on_an_invalid_context(self):
|
||||
with pytest.raises(ValueError, match="either file-path or file-like object must be prov"):
|
||||
_CsvPartitioningContext.load(
|
||||
file_path=None,
|
||||
file=None,
|
||||
metadata_file_path=None,
|
||||
metadata_last_modified=None,
|
||||
include_header=True,
|
||||
infer_table_structure=True,
|
||||
date_from_file_object=False,
|
||||
)
|
||||
|
||||
# -- .delimiter ---------------------------------------------
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"file_name",
|
||||
[
|
||||
"stanley-cups.csv",
|
||||
# -- Issue #2643: previously raised `_csv.Error: Could not determine delimiter` on
|
||||
# -- this file
|
||||
"csv-with-long-lines.csv",
|
||||
],
|
||||
)
|
||||
def it_auto_detects_the_delimiter_for_a_comma_delimited_CSV_file(self, file_name: str):
|
||||
ctx = _CsvPartitioningContext(example_doc_path(file_name))
|
||||
assert ctx.delimiter == ","
|
||||
|
||||
def and_it_auto_detects_the_delimiter_for_a_semicolon_delimited_CSV_file(self):
|
||||
ctx = _CsvPartitioningContext(example_doc_path("semicolon-delimited.csv"))
|
||||
assert ctx.delimiter == ";"
|
||||
|
||||
def but_it_returns_None_as_the_delimiter_for_a_single_column_CSV_file(self):
|
||||
ctx = _CsvPartitioningContext(example_doc_path("single-column.csv"))
|
||||
assert ctx.delimiter is None
|
||||
|
||||
# -- .header ------------------------------------------------
|
||||
|
||||
@pytest.mark.parametrize(("include_header", "expected_value"), [(False, None), (True, 0)])
|
||||
def it_identifies_the_header_row_based_on_include_header_arg(
|
||||
self, include_header: bool, expected_value: int | None
|
||||
):
|
||||
assert _CsvPartitioningContext(include_header=include_header).header == expected_value
|
||||
|
||||
# -- .last_modified --------------------------
|
||||
|
||||
def it_gets_the_last_modified_date_of_the_document_from_the_caller_when_provided(self):
|
||||
ctx = _CsvPartitioningContext(metadata_last_modified="2024-08-04T13:12:35")
|
||||
assert ctx.last_modified == "2024-08-04T13:12:35"
|
||||
|
||||
def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_path_is_provided(
|
||||
self, get_last_modified_date_: Mock
|
||||
):
|
||||
get_last_modified_date_.return_value = "2024-08-04T02:23:53"
|
||||
ctx = _CsvPartitioningContext(file_path="a/b/document.csv")
|
||||
|
||||
last_modified = ctx.last_modified
|
||||
|
||||
get_last_modified_date_.assert_called_once_with("a/b/document.csv")
|
||||
assert last_modified == "2024-08-04T02:23:53"
|
||||
|
||||
def and_it_falls_back_to_last_modified_date_of_file_when_a_file_like_object_is_provided(
|
||||
self, get_last_modified_date_from_file_: Mock
|
||||
):
|
||||
get_last_modified_date_from_file_.return_value = "2024-08-04T13:17:47"
|
||||
file = io.BytesIO(b"abcdefg")
|
||||
ctx = _CsvPartitioningContext(file=file, date_from_file_object=True)
|
||||
|
||||
last_modified = ctx.last_modified
|
||||
|
||||
get_last_modified_date_from_file_.assert_called_once_with(file)
|
||||
assert last_modified == "2024-08-04T13:17:47"
|
||||
|
||||
def but_it_falls_back_to_None_for_the_last_modified_date_when_date_from_file_object_is_False(
|
||||
self, get_last_modified_date_from_file_: Mock
|
||||
):
|
||||
get_last_modified_date_from_file_.return_value = "2024-08-04T13:18:57"
|
||||
file = io.BytesIO(b"abcdefg")
|
||||
ctx = _CsvPartitioningContext(file=file, date_from_file_object=False)
|
||||
|
||||
last_modified = ctx.last_modified
|
||||
|
||||
get_last_modified_date_from_file_.assert_not_called()
|
||||
assert last_modified is None
|
||||
|
||||
# -- .open() ------------------------------------------------
|
||||
|
||||
def it_provides_transparent_access_to_the_source_file_when_it_is_a_file_like_object(self):
|
||||
with open(example_doc_path("stanley-cups.csv"), "rb") as f:
|
||||
# -- read so file cursor is at end of file --
|
||||
f.read()
|
||||
ctx = _CsvPartitioningContext(file=f)
|
||||
with ctx.open() as file:
|
||||
assert file is f
|
||||
# -- read cursor is reset to 0 on .open() context entry --
|
||||
assert f.tell() == 0
|
||||
assert file.read(14) == b"Stanley Cups,,"
|
||||
assert f.tell() == 14
|
||||
|
||||
# -- and read cursor is reset to 0 on .open() context exit --
|
||||
assert f.tell() == 0
|
||||
|
||||
def it_provides_transparent_access_to_the_source_file_when_it_is_a_file_path(self):
|
||||
ctx = _CsvPartitioningContext(example_doc_path("stanley-cups.csv"))
|
||||
with ctx.open() as file:
|
||||
assert file.read(14) == b"Stanley Cups,,"
|
||||
|
||||
# -- .validate() --------------------------------------------
|
||||
|
||||
def it_raises_when_neither_file_path_nor_file_is_provided(self):
|
||||
with pytest.raises(ValueError, match="either file-path or file-like object must be prov"):
|
||||
_CsvPartitioningContext()._validate()
|
||||
|
||||
# -- fixtures --------------------------------------------------------------------------------
|
||||
|
||||
@pytest.fixture()
|
||||
def get_last_modified_date_(self, request: FixtureRequest) -> Mock:
|
||||
return function_mock(request, "unstructured.partition.csv.get_last_modified_date")
|
||||
|
||||
@pytest.fixture()
|
||||
def get_last_modified_date_from_file_(self, request: FixtureRequest):
|
||||
return function_mock(request, "unstructured.partition.csv.get_last_modified_date_from_file")
|
||||
|
@ -15,6 +15,8 @@ _VT_co = TypeVar("_VT_co", covariant=True)
|
||||
|
||||
_AttrName: TypeAlias = str
|
||||
|
||||
_AttrVal: TypeAlias = _TextArg
|
||||
|
||||
_ElemPathArg: TypeAlias = str | QName
|
||||
|
||||
_ElementOrTree: TypeAlias = _ET | _ElementTree[_ET]
|
||||
@ -23,6 +25,9 @@ _TagName: TypeAlias = str
|
||||
|
||||
_TagSelector: TypeAlias = _TagName | Callable[..., _Element]
|
||||
|
||||
# String argument also support QName in various places
|
||||
_TextArg: TypeAlias = str | bytes | QName
|
||||
|
||||
_XPathObject = Any
|
||||
|
||||
class SupportsLaxedItems(Protocol[_KT_co, _VT_co]):
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from ._element import _Element
|
||||
from ._element import _Attrib, _Element
|
||||
|
||||
class ElementBase(_Element):
|
||||
"""The public Element class
|
||||
@ -49,6 +49,8 @@ class ElementBase(_Element):
|
||||
**_extra: str,
|
||||
) -> None: ...
|
||||
def _init(self) -> None: ...
|
||||
@property
|
||||
def attrib(self) -> _Attrib: ...
|
||||
|
||||
class ElementClassLookup:
|
||||
"""Superclass of Element class lookups"""
|
||||
|
@ -48,3 +48,24 @@ class _Element:
|
||||
) -> _t._XPathObject: ...
|
||||
|
||||
class _ElementTree(Generic[_t._ET_co]): ...
|
||||
|
||||
# Behaves like MutableMapping but deviates a lot in details
|
||||
class _Attrib:
|
||||
def __bool__(self) -> bool: ...
|
||||
def __contains__(self, __o: object) -> bool: ...
|
||||
def __delitem__(self, __k: _t._AttrName) -> None: ...
|
||||
def __getitem__(self, __k: _t._AttrName) -> str: ...
|
||||
def __iter__(self) -> Iterator[str]: ...
|
||||
def __len__(self) -> int: ...
|
||||
def __setitem__(self, __k: _t._AttrName, __v: _t._AttrVal) -> None: ...
|
||||
@property
|
||||
def _element(self) -> _Element: ...
|
||||
def clear(self) -> None: ...
|
||||
def get(self, key: _t._AttrName, default: _T) -> str | _T: ...
|
||||
def has_key(self, key: _t._AttrName) -> bool: ...
|
||||
def items(self) -> list[tuple[str, str]]: ...
|
||||
def iteritems(self) -> Iterator[tuple[str, str]]: ...
|
||||
def iterkeys(self) -> Iterator[str]: ...
|
||||
def itervalues(self) -> Iterator[str]: ...
|
||||
def keys(self) -> list[str]: ...
|
||||
def values(self) -> list[str]: ...
|
||||
|
6
typings/lxml/html/_element.pyi
Normal file
6
typings/lxml/html/_element.pyi
Normal file
@ -0,0 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from .. import etree
|
||||
|
||||
class HtmlElement(etree.ElementBase):
|
||||
def text_content(self) -> str: ...
|
9
typings/lxml/html/soupparser.pyi
Normal file
9
typings/lxml/html/soupparser.pyi
Normal file
@ -0,0 +1,9 @@
|
||||
# pyright: reportPrivateUsage=false
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from lxml.html._element import HtmlElement
|
||||
|
||||
def fromstring(
|
||||
data: str,
|
||||
) -> HtmlElement: ...
|
8
typings/pandas/__init__.pyi
Normal file
8
typings/pandas/__init__.pyi
Normal file
@ -0,0 +1,8 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pandas.core.api import (
|
||||
DataFrame as DataFrame,
|
||||
)
|
||||
from pandas.io.api import (
|
||||
read_csv as read_csv,
|
||||
)
|
3
typings/pandas/core/api.pyi
Normal file
3
typings/pandas/core/api.pyi
Normal file
@ -0,0 +1,3 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pandas.core.frame import DataFrame as DataFrame
|
9
typings/pandas/core/frame.pyi
Normal file
9
typings/pandas/core/frame.pyi
Normal file
@ -0,0 +1,9 @@
|
||||
from __future__ import annotations
|
||||
|
||||
class DataFrame:
|
||||
def to_html(
|
||||
self,
|
||||
index: bool = ...,
|
||||
header: bool = ...,
|
||||
na_rep: str = ...,
|
||||
) -> str: ...
|
5
typings/pandas/io/api.pyi
Normal file
5
typings/pandas/io/api.pyi
Normal file
@ -0,0 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pandas.io.parsers import (
|
||||
read_csv as read_csv,
|
||||
)
|
5
typings/pandas/io/parsers/__init__.pyi
Normal file
5
typings/pandas/io/parsers/__init__.pyi
Normal file
@ -0,0 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pandas.io.parsers.readers import (
|
||||
read_csv as read_csv,
|
||||
)
|
12
typings/pandas/io/parsers/readers.pyi
Normal file
12
typings/pandas/io/parsers/readers.pyi
Normal file
@ -0,0 +1,12 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import IO, Literal
|
||||
|
||||
from pandas.core.frame import DataFrame
|
||||
|
||||
def read_csv(
|
||||
filepath_or_buffer: str | IO[bytes],
|
||||
*,
|
||||
sep: str | None = ...,
|
||||
header: int | None | Literal["infer"] = ...,
|
||||
) -> DataFrame: ...
|
@ -1 +1 @@
|
||||
__version__ = "0.15.2-dev0" # pragma: no cover
|
||||
__version__ = "0.15.2-dev1" # pragma: no cover
|
||||
|
@ -1,7 +1,8 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import csv
|
||||
from typing import IO, Any, Optional, cast
|
||||
from typing import IO, Any, Iterator
|
||||
|
||||
import pandas as pd
|
||||
from lxml.html.soupparser import fromstring as soupparser_fromstring
|
||||
@ -15,13 +16,9 @@ from unstructured.documents.elements import (
|
||||
)
|
||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||
from unstructured.file_utils.model import FileType
|
||||
from unstructured.partition.common import (
|
||||
exactly_one,
|
||||
get_last_modified_date,
|
||||
get_last_modified_date_from_file,
|
||||
spooled_to_bytes_io_if_needed,
|
||||
)
|
||||
from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file
|
||||
from unstructured.partition.lang import apply_lang_metadata
|
||||
from unstructured.utils import is_temp_file_path, lazyproperty
|
||||
|
||||
DETECTION_ORIGIN: str = "csv"
|
||||
|
||||
@ -30,16 +27,15 @@ DETECTION_ORIGIN: str = "csv"
|
||||
@add_metadata_with_filetype(FileType.CSV)
|
||||
@add_chunking_strategy
|
||||
def partition_csv(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
metadata_filename: Optional[str] = None,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
filename: str | None = None,
|
||||
file: IO[bytes] | None = None,
|
||||
metadata_filename: str | None = None,
|
||||
metadata_last_modified: str | None = None,
|
||||
include_header: bool = False,
|
||||
include_metadata: bool = True,
|
||||
infer_table_structure: bool = True,
|
||||
languages: Optional[list[str]] = ["auto"],
|
||||
# NOTE (jennings) partition_csv generates a single TableElement
|
||||
# so detect_language_per_element is not included as a param
|
||||
languages: list[str] | None = ["auto"],
|
||||
# NOTE (jennings) partition_csv generates a single TableElement so detect_language_per_element
|
||||
# is not included as a param
|
||||
date_from_file_object: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> list[Element]:
|
||||
@ -73,62 +69,156 @@ def partition_csv(
|
||||
Applies only when providing file via `file` parameter. If this option is True, attempt
|
||||
infer last_modified metadata from bytes, otherwise set it to None.
|
||||
"""
|
||||
exactly_one(filename=filename, file=file)
|
||||
|
||||
header = 0 if include_header else None
|
||||
|
||||
if filename:
|
||||
delimiter = get_delimiter(file_path=filename)
|
||||
table = pd.read_csv(filename, header=header, sep=delimiter)
|
||||
last_modification_date = get_last_modified_date(filename)
|
||||
|
||||
elif file:
|
||||
last_modification_date = (
|
||||
get_last_modified_date_from_file(file) if date_from_file_object else None
|
||||
)
|
||||
f = spooled_to_bytes_io_if_needed(file)
|
||||
delimiter = get_delimiter(file=f)
|
||||
table = pd.read_csv(f, header=header, sep=delimiter)
|
||||
|
||||
html_text = table.to_html(index=False, header=include_header, na_rep="")
|
||||
text = cast(str, soupparser_fromstring(html_text).text_content())
|
||||
|
||||
if include_metadata:
|
||||
metadata = ElementMetadata(
|
||||
filename=metadata_filename or filename,
|
||||
last_modified=metadata_last_modified or last_modification_date,
|
||||
languages=languages,
|
||||
)
|
||||
if infer_table_structure:
|
||||
metadata.text_as_html = html_text
|
||||
else:
|
||||
metadata = ElementMetadata()
|
||||
|
||||
elements = apply_lang_metadata(
|
||||
[Table(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)],
|
||||
languages=languages,
|
||||
ctx = _CsvPartitioningContext(
|
||||
file_path=filename,
|
||||
file=file,
|
||||
metadata_file_path=metadata_filename,
|
||||
metadata_last_modified=metadata_last_modified,
|
||||
include_header=include_header,
|
||||
infer_table_structure=infer_table_structure,
|
||||
date_from_file_object=date_from_file_object,
|
||||
)
|
||||
|
||||
return list(elements)
|
||||
with ctx.open() as file:
|
||||
dataframe = pd.read_csv(file, header=ctx.header, sep=ctx.delimiter)
|
||||
|
||||
html_text = dataframe.to_html(index=False, header=include_header, na_rep="")
|
||||
text = soupparser_fromstring(html_text).text_content()
|
||||
|
||||
metadata = ElementMetadata(
|
||||
filename=metadata_filename or filename,
|
||||
last_modified=ctx.last_modified,
|
||||
languages=languages,
|
||||
text_as_html=html_text if infer_table_structure else None,
|
||||
)
|
||||
|
||||
# -- a CSV file becomes a single `Table` element --
|
||||
elements = [Table(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)]
|
||||
|
||||
return list(apply_lang_metadata(elements, languages=languages))
|
||||
|
||||
|
||||
def get_delimiter(file_path: str | None = None, file: IO[bytes] | None = None):
|
||||
"""Use the standard csv sniffer to determine the delimiter.
|
||||
class _CsvPartitioningContext:
|
||||
"""Encapsulates the partitioning-run details.
|
||||
|
||||
Reads just a small portion in case the file is large.
|
||||
Provides access to argument values and especially encapsulates computation of values derived
|
||||
from those values so they don't obscure the core partitioning logic.
|
||||
"""
|
||||
sniffer = csv.Sniffer()
|
||||
num_bytes = 65536
|
||||
|
||||
# -- read whole lines, sniffer can be confused by a trailing partial line --
|
||||
if file:
|
||||
lines = file.readlines(num_bytes)
|
||||
file.seek(0)
|
||||
data = "\n".join(ln.decode("utf-8") for ln in lines)
|
||||
elif file_path is not None:
|
||||
with open(file_path) as f:
|
||||
data = "\n".join(f.readlines(num_bytes))
|
||||
else:
|
||||
raise ValueError("either `file_path` or `file` argument must be provided")
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str | None = None,
|
||||
file: IO[bytes] | None = None,
|
||||
metadata_file_path: str | None = None,
|
||||
metadata_last_modified: str | None = None,
|
||||
include_header: bool = False,
|
||||
infer_table_structure: bool = True,
|
||||
date_from_file_object: bool = False,
|
||||
):
|
||||
self._file_path = file_path
|
||||
self._file = file
|
||||
self._metadata_file_path = metadata_file_path
|
||||
self._metadata_last_modified = metadata_last_modified
|
||||
self._include_header = include_header
|
||||
self._infer_table_structure = infer_table_structure
|
||||
self._date_from_file_object = date_from_file_object
|
||||
|
||||
return sniffer.sniff(data, delimiters=",;").delimiter
|
||||
@classmethod
|
||||
def load(
|
||||
cls,
|
||||
file_path: str | None,
|
||||
file: IO[bytes] | None,
|
||||
metadata_file_path: str | None,
|
||||
metadata_last_modified: str | None,
|
||||
include_header: bool,
|
||||
infer_table_structure: bool,
|
||||
date_from_file_object: bool = False,
|
||||
) -> _CsvPartitioningContext:
|
||||
return cls(
|
||||
file_path=file_path,
|
||||
file=file,
|
||||
metadata_file_path=metadata_file_path,
|
||||
metadata_last_modified=metadata_last_modified,
|
||||
include_header=include_header,
|
||||
infer_table_structure=infer_table_structure,
|
||||
date_from_file_object=date_from_file_object,
|
||||
)._validate()
|
||||
|
||||
@lazyproperty
|
||||
def delimiter(self) -> str | None:
|
||||
"""The CSV delimiter, nominally a comma ",".
|
||||
|
||||
`None` for a single-column CSV file which naturally has no delimiter.
|
||||
"""
|
||||
sniffer = csv.Sniffer()
|
||||
num_bytes = 65536
|
||||
|
||||
with self.open() as file:
|
||||
# -- read whole lines, sniffer can be confused by a trailing partial line --
|
||||
data = "\n".join(ln.decode("utf-8") for ln in file.readlines(num_bytes))
|
||||
|
||||
try:
|
||||
return sniffer.sniff(data, delimiters=",;").delimiter
|
||||
except csv.Error:
|
||||
# -- sniffing will fail on single-column csv as no default can be assumed --
|
||||
return None
|
||||
|
||||
@lazyproperty
|
||||
def header(self) -> int | None:
|
||||
"""Identifies the header row, if any, to Pandas, by idx."""
|
||||
return 0 if self._include_header else None
|
||||
|
||||
@lazyproperty
|
||||
def last_modified(self) -> str | None:
|
||||
"""The best last-modified date available, None if no sources are available."""
|
||||
# -- Value explicitly specified by caller takes precedence. This is used for example when
|
||||
# -- this file was converted from another format.
|
||||
if self._metadata_last_modified:
|
||||
return self._metadata_last_modified
|
||||
|
||||
if self._file_path:
|
||||
return (
|
||||
None
|
||||
if is_temp_file_path(self._file_path)
|
||||
else get_last_modified_date(self._file_path)
|
||||
)
|
||||
|
||||
if self._file:
|
||||
return (
|
||||
get_last_modified_date_from_file(self._file)
|
||||
if self._date_from_file_object
|
||||
else None
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
@contextlib.contextmanager
|
||||
def open(self) -> Iterator[IO[bytes]]:
|
||||
"""Encapsulates complexity of dealing with file-path or file-like-object.
|
||||
|
||||
Provides an `IO[bytes]` object as the "common-denominator" document source.
|
||||
|
||||
Must be used as a context manager using a `with` statement:
|
||||
|
||||
with self._file as file:
|
||||
do things with file
|
||||
|
||||
File is guaranteed to be at read position 0 when called.
|
||||
"""
|
||||
if self._file_path:
|
||||
with open(self._file_path, "rb") as f:
|
||||
yield f
|
||||
else:
|
||||
file = self._file
|
||||
assert file is not None # -- guaranteed by `._validate()` --
|
||||
# -- Be polite on principle. Reset file-pointer both before and after use --
|
||||
file.seek(0)
|
||||
yield file
|
||||
file.seek(0)
|
||||
|
||||
def _validate(self) -> _CsvPartitioningContext:
|
||||
"""Raise on invalid argument values."""
|
||||
if self._file_path is None and self._file is None:
|
||||
raise ValueError("either file-path or file-like object must be provided")
|
||||
return self
|
||||
|
Loading…
x
Reference in New Issue
Block a user