diff --git a/haystack/components/preprocessors/csv_document_splitter.py b/haystack/components/preprocessors/csv_document_splitter.py index 4809bf838..780e0cb51 100644 --- a/haystack/components/preprocessors/csv_document_splitter.py +++ b/haystack/components/preprocessors/csv_document_splitter.py @@ -195,7 +195,7 @@ class CSVDocumentSplitter: df_length = df.shape[0] if axis == "row" else df.shape[1] for empty_start_idx, empty_end_idx in split_indices + [(df_length, df_length)]: # Avoid empty splits - if empty_start_idx - table_start_idx > 1: + if empty_start_idx - table_start_idx >= 1: if axis == "row": sub_table = df.iloc[table_start_idx:empty_start_idx] else: diff --git a/test/components/preprocessors/test_csv_document_splitter.py b/test/components/preprocessors/test_csv_document_splitter.py index e94efd349..f39dd231b 100644 --- a/test/components/preprocessors/test_csv_document_splitter.py +++ b/test/components/preprocessors/test_csv_document_splitter.py @@ -227,6 +227,12 @@ E,F,,,G,H assert table.content == expected_tables[i] assert table.meta == expected_meta[i] + def test_sub_table_with_one_row(self): + splitter = CSVDocumentSplitter(row_split_threshold=1) + doc = Document(content="""A,B,C\n1,2,3\n,,\n4,5,6""") + split_result = splitter.run([doc]) + assert len(split_result["documents"]) == 2 + def test_threshold_no_effect(self, two_tables_sep_by_two_empty_rows: str) -> None: splitter = CSVDocumentSplitter(row_split_threshold=3) doc = Document(content=two_tables_sep_by_two_empty_rows)