From 35788a2d060d58b901b8d10237f23273399998e5 Mon Sep 17 00:00:00 2001
From: Sebastian Husch Lee <sjrl@users.noreply.github.com>
Date: Fri, 7 Feb 2025 05:29:53 -0800
Subject: [PATCH] feat: Update csv cleaner (#8828)

* More refactoring

* Add more new options and more tests

* Improve docstrings

* Add release notes

* Fix pylint
---
 .../preprocessors/csv_document_cleaner.py     | 130 +++++++++++++-----
 ...csv-document-cleaner-7faed3788e9bfea4.yaml |   5 +
 .../test_csv_document_cleaner.py              |  75 ++++++++++
 3 files changed, 176 insertions(+), 34 deletions(-)
 create mode 100644 releasenotes/notes/update-csv-document-cleaner-7faed3788e9bfea4.yaml

diff --git a/haystack/components/preprocessors/csv_document_cleaner.py b/haystack/components/preprocessors/csv_document_cleaner.py
index 4dfb7e9c9..3ad54af9c 100644
--- a/haystack/components/preprocessors/csv_document_cleaner.py
+++ b/haystack/components/preprocessors/csv_document_cleaner.py
@@ -2,8 +2,9 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from copy import deepcopy
 from io import StringIO
-from typing import Dict, List
+from typing import Dict, List, Optional
 
 from haystack import Document, component, logging
 from haystack.lazy_imports import LazyImport
@@ -21,21 +22,36 @@ class CSVDocumentCleaner:
 
     This component processes CSV content stored in Documents, allowing
     for the optional ignoring of a specified number of rows and columns before performing
-    the cleaning operation.
+    the cleaning operation. Additionally, it provides options to keep document IDs and
+    control whether empty rows and columns should be removed.
     """
 
-    def __init__(self, ignore_rows: int = 0, ignore_columns: int = 0) -> None:
+    def __init__(
+        self,
+        *,
+        ignore_rows: int = 0,
+        ignore_columns: int = 0,
+        remove_empty_rows: bool = True,
+        remove_empty_columns: bool = True,
+        keep_id: bool = False,
+    ) -> None:
         """
         Initializes the CSVDocumentCleaner component.
 
         :param ignore_rows: Number of rows to ignore from the top of the CSV table before processing.
         :param ignore_columns: Number of columns to ignore from the left of the CSV table before processing.
+        :param remove_empty_rows: Whether to remove rows that are entirely empty.
+        :param remove_empty_columns: Whether to remove columns that are entirely empty.
+        :param keep_id: Whether to retain the original document ID in the output document.
 
         Rows and columns ignored using these parameters are preserved in the final output, meaning
         they are not considered when removing empty rows and columns.
         """
         self.ignore_rows = ignore_rows
         self.ignore_columns = ignore_columns
+        self.remove_empty_rows = remove_empty_rows
+        self.remove_empty_columns = remove_empty_columns
+        self.keep_id = keep_id
         pandas_import.check()
 
     @component.output_types(documents=List[Document])
@@ -44,14 +60,20 @@ class CSVDocumentCleaner:
         Cleans CSV documents by removing empty rows and columns while preserving specified ignored rows and columns.
 
         :param documents: List of Documents containing CSV-formatted content.
+        :return: A dictionary with a list of cleaned Documents under the key "documents".
 
         Processing steps:
         1. Reads each document's content as a CSV table.
         2. Retains the specified number of `ignore_rows` from the top and `ignore_columns` from the left.
-        3. Drops any rows and columns that are entirely empty (all NaN values).
+        3. Drops any rows and columns that are entirely empty (if enabled by `remove_empty_rows` and
+            `remove_empty_columns`).
         4. Reattaches the ignored rows and columns to maintain their original positions.
-        5. Returns the cleaned CSV content as a new `Document` object.
+        5. Returns the cleaned CSV content as a new `Document` object, with an option to retain the original
+            document ID.
         """
+        if len(documents) == 0:
+            return {"documents": []}
+
         ignore_rows = self.ignore_rows
         ignore_columns = self.ignore_columns
 
@@ -82,35 +104,75 @@ class CSVDocumentCleaner:
                 cleaned_documents.append(document)
                 continue
 
-            # Save ignored rows
-            ignored_rows = None
-            if ignore_rows > 0:
-                ignored_rows = df.iloc[:ignore_rows, :]
+            final_df = self._clean_df(df=df, ignore_rows=ignore_rows, ignore_columns=ignore_columns)
 
-            # Save ignored columns
-            ignored_columns = None
-            if ignore_columns > 0:
-                ignored_columns = df.iloc[:, :ignore_columns]
-
-            # Drop rows and columns that are entirely empty
-            remaining_df = df.iloc[ignore_rows:, ignore_columns:]
-            final_df = remaining_df.dropna(axis=0, how="all").dropna(axis=1, how="all")
-
-            # Reattach ignored rows
-            if ignore_rows > 0 and ignored_rows is not None:
-                # Keep only relevant columns
-                ignored_rows = ignored_rows.loc[:, final_df.columns]
-                final_df = pd.concat([ignored_rows, final_df], axis=0)
-
-            # Reattach ignored columns
-            if ignore_columns > 0 and ignored_columns is not None:
-                # Keep only relevant rows
-                ignored_columns = ignored_columns.loc[final_df.index, :]
-                final_df = pd.concat([ignored_columns, final_df], axis=1)
-
-            cleaned_documents.append(
-                Document(
-                    content=final_df.to_csv(index=False, header=False, lineterminator="\n"), meta=document.meta.copy()
-                )
+            clean_doc = Document(
+                id=document.id if self.keep_id else "",
+                content=final_df.to_csv(index=False, header=False, lineterminator="\n"),
+                blob=document.blob,
+                meta=deepcopy(document.meta),
+                score=document.score,
+                embedding=document.embedding,
+                sparse_embedding=document.sparse_embedding,
             )
+            cleaned_documents.append(clean_doc)
         return {"documents": cleaned_documents}
+
+    def _clean_df(self, df: "pd.DataFrame", ignore_rows: int, ignore_columns: int) -> "pd.DataFrame":
+        """
+        Cleans a DataFrame by removing empty rows and columns while preserving ignored sections.
+
+        :param df: The input DataFrame representing the CSV data.
+        :param ignore_rows: Number of top rows to ignore.
+        :param ignore_columns: Number of left columns to ignore.
+        """
+        # Get ignored rows and columns
+        ignored_rows = self._get_ignored_rows(df=df, ignore_rows=ignore_rows)
+        ignored_columns = self._get_ignored_columns(df=df, ignore_columns=ignore_columns)
+        final_df = df.iloc[ignore_rows:, ignore_columns:]
+
+        # Drop rows that are entirely empty
+        if self.remove_empty_rows:
+            final_df = final_df.dropna(axis=0, how="all")
+
+        # Drop columns that are entirely empty
+        if self.remove_empty_columns:
+            final_df = final_df.dropna(axis=1, how="all")
+
+        # Reattach ignored rows
+        if ignore_rows > 0 and ignored_rows is not None:
+            # Keep only relevant columns
+            ignored_rows = ignored_rows.loc[:, final_df.columns]
+            final_df = pd.concat([ignored_rows, final_df], axis=0)
+
+        # Reattach ignored columns
+        if ignore_columns > 0 and ignored_columns is not None:
+            # Keep only relevant rows
+            ignored_columns = ignored_columns.loc[final_df.index, :]
+            final_df = pd.concat([ignored_columns, final_df], axis=1)
+
+        return final_df
+
+    @staticmethod
+    def _get_ignored_rows(df: "pd.DataFrame", ignore_rows: int) -> Optional["pd.DataFrame"]:
+        """
+        Extracts the rows to be ignored from the DataFrame.
+
+        :param df: The input DataFrame.
+        :param ignore_rows: Number of rows to extract from the top.
+        """
+        if ignore_rows > 0:
+            return df.iloc[:ignore_rows, :]
+        return None
+
+    @staticmethod
+    def _get_ignored_columns(df: "pd.DataFrame", ignore_columns: int) -> Optional["pd.DataFrame"]:
+        """
+        Extracts the columns to be ignored from the DataFrame.
+
+        :param df: The input DataFrame.
+        :param ignore_columns: Number of columns to extract from the left.
+        """
+        if ignore_columns > 0:
+            return df.iloc[:, :ignore_columns]
+        return None
diff --git a/releasenotes/notes/update-csv-document-cleaner-7faed3788e9bfea4.yaml b/releasenotes/notes/update-csv-document-cleaner-7faed3788e9bfea4.yaml
new file mode 100644
index 000000000..cb94c057b
--- /dev/null
+++ b/releasenotes/notes/update-csv-document-cleaner-7faed3788e9bfea4.yaml
@@ -0,0 +1,5 @@
+---
+enhancements:
+  - |
+    For the CSVDocumentCleaner, added `remove_empty_rows` & `remove_empty_columns` to optionally remove rows and columns.
+    Also added `keep_id` to optionally allow for keeping the original document ID.
diff --git a/test/components/preprocessors/test_csv_document_cleaner.py b/test/components/preprocessors/test_csv_document_cleaner.py
index ca53f1c17..e51306adf 100644
--- a/test/components/preprocessors/test_csv_document_cleaner.py
+++ b/test/components/preprocessors/test_csv_document_cleaner.py
@@ -144,3 +144,78 @@ def test_zero_ignore_rows_and_columns() -> None:
     result = csv_document_cleaner.run([csv_document])
     cleaned_document = result["documents"][0]
     assert cleaned_document.content == ",A,B,C\n1,item,s,\n2,item2,fd,\n"
+
+
+def test_empty_document() -> None:
+    csv_document = Document(content="")
+    csv_document_cleaner = CSVDocumentCleaner()
+    result = csv_document_cleaner.run([csv_document])
+    cleaned_document = result["documents"][0]
+    assert cleaned_document.content == ""
+    assert cleaned_document.meta == {}
+
+
+def test_empty_documents() -> None:
+    csv_document_cleaner = CSVDocumentCleaner()
+    result = csv_document_cleaner.run([])
+    assert result["documents"] == []
+
+
+def test_keep_id() -> None:
+    csv_content = """,A,B,C
+1,item,s,
+"""
+    csv_document = Document(id="123", content=csv_content)
+    csv_document_cleaner = CSVDocumentCleaner(keep_id=True)
+    result = csv_document_cleaner.run([csv_document])
+    cleaned_document = result["documents"][0]
+    assert cleaned_document.id == "123"
+    assert cleaned_document.content == ",A,B,C\n1,item,s,\n"
+
+
+def test_id_not_none() -> None:
+    csv_content = """,A,B,C
+1,item,s,
+"""
+    csv_document = Document(content=csv_content)
+    csv_document_cleaner = CSVDocumentCleaner()
+    result = csv_document_cleaner.run([csv_document])
+    cleaned_document = result["documents"][0]
+    assert cleaned_document.id != ""
+    assert cleaned_document.content == ",A,B,C\n1,item,s,\n"
+
+
+def test_remove_empty_rows_false() -> None:
+    csv_content = """,B,C
+,,
+,5,6
+"""
+    csv_document = Document(content=csv_content)
+    csv_document_cleaner = CSVDocumentCleaner(remove_empty_rows=False)
+    result = csv_document_cleaner.run([csv_document])
+    cleaned_document = result["documents"][0]
+    assert cleaned_document.content == "B,C\n,\n5,6\n"
+
+
+def test_remove_empty_columns_false() -> None:
+    csv_content = """,B,C
+,,
+,,4
+"""
+    csv_document = Document(content=csv_content)
+    csv_document_cleaner = CSVDocumentCleaner(remove_empty_columns=False)
+    result = csv_document_cleaner.run([csv_document])
+    cleaned_document = result["documents"][0]
+    assert cleaned_document.content == ",B,C\n,,4\n"
+
+
+def test_remove_empty_rows_and_columns_false() -> None:
+    csv_content = """,B,C
+,,4
+,,
+"""
+    csv_document = Document(content=csv_content)
+    csv_document_cleaner = CSVDocumentCleaner(remove_empty_rows=False, remove_empty_columns=False)
+    result = csv_document_cleaner.run([csv_document])
+    cleaned_document = result["documents"][0]
+    assert cleaned_document.content == ",B,C\n,,4\n,,\n"