feat: Update csv cleaner (#8828)

* More refactoring * Add more new options and more tests * Improve docstrings * Add release notes * Fix pylint
2025-11-09 14:23:43 +00:00 · 2025-02-07 05:29:53 -08:00 · 2025-02-07 05:29:53 -08:00 · 35788a2d06
commit 35788a2d06
parent 1785ea622e
3 changed files with 176 additions and 34 deletions
--- a/haystack/components/preprocessors/csv_document_cleaner.py
+++ b/haystack/components/preprocessors/csv_document_cleaner.py
@ -2,8 +2,9 @@
 #
 # SPDX-License-Identifier: Apache-2.0

+from copy import deepcopy
 from io import StringIO
-from typing import Dict, List
+from typing import Dict, List, Optional

 from haystack import Document, component, logging
 from haystack.lazy_imports import LazyImport
@ -21,21 +22,36 @@ class CSVDocumentCleaner:

    This component processes CSV content stored in Documents, allowing
    for the optional ignoring of a specified number of rows and columns before performing
-    the cleaning operation.
+    the cleaning operation. Additionally, it provides options to keep document IDs and
+    control whether empty rows and columns should be removed.
    """

-    def __init__(self, ignore_rows: int = 0, ignore_columns: int = 0) -> None:
+    def __init__(
+        self,
+        *,
+        ignore_rows: int = 0,
+        ignore_columns: int = 0,
+        remove_empty_rows: bool = True,
+        remove_empty_columns: bool = True,
+        keep_id: bool = False,
+    ) -> None:
        """
        Initializes the CSVDocumentCleaner component.

        :param ignore_rows: Number of rows to ignore from the top of the CSV table before processing.
        :param ignore_columns: Number of columns to ignore from the left of the CSV table before processing.
+        :param remove_empty_rows: Whether to remove rows that are entirely empty.
+        :param remove_empty_columns: Whether to remove columns that are entirely empty.
+        :param keep_id: Whether to retain the original document ID in the output document.

        Rows and columns ignored using these parameters are preserved in the final output, meaning
        they are not considered when removing empty rows and columns.
        """
        self.ignore_rows = ignore_rows
        self.ignore_columns = ignore_columns
+        self.remove_empty_rows = remove_empty_rows
+        self.remove_empty_columns = remove_empty_columns
+        self.keep_id = keep_id
        pandas_import.check()

    @component.output_types(documents=List[Document])
@ -44,14 +60,20 @@ class CSVDocumentCleaner:
        Cleans CSV documents by removing empty rows and columns while preserving specified ignored rows and columns.

        :param documents: List of Documents containing CSV-formatted content.
+        :return: A dictionary with a list of cleaned Documents under the key "documents".

        Processing steps:
        1. Reads each document's content as a CSV table.
        2. Retains the specified number of `ignore_rows` from the top and `ignore_columns` from the left.
-        3. Drops any rows and columns that are entirely empty (all NaN values).
+        3. Drops any rows and columns that are entirely empty (if enabled by `remove_empty_rows` and
+            `remove_empty_columns`).
        4. Reattaches the ignored rows and columns to maintain their original positions.
-        5. Returns the cleaned CSV content as a new `Document` object.
+        5. Returns the cleaned CSV content as a new `Document` object, with an option to retain the original
+            document ID.
        """
+        if len(documents) == 0:
+            return {"documents": []}
+
        ignore_rows = self.ignore_rows
        ignore_columns = self.ignore_columns

@ -82,35 +104,75 @@ class CSVDocumentCleaner:
                cleaned_documents.append(document)
                continue

-            # Save ignored rows
-            ignored_rows = None
-            if ignore_rows > 0:
-                ignored_rows = df.iloc[:ignore_rows, :]
+            final_df = self._clean_df(df=df, ignore_rows=ignore_rows, ignore_columns=ignore_columns)

-            # Save ignored columns
-            ignored_columns = None
-            if ignore_columns > 0:
-                ignored_columns = df.iloc[:, :ignore_columns]
-
-            # Drop rows and columns that are entirely empty
-            remaining_df = df.iloc[ignore_rows:, ignore_columns:]
-            final_df = remaining_df.dropna(axis=0, how="all").dropna(axis=1, how="all")
-
-            # Reattach ignored rows
-            if ignore_rows > 0 and ignored_rows is not None:
-                # Keep only relevant columns
-                ignored_rows = ignored_rows.loc[:, final_df.columns]
-                final_df = pd.concat([ignored_rows, final_df], axis=0)
-
-            # Reattach ignored columns
-            if ignore_columns > 0 and ignored_columns is not None:
-                # Keep only relevant rows
-                ignored_columns = ignored_columns.loc[final_df.index, :]
-                final_df = pd.concat([ignored_columns, final_df], axis=1)
-
-            cleaned_documents.append(
-                Document(
-                    content=final_df.to_csv(index=False, header=False, lineterminator="\n"), meta=document.meta.copy()
-                )
+            clean_doc = Document(
+                id=document.id if self.keep_id else "",
+                content=final_df.to_csv(index=False, header=False, lineterminator="\n"),
+                blob=document.blob,
+                meta=deepcopy(document.meta),
+                score=document.score,
+                embedding=document.embedding,
+                sparse_embedding=document.sparse_embedding,
            )
+            cleaned_documents.append(clean_doc)
        return {"documents": cleaned_documents}
+
+    def _clean_df(self, df: "pd.DataFrame", ignore_rows: int, ignore_columns: int) -> "pd.DataFrame":
+        """
+        Cleans a DataFrame by removing empty rows and columns while preserving ignored sections.
+
+        :param df: The input DataFrame representing the CSV data.
+        :param ignore_rows: Number of top rows to ignore.
+        :param ignore_columns: Number of left columns to ignore.
+        """
+        # Get ignored rows and columns
+        ignored_rows = self._get_ignored_rows(df=df, ignore_rows=ignore_rows)
+        ignored_columns = self._get_ignored_columns(df=df, ignore_columns=ignore_columns)
+        final_df = df.iloc[ignore_rows:, ignore_columns:]
+
+        # Drop rows that are entirely empty
+        if self.remove_empty_rows:
+            final_df = final_df.dropna(axis=0, how="all")
+
+        # Drop columns that are entirely empty
+        if self.remove_empty_columns:
+            final_df = final_df.dropna(axis=1, how="all")
+
+        # Reattach ignored rows
+        if ignore_rows > 0 and ignored_rows is not None:
+            # Keep only relevant columns
+            ignored_rows = ignored_rows.loc[:, final_df.columns]
+            final_df = pd.concat([ignored_rows, final_df], axis=0)
+
+        # Reattach ignored columns
+        if ignore_columns > 0 and ignored_columns is not None:
+            # Keep only relevant rows
+            ignored_columns = ignored_columns.loc[final_df.index, :]
+            final_df = pd.concat([ignored_columns, final_df], axis=1)
+
+        return final_df
+
+    @staticmethod
+    def _get_ignored_rows(df: "pd.DataFrame", ignore_rows: int) -> Optional["pd.DataFrame"]:
+        """
+        Extracts the rows to be ignored from the DataFrame.
+
+        :param df: The input DataFrame.
+        :param ignore_rows: Number of rows to extract from the top.
+        """
+        if ignore_rows > 0:
+            return df.iloc[:ignore_rows, :]
+        return None
+
+    @staticmethod
+    def _get_ignored_columns(df: "pd.DataFrame", ignore_columns: int) -> Optional["pd.DataFrame"]:
+        """
+        Extracts the columns to be ignored from the DataFrame.
+
+        :param df: The input DataFrame.
+        :param ignore_columns: Number of columns to extract from the left.
+        """
+        if ignore_columns > 0:
+            return df.iloc[:, :ignore_columns]
+        return None
--- a/releasenotes/notes/update-csv-document-cleaner-7faed3788e9bfea4.yaml
+++ b/releasenotes/notes/update-csv-document-cleaner-7faed3788e9bfea4.yaml
@ -0,0 +1,5 @@
+---
+enhancements:
+  - |
+    For the CSVDocumentCleaner, added `remove_empty_rows` & `remove_empty_columns` to optionally remove rows and columns.
+    Also added `keep_id` to optionally allow for keeping the original document ID.
--- a/test/components/preprocessors/test_csv_document_cleaner.py
+++ b/test/components/preprocessors/test_csv_document_cleaner.py
@ -144,3 +144,78 @@ def test_zero_ignore_rows_and_columns() -> None:
    result = csv_document_cleaner.run([csv_document])
    cleaned_document = result["documents"][0]
    assert cleaned_document.content == ",A,B,C\n1,item,s,\n2,item2,fd,\n"
+
+
+def test_empty_document() -> None:
+    csv_document = Document(content="")
+    csv_document_cleaner = CSVDocumentCleaner()
+    result = csv_document_cleaner.run([csv_document])
+    cleaned_document = result["documents"][0]
+    assert cleaned_document.content == ""
+    assert cleaned_document.meta == {}
+
+
+def test_empty_documents() -> None:
+    csv_document_cleaner = CSVDocumentCleaner()
+    result = csv_document_cleaner.run([])
+    assert result["documents"] == []
+
+
+def test_keep_id() -> None:
+    csv_content = """,A,B,C
+1,item,s,
+"""
+    csv_document = Document(id="123", content=csv_content)
+    csv_document_cleaner = CSVDocumentCleaner(keep_id=True)
+    result = csv_document_cleaner.run([csv_document])
+    cleaned_document = result["documents"][0]
+    assert cleaned_document.id == "123"
+    assert cleaned_document.content == ",A,B,C\n1,item,s,\n"
+
+
+def test_id_not_none() -> None:
+    csv_content = """,A,B,C
+1,item,s,
+"""
+    csv_document = Document(content=csv_content)
+    csv_document_cleaner = CSVDocumentCleaner()
+    result = csv_document_cleaner.run([csv_document])
+    cleaned_document = result["documents"][0]
+    assert cleaned_document.id != ""
+    assert cleaned_document.content == ",A,B,C\n1,item,s,\n"
+
+
+def test_remove_empty_rows_false() -> None:
+    csv_content = """,B,C
+,,
+,5,6
+"""
+    csv_document = Document(content=csv_content)
+    csv_document_cleaner = CSVDocumentCleaner(remove_empty_rows=False)
+    result = csv_document_cleaner.run([csv_document])
+    cleaned_document = result["documents"][0]
+    assert cleaned_document.content == "B,C\n,\n5,6\n"
+
+
+def test_remove_empty_columns_false() -> None:
+    csv_content = """,B,C
+,,
+,,4
+"""
+    csv_document = Document(content=csv_content)
+    csv_document_cleaner = CSVDocumentCleaner(remove_empty_columns=False)
+    result = csv_document_cleaner.run([csv_document])
+    cleaned_document = result["documents"][0]
+    assert cleaned_document.content == ",B,C\n,,4\n"
+
+
+def test_remove_empty_rows_and_columns_false() -> None:
+    csv_content = """,B,C
+,,4
+,,
+"""
+    csv_document = Document(content=csv_content)
+    csv_document_cleaner = CSVDocumentCleaner(remove_empty_rows=False, remove_empty_columns=False)
+    result = csv_document_cleaner.run([csv_document])
+    cleaned_document = result["documents"][0]
+    assert cleaned_document.content == ",B,C\n,,4\n,,\n"