mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-05 16:10:31 +00:00
feat: Update csv cleaner (#8828)
* More refactoring * Add more new options and more tests * Improve docstrings * Add release notes * Fix pylint
This commit is contained in:
parent
1785ea622e
commit
35788a2d06
@ -2,8 +2,9 @@
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from copy import deepcopy
|
||||
from io import StringIO
|
||||
from typing import Dict, List
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from haystack import Document, component, logging
|
||||
from haystack.lazy_imports import LazyImport
|
||||
@ -21,21 +22,36 @@ class CSVDocumentCleaner:
|
||||
|
||||
This component processes CSV content stored in Documents, allowing
|
||||
for the optional ignoring of a specified number of rows and columns before performing
|
||||
the cleaning operation.
|
||||
the cleaning operation. Additionally, it provides options to keep document IDs and
|
||||
control whether empty rows and columns should be removed.
|
||||
"""
|
||||
|
||||
def __init__(self, ignore_rows: int = 0, ignore_columns: int = 0) -> None:
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
ignore_rows: int = 0,
|
||||
ignore_columns: int = 0,
|
||||
remove_empty_rows: bool = True,
|
||||
remove_empty_columns: bool = True,
|
||||
keep_id: bool = False,
|
||||
) -> None:
|
||||
"""
|
||||
Initializes the CSVDocumentCleaner component.
|
||||
|
||||
:param ignore_rows: Number of rows to ignore from the top of the CSV table before processing.
|
||||
:param ignore_columns: Number of columns to ignore from the left of the CSV table before processing.
|
||||
:param remove_empty_rows: Whether to remove rows that are entirely empty.
|
||||
:param remove_empty_columns: Whether to remove columns that are entirely empty.
|
||||
:param keep_id: Whether to retain the original document ID in the output document.
|
||||
|
||||
Rows and columns ignored using these parameters are preserved in the final output, meaning
|
||||
they are not considered when removing empty rows and columns.
|
||||
"""
|
||||
self.ignore_rows = ignore_rows
|
||||
self.ignore_columns = ignore_columns
|
||||
self.remove_empty_rows = remove_empty_rows
|
||||
self.remove_empty_columns = remove_empty_columns
|
||||
self.keep_id = keep_id
|
||||
pandas_import.check()
|
||||
|
||||
@component.output_types(documents=List[Document])
|
||||
@ -44,14 +60,20 @@ class CSVDocumentCleaner:
|
||||
Cleans CSV documents by removing empty rows and columns while preserving specified ignored rows and columns.
|
||||
|
||||
:param documents: List of Documents containing CSV-formatted content.
|
||||
:return: A dictionary with a list of cleaned Documents under the key "documents".
|
||||
|
||||
Processing steps:
|
||||
1. Reads each document's content as a CSV table.
|
||||
2. Retains the specified number of `ignore_rows` from the top and `ignore_columns` from the left.
|
||||
3. Drops any rows and columns that are entirely empty (all NaN values).
|
||||
3. Drops any rows and columns that are entirely empty (if enabled by `remove_empty_rows` and
|
||||
`remove_empty_columns`).
|
||||
4. Reattaches the ignored rows and columns to maintain their original positions.
|
||||
5. Returns the cleaned CSV content as a new `Document` object.
|
||||
5. Returns the cleaned CSV content as a new `Document` object, with an option to retain the original
|
||||
document ID.
|
||||
"""
|
||||
if len(documents) == 0:
|
||||
return {"documents": []}
|
||||
|
||||
ignore_rows = self.ignore_rows
|
||||
ignore_columns = self.ignore_columns
|
||||
|
||||
@ -82,35 +104,75 @@ class CSVDocumentCleaner:
|
||||
cleaned_documents.append(document)
|
||||
continue
|
||||
|
||||
# Save ignored rows
|
||||
ignored_rows = None
|
||||
if ignore_rows > 0:
|
||||
ignored_rows = df.iloc[:ignore_rows, :]
|
||||
final_df = self._clean_df(df=df, ignore_rows=ignore_rows, ignore_columns=ignore_columns)
|
||||
|
||||
# Save ignored columns
|
||||
ignored_columns = None
|
||||
if ignore_columns > 0:
|
||||
ignored_columns = df.iloc[:, :ignore_columns]
|
||||
|
||||
# Drop rows and columns that are entirely empty
|
||||
remaining_df = df.iloc[ignore_rows:, ignore_columns:]
|
||||
final_df = remaining_df.dropna(axis=0, how="all").dropna(axis=1, how="all")
|
||||
|
||||
# Reattach ignored rows
|
||||
if ignore_rows > 0 and ignored_rows is not None:
|
||||
# Keep only relevant columns
|
||||
ignored_rows = ignored_rows.loc[:, final_df.columns]
|
||||
final_df = pd.concat([ignored_rows, final_df], axis=0)
|
||||
|
||||
# Reattach ignored columns
|
||||
if ignore_columns > 0 and ignored_columns is not None:
|
||||
# Keep only relevant rows
|
||||
ignored_columns = ignored_columns.loc[final_df.index, :]
|
||||
final_df = pd.concat([ignored_columns, final_df], axis=1)
|
||||
|
||||
cleaned_documents.append(
|
||||
Document(
|
||||
content=final_df.to_csv(index=False, header=False, lineterminator="\n"), meta=document.meta.copy()
|
||||
)
|
||||
clean_doc = Document(
|
||||
id=document.id if self.keep_id else "",
|
||||
content=final_df.to_csv(index=False, header=False, lineterminator="\n"),
|
||||
blob=document.blob,
|
||||
meta=deepcopy(document.meta),
|
||||
score=document.score,
|
||||
embedding=document.embedding,
|
||||
sparse_embedding=document.sparse_embedding,
|
||||
)
|
||||
cleaned_documents.append(clean_doc)
|
||||
return {"documents": cleaned_documents}
|
||||
|
||||
def _clean_df(self, df: "pd.DataFrame", ignore_rows: int, ignore_columns: int) -> "pd.DataFrame":
|
||||
"""
|
||||
Cleans a DataFrame by removing empty rows and columns while preserving ignored sections.
|
||||
|
||||
:param df: The input DataFrame representing the CSV data.
|
||||
:param ignore_rows: Number of top rows to ignore.
|
||||
:param ignore_columns: Number of left columns to ignore.
|
||||
"""
|
||||
# Get ignored rows and columns
|
||||
ignored_rows = self._get_ignored_rows(df=df, ignore_rows=ignore_rows)
|
||||
ignored_columns = self._get_ignored_columns(df=df, ignore_columns=ignore_columns)
|
||||
final_df = df.iloc[ignore_rows:, ignore_columns:]
|
||||
|
||||
# Drop rows that are entirely empty
|
||||
if self.remove_empty_rows:
|
||||
final_df = final_df.dropna(axis=0, how="all")
|
||||
|
||||
# Drop columns that are entirely empty
|
||||
if self.remove_empty_columns:
|
||||
final_df = final_df.dropna(axis=1, how="all")
|
||||
|
||||
# Reattach ignored rows
|
||||
if ignore_rows > 0 and ignored_rows is not None:
|
||||
# Keep only relevant columns
|
||||
ignored_rows = ignored_rows.loc[:, final_df.columns]
|
||||
final_df = pd.concat([ignored_rows, final_df], axis=0)
|
||||
|
||||
# Reattach ignored columns
|
||||
if ignore_columns > 0 and ignored_columns is not None:
|
||||
# Keep only relevant rows
|
||||
ignored_columns = ignored_columns.loc[final_df.index, :]
|
||||
final_df = pd.concat([ignored_columns, final_df], axis=1)
|
||||
|
||||
return final_df
|
||||
|
||||
@staticmethod
|
||||
def _get_ignored_rows(df: "pd.DataFrame", ignore_rows: int) -> Optional["pd.DataFrame"]:
|
||||
"""
|
||||
Extracts the rows to be ignored from the DataFrame.
|
||||
|
||||
:param df: The input DataFrame.
|
||||
:param ignore_rows: Number of rows to extract from the top.
|
||||
"""
|
||||
if ignore_rows > 0:
|
||||
return df.iloc[:ignore_rows, :]
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _get_ignored_columns(df: "pd.DataFrame", ignore_columns: int) -> Optional["pd.DataFrame"]:
|
||||
"""
|
||||
Extracts the columns to be ignored from the DataFrame.
|
||||
|
||||
:param df: The input DataFrame.
|
||||
:param ignore_columns: Number of columns to extract from the left.
|
||||
"""
|
||||
if ignore_columns > 0:
|
||||
return df.iloc[:, :ignore_columns]
|
||||
return None
|
||||
|
@ -0,0 +1,5 @@
|
||||
---
|
||||
enhancements:
|
||||
- |
|
||||
For the CSVDocumentCleaner, added `remove_empty_rows` & `remove_empty_columns` to optionally remove rows and columns.
|
||||
Also added `keep_id` to optionally allow for keeping the original document ID.
|
@ -144,3 +144,78 @@ def test_zero_ignore_rows_and_columns() -> None:
|
||||
result = csv_document_cleaner.run([csv_document])
|
||||
cleaned_document = result["documents"][0]
|
||||
assert cleaned_document.content == ",A,B,C\n1,item,s,\n2,item2,fd,\n"
|
||||
|
||||
|
||||
def test_empty_document() -> None:
|
||||
csv_document = Document(content="")
|
||||
csv_document_cleaner = CSVDocumentCleaner()
|
||||
result = csv_document_cleaner.run([csv_document])
|
||||
cleaned_document = result["documents"][0]
|
||||
assert cleaned_document.content == ""
|
||||
assert cleaned_document.meta == {}
|
||||
|
||||
|
||||
def test_empty_documents() -> None:
|
||||
csv_document_cleaner = CSVDocumentCleaner()
|
||||
result = csv_document_cleaner.run([])
|
||||
assert result["documents"] == []
|
||||
|
||||
|
||||
def test_keep_id() -> None:
|
||||
csv_content = """,A,B,C
|
||||
1,item,s,
|
||||
"""
|
||||
csv_document = Document(id="123", content=csv_content)
|
||||
csv_document_cleaner = CSVDocumentCleaner(keep_id=True)
|
||||
result = csv_document_cleaner.run([csv_document])
|
||||
cleaned_document = result["documents"][0]
|
||||
assert cleaned_document.id == "123"
|
||||
assert cleaned_document.content == ",A,B,C\n1,item,s,\n"
|
||||
|
||||
|
||||
def test_id_not_none() -> None:
|
||||
csv_content = """,A,B,C
|
||||
1,item,s,
|
||||
"""
|
||||
csv_document = Document(content=csv_content)
|
||||
csv_document_cleaner = CSVDocumentCleaner()
|
||||
result = csv_document_cleaner.run([csv_document])
|
||||
cleaned_document = result["documents"][0]
|
||||
assert cleaned_document.id != ""
|
||||
assert cleaned_document.content == ",A,B,C\n1,item,s,\n"
|
||||
|
||||
|
||||
def test_remove_empty_rows_false() -> None:
|
||||
csv_content = """,B,C
|
||||
,,
|
||||
,5,6
|
||||
"""
|
||||
csv_document = Document(content=csv_content)
|
||||
csv_document_cleaner = CSVDocumentCleaner(remove_empty_rows=False)
|
||||
result = csv_document_cleaner.run([csv_document])
|
||||
cleaned_document = result["documents"][0]
|
||||
assert cleaned_document.content == "B,C\n,\n5,6\n"
|
||||
|
||||
|
||||
def test_remove_empty_columns_false() -> None:
|
||||
csv_content = """,B,C
|
||||
,,
|
||||
,,4
|
||||
"""
|
||||
csv_document = Document(content=csv_content)
|
||||
csv_document_cleaner = CSVDocumentCleaner(remove_empty_columns=False)
|
||||
result = csv_document_cleaner.run([csv_document])
|
||||
cleaned_document = result["documents"][0]
|
||||
assert cleaned_document.content == ",B,C\n,,4\n"
|
||||
|
||||
|
||||
def test_remove_empty_rows_and_columns_false() -> None:
|
||||
csv_content = """,B,C
|
||||
,,4
|
||||
,,
|
||||
"""
|
||||
csv_document = Document(content=csv_content)
|
||||
csv_document_cleaner = CSVDocumentCleaner(remove_empty_rows=False, remove_empty_columns=False)
|
||||
result = csv_document_cleaner.run([csv_document])
|
||||
cleaned_document = result["documents"][0]
|
||||
assert cleaned_document.content == ",B,C\n,,4\n,,\n"
|
||||
|
Loading…
x
Reference in New Issue
Block a user