mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-05 16:10:31 +00:00
feat: Update csv cleaner (#8828)
* More refactoring * Add more new options and more tests * Improve docstrings * Add release notes * Fix pylint
This commit is contained in:
parent
1785ea622e
commit
35788a2d06
@ -2,8 +2,9 @@
|
|||||||
#
|
#
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
from copy import deepcopy
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
from typing import Dict, List
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
from haystack import Document, component, logging
|
from haystack import Document, component, logging
|
||||||
from haystack.lazy_imports import LazyImport
|
from haystack.lazy_imports import LazyImport
|
||||||
@ -21,21 +22,36 @@ class CSVDocumentCleaner:
|
|||||||
|
|
||||||
This component processes CSV content stored in Documents, allowing
|
This component processes CSV content stored in Documents, allowing
|
||||||
for the optional ignoring of a specified number of rows and columns before performing
|
for the optional ignoring of a specified number of rows and columns before performing
|
||||||
the cleaning operation.
|
the cleaning operation. Additionally, it provides options to keep document IDs and
|
||||||
|
control whether empty rows and columns should be removed.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, ignore_rows: int = 0, ignore_columns: int = 0) -> None:
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
ignore_rows: int = 0,
|
||||||
|
ignore_columns: int = 0,
|
||||||
|
remove_empty_rows: bool = True,
|
||||||
|
remove_empty_columns: bool = True,
|
||||||
|
keep_id: bool = False,
|
||||||
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Initializes the CSVDocumentCleaner component.
|
Initializes the CSVDocumentCleaner component.
|
||||||
|
|
||||||
:param ignore_rows: Number of rows to ignore from the top of the CSV table before processing.
|
:param ignore_rows: Number of rows to ignore from the top of the CSV table before processing.
|
||||||
:param ignore_columns: Number of columns to ignore from the left of the CSV table before processing.
|
:param ignore_columns: Number of columns to ignore from the left of the CSV table before processing.
|
||||||
|
:param remove_empty_rows: Whether to remove rows that are entirely empty.
|
||||||
|
:param remove_empty_columns: Whether to remove columns that are entirely empty.
|
||||||
|
:param keep_id: Whether to retain the original document ID in the output document.
|
||||||
|
|
||||||
Rows and columns ignored using these parameters are preserved in the final output, meaning
|
Rows and columns ignored using these parameters are preserved in the final output, meaning
|
||||||
they are not considered when removing empty rows and columns.
|
they are not considered when removing empty rows and columns.
|
||||||
"""
|
"""
|
||||||
self.ignore_rows = ignore_rows
|
self.ignore_rows = ignore_rows
|
||||||
self.ignore_columns = ignore_columns
|
self.ignore_columns = ignore_columns
|
||||||
|
self.remove_empty_rows = remove_empty_rows
|
||||||
|
self.remove_empty_columns = remove_empty_columns
|
||||||
|
self.keep_id = keep_id
|
||||||
pandas_import.check()
|
pandas_import.check()
|
||||||
|
|
||||||
@component.output_types(documents=List[Document])
|
@component.output_types(documents=List[Document])
|
||||||
@ -44,14 +60,20 @@ class CSVDocumentCleaner:
|
|||||||
Cleans CSV documents by removing empty rows and columns while preserving specified ignored rows and columns.
|
Cleans CSV documents by removing empty rows and columns while preserving specified ignored rows and columns.
|
||||||
|
|
||||||
:param documents: List of Documents containing CSV-formatted content.
|
:param documents: List of Documents containing CSV-formatted content.
|
||||||
|
:return: A dictionary with a list of cleaned Documents under the key "documents".
|
||||||
|
|
||||||
Processing steps:
|
Processing steps:
|
||||||
1. Reads each document's content as a CSV table.
|
1. Reads each document's content as a CSV table.
|
||||||
2. Retains the specified number of `ignore_rows` from the top and `ignore_columns` from the left.
|
2. Retains the specified number of `ignore_rows` from the top and `ignore_columns` from the left.
|
||||||
3. Drops any rows and columns that are entirely empty (all NaN values).
|
3. Drops any rows and columns that are entirely empty (if enabled by `remove_empty_rows` and
|
||||||
|
`remove_empty_columns`).
|
||||||
4. Reattaches the ignored rows and columns to maintain their original positions.
|
4. Reattaches the ignored rows and columns to maintain their original positions.
|
||||||
5. Returns the cleaned CSV content as a new `Document` object.
|
5. Returns the cleaned CSV content as a new `Document` object, with an option to retain the original
|
||||||
|
document ID.
|
||||||
"""
|
"""
|
||||||
|
if len(documents) == 0:
|
||||||
|
return {"documents": []}
|
||||||
|
|
||||||
ignore_rows = self.ignore_rows
|
ignore_rows = self.ignore_rows
|
||||||
ignore_columns = self.ignore_columns
|
ignore_columns = self.ignore_columns
|
||||||
|
|
||||||
@ -82,35 +104,75 @@ class CSVDocumentCleaner:
|
|||||||
cleaned_documents.append(document)
|
cleaned_documents.append(document)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Save ignored rows
|
final_df = self._clean_df(df=df, ignore_rows=ignore_rows, ignore_columns=ignore_columns)
|
||||||
ignored_rows = None
|
|
||||||
if ignore_rows > 0:
|
|
||||||
ignored_rows = df.iloc[:ignore_rows, :]
|
|
||||||
|
|
||||||
# Save ignored columns
|
clean_doc = Document(
|
||||||
ignored_columns = None
|
id=document.id if self.keep_id else "",
|
||||||
if ignore_columns > 0:
|
content=final_df.to_csv(index=False, header=False, lineterminator="\n"),
|
||||||
ignored_columns = df.iloc[:, :ignore_columns]
|
blob=document.blob,
|
||||||
|
meta=deepcopy(document.meta),
|
||||||
# Drop rows and columns that are entirely empty
|
score=document.score,
|
||||||
remaining_df = df.iloc[ignore_rows:, ignore_columns:]
|
embedding=document.embedding,
|
||||||
final_df = remaining_df.dropna(axis=0, how="all").dropna(axis=1, how="all")
|
sparse_embedding=document.sparse_embedding,
|
||||||
|
|
||||||
# Reattach ignored rows
|
|
||||||
if ignore_rows > 0 and ignored_rows is not None:
|
|
||||||
# Keep only relevant columns
|
|
||||||
ignored_rows = ignored_rows.loc[:, final_df.columns]
|
|
||||||
final_df = pd.concat([ignored_rows, final_df], axis=0)
|
|
||||||
|
|
||||||
# Reattach ignored columns
|
|
||||||
if ignore_columns > 0 and ignored_columns is not None:
|
|
||||||
# Keep only relevant rows
|
|
||||||
ignored_columns = ignored_columns.loc[final_df.index, :]
|
|
||||||
final_df = pd.concat([ignored_columns, final_df], axis=1)
|
|
||||||
|
|
||||||
cleaned_documents.append(
|
|
||||||
Document(
|
|
||||||
content=final_df.to_csv(index=False, header=False, lineterminator="\n"), meta=document.meta.copy()
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
cleaned_documents.append(clean_doc)
|
||||||
return {"documents": cleaned_documents}
|
return {"documents": cleaned_documents}
|
||||||
|
|
||||||
|
def _clean_df(self, df: "pd.DataFrame", ignore_rows: int, ignore_columns: int) -> "pd.DataFrame":
|
||||||
|
"""
|
||||||
|
Cleans a DataFrame by removing empty rows and columns while preserving ignored sections.
|
||||||
|
|
||||||
|
:param df: The input DataFrame representing the CSV data.
|
||||||
|
:param ignore_rows: Number of top rows to ignore.
|
||||||
|
:param ignore_columns: Number of left columns to ignore.
|
||||||
|
"""
|
||||||
|
# Get ignored rows and columns
|
||||||
|
ignored_rows = self._get_ignored_rows(df=df, ignore_rows=ignore_rows)
|
||||||
|
ignored_columns = self._get_ignored_columns(df=df, ignore_columns=ignore_columns)
|
||||||
|
final_df = df.iloc[ignore_rows:, ignore_columns:]
|
||||||
|
|
||||||
|
# Drop rows that are entirely empty
|
||||||
|
if self.remove_empty_rows:
|
||||||
|
final_df = final_df.dropna(axis=0, how="all")
|
||||||
|
|
||||||
|
# Drop columns that are entirely empty
|
||||||
|
if self.remove_empty_columns:
|
||||||
|
final_df = final_df.dropna(axis=1, how="all")
|
||||||
|
|
||||||
|
# Reattach ignored rows
|
||||||
|
if ignore_rows > 0 and ignored_rows is not None:
|
||||||
|
# Keep only relevant columns
|
||||||
|
ignored_rows = ignored_rows.loc[:, final_df.columns]
|
||||||
|
final_df = pd.concat([ignored_rows, final_df], axis=0)
|
||||||
|
|
||||||
|
# Reattach ignored columns
|
||||||
|
if ignore_columns > 0 and ignored_columns is not None:
|
||||||
|
# Keep only relevant rows
|
||||||
|
ignored_columns = ignored_columns.loc[final_df.index, :]
|
||||||
|
final_df = pd.concat([ignored_columns, final_df], axis=1)
|
||||||
|
|
||||||
|
return final_df
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_ignored_rows(df: "pd.DataFrame", ignore_rows: int) -> Optional["pd.DataFrame"]:
|
||||||
|
"""
|
||||||
|
Extracts the rows to be ignored from the DataFrame.
|
||||||
|
|
||||||
|
:param df: The input DataFrame.
|
||||||
|
:param ignore_rows: Number of rows to extract from the top.
|
||||||
|
"""
|
||||||
|
if ignore_rows > 0:
|
||||||
|
return df.iloc[:ignore_rows, :]
|
||||||
|
return None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_ignored_columns(df: "pd.DataFrame", ignore_columns: int) -> Optional["pd.DataFrame"]:
|
||||||
|
"""
|
||||||
|
Extracts the columns to be ignored from the DataFrame.
|
||||||
|
|
||||||
|
:param df: The input DataFrame.
|
||||||
|
:param ignore_columns: Number of columns to extract from the left.
|
||||||
|
"""
|
||||||
|
if ignore_columns > 0:
|
||||||
|
return df.iloc[:, :ignore_columns]
|
||||||
|
return None
|
||||||
|
@ -0,0 +1,5 @@
|
|||||||
|
---
|
||||||
|
enhancements:
|
||||||
|
- |
|
||||||
|
For the CSVDocumentCleaner, added `remove_empty_rows` & `remove_empty_columns` to optionally remove rows and columns.
|
||||||
|
Also added `keep_id` to optionally allow for keeping the original document ID.
|
@ -144,3 +144,78 @@ def test_zero_ignore_rows_and_columns() -> None:
|
|||||||
result = csv_document_cleaner.run([csv_document])
|
result = csv_document_cleaner.run([csv_document])
|
||||||
cleaned_document = result["documents"][0]
|
cleaned_document = result["documents"][0]
|
||||||
assert cleaned_document.content == ",A,B,C\n1,item,s,\n2,item2,fd,\n"
|
assert cleaned_document.content == ",A,B,C\n1,item,s,\n2,item2,fd,\n"
|
||||||
|
|
||||||
|
|
||||||
|
def test_empty_document() -> None:
|
||||||
|
csv_document = Document(content="")
|
||||||
|
csv_document_cleaner = CSVDocumentCleaner()
|
||||||
|
result = csv_document_cleaner.run([csv_document])
|
||||||
|
cleaned_document = result["documents"][0]
|
||||||
|
assert cleaned_document.content == ""
|
||||||
|
assert cleaned_document.meta == {}
|
||||||
|
|
||||||
|
|
||||||
|
def test_empty_documents() -> None:
|
||||||
|
csv_document_cleaner = CSVDocumentCleaner()
|
||||||
|
result = csv_document_cleaner.run([])
|
||||||
|
assert result["documents"] == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_keep_id() -> None:
|
||||||
|
csv_content = """,A,B,C
|
||||||
|
1,item,s,
|
||||||
|
"""
|
||||||
|
csv_document = Document(id="123", content=csv_content)
|
||||||
|
csv_document_cleaner = CSVDocumentCleaner(keep_id=True)
|
||||||
|
result = csv_document_cleaner.run([csv_document])
|
||||||
|
cleaned_document = result["documents"][0]
|
||||||
|
assert cleaned_document.id == "123"
|
||||||
|
assert cleaned_document.content == ",A,B,C\n1,item,s,\n"
|
||||||
|
|
||||||
|
|
||||||
|
def test_id_not_none() -> None:
|
||||||
|
csv_content = """,A,B,C
|
||||||
|
1,item,s,
|
||||||
|
"""
|
||||||
|
csv_document = Document(content=csv_content)
|
||||||
|
csv_document_cleaner = CSVDocumentCleaner()
|
||||||
|
result = csv_document_cleaner.run([csv_document])
|
||||||
|
cleaned_document = result["documents"][0]
|
||||||
|
assert cleaned_document.id != ""
|
||||||
|
assert cleaned_document.content == ",A,B,C\n1,item,s,\n"
|
||||||
|
|
||||||
|
|
||||||
|
def test_remove_empty_rows_false() -> None:
|
||||||
|
csv_content = """,B,C
|
||||||
|
,,
|
||||||
|
,5,6
|
||||||
|
"""
|
||||||
|
csv_document = Document(content=csv_content)
|
||||||
|
csv_document_cleaner = CSVDocumentCleaner(remove_empty_rows=False)
|
||||||
|
result = csv_document_cleaner.run([csv_document])
|
||||||
|
cleaned_document = result["documents"][0]
|
||||||
|
assert cleaned_document.content == "B,C\n,\n5,6\n"
|
||||||
|
|
||||||
|
|
||||||
|
def test_remove_empty_columns_false() -> None:
|
||||||
|
csv_content = """,B,C
|
||||||
|
,,
|
||||||
|
,,4
|
||||||
|
"""
|
||||||
|
csv_document = Document(content=csv_content)
|
||||||
|
csv_document_cleaner = CSVDocumentCleaner(remove_empty_columns=False)
|
||||||
|
result = csv_document_cleaner.run([csv_document])
|
||||||
|
cleaned_document = result["documents"][0]
|
||||||
|
assert cleaned_document.content == ",B,C\n,,4\n"
|
||||||
|
|
||||||
|
|
||||||
|
def test_remove_empty_rows_and_columns_false() -> None:
|
||||||
|
csv_content = """,B,C
|
||||||
|
,,4
|
||||||
|
,,
|
||||||
|
"""
|
||||||
|
csv_document = Document(content=csv_content)
|
||||||
|
csv_document_cleaner = CSVDocumentCleaner(remove_empty_rows=False, remove_empty_columns=False)
|
||||||
|
result = csv_document_cleaner.run([csv_document])
|
||||||
|
cleaned_document = result["documents"][0]
|
||||||
|
assert cleaned_document.content == ",B,C\n,,4\n,,\n"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user