mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-07 12:37:27 +00:00
Remove substrings basic implementation (#2152)
* Remove substrings basic implementation * Update Documentation & Code Style * Remove substrings basic tests * Simplify test
This commit is contained in:
parent
6c0094b5ad
commit
ecec9b4e2c
@ -37,7 +37,7 @@ class PreProcessor(BasePreProcessor)
|
||||
#### process
|
||||
|
||||
```python
|
||||
def process(documents: Union[dict, List[dict]], clean_whitespace: Optional[bool] = None, clean_header_footer: Optional[bool] = None, clean_empty_lines: Optional[bool] = None, split_by: Optional[str] = None, split_length: Optional[int] = None, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = None) -> List[dict]
|
||||
def process(documents: Union[dict, List[dict]], clean_whitespace: Optional[bool] = None, clean_header_footer: Optional[bool] = None, clean_empty_lines: Optional[bool] = None, remove_substrings: List[str] = [], split_by: Optional[str] = None, split_length: Optional[int] = None, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = None) -> List[dict]
|
||||
```
|
||||
|
||||
Perform document cleaning and splitting. Can take a single document or a list of documents as input and returns a list of documents.
|
||||
@ -47,7 +47,7 @@ Perform document cleaning and splitting. Can take a single document or a list of
|
||||
#### clean
|
||||
|
||||
```python
|
||||
def clean(document: dict, clean_whitespace: bool, clean_header_footer: bool, clean_empty_lines: bool) -> dict
|
||||
def clean(document: dict, clean_whitespace: bool, clean_header_footer: bool, clean_empty_lines: bool, remove_substrings: List[str]) -> dict
|
||||
```
|
||||
|
||||
Perform document cleaning on a single document and return a single document. This method will deal with whitespaces, headers, footers
|
||||
|
||||
@ -43,6 +43,7 @@ class PreProcessor(BasePreProcessor):
|
||||
clean_whitespace: bool = True,
|
||||
clean_header_footer: bool = False,
|
||||
clean_empty_lines: bool = True,
|
||||
remove_substrings: List[str] = [],
|
||||
split_by: str = "word",
|
||||
split_length: int = 200,
|
||||
split_overlap: int = 0,
|
||||
@ -56,6 +57,7 @@ class PreProcessor(BasePreProcessor):
|
||||
or similar.
|
||||
:param clean_whitespace: Strip whitespaces before or after each line in the text.
|
||||
:param clean_empty_lines: Remove more than two empty lines in the text.
|
||||
:param remove_substrings: Remove specified substrings from the text.
|
||||
:param split_by: Unit for splitting the document. Can be "word", "sentence", or "passage". Set to None to disable splitting.
|
||||
:param split_length: Max. number of the above split unit (e.g. words) that are allowed in one document. For instance, if n -> 10 & split_by ->
|
||||
"sentence", then each output document will have 10 sentences.
|
||||
@ -76,6 +78,7 @@ class PreProcessor(BasePreProcessor):
|
||||
clean_whitespace=clean_whitespace,
|
||||
clean_header_footer=clean_header_footer,
|
||||
clean_empty_lines=clean_empty_lines,
|
||||
remove_substrings=remove_substrings,
|
||||
split_by=split_by,
|
||||
split_length=split_length,
|
||||
split_overlap=split_overlap,
|
||||
@ -90,6 +93,7 @@ class PreProcessor(BasePreProcessor):
|
||||
self.clean_whitespace = clean_whitespace
|
||||
self.clean_header_footer = clean_header_footer
|
||||
self.clean_empty_lines = clean_empty_lines
|
||||
self.remove_substrings = remove_substrings
|
||||
self.split_by = split_by
|
||||
self.split_length = split_length
|
||||
self.split_overlap = split_overlap
|
||||
@ -103,6 +107,7 @@ class PreProcessor(BasePreProcessor):
|
||||
clean_whitespace: Optional[bool] = None,
|
||||
clean_header_footer: Optional[bool] = None,
|
||||
clean_empty_lines: Optional[bool] = None,
|
||||
remove_substrings: List[str] = [],
|
||||
split_by: Optional[str] = None,
|
||||
split_length: Optional[int] = None,
|
||||
split_overlap: Optional[int] = None,
|
||||
@ -117,6 +122,7 @@ class PreProcessor(BasePreProcessor):
|
||||
"clean_whitespace": clean_whitespace,
|
||||
"clean_header_footer": clean_header_footer,
|
||||
"clean_empty_lines": clean_empty_lines,
|
||||
"remove_substrings": remove_substrings,
|
||||
"split_by": split_by,
|
||||
"split_length": split_length,
|
||||
"split_overlap": split_overlap,
|
||||
@ -141,6 +147,7 @@ class PreProcessor(BasePreProcessor):
|
||||
clean_whitespace: Optional[bool] = None,
|
||||
clean_header_footer: Optional[bool] = None,
|
||||
clean_empty_lines: Optional[bool] = None,
|
||||
remove_substrings: List[str] = [],
|
||||
split_by: Optional[str] = None,
|
||||
split_length: Optional[int] = None,
|
||||
split_overlap: Optional[int] = None,
|
||||
@ -153,6 +160,8 @@ class PreProcessor(BasePreProcessor):
|
||||
clean_header_footer = self.clean_header_footer
|
||||
if clean_empty_lines is None:
|
||||
clean_empty_lines = self.clean_empty_lines
|
||||
if not remove_substrings:
|
||||
remove_substrings = self.remove_substrings
|
||||
if split_by is None:
|
||||
split_by = self.split_by
|
||||
if split_length is None:
|
||||
@ -167,6 +176,7 @@ class PreProcessor(BasePreProcessor):
|
||||
clean_whitespace=clean_whitespace,
|
||||
clean_header_footer=clean_header_footer,
|
||||
clean_empty_lines=clean_empty_lines,
|
||||
remove_substrings=remove_substrings,
|
||||
)
|
||||
split_documents = self.split(
|
||||
document=cleaned_document,
|
||||
@ -181,7 +191,14 @@ class PreProcessor(BasePreProcessor):
|
||||
nested_docs = [self._process_single(d, **kwargs) for d in tqdm(documents, unit="docs")]
|
||||
return [d for x in nested_docs for d in x]
|
||||
|
||||
def clean(self, document: dict, clean_whitespace: bool, clean_header_footer: bool, clean_empty_lines: bool) -> dict:
|
||||
def clean(
|
||||
self,
|
||||
document: dict,
|
||||
clean_whitespace: bool,
|
||||
clean_header_footer: bool,
|
||||
clean_empty_lines: bool,
|
||||
remove_substrings: List[str],
|
||||
) -> dict:
|
||||
"""
|
||||
Perform document cleaning on a single document and return a single document. This method will deal with whitespaces, headers, footers
|
||||
and empty lines. Its exact functionality is defined by the parameters passed into PreProcessor.__init__().
|
||||
@ -204,6 +221,9 @@ class PreProcessor(BasePreProcessor):
|
||||
if clean_empty_lines:
|
||||
text = re.sub(r"\n\n+", "\n\n", text)
|
||||
|
||||
for substring in remove_substrings:
|
||||
text = text.replace(substring, "")
|
||||
|
||||
document["content"] = text
|
||||
return document
|
||||
|
||||
|
||||
@ -1381,6 +1381,14 @@
|
||||
"default": true,
|
||||
"type": "boolean"
|
||||
},
|
||||
"remove_substrings": {
|
||||
"title": "Remove Substrings",
|
||||
"default": [],
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"split_by": {
|
||||
"title": "Split By",
|
||||
"default": "word",
|
||||
|
||||
@ -1432,6 +1432,14 @@
|
||||
"default": true,
|
||||
"type": "boolean"
|
||||
},
|
||||
"remove_substrings": {
|
||||
"title": "Remove Substrings",
|
||||
"default": [],
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"split_by": {
|
||||
"title": "Split By",
|
||||
"default": "word",
|
||||
|
||||
@ -88,3 +88,23 @@ def test_clean_header_footer():
|
||||
|
||||
assert "This is a header." not in documents[0]["content"]
|
||||
assert "footer" not in documents[0]["content"]
|
||||
|
||||
|
||||
def test_remove_substrings():
|
||||
document = Document("This is a header. Some additional text. wiki. Some emoji ✨ 🪲 Weird whitespace\b\b\b.")
|
||||
|
||||
# check that the file contains the substrings we are about to remove
|
||||
assert "This is a header." in document["content"]
|
||||
assert "wiki" in document["content"]
|
||||
assert "🪲" in document["content"]
|
||||
assert "whitespace" in document["content"]
|
||||
assert "✨" in document["content"]
|
||||
|
||||
preprocessor = PreProcessor(remove_substrings=["This is a header.", "wiki", "🪲"])
|
||||
documents = preprocessor.process(document)
|
||||
|
||||
assert "This is a header." not in document["content"]
|
||||
assert "wiki" not in document["content"]
|
||||
assert "🪲" not in document["content"]
|
||||
assert "whitespace" in document["content"]
|
||||
assert "✨" in document["content"]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user