Remove substrings basic implementation (#2152)

* Remove substrings basic implementation

* Update Documentation & Code Style

* Remove substrings basic tests

* Simplify test
This commit is contained in:
Dmitry Goryunov 2022-03-08 15:49:56 +01:00 committed by GitHub
parent 6c0094b5ad
commit ecec9b4e2c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 59 additions and 3 deletions

View File

@ -37,7 +37,7 @@ class PreProcessor(BasePreProcessor)
#### process
```python
def process(documents: Union[dict, List[dict]], clean_whitespace: Optional[bool] = None, clean_header_footer: Optional[bool] = None, clean_empty_lines: Optional[bool] = None, split_by: Optional[str] = None, split_length: Optional[int] = None, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = None) -> List[dict]
def process(documents: Union[dict, List[dict]], clean_whitespace: Optional[bool] = None, clean_header_footer: Optional[bool] = None, clean_empty_lines: Optional[bool] = None, remove_substrings: List[str] = [], split_by: Optional[str] = None, split_length: Optional[int] = None, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = None) -> List[dict]
```
Perform document cleaning and splitting. Can take a single document or a list of documents as input and returns a list of documents.
@ -47,7 +47,7 @@ Perform document cleaning and splitting. Can take a single document or a list of
#### clean
```python
def clean(document: dict, clean_whitespace: bool, clean_header_footer: bool, clean_empty_lines: bool) -> dict
def clean(document: dict, clean_whitespace: bool, clean_header_footer: bool, clean_empty_lines: bool, remove_substrings: List[str]) -> dict
```
Perform document cleaning on a single document and return a single document. This method will deal with whitespaces, headers, footers

View File

@ -43,6 +43,7 @@ class PreProcessor(BasePreProcessor):
clean_whitespace: bool = True,
clean_header_footer: bool = False,
clean_empty_lines: bool = True,
remove_substrings: List[str] = [],
split_by: str = "word",
split_length: int = 200,
split_overlap: int = 0,
@ -56,6 +57,7 @@ class PreProcessor(BasePreProcessor):
or similar.
:param clean_whitespace: Strip whitespaces before or after each line in the text.
:param clean_empty_lines: Remove more than two empty lines in the text.
:param remove_substrings: Remove specified substrings from the text.
:param split_by: Unit for splitting the document. Can be "word", "sentence", or "passage". Set to None to disable splitting.
:param split_length: Max. number of the above split unit (e.g. words) that are allowed in one document. For instance, if n -> 10 & split_by ->
"sentence", then each output document will have 10 sentences.
@ -76,6 +78,7 @@ class PreProcessor(BasePreProcessor):
clean_whitespace=clean_whitespace,
clean_header_footer=clean_header_footer,
clean_empty_lines=clean_empty_lines,
remove_substrings=remove_substrings,
split_by=split_by,
split_length=split_length,
split_overlap=split_overlap,
@ -90,6 +93,7 @@ class PreProcessor(BasePreProcessor):
self.clean_whitespace = clean_whitespace
self.clean_header_footer = clean_header_footer
self.clean_empty_lines = clean_empty_lines
self.remove_substrings = remove_substrings
self.split_by = split_by
self.split_length = split_length
self.split_overlap = split_overlap
@ -103,6 +107,7 @@ class PreProcessor(BasePreProcessor):
clean_whitespace: Optional[bool] = None,
clean_header_footer: Optional[bool] = None,
clean_empty_lines: Optional[bool] = None,
remove_substrings: List[str] = [],
split_by: Optional[str] = None,
split_length: Optional[int] = None,
split_overlap: Optional[int] = None,
@ -117,6 +122,7 @@ class PreProcessor(BasePreProcessor):
"clean_whitespace": clean_whitespace,
"clean_header_footer": clean_header_footer,
"clean_empty_lines": clean_empty_lines,
"remove_substrings": remove_substrings,
"split_by": split_by,
"split_length": split_length,
"split_overlap": split_overlap,
@ -141,6 +147,7 @@ class PreProcessor(BasePreProcessor):
clean_whitespace: Optional[bool] = None,
clean_header_footer: Optional[bool] = None,
clean_empty_lines: Optional[bool] = None,
remove_substrings: List[str] = [],
split_by: Optional[str] = None,
split_length: Optional[int] = None,
split_overlap: Optional[int] = None,
@ -153,6 +160,8 @@ class PreProcessor(BasePreProcessor):
clean_header_footer = self.clean_header_footer
if clean_empty_lines is None:
clean_empty_lines = self.clean_empty_lines
if not remove_substrings:
remove_substrings = self.remove_substrings
if split_by is None:
split_by = self.split_by
if split_length is None:
@ -167,6 +176,7 @@ class PreProcessor(BasePreProcessor):
clean_whitespace=clean_whitespace,
clean_header_footer=clean_header_footer,
clean_empty_lines=clean_empty_lines,
remove_substrings=remove_substrings,
)
split_documents = self.split(
document=cleaned_document,
@ -181,7 +191,14 @@ class PreProcessor(BasePreProcessor):
nested_docs = [self._process_single(d, **kwargs) for d in tqdm(documents, unit="docs")]
return [d for x in nested_docs for d in x]
def clean(self, document: dict, clean_whitespace: bool, clean_header_footer: bool, clean_empty_lines: bool) -> dict:
def clean(
self,
document: dict,
clean_whitespace: bool,
clean_header_footer: bool,
clean_empty_lines: bool,
remove_substrings: List[str],
) -> dict:
"""
Perform document cleaning on a single document and return a single document. This method will deal with whitespaces, headers, footers
and empty lines. Its exact functionality is defined by the parameters passed into PreProcessor.__init__().
@ -204,6 +221,9 @@ class PreProcessor(BasePreProcessor):
if clean_empty_lines:
text = re.sub(r"\n\n+", "\n\n", text)
for substring in remove_substrings:
text = text.replace(substring, "")
document["content"] = text
return document

View File

@ -1381,6 +1381,14 @@
"default": true,
"type": "boolean"
},
"remove_substrings": {
"title": "Remove Substrings",
"default": [],
"type": "array",
"items": {
"type": "string"
}
},
"split_by": {
"title": "Split By",
"default": "word",

View File

@ -1432,6 +1432,14 @@
"default": true,
"type": "boolean"
},
"remove_substrings": {
"title": "Remove Substrings",
"default": [],
"type": "array",
"items": {
"type": "string"
}
},
"split_by": {
"title": "Split By",
"default": "word",

View File

@ -88,3 +88,23 @@ def test_clean_header_footer():
assert "This is a header." not in documents[0]["content"]
assert "footer" not in documents[0]["content"]
def test_remove_substrings():
document = Document("This is a header. Some additional text. wiki. Some emoji ✨ 🪲 Weird whitespace\b\b\b.")
# check that the file contains the substrings we are about to remove
assert "This is a header." in document["content"]
assert "wiki" in document["content"]
assert "🪲" in document["content"]
assert "whitespace" in document["content"]
assert "" in document["content"]
preprocessor = PreProcessor(remove_substrings=["This is a header.", "wiki", "🪲"])
documents = preprocessor.process(document)
assert "This is a header." not in document["content"]
assert "wiki" not in document["content"]
assert "🪲" not in document["content"]
assert "whitespace" in document["content"]
assert "" in document["content"]