diff --git a/haystack/nodes/preprocessor/preprocessor.py b/haystack/nodes/preprocessor/preprocessor.py index 11f64e3ae..4a0009082 100644 --- a/haystack/nodes/preprocessor/preprocessor.py +++ b/haystack/nodes/preprocessor/preprocessor.py @@ -103,7 +103,10 @@ class PreProcessor(BasePreProcessor): field `"page"`. Page boundaries are determined by `"\f"` character which is added in between pages by `PDFToTextConverter`, `TikaConverter`, `ParsrConverter` and `AzureConverter`. - :param max_chars_check: the maximum length a document is expected to have. Each document that is longer than max_chars_check in characters after pre-processing will raise a warning. + :param max_chars_check: the maximum length a document is expected to have. Each document that is longer than + max_chars_check in characters after pre-processing will raise a warning and is going to be split at the + `max_char_check`-th char, regardless of any other constraint. If the resulting documents are still too long, + they'll be cut again until all fragments are below the maximum allowed length. """ if remove_substrings is None: remove_substrings = [] @@ -186,7 +189,9 @@ class PreProcessor(BasePreProcessor): def _long_documents(self, documents: List[Document], max_chars_check=10_000): """ - Function that tries to detect unusually long documents. + Function that tries to detect unusually long documents. When detected, such documents are going to be + split at the `max_char_check`-th char, regardless of any other constraint. If the resulting documents + are still too long, they'll be cut again until all fragments are below the maximum allowed length. NOTE: this function is a heuristic that is in place only because a proper fix that prevents such documents from forming would imply a complete revamp of this class, including better definitions of what the various units (word, sentence, passage) mean exactly. @@ -195,11 +200,23 @@ class PreProcessor(BasePreProcessor): if len(document.content) > max_chars_check: logger.warning( "Document %s is %s characters long after preprocessing, where the maximum length should be %s. " - "Something might be wrong with the splitting, check the document affected to prevent issues at query time.", + "Something might be wrong with the splitting, check the document affected to prevent issues at " + "query time. This document will be now hard-split at %s chars recursively.", document.id, len(document.content), max_chars_check, + max_chars_check, ) + fields = document.to_dict() + document.content = document.content[:max_chars_check] + fields.pop("id") + fields["content"] = fields["content"][max_chars_check:] + # recursively check if tail_document is still too long + tail_documents = self._long_documents( + documents=[Document.from_dict(fields)], max_chars_check=max_chars_check + ) + documents += tail_documents + return documents def _process_single( self, @@ -250,7 +267,7 @@ class PreProcessor(BasePreProcessor): id_hash_keys=id_hash_keys, ) - self._long_documents(split_documents, max_chars_check=self.max_chars_check) + split_documents = self._long_documents(split_documents, max_chars_check=self.max_chars_check) return split_documents diff --git a/test/nodes/test_preprocessor.py b/test/nodes/test_preprocessor.py index 8fdef509f..8bdd2d526 100644 --- a/test/nodes/test_preprocessor.py +++ b/test/nodes/test_preprocessor.py @@ -528,13 +528,12 @@ def test_preprocessor_very_long_document(caplog): preproc = PreProcessor( clean_empty_lines=False, clean_header_footer=False, clean_whitespace=False, split_by=None, max_chars_check=10 ) - documents = [ - Document(content=f"this is a test document with more than max_char characters: {'1'*i}") for i in range(9) - ] + documents = [Document(content=str(i) + (f"." * i)) for i in range(0, 30, 3)] results = preproc.process(documents) - assert results == documents - for i in range(5): - assert f"is 6{i} characters long after preprocessing, where the maximum length should be 10." in caplog.text + assert len(results) == 19 + assert any(d.content.startswith(".") for d in results) + assert any(not d.content.startswith(".") for d in results) + assert f"characters long after preprocessing, where the maximum length should be 10." in caplog.text @pytest.mark.unit