feat: hard document length limit at max_chars_check (#5191)

* implement hard cut at max_chars_check * regenerate ids * black * docstring * black
2025-12-29 07:59:27 +00:00 · 2023-06-23 12:34:19 +02:00 · 2023-06-23 12:34:19 +02:00 · 31664627eb
commit 31664627eb
parent 36192eca72
2 changed files with 26 additions and 10 deletions
--- a/haystack/nodes/preprocessor/preprocessor.py
+++ b/haystack/nodes/preprocessor/preprocessor.py
@ -103,7 +103,10 @@ class PreProcessor(BasePreProcessor):
                                field `"page"`. Page boundaries are determined by `"\f"` character which is added
                                in between pages by `PDFToTextConverter`, `TikaConverter`, `ParsrConverter` and
                                `AzureConverter`.
-        :param max_chars_check: the maximum length a document is expected to have. Each document that is longer than max_chars_check in characters after pre-processing will raise a warning.
+        :param max_chars_check: the maximum length a document is expected to have. Each document that is longer than
+            max_chars_check in characters after pre-processing will raise a warning and is going to be split at the
+            `max_char_check`-th char, regardless of any other constraint. If the resulting documents are still too long,
+            they'll be cut again until all fragments are below the maximum allowed length.
        """
        if remove_substrings is None:
            remove_substrings = []
@ -186,7 +189,9 @@ class PreProcessor(BasePreProcessor):

    def _long_documents(self, documents: List[Document], max_chars_check=10_000):
        """
-        Function that tries to detect unusually long documents.
+        Function that tries to detect unusually long documents. When detected, such documents are going to be
+        split at the `max_char_check`-th char, regardless of any other constraint. If the resulting documents
+        are still too long, they'll be cut again until all fragments are below the maximum allowed length.

        NOTE: this function is a heuristic that is in place only because a proper fix that prevents such documents from forming
        would imply a complete revamp of this class, including better definitions of what the various units (word, sentence, passage) mean exactly.
@ -195,11 +200,23 @@ class PreProcessor(BasePreProcessor):
            if len(document.content) > max_chars_check:
                logger.warning(
                    "Document %s is %s characters long after preprocessing, where the maximum length should be %s. "
-                    "Something might be wrong with the splitting, check the document affected to prevent issues at query time.",
+                    "Something might be wrong with the splitting, check the document affected to prevent issues at "
+                    "query time. This document will be now hard-split at %s chars recursively.",
                    document.id,
                    len(document.content),
                    max_chars_check,
+                    max_chars_check,
                )
+                fields = document.to_dict()
+                document.content = document.content[:max_chars_check]
+                fields.pop("id")
+                fields["content"] = fields["content"][max_chars_check:]
+                # recursively check if tail_document is still too long
+                tail_documents = self._long_documents(
+                    documents=[Document.from_dict(fields)], max_chars_check=max_chars_check
+                )
+                documents += tail_documents
+        return documents

    def _process_single(
        self,
@ -250,7 +267,7 @@ class PreProcessor(BasePreProcessor):
            id_hash_keys=id_hash_keys,
        )

-        self._long_documents(split_documents, max_chars_check=self.max_chars_check)
+        split_documents = self._long_documents(split_documents, max_chars_check=self.max_chars_check)

        return split_documents

--- a/test/nodes/test_preprocessor.py
+++ b/test/nodes/test_preprocessor.py
@ -528,13 +528,12 @@ def test_preprocessor_very_long_document(caplog):
    preproc = PreProcessor(
        clean_empty_lines=False, clean_header_footer=False, clean_whitespace=False, split_by=None, max_chars_check=10
    )
-    documents = [
-        Document(content=f"this is a test document with more than max_char characters: {'1'*i}") for i in range(9)
-    ]
+    documents = [Document(content=str(i) + (f"." * i)) for i in range(0, 30, 3)]
    results = preproc.process(documents)
-    assert results == documents
-    for i in range(5):
-        assert f"is 6{i} characters long after preprocessing, where the maximum length should be 10." in caplog.text
+    assert len(results) == 19
+    assert any(d.content.startswith(".") for d in results)
+    assert any(not d.content.startswith(".") for d in results)
+    assert f"characters long after preprocessing, where the maximum length should be 10." in caplog.text


@pytest.mark.unit