fix: Fix the error of wrong page numbers when documents contain empty pages. (#3330)

* Fix the error of wrong page numbers when documents contain empty pages. * Reformat using git hooks. * Use a more descriptive placeholder
2026-01-08 04:56:45 +00:00 · 2022-10-18 17:51:02 +02:00 · 2022-10-18 17:51:02 +02:00 · 5fedfb03b0
commit 5fedfb03b0
parent 51d4fe01c3
2 changed files with 42 additions and 9 deletions
--- a/haystack/nodes/preprocessor/preprocessor.py
+++ b/haystack/nodes/preprocessor/preprocessor.py
@ -42,6 +42,8 @@ iso639_to_nltk = {
    "ml": "malayalam",
 }

+EMPTY_PAGE_PLACEHOLDER = "@@@HAYSTACK_KEEP_PAGE@@@."
+

 class PreProcessor(BasePreProcessor):
    def __init__(
@ -259,13 +261,17 @@ class PreProcessor(BasePreProcessor):
            cleaned_pages = []
            for page in pages:
                if not page:
-                    continue
-                lines = page.splitlines()
-                cleaned_lines = []
-                for line in lines:
-                    line = line.strip()
-                    cleaned_lines.append(line)
-                cleaned_page = "\n".join(cleaned_lines)
+                    # there are many "empty text" pages in a marketing document, as for example the cover page. If we just forget about them, we have a mismatch
+                    # with page numbers which causes problems later on. Therefore, we replace them with a dummy text, which will not be found by any query.
+                    cleaned_page = EMPTY_PAGE_PLACEHOLDER
+                else:
+                    lines = page.splitlines()
+                    cleaned_lines = []
+                    for line in lines:
+                        line = line.strip()
+                        cleaned_lines.append(line)
+                    cleaned_page = "\n".join(cleaned_lines)
+
                cleaned_pages.append(cleaned_page)

            text = "\f".join(cleaned_pages)
@ -332,7 +338,7 @@ class PreProcessor(BasePreProcessor):
            list_splits = []
            current_slice: List[str] = []
            for sen in sentences:
-                if self.add_page_number and sen.startswith("[NEW_PAGE]"):
+                if self.add_page_number and "[NEW_PAGE]" in sen:
                    sen = sen.replace("[NEW_PAGE]", "\f")

                word_count_sen = len(sen.split(" "))
@ -429,7 +435,12 @@ class PreProcessor(BasePreProcessor):
        # create new document dicts for each text split
        documents = []
        for i, txt in enumerate(text_splits):
-            doc = Document(content=txt, meta=deepcopy(document.meta) or {}, id_hash_keys=id_hash_keys)
+            # now we want to get rid of the empty page placeholder and skip the split if there's nothing left
+            txt_clean = txt.replace(EMPTY_PAGE_PLACEHOLDER, "")
+            if not txt_clean.strip():
+                continue
+
+            doc = Document(content=txt_clean, meta=deepcopy(document.meta) or {}, id_hash_keys=id_hash_keys)
            doc.meta["_split_id"] = i
            if self.add_page_number:
                doc.meta["page"] = splits_pages[i]
--- a/test/nodes/test_preprocessor.py
+++ b/test/nodes/test_preprocessor.py
@ -222,6 +222,28 @@ def test_page_number_extraction(test_input):
            assert doc.meta["page"] == 2


+def test_page_number_extraction_on_empty_pages():
+    """
+    Often "marketing" documents contain pages without text (visuals only). When extracting page numbers, these pages should be counted as well to avoid
+    issues when mapping results back to the original document.
+    """
+    preprocessor = PreProcessor(add_page_number=True, split_by="word", split_length=7, split_overlap=0)
+    text_page_one = "This is a text on page one."
+    text_page_three = "This is a text on page three."
+    # this is what we get from PDFToTextConverter in case of an "empty" page
+    document_with_empty_pages = f"{text_page_one}\f\f{text_page_three}"
+    document = Document(content=document_with_empty_pages)
+
+    documents = preprocessor.process(document)
+
+    assert documents[0].meta["page"] == 1
+    assert documents[1].meta["page"] == 3
+
+    # verify the placeholder for the empty page has been removed
+    assert documents[0].content.strip() == text_page_one
+    assert documents[1].content.strip() == text_page_three
+
+
 def test_substitute_page_break():
    # Page breaks at the end of sentences should be replaced by "[NEW_PAGE]", while page breaks in between of
    # sentences should not be replaced.