fix: Fix the error of wrong page numbers when documents contain empty pages. (#3330)

* Fix the error of wrong page numbers when documents contain empty pages. * Reformat using git hooks. * Use a more descriptive placeholder
2025-11-12 08:03:50 +00:00 · 2022-10-18 17:51:02 +02:00 · 2022-10-18 17:51:02 +02:00 · 5fedfb03b0
commit 5fedfb03b0
parent 51d4fe01c3
2 changed files with 42 additions and 9 deletions
--- a/haystack/nodes/preprocessor/preprocessor.py
+++ b/haystack/nodes/preprocessor/preprocessor.py
@ -42,6 +42,8 @@ iso639_to_nltk = {
    "ml": "malayalam",
 }
 EMPTY_PAGE_PLACEHOLDER = "@@@HAYSTACK_KEEP_PAGE@@@."
 class PreProcessor(BasePreProcessor):
    def __init__(
@ -259,13 +261,17 @@ class PreProcessor(BasePreProcessor):
            cleaned_pages = []
            for page in pages:
                if not page:
-                    continue
+                    # there are many "empty text" pages in a marketing document, as for example the cover page. If we just forget about them, we have a mismatch
-                lines = page.splitlines()
+                    # with page numbers which causes problems later on. Therefore, we replace them with a dummy text, which will not be found by any query.
-                cleaned_lines = []
+                    cleaned_page = EMPTY_PAGE_PLACEHOLDER
-                for line in lines:
+                else:
-                    line = line.strip()
+                    lines = page.splitlines()
-                    cleaned_lines.append(line)
+                    cleaned_lines = []
-                cleaned_page = "\n".join(cleaned_lines)
+                    for line in lines:
                        line = line.strip()
                        cleaned_lines.append(line)
                    cleaned_page = "\n".join(cleaned_lines)
                cleaned_pages.append(cleaned_page)
            text = "\f".join(cleaned_pages)
@ -332,7 +338,7 @@ class PreProcessor(BasePreProcessor):
            list_splits = []
            current_slice: List[str] = []
            for sen in sentences:
-                if self.add_page_number and sen.startswith("[NEW_PAGE]"):
+                if self.add_page_number and "[NEW_PAGE]" in sen:
                    sen = sen.replace("[NEW_PAGE]", "\f")
                word_count_sen = len(sen.split(" "))
@ -429,7 +435,12 @@ class PreProcessor(BasePreProcessor):
        # create new document dicts for each text split
        documents = []
        for i, txt in enumerate(text_splits):
-            doc = Document(content=txt, meta=deepcopy(document.meta) or {}, id_hash_keys=id_hash_keys)
+            # now we want to get rid of the empty page placeholder and skip the split if there's nothing left
            txt_clean = txt.replace(EMPTY_PAGE_PLACEHOLDER, "")
            if not txt_clean.strip():
                continue
            doc = Document(content=txt_clean, meta=deepcopy(document.meta) or {}, id_hash_keys=id_hash_keys)
            doc.meta["_split_id"] = i
            if self.add_page_number:
                doc.meta["page"] = splits_pages[i]
--- a/test/nodes/test_preprocessor.py
+++ b/test/nodes/test_preprocessor.py
@ -222,6 +222,28 @@ def test_page_number_extraction(test_input):
            assert doc.meta["page"] == 2
 def test_page_number_extraction_on_empty_pages():
    """
    Often "marketing" documents contain pages without text (visuals only). When extracting page numbers, these pages should be counted as well to avoid
    issues when mapping results back to the original document.
    """
    preprocessor = PreProcessor(add_page_number=True, split_by="word", split_length=7, split_overlap=0)
    text_page_one = "This is a text on page one."
    text_page_three = "This is a text on page three."
    # this is what we get from PDFToTextConverter in case of an "empty" page
    document_with_empty_pages = f"{text_page_one}\f\f{text_page_three}"
    document = Document(content=document_with_empty_pages)
    documents = preprocessor.process(document)
    assert documents[0].meta["page"] == 1
    assert documents[1].meta["page"] == 3
    # verify the placeholder for the empty page has been removed
    assert documents[0].content.strip() == text_page_one
    assert documents[1].content.strip() == text_page_three
 def test_substitute_page_break():
    # Page breaks at the end of sentences should be replaced by "[NEW_PAGE]", while page breaks in between of
    # sentences should not be replaced.