From 5fedfb03b03496d7ca25f55788e1fa576ff1b2a4 Mon Sep 17 00:00:00 2001
From: Ursin Brunner <ursin.brunner@gmail.com>
Date: Tue, 18 Oct 2022 17:51:02 +0200
Subject: [PATCH] fix: Fix the error of wrong page numbers when documents
 contain empty pages. (#3330)

* Fix the error of wrong page numbers when documents contain empty pages.

* Reformat using git hooks.

* Use a more descriptive placeholder
---
 haystack/nodes/preprocessor/preprocessor.py | 29 ++++++++++++++-------
 test/nodes/test_preprocessor.py             | 22 ++++++++++++++++
 2 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/haystack/nodes/preprocessor/preprocessor.py b/haystack/nodes/preprocessor/preprocessor.py
index cb71d1a55..488e8d071 100644
--- a/haystack/nodes/preprocessor/preprocessor.py
+++ b/haystack/nodes/preprocessor/preprocessor.py
@@ -42,6 +42,8 @@ iso639_to_nltk = {
     "ml": "malayalam",
 }
 
+EMPTY_PAGE_PLACEHOLDER = "@@@HAYSTACK_KEEP_PAGE@@@."
+
 
 class PreProcessor(BasePreProcessor):
     def __init__(
@@ -259,13 +261,17 @@ class PreProcessor(BasePreProcessor):
             cleaned_pages = []
             for page in pages:
                 if not page:
-                    continue
-                lines = page.splitlines()
-                cleaned_lines = []
-                for line in lines:
-                    line = line.strip()
-                    cleaned_lines.append(line)
-                cleaned_page = "\n".join(cleaned_lines)
+                    # there are many "empty text" pages in a marketing document, as for example the cover page. If we just forget about them, we have a mismatch
+                    # with page numbers which causes problems later on. Therefore, we replace them with a dummy text, which will not be found by any query.
+                    cleaned_page = EMPTY_PAGE_PLACEHOLDER
+                else:
+                    lines = page.splitlines()
+                    cleaned_lines = []
+                    for line in lines:
+                        line = line.strip()
+                        cleaned_lines.append(line)
+                    cleaned_page = "\n".join(cleaned_lines)
+
                 cleaned_pages.append(cleaned_page)
 
             text = "\f".join(cleaned_pages)
@@ -332,7 +338,7 @@ class PreProcessor(BasePreProcessor):
             list_splits = []
             current_slice: List[str] = []
             for sen in sentences:
-                if self.add_page_number and sen.startswith("[NEW_PAGE]"):
+                if self.add_page_number and "[NEW_PAGE]" in sen:
                     sen = sen.replace("[NEW_PAGE]", "\f")
 
                 word_count_sen = len(sen.split(" "))
@@ -429,7 +435,12 @@ class PreProcessor(BasePreProcessor):
         # create new document dicts for each text split
         documents = []
         for i, txt in enumerate(text_splits):
-            doc = Document(content=txt, meta=deepcopy(document.meta) or {}, id_hash_keys=id_hash_keys)
+            # now we want to get rid of the empty page placeholder and skip the split if there's nothing left
+            txt_clean = txt.replace(EMPTY_PAGE_PLACEHOLDER, "")
+            if not txt_clean.strip():
+                continue
+
+            doc = Document(content=txt_clean, meta=deepcopy(document.meta) or {}, id_hash_keys=id_hash_keys)
             doc.meta["_split_id"] = i
             if self.add_page_number:
                 doc.meta["page"] = splits_pages[i]
diff --git a/test/nodes/test_preprocessor.py b/test/nodes/test_preprocessor.py
index b9a14257a..91f2e9ad6 100644
--- a/test/nodes/test_preprocessor.py
+++ b/test/nodes/test_preprocessor.py
@@ -222,6 +222,28 @@ def test_page_number_extraction(test_input):
             assert doc.meta["page"] == 2
 
 
+def test_page_number_extraction_on_empty_pages():
+    """
+    Often "marketing" documents contain pages without text (visuals only). When extracting page numbers, these pages should be counted as well to avoid
+    issues when mapping results back to the original document.
+    """
+    preprocessor = PreProcessor(add_page_number=True, split_by="word", split_length=7, split_overlap=0)
+    text_page_one = "This is a text on page one."
+    text_page_three = "This is a text on page three."
+    # this is what we get from PDFToTextConverter in case of an "empty" page
+    document_with_empty_pages = f"{text_page_one}\f\f{text_page_three}"
+    document = Document(content=document_with_empty_pages)
+
+    documents = preprocessor.process(document)
+
+    assert documents[0].meta["page"] == 1
+    assert documents[1].meta["page"] == 3
+
+    # verify the placeholder for the empty page has been removed
+    assert documents[0].content.strip() == text_page_one
+    assert documents[1].content.strip() == text_page_three
+
+
 def test_substitute_page_break():
     # Page breaks at the end of sentences should be replaced by "[NEW_PAGE]", while page breaks in between of
     # sentences should not be replaced.