From 5fedfb03b03496d7ca25f55788e1fa576ff1b2a4 Mon Sep 17 00:00:00 2001 From: Ursin Brunner Date: Tue, 18 Oct 2022 17:51:02 +0200 Subject: [PATCH] fix: Fix the error of wrong page numbers when documents contain empty pages. (#3330) * Fix the error of wrong page numbers when documents contain empty pages. * Reformat using git hooks. * Use a more descriptive placeholder --- haystack/nodes/preprocessor/preprocessor.py | 29 ++++++++++++++------- test/nodes/test_preprocessor.py | 22 ++++++++++++++++ 2 files changed, 42 insertions(+), 9 deletions(-) diff --git a/haystack/nodes/preprocessor/preprocessor.py b/haystack/nodes/preprocessor/preprocessor.py index cb71d1a55..488e8d071 100644 --- a/haystack/nodes/preprocessor/preprocessor.py +++ b/haystack/nodes/preprocessor/preprocessor.py @@ -42,6 +42,8 @@ iso639_to_nltk = { "ml": "malayalam", } +EMPTY_PAGE_PLACEHOLDER = "@@@HAYSTACK_KEEP_PAGE@@@." + class PreProcessor(BasePreProcessor): def __init__( @@ -259,13 +261,17 @@ class PreProcessor(BasePreProcessor): cleaned_pages = [] for page in pages: if not page: - continue - lines = page.splitlines() - cleaned_lines = [] - for line in lines: - line = line.strip() - cleaned_lines.append(line) - cleaned_page = "\n".join(cleaned_lines) + # there are many "empty text" pages in a marketing document, as for example the cover page. If we just forget about them, we have a mismatch + # with page numbers which causes problems later on. Therefore, we replace them with a dummy text, which will not be found by any query. + cleaned_page = EMPTY_PAGE_PLACEHOLDER + else: + lines = page.splitlines() + cleaned_lines = [] + for line in lines: + line = line.strip() + cleaned_lines.append(line) + cleaned_page = "\n".join(cleaned_lines) + cleaned_pages.append(cleaned_page) text = "\f".join(cleaned_pages) @@ -332,7 +338,7 @@ class PreProcessor(BasePreProcessor): list_splits = [] current_slice: List[str] = [] for sen in sentences: - if self.add_page_number and sen.startswith("[NEW_PAGE]"): + if self.add_page_number and "[NEW_PAGE]" in sen: sen = sen.replace("[NEW_PAGE]", "\f") word_count_sen = len(sen.split(" ")) @@ -429,7 +435,12 @@ class PreProcessor(BasePreProcessor): # create new document dicts for each text split documents = [] for i, txt in enumerate(text_splits): - doc = Document(content=txt, meta=deepcopy(document.meta) or {}, id_hash_keys=id_hash_keys) + # now we want to get rid of the empty page placeholder and skip the split if there's nothing left + txt_clean = txt.replace(EMPTY_PAGE_PLACEHOLDER, "") + if not txt_clean.strip(): + continue + + doc = Document(content=txt_clean, meta=deepcopy(document.meta) or {}, id_hash_keys=id_hash_keys) doc.meta["_split_id"] = i if self.add_page_number: doc.meta["page"] = splits_pages[i] diff --git a/test/nodes/test_preprocessor.py b/test/nodes/test_preprocessor.py index b9a14257a..91f2e9ad6 100644 --- a/test/nodes/test_preprocessor.py +++ b/test/nodes/test_preprocessor.py @@ -222,6 +222,28 @@ def test_page_number_extraction(test_input): assert doc.meta["page"] == 2 +def test_page_number_extraction_on_empty_pages(): + """ + Often "marketing" documents contain pages without text (visuals only). When extracting page numbers, these pages should be counted as well to avoid + issues when mapping results back to the original document. + """ + preprocessor = PreProcessor(add_page_number=True, split_by="word", split_length=7, split_overlap=0) + text_page_one = "This is a text on page one." + text_page_three = "This is a text on page three." + # this is what we get from PDFToTextConverter in case of an "empty" page + document_with_empty_pages = f"{text_page_one}\f\f{text_page_three}" + document = Document(content=document_with_empty_pages) + + documents = preprocessor.process(document) + + assert documents[0].meta["page"] == 1 + assert documents[1].meta["page"] == 3 + + # verify the placeholder for the empty page has been removed + assert documents[0].content.strip() == text_page_one + assert documents[1].content.strip() == text_page_three + + def test_substitute_page_break(): # Page breaks at the end of sentences should be replaced by "[NEW_PAGE]", while page breaks in between of # sentences should not be replaced.