fix: Fix the error of wrong page numbers when documents contain empty pages. (#3330)

* Fix the error of wrong page numbers when documents contain empty pages.

* Reformat using git hooks.

* Use a more descriptive placeholder
This commit is contained in:
Ursin Brunner 2022-10-18 17:51:02 +02:00 committed by GitHub
parent 51d4fe01c3
commit 5fedfb03b0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 42 additions and 9 deletions

View File

@ -42,6 +42,8 @@ iso639_to_nltk = {
"ml": "malayalam",
}
EMPTY_PAGE_PLACEHOLDER = "@@@HAYSTACK_KEEP_PAGE@@@."
class PreProcessor(BasePreProcessor):
def __init__(
@ -259,13 +261,17 @@ class PreProcessor(BasePreProcessor):
cleaned_pages = []
for page in pages:
if not page:
continue
lines = page.splitlines()
cleaned_lines = []
for line in lines:
line = line.strip()
cleaned_lines.append(line)
cleaned_page = "\n".join(cleaned_lines)
# there are many "empty text" pages in a marketing document, as for example the cover page. If we just forget about them, we have a mismatch
# with page numbers which causes problems later on. Therefore, we replace them with a dummy text, which will not be found by any query.
cleaned_page = EMPTY_PAGE_PLACEHOLDER
else:
lines = page.splitlines()
cleaned_lines = []
for line in lines:
line = line.strip()
cleaned_lines.append(line)
cleaned_page = "\n".join(cleaned_lines)
cleaned_pages.append(cleaned_page)
text = "\f".join(cleaned_pages)
@ -332,7 +338,7 @@ class PreProcessor(BasePreProcessor):
list_splits = []
current_slice: List[str] = []
for sen in sentences:
if self.add_page_number and sen.startswith("[NEW_PAGE]"):
if self.add_page_number and "[NEW_PAGE]" in sen:
sen = sen.replace("[NEW_PAGE]", "\f")
word_count_sen = len(sen.split(" "))
@ -429,7 +435,12 @@ class PreProcessor(BasePreProcessor):
# create new document dicts for each text split
documents = []
for i, txt in enumerate(text_splits):
doc = Document(content=txt, meta=deepcopy(document.meta) or {}, id_hash_keys=id_hash_keys)
# now we want to get rid of the empty page placeholder and skip the split if there's nothing left
txt_clean = txt.replace(EMPTY_PAGE_PLACEHOLDER, "")
if not txt_clean.strip():
continue
doc = Document(content=txt_clean, meta=deepcopy(document.meta) or {}, id_hash_keys=id_hash_keys)
doc.meta["_split_id"] = i
if self.add_page_number:
doc.meta["page"] = splits_pages[i]

View File

@ -222,6 +222,28 @@ def test_page_number_extraction(test_input):
assert doc.meta["page"] == 2
def test_page_number_extraction_on_empty_pages():
"""
Often "marketing" documents contain pages without text (visuals only). When extracting page numbers, these pages should be counted as well to avoid
issues when mapping results back to the original document.
"""
preprocessor = PreProcessor(add_page_number=True, split_by="word", split_length=7, split_overlap=0)
text_page_one = "This is a text on page one."
text_page_three = "This is a text on page three."
# this is what we get from PDFToTextConverter in case of an "empty" page
document_with_empty_pages = f"{text_page_one}\f\f{text_page_three}"
document = Document(content=document_with_empty_pages)
documents = preprocessor.process(document)
assert documents[0].meta["page"] == 1
assert documents[1].meta["page"] == 3
# verify the placeholder for the empty page has been removed
assert documents[0].content.strip() == text_page_one
assert documents[1].content.strip() == text_page_three
def test_substitute_page_break():
# Page breaks at the end of sentences should be replaced by "[NEW_PAGE]", while page breaks in between of
# sentences should not be replaced.