mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-01 02:09:39 +00:00
fix: Fix the error of wrong page numbers when documents contain empty pages. (#3330)
* Fix the error of wrong page numbers when documents contain empty pages. * Reformat using git hooks. * Use a more descriptive placeholder
This commit is contained in:
parent
51d4fe01c3
commit
5fedfb03b0
@ -42,6 +42,8 @@ iso639_to_nltk = {
|
||||
"ml": "malayalam",
|
||||
}
|
||||
|
||||
EMPTY_PAGE_PLACEHOLDER = "@@@HAYSTACK_KEEP_PAGE@@@."
|
||||
|
||||
|
||||
class PreProcessor(BasePreProcessor):
|
||||
def __init__(
|
||||
@ -259,13 +261,17 @@ class PreProcessor(BasePreProcessor):
|
||||
cleaned_pages = []
|
||||
for page in pages:
|
||||
if not page:
|
||||
continue
|
||||
lines = page.splitlines()
|
||||
cleaned_lines = []
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
cleaned_lines.append(line)
|
||||
cleaned_page = "\n".join(cleaned_lines)
|
||||
# there are many "empty text" pages in a marketing document, as for example the cover page. If we just forget about them, we have a mismatch
|
||||
# with page numbers which causes problems later on. Therefore, we replace them with a dummy text, which will not be found by any query.
|
||||
cleaned_page = EMPTY_PAGE_PLACEHOLDER
|
||||
else:
|
||||
lines = page.splitlines()
|
||||
cleaned_lines = []
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
cleaned_lines.append(line)
|
||||
cleaned_page = "\n".join(cleaned_lines)
|
||||
|
||||
cleaned_pages.append(cleaned_page)
|
||||
|
||||
text = "\f".join(cleaned_pages)
|
||||
@ -332,7 +338,7 @@ class PreProcessor(BasePreProcessor):
|
||||
list_splits = []
|
||||
current_slice: List[str] = []
|
||||
for sen in sentences:
|
||||
if self.add_page_number and sen.startswith("[NEW_PAGE]"):
|
||||
if self.add_page_number and "[NEW_PAGE]" in sen:
|
||||
sen = sen.replace("[NEW_PAGE]", "\f")
|
||||
|
||||
word_count_sen = len(sen.split(" "))
|
||||
@ -429,7 +435,12 @@ class PreProcessor(BasePreProcessor):
|
||||
# create new document dicts for each text split
|
||||
documents = []
|
||||
for i, txt in enumerate(text_splits):
|
||||
doc = Document(content=txt, meta=deepcopy(document.meta) or {}, id_hash_keys=id_hash_keys)
|
||||
# now we want to get rid of the empty page placeholder and skip the split if there's nothing left
|
||||
txt_clean = txt.replace(EMPTY_PAGE_PLACEHOLDER, "")
|
||||
if not txt_clean.strip():
|
||||
continue
|
||||
|
||||
doc = Document(content=txt_clean, meta=deepcopy(document.meta) or {}, id_hash_keys=id_hash_keys)
|
||||
doc.meta["_split_id"] = i
|
||||
if self.add_page_number:
|
||||
doc.meta["page"] = splits_pages[i]
|
||||
|
||||
@ -222,6 +222,28 @@ def test_page_number_extraction(test_input):
|
||||
assert doc.meta["page"] == 2
|
||||
|
||||
|
||||
def test_page_number_extraction_on_empty_pages():
|
||||
"""
|
||||
Often "marketing" documents contain pages without text (visuals only). When extracting page numbers, these pages should be counted as well to avoid
|
||||
issues when mapping results back to the original document.
|
||||
"""
|
||||
preprocessor = PreProcessor(add_page_number=True, split_by="word", split_length=7, split_overlap=0)
|
||||
text_page_one = "This is a text on page one."
|
||||
text_page_three = "This is a text on page three."
|
||||
# this is what we get from PDFToTextConverter in case of an "empty" page
|
||||
document_with_empty_pages = f"{text_page_one}\f\f{text_page_three}"
|
||||
document = Document(content=document_with_empty_pages)
|
||||
|
||||
documents = preprocessor.process(document)
|
||||
|
||||
assert documents[0].meta["page"] == 1
|
||||
assert documents[1].meta["page"] == 3
|
||||
|
||||
# verify the placeholder for the empty page has been removed
|
||||
assert documents[0].content.strip() == text_page_one
|
||||
assert documents[1].content.strip() == text_page_three
|
||||
|
||||
|
||||
def test_substitute_page_break():
|
||||
# Page breaks at the end of sentences should be replaced by "[NEW_PAGE]", while page breaks in between of
|
||||
# sentences should not be replaced.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user