mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-12 08:03:50 +00:00
fix: Fix the error of wrong page numbers when documents contain empty pages. (#3330)
* Fix the error of wrong page numbers when documents contain empty pages. * Reformat using git hooks. * Use a more descriptive placeholder
This commit is contained in:
parent
51d4fe01c3
commit
5fedfb03b0
@ -42,6 +42,8 @@ iso639_to_nltk = {
|
|||||||
"ml": "malayalam",
|
"ml": "malayalam",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EMPTY_PAGE_PLACEHOLDER = "@@@HAYSTACK_KEEP_PAGE@@@."
|
||||||
|
|
||||||
|
|
||||||
class PreProcessor(BasePreProcessor):
|
class PreProcessor(BasePreProcessor):
|
||||||
def __init__(
|
def __init__(
|
||||||
@ -259,13 +261,17 @@ class PreProcessor(BasePreProcessor):
|
|||||||
cleaned_pages = []
|
cleaned_pages = []
|
||||||
for page in pages:
|
for page in pages:
|
||||||
if not page:
|
if not page:
|
||||||
continue
|
# there are many "empty text" pages in a marketing document, as for example the cover page. If we just forget about them, we have a mismatch
|
||||||
lines = page.splitlines()
|
# with page numbers which causes problems later on. Therefore, we replace them with a dummy text, which will not be found by any query.
|
||||||
cleaned_lines = []
|
cleaned_page = EMPTY_PAGE_PLACEHOLDER
|
||||||
for line in lines:
|
else:
|
||||||
line = line.strip()
|
lines = page.splitlines()
|
||||||
cleaned_lines.append(line)
|
cleaned_lines = []
|
||||||
cleaned_page = "\n".join(cleaned_lines)
|
for line in lines:
|
||||||
|
line = line.strip()
|
||||||
|
cleaned_lines.append(line)
|
||||||
|
cleaned_page = "\n".join(cleaned_lines)
|
||||||
|
|
||||||
cleaned_pages.append(cleaned_page)
|
cleaned_pages.append(cleaned_page)
|
||||||
|
|
||||||
text = "\f".join(cleaned_pages)
|
text = "\f".join(cleaned_pages)
|
||||||
@ -332,7 +338,7 @@ class PreProcessor(BasePreProcessor):
|
|||||||
list_splits = []
|
list_splits = []
|
||||||
current_slice: List[str] = []
|
current_slice: List[str] = []
|
||||||
for sen in sentences:
|
for sen in sentences:
|
||||||
if self.add_page_number and sen.startswith("[NEW_PAGE]"):
|
if self.add_page_number and "[NEW_PAGE]" in sen:
|
||||||
sen = sen.replace("[NEW_PAGE]", "\f")
|
sen = sen.replace("[NEW_PAGE]", "\f")
|
||||||
|
|
||||||
word_count_sen = len(sen.split(" "))
|
word_count_sen = len(sen.split(" "))
|
||||||
@ -429,7 +435,12 @@ class PreProcessor(BasePreProcessor):
|
|||||||
# create new document dicts for each text split
|
# create new document dicts for each text split
|
||||||
documents = []
|
documents = []
|
||||||
for i, txt in enumerate(text_splits):
|
for i, txt in enumerate(text_splits):
|
||||||
doc = Document(content=txt, meta=deepcopy(document.meta) or {}, id_hash_keys=id_hash_keys)
|
# now we want to get rid of the empty page placeholder and skip the split if there's nothing left
|
||||||
|
txt_clean = txt.replace(EMPTY_PAGE_PLACEHOLDER, "")
|
||||||
|
if not txt_clean.strip():
|
||||||
|
continue
|
||||||
|
|
||||||
|
doc = Document(content=txt_clean, meta=deepcopy(document.meta) or {}, id_hash_keys=id_hash_keys)
|
||||||
doc.meta["_split_id"] = i
|
doc.meta["_split_id"] = i
|
||||||
if self.add_page_number:
|
if self.add_page_number:
|
||||||
doc.meta["page"] = splits_pages[i]
|
doc.meta["page"] = splits_pages[i]
|
||||||
|
|||||||
@ -222,6 +222,28 @@ def test_page_number_extraction(test_input):
|
|||||||
assert doc.meta["page"] == 2
|
assert doc.meta["page"] == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_page_number_extraction_on_empty_pages():
|
||||||
|
"""
|
||||||
|
Often "marketing" documents contain pages without text (visuals only). When extracting page numbers, these pages should be counted as well to avoid
|
||||||
|
issues when mapping results back to the original document.
|
||||||
|
"""
|
||||||
|
preprocessor = PreProcessor(add_page_number=True, split_by="word", split_length=7, split_overlap=0)
|
||||||
|
text_page_one = "This is a text on page one."
|
||||||
|
text_page_three = "This is a text on page three."
|
||||||
|
# this is what we get from PDFToTextConverter in case of an "empty" page
|
||||||
|
document_with_empty_pages = f"{text_page_one}\f\f{text_page_three}"
|
||||||
|
document = Document(content=document_with_empty_pages)
|
||||||
|
|
||||||
|
documents = preprocessor.process(document)
|
||||||
|
|
||||||
|
assert documents[0].meta["page"] == 1
|
||||||
|
assert documents[1].meta["page"] == 3
|
||||||
|
|
||||||
|
# verify the placeholder for the empty page has been removed
|
||||||
|
assert documents[0].content.strip() == text_page_one
|
||||||
|
assert documents[1].content.strip() == text_page_three
|
||||||
|
|
||||||
|
|
||||||
def test_substitute_page_break():
|
def test_substitute_page_break():
|
||||||
# Page breaks at the end of sentences should be replaced by "[NEW_PAGE]", while page breaks in between of
|
# Page breaks at the end of sentences should be replaced by "[NEW_PAGE]", while page breaks in between of
|
||||||
# sentences should not be replaced.
|
# sentences should not be replaced.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user