feat: add page_number to metadata in DocumentSplitter (#7599)

* Add the implementation for page counting used in the v1.25.x branch. It should work as expected in issue #6705.

* Add tests that reflect the desired behabiour. This behabiour is inffered from the one it had on Haystack 1.x
Solve some minor bugs spotted by tests.

* Update docstrings.

* Add reno.

* Update haystack/components/preprocessors/document_splitter.py

Update docstring from suggestion

Co-authored-by: David S. Batista <dsbatista@gmail.com>

* solve suggestion to improve readability

* fragment tests

* Update haystack/components/preprocessors/document_splitter.py

Co-authored-by: David S. Batista <dsbatista@gmail.com>

* Update .gitignore

* Update .gitignore

* Update add-page-number-to-document-splitter-162e9dc7443575f0.yaml

* blackening

---------

Co-authored-by: David S. Batista <dsbatista@gmail.com>
This commit is contained in:
Carlos Fernández 2024-04-29 12:51:18 +02:00 committed by GitHub
parent 8d04e530da
commit d2c87b2fd9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 136 additions and 7 deletions

View File

@ -1,5 +1,5 @@
from copy import deepcopy
from typing import List, Literal
from typing import Dict, List, Literal, Tuple
from more_itertools import windowed
@ -53,7 +53,7 @@ class DocumentSplitter:
:returns: A dictionary with the following key:
- `documents`: List of documents with the split texts. A metadata field "source_id" is added to each
document to keep track of the original document that was split. Other metadata are copied from the original
document to keep track of the original document that was split. Another metadata field "page_number" is added to each number to keep track of the page it belonged to in the original document. Other metadata are copied from the original
document.
:raises TypeError: if the input is not a list of Documents.
@ -70,10 +70,12 @@ class DocumentSplitter:
f"DocumentSplitter only works with text documents but document.content for document ID {doc.id} is None."
)
units = self._split_into_units(doc.content, self.split_by)
text_splits = self._concatenate_units(units, self.split_length, self.split_overlap)
text_splits, splits_pages = self._concatenate_units(units, self.split_length, self.split_overlap)
metadata = deepcopy(doc.meta)
metadata["source_id"] = doc.id
split_docs += [Document(content=txt, meta=metadata) for txt in text_splits]
split_docs += self._create_docs_from_splits(
text_splits=text_splits, splits_pages=splits_pages, meta=metadata
)
return {"documents": split_docs}
def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage", "page"]) -> List[str]:
@ -95,15 +97,40 @@ class DocumentSplitter:
units[i] += split_at
return units
def _concatenate_units(self, elements: List[str], split_length: int, split_overlap: int) -> List[str]:
def _concatenate_units(
self, elements: List[str], split_length: int, split_overlap: int
) -> Tuple[List[str], List[int]]:
"""
Concatenates the elements into parts of split_length units.
Concatenates the elements into parts of split_length units keeping track of the original page number that each element belongs.
"""
text_splits = []
splits_pages = []
cur_page = 1
segments = windowed(elements, n=split_length, step=split_length - split_overlap)
for seg in segments:
current_units = [unit for unit in seg if unit is not None]
txt = "".join(current_units)
if len(txt) > 0:
text_splits.append(txt)
return text_splits
splits_pages.append(cur_page)
processed_units = current_units[: split_length - split_overlap]
if self.split_by == "page":
num_page_breaks = len(processed_units)
else:
num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)
cur_page += num_page_breaks
return text_splits, splits_pages
@staticmethod
def _create_docs_from_splits(text_splits: List[str], splits_pages: List[int], meta: Dict) -> List[Document]:
"""
Creates Document objects from text splits enriching them with page number and the metadata of the original document.
"""
documents: List[Document] = []
for i, txt in enumerate(text_splits):
meta = deepcopy(meta)
doc = Document(content=txt, meta=meta)
doc.meta["page_number"] = splits_pages[i]
documents.append(doc)
return documents

View File

@ -0,0 +1,7 @@
---
highlights: >
Add the "page_number" field to the metadata of all output documents.
enhancements:
- |
Now the DocumentSplitter adds the "page_number" field to the metadata of all output documents to keep track of the page of the original document it belongs to.

View File

@ -141,3 +141,98 @@ class TestDocumentSplitter:
for doc, split_doc in zip(documents, result["documents"]):
assert doc.meta.items() <= split_doc.meta.items()
assert split_doc.content == "Text."
def test_add_page_number_to_metadata_with_no_overlap_word_split(self):
splitter = DocumentSplitter(split_by="word", split_length=2)
doc1 = Document(content="This is some text.\f This text is on another page.")
doc2 = Document(content="This content has two.\f\f page brakes.")
result = splitter.run(documents=[doc1, doc2])
expected_pages = [1, 1, 2, 2, 2, 1, 1, 3]
for doc, p in zip(result["documents"], expected_pages):
assert doc.meta["page_number"] == p
def test_add_page_number_to_metadata_with_no_overlap_sentence_split(self):
splitter = DocumentSplitter(split_by="sentence", split_length=1)
doc1 = Document(content="This is some text.\f This text is on another page.")
doc2 = Document(content="This content has two.\f\f page brakes.")
result = splitter.run(documents=[doc1, doc2])
expected_pages = [1, 1, 1, 1]
for doc, p in zip(result["documents"], expected_pages):
assert doc.meta["page_number"] == p
def test_add_page_number_to_metadata_with_no_overlap_passage_split(self):
splitter = DocumentSplitter(split_by="passage", split_length=1)
doc1 = Document(
content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage."
)
result = splitter.run(documents=[doc1])
expected_pages = [1, 2, 2, 2]
for doc, p in zip(result["documents"], expected_pages):
assert doc.meta["page_number"] == p
def test_add_page_number_to_metadata_with_no_overlap_page_split(self):
splitter = DocumentSplitter(split_by="page", split_length=1)
doc1 = Document(
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
)
result = splitter.run(documents=[doc1])
expected_pages = [1, 2, 3]
for doc, p in zip(result["documents"], expected_pages):
assert doc.meta["page_number"] == p
splitter = DocumentSplitter(split_by="page", split_length=2)
doc1 = Document(
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
)
result = splitter.run(documents=[doc1])
expected_pages = [1, 3]
for doc, p in zip(result["documents"], expected_pages):
assert doc.meta["page_number"] == p
def test_add_page_number_to_metadata_with_overlap_word_split(self):
splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=1)
doc1 = Document(content="This is some text. And\f this text is on another page.")
doc2 = Document(content="This content has two.\f\f page brakes.")
result = splitter.run(documents=[doc1, doc2])
expected_pages = [1, 1, 1, 2, 2, 1, 1, 3]
for doc, p in zip(result["documents"], expected_pages):
print(doc.content, doc.meta, p)
assert doc.meta["page_number"] == p
def test_add_page_number_to_metadata_with_overlap_sentence_split(self):
splitter = DocumentSplitter(split_by="sentence", split_length=2, split_overlap=1)
doc1 = Document(content="This is some text. And this is more text.\f This text is on another page. End.")
doc2 = Document(content="This content has two.\f\f page brakes. More text.")
result = splitter.run(documents=[doc1, doc2])
expected_pages = [1, 1, 1, 2, 1, 1]
for doc, p in zip(result["documents"], expected_pages):
print(doc.content, doc.meta, p)
assert doc.meta["page_number"] == p
def test_add_page_number_to_metadata_with_overlap_passage_split(self):
splitter = DocumentSplitter(split_by="passage", split_length=2, split_overlap=1)
doc1 = Document(
content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage."
)
result = splitter.run(documents=[doc1])
expected_pages = [1, 2, 2]
for doc, p in zip(result["documents"], expected_pages):
assert doc.meta["page_number"] == p
def test_add_page_number_to_metadata_with_overlap_page_split(self):
splitter = DocumentSplitter(split_by="page", split_length=2, split_overlap=1)
doc1 = Document(
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
)
result = splitter.run(documents=[doc1])
expected_pages = [1, 2, 3]
for doc, p in zip(result["documents"], expected_pages):
assert doc.meta["page_number"] == p