mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-06 12:07:04 +00:00
feat: add page_number to metadata in DocumentSplitter (#7599)
* Add the implementation for page counting used in the v1.25.x branch. It should work as expected in issue #6705. * Add tests that reflect the desired behabiour. This behabiour is inffered from the one it had on Haystack 1.x Solve some minor bugs spotted by tests. * Update docstrings. * Add reno. * Update haystack/components/preprocessors/document_splitter.py Update docstring from suggestion Co-authored-by: David S. Batista <dsbatista@gmail.com> * solve suggestion to improve readability * fragment tests * Update haystack/components/preprocessors/document_splitter.py Co-authored-by: David S. Batista <dsbatista@gmail.com> * Update .gitignore * Update .gitignore * Update add-page-number-to-document-splitter-162e9dc7443575f0.yaml * blackening --------- Co-authored-by: David S. Batista <dsbatista@gmail.com>
This commit is contained in:
parent
8d04e530da
commit
d2c87b2fd9
@ -1,5 +1,5 @@
|
||||
from copy import deepcopy
|
||||
from typing import List, Literal
|
||||
from typing import Dict, List, Literal, Tuple
|
||||
|
||||
from more_itertools import windowed
|
||||
|
||||
@ -53,7 +53,7 @@ class DocumentSplitter:
|
||||
|
||||
:returns: A dictionary with the following key:
|
||||
- `documents`: List of documents with the split texts. A metadata field "source_id" is added to each
|
||||
document to keep track of the original document that was split. Other metadata are copied from the original
|
||||
document to keep track of the original document that was split. Another metadata field "page_number" is added to each number to keep track of the page it belonged to in the original document. Other metadata are copied from the original
|
||||
document.
|
||||
|
||||
:raises TypeError: if the input is not a list of Documents.
|
||||
@ -70,10 +70,12 @@ class DocumentSplitter:
|
||||
f"DocumentSplitter only works with text documents but document.content for document ID {doc.id} is None."
|
||||
)
|
||||
units = self._split_into_units(doc.content, self.split_by)
|
||||
text_splits = self._concatenate_units(units, self.split_length, self.split_overlap)
|
||||
text_splits, splits_pages = self._concatenate_units(units, self.split_length, self.split_overlap)
|
||||
metadata = deepcopy(doc.meta)
|
||||
metadata["source_id"] = doc.id
|
||||
split_docs += [Document(content=txt, meta=metadata) for txt in text_splits]
|
||||
split_docs += self._create_docs_from_splits(
|
||||
text_splits=text_splits, splits_pages=splits_pages, meta=metadata
|
||||
)
|
||||
return {"documents": split_docs}
|
||||
|
||||
def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage", "page"]) -> List[str]:
|
||||
@ -95,15 +97,40 @@ class DocumentSplitter:
|
||||
units[i] += split_at
|
||||
return units
|
||||
|
||||
def _concatenate_units(self, elements: List[str], split_length: int, split_overlap: int) -> List[str]:
|
||||
def _concatenate_units(
|
||||
self, elements: List[str], split_length: int, split_overlap: int
|
||||
) -> Tuple[List[str], List[int]]:
|
||||
"""
|
||||
Concatenates the elements into parts of split_length units.
|
||||
Concatenates the elements into parts of split_length units keeping track of the original page number that each element belongs.
|
||||
"""
|
||||
text_splits = []
|
||||
splits_pages = []
|
||||
cur_page = 1
|
||||
segments = windowed(elements, n=split_length, step=split_length - split_overlap)
|
||||
for seg in segments:
|
||||
current_units = [unit for unit in seg if unit is not None]
|
||||
txt = "".join(current_units)
|
||||
if len(txt) > 0:
|
||||
text_splits.append(txt)
|
||||
return text_splits
|
||||
splits_pages.append(cur_page)
|
||||
processed_units = current_units[: split_length - split_overlap]
|
||||
if self.split_by == "page":
|
||||
num_page_breaks = len(processed_units)
|
||||
else:
|
||||
num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)
|
||||
cur_page += num_page_breaks
|
||||
return text_splits, splits_pages
|
||||
|
||||
@staticmethod
|
||||
def _create_docs_from_splits(text_splits: List[str], splits_pages: List[int], meta: Dict) -> List[Document]:
|
||||
"""
|
||||
Creates Document objects from text splits enriching them with page number and the metadata of the original document.
|
||||
"""
|
||||
documents: List[Document] = []
|
||||
|
||||
for i, txt in enumerate(text_splits):
|
||||
meta = deepcopy(meta)
|
||||
doc = Document(content=txt, meta=meta)
|
||||
doc.meta["page_number"] = splits_pages[i]
|
||||
documents.append(doc)
|
||||
return documents
|
||||
|
||||
@ -0,0 +1,7 @@
|
||||
---
|
||||
highlights: >
|
||||
Add the "page_number" field to the metadata of all output documents.
|
||||
|
||||
enhancements:
|
||||
- |
|
||||
Now the DocumentSplitter adds the "page_number" field to the metadata of all output documents to keep track of the page of the original document it belongs to.
|
||||
@ -141,3 +141,98 @@ class TestDocumentSplitter:
|
||||
for doc, split_doc in zip(documents, result["documents"]):
|
||||
assert doc.meta.items() <= split_doc.meta.items()
|
||||
assert split_doc.content == "Text."
|
||||
|
||||
def test_add_page_number_to_metadata_with_no_overlap_word_split(self):
|
||||
splitter = DocumentSplitter(split_by="word", split_length=2)
|
||||
doc1 = Document(content="This is some text.\f This text is on another page.")
|
||||
doc2 = Document(content="This content has two.\f\f page brakes.")
|
||||
result = splitter.run(documents=[doc1, doc2])
|
||||
|
||||
expected_pages = [1, 1, 2, 2, 2, 1, 1, 3]
|
||||
for doc, p in zip(result["documents"], expected_pages):
|
||||
assert doc.meta["page_number"] == p
|
||||
|
||||
def test_add_page_number_to_metadata_with_no_overlap_sentence_split(self):
|
||||
splitter = DocumentSplitter(split_by="sentence", split_length=1)
|
||||
doc1 = Document(content="This is some text.\f This text is on another page.")
|
||||
doc2 = Document(content="This content has two.\f\f page brakes.")
|
||||
result = splitter.run(documents=[doc1, doc2])
|
||||
|
||||
expected_pages = [1, 1, 1, 1]
|
||||
for doc, p in zip(result["documents"], expected_pages):
|
||||
assert doc.meta["page_number"] == p
|
||||
|
||||
def test_add_page_number_to_metadata_with_no_overlap_passage_split(self):
|
||||
splitter = DocumentSplitter(split_by="passage", split_length=1)
|
||||
doc1 = Document(
|
||||
content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage."
|
||||
)
|
||||
result = splitter.run(documents=[doc1])
|
||||
|
||||
expected_pages = [1, 2, 2, 2]
|
||||
for doc, p in zip(result["documents"], expected_pages):
|
||||
assert doc.meta["page_number"] == p
|
||||
|
||||
def test_add_page_number_to_metadata_with_no_overlap_page_split(self):
|
||||
splitter = DocumentSplitter(split_by="page", split_length=1)
|
||||
doc1 = Document(
|
||||
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
|
||||
)
|
||||
result = splitter.run(documents=[doc1])
|
||||
expected_pages = [1, 2, 3]
|
||||
for doc, p in zip(result["documents"], expected_pages):
|
||||
assert doc.meta["page_number"] == p
|
||||
|
||||
splitter = DocumentSplitter(split_by="page", split_length=2)
|
||||
doc1 = Document(
|
||||
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
|
||||
)
|
||||
result = splitter.run(documents=[doc1])
|
||||
expected_pages = [1, 3]
|
||||
|
||||
for doc, p in zip(result["documents"], expected_pages):
|
||||
assert doc.meta["page_number"] == p
|
||||
|
||||
def test_add_page_number_to_metadata_with_overlap_word_split(self):
|
||||
splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=1)
|
||||
doc1 = Document(content="This is some text. And\f this text is on another page.")
|
||||
doc2 = Document(content="This content has two.\f\f page brakes.")
|
||||
result = splitter.run(documents=[doc1, doc2])
|
||||
|
||||
expected_pages = [1, 1, 1, 2, 2, 1, 1, 3]
|
||||
for doc, p in zip(result["documents"], expected_pages):
|
||||
print(doc.content, doc.meta, p)
|
||||
assert doc.meta["page_number"] == p
|
||||
|
||||
def test_add_page_number_to_metadata_with_overlap_sentence_split(self):
|
||||
splitter = DocumentSplitter(split_by="sentence", split_length=2, split_overlap=1)
|
||||
doc1 = Document(content="This is some text. And this is more text.\f This text is on another page. End.")
|
||||
doc2 = Document(content="This content has two.\f\f page brakes. More text.")
|
||||
result = splitter.run(documents=[doc1, doc2])
|
||||
|
||||
expected_pages = [1, 1, 1, 2, 1, 1]
|
||||
for doc, p in zip(result["documents"], expected_pages):
|
||||
print(doc.content, doc.meta, p)
|
||||
assert doc.meta["page_number"] == p
|
||||
|
||||
def test_add_page_number_to_metadata_with_overlap_passage_split(self):
|
||||
splitter = DocumentSplitter(split_by="passage", split_length=2, split_overlap=1)
|
||||
doc1 = Document(
|
||||
content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage."
|
||||
)
|
||||
result = splitter.run(documents=[doc1])
|
||||
|
||||
expected_pages = [1, 2, 2]
|
||||
for doc, p in zip(result["documents"], expected_pages):
|
||||
assert doc.meta["page_number"] == p
|
||||
|
||||
def test_add_page_number_to_metadata_with_overlap_page_split(self):
|
||||
splitter = DocumentSplitter(split_by="page", split_length=2, split_overlap=1)
|
||||
doc1 = Document(
|
||||
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
|
||||
)
|
||||
result = splitter.run(documents=[doc1])
|
||||
expected_pages = [1, 2, 3]
|
||||
|
||||
for doc, p in zip(result["documents"], expected_pages):
|
||||
assert doc.meta["page_number"] == p
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user