mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-08 21:28:00 +00:00
feat : adding split_id and split_overlap to DocumentSplitter (#7933)
* wip: adding _split_overlapp * fixing join issue for _split_overlap * adding tests * adding release notes * cleaning and fixing tests * making mypy happy * Update haystack/components/preprocessors/document_splitter.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * adding docstrings --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
This commit is contained in:
parent
569b2a87cb
commit
91f57015c0
@ -90,38 +90,38 @@ class DocumentSplitter:
|
||||
f"DocumentSplitter only works with text documents but content for document ID {doc.id} is None."
|
||||
)
|
||||
units = self._split_into_units(doc.content, self.split_by)
|
||||
text_splits, splits_pages = self._concatenate_units(
|
||||
text_splits, splits_pages, splits_start_idxs = self._concatenate_units(
|
||||
units, self.split_length, self.split_overlap, self.split_threshold
|
||||
)
|
||||
metadata = deepcopy(doc.meta)
|
||||
metadata["source_id"] = doc.id
|
||||
split_docs += self._create_docs_from_splits(
|
||||
text_splits=text_splits, splits_pages=splits_pages, meta=metadata
|
||||
text_splits=text_splits, splits_pages=splits_pages, splits_start_idxs=splits_start_idxs, meta=metadata
|
||||
)
|
||||
return {"documents": split_docs}
|
||||
|
||||
def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage", "page"]) -> List[str]:
|
||||
if split_by == "page":
|
||||
split_at = "\f"
|
||||
self.split_at = "\f"
|
||||
elif split_by == "passage":
|
||||
split_at = "\n\n"
|
||||
self.split_at = "\n\n"
|
||||
elif split_by == "sentence":
|
||||
split_at = "."
|
||||
self.split_at = "."
|
||||
elif split_by == "word":
|
||||
split_at = " "
|
||||
self.split_at = " "
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"DocumentSplitter only supports 'word', 'sentence', 'page' or 'passage' split_by options."
|
||||
)
|
||||
units = text.split(split_at)
|
||||
units = text.split(self.split_at)
|
||||
# Add the delimiter back to all units except the last one
|
||||
for i in range(len(units) - 1):
|
||||
units[i] += split_at
|
||||
units[i] += self.split_at
|
||||
return units
|
||||
|
||||
def _concatenate_units(
|
||||
self, elements: List[str], split_length: int, split_overlap: int, split_threshold: int
|
||||
) -> Tuple[List[str], List[int]]:
|
||||
) -> Tuple[List[str], List[int], List[int]]:
|
||||
"""
|
||||
Concatenates the elements into parts of split_length units.
|
||||
|
||||
@ -132,36 +132,90 @@ class DocumentSplitter:
|
||||
|
||||
text_splits: List[str] = []
|
||||
splits_pages = []
|
||||
splits_start_idxs = []
|
||||
split_at_len = len(self.split_at)
|
||||
cur_start_idx = 0
|
||||
cur_page = 1
|
||||
segments = windowed(elements, n=split_length, step=split_length - split_overlap)
|
||||
|
||||
for seg in segments:
|
||||
current_units = [unit for unit in seg if unit is not None]
|
||||
txt = "".join(current_units)
|
||||
|
||||
# check if length of current units is below split_threshold
|
||||
if len(current_units) < split_threshold and len(text_splits) > 0:
|
||||
# concatenate the last split with the current one
|
||||
text_splits[-1] += txt
|
||||
|
||||
elif len(txt) > 0:
|
||||
text_splits.append(txt)
|
||||
splits_pages.append(cur_page)
|
||||
splits_start_idxs.append(cur_start_idx)
|
||||
|
||||
processed_units = current_units[: split_length - split_overlap]
|
||||
cur_start_idx += len("".join(processed_units)) + split_at_len
|
||||
|
||||
if self.split_by == "page":
|
||||
num_page_breaks = len(processed_units)
|
||||
else:
|
||||
num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)
|
||||
cur_page += num_page_breaks
|
||||
return text_splits, splits_pages
|
||||
|
||||
@staticmethod
|
||||
def _create_docs_from_splits(text_splits: List[str], splits_pages: List[int], meta: Dict) -> List[Document]:
|
||||
cur_page += num_page_breaks
|
||||
|
||||
return text_splits, splits_pages, splits_start_idxs
|
||||
|
||||
def _create_docs_from_splits(
|
||||
self, text_splits: List[str], splits_pages: List[int], splits_start_idxs: List[int], meta: Dict
|
||||
) -> List[Document]:
|
||||
"""
|
||||
Creates Document objects from splits enriching them with page number and the metadata of the original document.
|
||||
"""
|
||||
documents: List[Document] = []
|
||||
|
||||
for i, txt in enumerate(text_splits):
|
||||
for i, (txt, split_idx) in enumerate(zip(text_splits, splits_start_idxs)):
|
||||
meta = deepcopy(meta)
|
||||
doc = Document(content=txt, meta=meta)
|
||||
doc.meta["page_number"] = splits_pages[i]
|
||||
doc.meta["split_id"] = i
|
||||
doc.meta["split_idx_start"] = split_idx
|
||||
documents.append(doc)
|
||||
|
||||
if self.split_overlap <= 0:
|
||||
continue
|
||||
|
||||
doc.meta["_split_overlap"] = []
|
||||
|
||||
if i == 0:
|
||||
continue
|
||||
|
||||
doc_start_idx = splits_start_idxs[i]
|
||||
previous_doc = documents[i - 1]
|
||||
previous_doc_start_idx = splits_start_idxs[i - 1]
|
||||
self._add_split_overlap_information(doc, doc_start_idx, previous_doc, previous_doc_start_idx)
|
||||
|
||||
return documents
|
||||
|
||||
@staticmethod
|
||||
def _add_split_overlap_information(
|
||||
current_doc: Document, current_doc_start_idx: int, previous_doc: Document, previous_doc_start_idx: int
|
||||
):
|
||||
"""
|
||||
Adds split overlap information to the current and previous Document's meta.
|
||||
|
||||
:param current_doc: The Document that is being split.
|
||||
:param current_doc_start_idx: The starting index of the current Document.
|
||||
:param previous_doc: The Document that was split before the current Document.
|
||||
:param previous_doc_start_idx: The starting index of the previous Document.
|
||||
"""
|
||||
overlapping_range = (current_doc_start_idx - previous_doc_start_idx - 1, len(previous_doc.content) - 1) # type: ignore
|
||||
|
||||
if overlapping_range[0] < overlapping_range[1]:
|
||||
overlapping_str = previous_doc.content[overlapping_range[0] : overlapping_range[1]] # type: ignore
|
||||
|
||||
if current_doc.content.startswith(overlapping_str): # type: ignore
|
||||
# add split overlap information to this Document regarding the previous Document
|
||||
current_doc.meta["_split_overlap"].append({"doc_id": previous_doc.id, "range": overlapping_range})
|
||||
|
||||
# add split overlap information to previous Document regarding this Document
|
||||
overlapping_range = (0, overlapping_range[1] - overlapping_range[0])
|
||||
previous_doc.meta["_split_overlap"].append({"doc_id": current_doc.id, "range": overlapping_range})
|
||||
|
||||
@ -0,0 +1,4 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
The `DocumentSplitter` now has support for the `split_id` and `split_overlap` to allow for more control over the splitting process.
|
||||
@ -7,6 +7,28 @@ from haystack import Document
|
||||
from haystack.components.preprocessors import DocumentSplitter
|
||||
|
||||
|
||||
def merge_documents(documents):
|
||||
"""Merge a list of doc chunks into a single doc by concatenating their content, eliminating overlapping content."""
|
||||
sorted_docs = sorted(documents, key=lambda doc: doc.meta["split_idx_start"])
|
||||
merged_text = ""
|
||||
last_idx_end = 0
|
||||
for doc in sorted_docs:
|
||||
start = doc.meta["split_idx_start"] # start of the current content
|
||||
|
||||
# if the start of the current content is before the end of the last appended content, adjust it
|
||||
if start < last_idx_end:
|
||||
start = last_idx_end
|
||||
|
||||
# append the non-overlapping part to the merged text
|
||||
merged_text = merged_text.strip()
|
||||
merged_text += doc.content[start - doc.meta["split_idx_start"] :]
|
||||
|
||||
# update the last end index
|
||||
last_idx_end = doc.meta["split_idx_start"] + len(doc.content)
|
||||
|
||||
return merged_text
|
||||
|
||||
|
||||
class TestDocumentSplitter:
|
||||
def test_non_text_document(self):
|
||||
with pytest.raises(
|
||||
@ -219,7 +241,6 @@ class TestDocumentSplitter:
|
||||
|
||||
expected_pages = [1, 1, 1, 2, 2, 1, 1, 3]
|
||||
for doc, p in zip(result["documents"], expected_pages):
|
||||
print(doc.content, doc.meta, p)
|
||||
assert doc.meta["page_number"] == p
|
||||
|
||||
def test_add_page_number_to_metadata_with_overlap_sentence_split(self):
|
||||
@ -230,7 +251,6 @@ class TestDocumentSplitter:
|
||||
|
||||
expected_pages = [1, 1, 1, 2, 1, 1]
|
||||
for doc, p in zip(result["documents"], expected_pages):
|
||||
print(doc.content, doc.meta, p)
|
||||
assert doc.meta["page_number"] == p
|
||||
|
||||
def test_add_page_number_to_metadata_with_overlap_passage_split(self):
|
||||
@ -254,3 +274,16 @@ class TestDocumentSplitter:
|
||||
|
||||
for doc, p in zip(result["documents"], expected_pages):
|
||||
assert doc.meta["page_number"] == p
|
||||
|
||||
def test_add_split_overlap_information(self):
|
||||
splitter = DocumentSplitter(split_length=10, split_overlap=5, split_by="word")
|
||||
doc = Document(content="This is a text with some words. There is a second sentence. And a third sentence.")
|
||||
docs = splitter.run(documents=[doc])
|
||||
|
||||
# check split_overlap is added to all the documents
|
||||
assert len(docs["documents"]) == 3
|
||||
for d in docs["documents"]:
|
||||
assert "_split_overlap" in d.meta
|
||||
|
||||
# reconstruct the original document content from the split documents
|
||||
assert doc.content == merge_documents(docs["documents"])
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user