diff --git a/haystack/components/preprocessors/document_splitter.py b/haystack/components/preprocessors/document_splitter.py index f5e048db6..200fa8aa9 100644 --- a/haystack/components/preprocessors/document_splitter.py +++ b/haystack/components/preprocessors/document_splitter.py @@ -90,38 +90,38 @@ class DocumentSplitter: f"DocumentSplitter only works with text documents but content for document ID {doc.id} is None." ) units = self._split_into_units(doc.content, self.split_by) - text_splits, splits_pages = self._concatenate_units( + text_splits, splits_pages, splits_start_idxs = self._concatenate_units( units, self.split_length, self.split_overlap, self.split_threshold ) metadata = deepcopy(doc.meta) metadata["source_id"] = doc.id split_docs += self._create_docs_from_splits( - text_splits=text_splits, splits_pages=splits_pages, meta=metadata + text_splits=text_splits, splits_pages=splits_pages, splits_start_idxs=splits_start_idxs, meta=metadata ) return {"documents": split_docs} def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage", "page"]) -> List[str]: if split_by == "page": - split_at = "\f" + self.split_at = "\f" elif split_by == "passage": - split_at = "\n\n" + self.split_at = "\n\n" elif split_by == "sentence": - split_at = "." + self.split_at = "." elif split_by == "word": - split_at = " " + self.split_at = " " else: raise NotImplementedError( "DocumentSplitter only supports 'word', 'sentence', 'page' or 'passage' split_by options." ) - units = text.split(split_at) + units = text.split(self.split_at) # Add the delimiter back to all units except the last one for i in range(len(units) - 1): - units[i] += split_at + units[i] += self.split_at return units def _concatenate_units( self, elements: List[str], split_length: int, split_overlap: int, split_threshold: int - ) -> Tuple[List[str], List[int]]: + ) -> Tuple[List[str], List[int], List[int]]: """ Concatenates the elements into parts of split_length units. @@ -132,36 +132,90 @@ class DocumentSplitter: text_splits: List[str] = [] splits_pages = [] + splits_start_idxs = [] + split_at_len = len(self.split_at) + cur_start_idx = 0 cur_page = 1 segments = windowed(elements, n=split_length, step=split_length - split_overlap) + for seg in segments: current_units = [unit for unit in seg if unit is not None] txt = "".join(current_units) + # check if length of current units is below split_threshold if len(current_units) < split_threshold and len(text_splits) > 0: # concatenate the last split with the current one text_splits[-1] += txt + elif len(txt) > 0: text_splits.append(txt) splits_pages.append(cur_page) + splits_start_idxs.append(cur_start_idx) + processed_units = current_units[: split_length - split_overlap] + cur_start_idx += len("".join(processed_units)) + split_at_len + if self.split_by == "page": num_page_breaks = len(processed_units) else: num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units) - cur_page += num_page_breaks - return text_splits, splits_pages - @staticmethod - def _create_docs_from_splits(text_splits: List[str], splits_pages: List[int], meta: Dict) -> List[Document]: + cur_page += num_page_breaks + + return text_splits, splits_pages, splits_start_idxs + + def _create_docs_from_splits( + self, text_splits: List[str], splits_pages: List[int], splits_start_idxs: List[int], meta: Dict + ) -> List[Document]: """ Creates Document objects from splits enriching them with page number and the metadata of the original document. """ documents: List[Document] = [] - for i, txt in enumerate(text_splits): + for i, (txt, split_idx) in enumerate(zip(text_splits, splits_start_idxs)): meta = deepcopy(meta) doc = Document(content=txt, meta=meta) doc.meta["page_number"] = splits_pages[i] + doc.meta["split_id"] = i + doc.meta["split_idx_start"] = split_idx documents.append(doc) + + if self.split_overlap <= 0: + continue + + doc.meta["_split_overlap"] = [] + + if i == 0: + continue + + doc_start_idx = splits_start_idxs[i] + previous_doc = documents[i - 1] + previous_doc_start_idx = splits_start_idxs[i - 1] + self._add_split_overlap_information(doc, doc_start_idx, previous_doc, previous_doc_start_idx) + return documents + + @staticmethod + def _add_split_overlap_information( + current_doc: Document, current_doc_start_idx: int, previous_doc: Document, previous_doc_start_idx: int + ): + """ + Adds split overlap information to the current and previous Document's meta. + + :param current_doc: The Document that is being split. + :param current_doc_start_idx: The starting index of the current Document. + :param previous_doc: The Document that was split before the current Document. + :param previous_doc_start_idx: The starting index of the previous Document. + """ + overlapping_range = (current_doc_start_idx - previous_doc_start_idx - 1, len(previous_doc.content) - 1) # type: ignore + + if overlapping_range[0] < overlapping_range[1]: + overlapping_str = previous_doc.content[overlapping_range[0] : overlapping_range[1]] # type: ignore + + if current_doc.content.startswith(overlapping_str): # type: ignore + # add split overlap information to this Document regarding the previous Document + current_doc.meta["_split_overlap"].append({"doc_id": previous_doc.id, "range": overlapping_range}) + + # add split overlap information to previous Document regarding this Document + overlapping_range = (0, overlapping_range[1] - overlapping_range[0]) + previous_doc.meta["_split_overlap"].append({"doc_id": current_doc.id, "range": overlapping_range}) diff --git a/releasenotes/notes/add-split_id_and_overlap_to_DocumentSplitter-8180ad8f13495741.yaml b/releasenotes/notes/add-split_id_and_overlap_to_DocumentSplitter-8180ad8f13495741.yaml new file mode 100644 index 000000000..e3eba2d57 --- /dev/null +++ b/releasenotes/notes/add-split_id_and_overlap_to_DocumentSplitter-8180ad8f13495741.yaml @@ -0,0 +1,4 @@ +--- +features: + - | + The `DocumentSplitter` now has support for the `split_id` and `split_overlap` to allow for more control over the splitting process. diff --git a/test/components/preprocessors/test_document_splitter.py b/test/components/preprocessors/test_document_splitter.py index 4351457f7..d6fcaa9d1 100644 --- a/test/components/preprocessors/test_document_splitter.py +++ b/test/components/preprocessors/test_document_splitter.py @@ -7,6 +7,28 @@ from haystack import Document from haystack.components.preprocessors import DocumentSplitter +def merge_documents(documents): + """Merge a list of doc chunks into a single doc by concatenating their content, eliminating overlapping content.""" + sorted_docs = sorted(documents, key=lambda doc: doc.meta["split_idx_start"]) + merged_text = "" + last_idx_end = 0 + for doc in sorted_docs: + start = doc.meta["split_idx_start"] # start of the current content + + # if the start of the current content is before the end of the last appended content, adjust it + if start < last_idx_end: + start = last_idx_end + + # append the non-overlapping part to the merged text + merged_text = merged_text.strip() + merged_text += doc.content[start - doc.meta["split_idx_start"] :] + + # update the last end index + last_idx_end = doc.meta["split_idx_start"] + len(doc.content) + + return merged_text + + class TestDocumentSplitter: def test_non_text_document(self): with pytest.raises( @@ -219,7 +241,6 @@ class TestDocumentSplitter: expected_pages = [1, 1, 1, 2, 2, 1, 1, 3] for doc, p in zip(result["documents"], expected_pages): - print(doc.content, doc.meta, p) assert doc.meta["page_number"] == p def test_add_page_number_to_metadata_with_overlap_sentence_split(self): @@ -230,7 +251,6 @@ class TestDocumentSplitter: expected_pages = [1, 1, 1, 2, 1, 1] for doc, p in zip(result["documents"], expected_pages): - print(doc.content, doc.meta, p) assert doc.meta["page_number"] == p def test_add_page_number_to_metadata_with_overlap_passage_split(self): @@ -254,3 +274,16 @@ class TestDocumentSplitter: for doc, p in zip(result["documents"], expected_pages): assert doc.meta["page_number"] == p + + def test_add_split_overlap_information(self): + splitter = DocumentSplitter(split_length=10, split_overlap=5, split_by="word") + doc = Document(content="This is a text with some words. There is a second sentence. And a third sentence.") + docs = splitter.run(documents=[doc]) + + # check split_overlap is added to all the documents + assert len(docs["documents"]) == 3 + for d in docs["documents"]: + assert "_split_overlap" in d.meta + + # reconstruct the original document content from the split documents + assert doc.content == merge_documents(docs["documents"])