haystack/test/components/preprocessors/test_recursive_splitter.py
David S. Batista 201becd400
fix: RecursiveSplitter bug in the case when the recursive chunking is triggered (#9316)
* initial import

* adding release notes

* Update fixing-bug-recursive-splitter-88d5714529f84e4e.yaml
2025-04-30 13:03:23 +02:00

989 lines
42 KiB
Python

import re
import pytest
from pytest import LogCaptureFixture
from haystack import Document, Pipeline
from haystack.components.preprocessors.recursive_splitter import RecursiveDocumentSplitter
from haystack.components.preprocessors.sentence_tokenizer import SentenceSplitter
def test_get_custom_sentence_tokenizer_success():
tokenizer = RecursiveDocumentSplitter._get_custom_sentence_tokenizer({})
assert isinstance(tokenizer, SentenceSplitter)
def test_init_with_negative_overlap():
with pytest.raises(ValueError):
_ = RecursiveDocumentSplitter(split_length=20, split_overlap=-1, separators=["."])
def test_init_with_overlap_greater_than_chunk_size():
with pytest.raises(ValueError):
_ = RecursiveDocumentSplitter(split_length=10, split_overlap=15, separators=["."])
def test_init_with_invalid_separators():
with pytest.raises(ValueError):
_ = RecursiveDocumentSplitter(separators=[".", 2])
def test_init_with_negative_split_length():
with pytest.raises(ValueError):
_ = RecursiveDocumentSplitter(split_length=-1, separators=["."])
def test_apply_overlap_no_overlap():
# Test the case where there is no overlap between chunks
splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=0, separators=["."], split_unit="char")
chunks = ["chunk1", "chunk2", "chunk3"]
result = splitter._apply_overlap(chunks)
assert result == ["chunk1", "chunk2", "chunk3"]
def test_apply_overlap_with_overlap():
# Test the case where there is overlap between chunks
splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=4, separators=["."], split_unit="char")
chunks = ["chunk1", "chunk2", "chunk3"]
result = splitter._apply_overlap(chunks)
assert result == ["chunk1", "unk1chunk2", "unk2chunk3"]
def test_apply_overlap_with_overlap_capturing_completely_previous_chunk(caplog):
splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=6, separators=["."], split_unit="char")
chunks = ["chunk1", "chunk2", "chunk3", "chunk4"]
_ = splitter._apply_overlap(chunks)
assert (
"Overlap is the same as the previous chunk. Consider increasing the `split_length` parameter or decreasing the `split_overlap` parameter."
in caplog.text
)
def test_apply_overlap_single_chunk():
# Test the case where there is only one chunk
splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=3, separators=["."], split_unit="char")
chunks = ["chunk1"]
result = splitter._apply_overlap(chunks)
assert result == ["chunk1"]
def test_chunk_text_smaller_than_chunk_size():
splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=0, separators=["."])
text = "small text"
chunks = splitter._chunk_text(text)
assert len(chunks) == 1
assert chunks[0] == text
def test_chunk_text_by_period():
splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=0, separators=["."], split_unit="char")
text = "This is a test. Another sentence. And one more."
chunks = splitter._chunk_text(text)
assert len(chunks) == 3
assert chunks[0] == "This is a test."
assert chunks[1] == " Another sentence."
assert chunks[2] == " And one more."
def test_run_multiple_new_lines_unit_char():
splitter = RecursiveDocumentSplitter(split_length=18, separators=["\n\n", "\n"], split_unit="char")
text = "This is a test.\n\n\nAnother test.\n\n\n\nFinal test."
doc = Document(content=text)
chunks = splitter.run([doc])["documents"]
assert chunks[0].content == "This is a test.\n\n"
assert chunks[1].content == "\nAnother test.\n\n\n\n"
assert chunks[2].content == "Final test."
def test_run_empty_documents(caplog: LogCaptureFixture):
splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=0, separators=["."])
empty_doc = Document(content="")
doc_chunks = splitter.run([empty_doc])
doc_chunks = doc_chunks["documents"]
assert len(doc_chunks) == 0
assert "has an empty content. Skipping this document." in caplog.text
def test_run_using_custom_sentence_tokenizer():
"""
This test includes abbreviations that are not handled by the simple sentence tokenizer based on "." and requires a
more sophisticated sentence tokenizer like the one provided by NLTK.
"""
splitter = RecursiveDocumentSplitter(
split_length=400,
split_overlap=0,
split_unit="char",
separators=["\n\n", "\n", "sentence", " "],
sentence_splitter_params={"language": "en", "use_split_rules": True, "keep_white_spaces": False},
)
splitter.warm_up()
text = """Artificial intelligence (AI) - Introduction
AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems.
AI technology is widely used throughout industry, government, and science. Some high-profile applications include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); interacting via human speech (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., ChatGPT and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go).""" # noqa: E501
chunks = splitter.run([Document(content=text)])
chunks = chunks["documents"]
assert len(chunks) == 4
assert chunks[0].content == "Artificial intelligence (AI) - Introduction\n\n"
assert (
chunks[1].content
== "AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems.\n"
) # noqa: E501
assert chunks[2].content == "AI technology is widely used throughout industry, government, and science." # noqa: E501
assert (
chunks[3].content
== "Some high-profile applications include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); interacting via human speech (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., ChatGPT and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go)."
) # noqa: E501
def test_run_split_by_dot_count_page_breaks_split_unit_char() -> None:
document_splitter = RecursiveDocumentSplitter(separators=["."], split_length=30, split_overlap=0, split_unit="char")
text = (
"Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f"
"Sentence on page 3. Another on page 3.\f\f Sentence on page 5."
)
documents = document_splitter.run(documents=[Document(content=text)])["documents"]
assert len(documents) == 7
assert documents[0].content == "Sentence on page 1."
assert documents[0].meta["page_number"] == 1
assert documents[0].meta["split_id"] == 0
assert documents[0].meta["split_idx_start"] == text.index(documents[0].content)
assert documents[1].content == " Another on page 1."
assert documents[1].meta["page_number"] == 1
assert documents[1].meta["split_id"] == 1
assert documents[1].meta["split_idx_start"] == text.index(documents[1].content)
assert documents[2].content == "\fSentence on page 2."
assert documents[2].meta["page_number"] == 2
assert documents[2].meta["split_id"] == 2
assert documents[2].meta["split_idx_start"] == text.index(documents[2].content)
assert documents[3].content == " Another on page 2."
assert documents[3].meta["page_number"] == 2
assert documents[3].meta["split_id"] == 3
assert documents[3].meta["split_idx_start"] == text.index(documents[3].content)
assert documents[4].content == "\fSentence on page 3."
assert documents[4].meta["page_number"] == 3
assert documents[4].meta["split_id"] == 4
assert documents[4].meta["split_idx_start"] == text.index(documents[4].content)
assert documents[5].content == " Another on page 3."
assert documents[5].meta["page_number"] == 3
assert documents[5].meta["split_id"] == 5
assert documents[5].meta["split_idx_start"] == text.index(documents[5].content)
assert documents[6].content == "\f\f Sentence on page 5."
assert documents[6].meta["page_number"] == 5
assert documents[6].meta["split_id"] == 6
assert documents[6].meta["split_idx_start"] == text.index(documents[6].content)
def test_run_split_by_word_count_page_breaks_split_unit_char():
splitter = RecursiveDocumentSplitter(split_length=19, split_overlap=0, separators=[" "], split_unit="char")
text = "This is some text. \f This text is on another page. \f This is the last pag3."
doc = Document(content=text)
doc_chunks = splitter.run([doc])
doc_chunks = doc_chunks["documents"]
assert len(doc_chunks) == 5
assert doc_chunks[0].content == "This is some text. "
assert doc_chunks[0].meta["page_number"] == 1
assert doc_chunks[0].meta["split_id"] == 0
assert doc_chunks[0].meta["split_idx_start"] == text.index(doc_chunks[0].content)
assert doc_chunks[1].content == "\f This text is on "
assert doc_chunks[1].meta["page_number"] == 2
assert doc_chunks[1].meta["split_id"] == 1
assert doc_chunks[1].meta["split_idx_start"] == text.index(doc_chunks[1].content)
assert doc_chunks[2].content == "another page. \f "
assert doc_chunks[2].meta["page_number"] == 3
assert doc_chunks[2].meta["split_id"] == 2
assert doc_chunks[2].meta["split_idx_start"] == text.index(doc_chunks[2].content)
assert doc_chunks[3].content == "This is the last "
assert doc_chunks[3].meta["page_number"] == 3
assert doc_chunks[3].meta["split_id"] == 3
assert doc_chunks[3].meta["split_idx_start"] == text.index(doc_chunks[3].content)
assert doc_chunks[4].content == "pag3."
assert doc_chunks[4].meta["page_number"] == 3
assert doc_chunks[4].meta["split_id"] == 4
assert doc_chunks[4].meta["split_idx_start"] == text.index(doc_chunks[4].content)
def test_run_split_by_page_break_count_page_breaks() -> None:
document_splitter = RecursiveDocumentSplitter(
separators=["\f"], split_length=50, split_overlap=0, split_unit="char"
)
text = (
"Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f"
"Sentence on page 3. Another on page 3.\f\f Sentence on page 5."
)
documents = document_splitter.run(documents=[Document(content=text)])
chunks_docs = documents["documents"]
assert len(chunks_docs) == 4
assert chunks_docs[0].content == "Sentence on page 1. Another on page 1.\f"
assert chunks_docs[0].meta["page_number"] == 1
assert chunks_docs[0].meta["split_id"] == 0
assert chunks_docs[0].meta["split_idx_start"] == text.index(chunks_docs[0].content)
assert chunks_docs[1].content == "Sentence on page 2. Another on page 2.\f"
assert chunks_docs[1].meta["page_number"] == 2
assert chunks_docs[1].meta["split_id"] == 1
assert chunks_docs[1].meta["split_idx_start"] == text.index(chunks_docs[1].content)
assert chunks_docs[2].content == "Sentence on page 3. Another on page 3.\f\f"
assert chunks_docs[2].meta["page_number"] == 3
assert chunks_docs[2].meta["split_id"] == 2
assert chunks_docs[2].meta["split_idx_start"] == text.index(chunks_docs[2].content)
assert chunks_docs[3].content == " Sentence on page 5."
assert chunks_docs[3].meta["page_number"] == 5
assert chunks_docs[3].meta["split_id"] == 3
assert chunks_docs[3].meta["split_idx_start"] == text.index(chunks_docs[3].content)
def test_run_split_by_new_line_count_page_breaks_split_unit_char() -> None:
document_splitter = RecursiveDocumentSplitter(
separators=["\n"], split_length=21, split_overlap=0, split_unit="char"
)
text = (
"Sentence on page 1.\nAnother on page 1.\n\f"
"Sentence on page 2.\nAnother on page 2.\n\f"
"Sentence on page 3.\nAnother on page 3.\n\f\f"
"Sentence on page 5."
)
documents = document_splitter.run(documents=[Document(content=text)])
chunks_docs = documents["documents"]
assert len(chunks_docs) == 7
assert chunks_docs[0].content == "Sentence on page 1.\n"
assert chunks_docs[0].meta["page_number"] == 1
assert chunks_docs[0].meta["split_id"] == 0
assert chunks_docs[0].meta["split_idx_start"] == text.index(chunks_docs[0].content)
assert chunks_docs[1].content == "Another on page 1.\n"
assert chunks_docs[1].meta["page_number"] == 1
assert chunks_docs[1].meta["split_id"] == 1
assert chunks_docs[1].meta["split_idx_start"] == text.index(chunks_docs[1].content)
assert chunks_docs[2].content == "\fSentence on page 2.\n"
assert chunks_docs[2].meta["page_number"] == 2
assert chunks_docs[2].meta["split_id"] == 2
assert chunks_docs[2].meta["split_idx_start"] == text.index(chunks_docs[2].content)
assert chunks_docs[3].content == "Another on page 2.\n"
assert chunks_docs[3].meta["page_number"] == 2
assert chunks_docs[3].meta["split_id"] == 3
assert chunks_docs[3].meta["split_idx_start"] == text.index(chunks_docs[3].content)
assert chunks_docs[4].content == "\fSentence on page 3.\n"
assert chunks_docs[4].meta["page_number"] == 3
assert chunks_docs[4].meta["split_id"] == 4
assert chunks_docs[4].meta["split_idx_start"] == text.index(chunks_docs[4].content)
assert chunks_docs[5].content == "Another on page 3.\n"
assert chunks_docs[5].meta["page_number"] == 3
assert chunks_docs[5].meta["split_id"] == 5
assert chunks_docs[5].meta["split_idx_start"] == text.index(chunks_docs[5].content)
assert chunks_docs[6].content == "\f\fSentence on page 5."
assert chunks_docs[6].meta["page_number"] == 5
assert chunks_docs[6].meta["split_id"] == 6
assert chunks_docs[6].meta["split_idx_start"] == text.index(chunks_docs[6].content)
def test_run_split_by_sentence_count_page_breaks_split_unit_char() -> None:
document_splitter = RecursiveDocumentSplitter(
separators=["sentence"], split_length=28, split_overlap=0, split_unit="char"
)
document_splitter.warm_up()
text = (
"Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f"
"Sentence on page 3. Another on page 3.\f\fSentence on page 5."
)
documents = document_splitter.run(documents=[Document(content=text)])
chunks_docs = documents["documents"]
assert len(chunks_docs) == 7
assert chunks_docs[0].content == "Sentence on page 1. "
assert chunks_docs[0].meta["page_number"] == 1
assert chunks_docs[0].meta["split_id"] == 0
assert chunks_docs[0].meta["split_idx_start"] == text.index(chunks_docs[0].content)
assert chunks_docs[1].content == "Another on page 1.\f"
assert chunks_docs[1].meta["page_number"] == 1
assert chunks_docs[1].meta["split_id"] == 1
assert chunks_docs[1].meta["split_idx_start"] == text.index(chunks_docs[1].content)
assert chunks_docs[2].content == "Sentence on page 2. "
assert chunks_docs[2].meta["page_number"] == 2
assert chunks_docs[2].meta["split_id"] == 2
assert chunks_docs[2].meta["split_idx_start"] == text.index(chunks_docs[2].content)
assert chunks_docs[3].content == "Another on page 2.\f"
assert chunks_docs[3].meta["page_number"] == 2
assert chunks_docs[3].meta["split_id"] == 3
assert chunks_docs[3].meta["split_idx_start"] == text.index(chunks_docs[3].content)
assert chunks_docs[4].content == "Sentence on page 3. "
assert chunks_docs[4].meta["page_number"] == 3
assert chunks_docs[4].meta["split_id"] == 4
assert chunks_docs[4].meta["split_idx_start"] == text.index(chunks_docs[4].content)
assert chunks_docs[5].content == "Another on page 3.\f\f"
assert chunks_docs[5].meta["page_number"] == 3
assert chunks_docs[5].meta["split_id"] == 5
assert chunks_docs[5].meta["split_idx_start"] == text.index(chunks_docs[5].content)
assert chunks_docs[6].content == "Sentence on page 5."
assert chunks_docs[6].meta["page_number"] == 5
assert chunks_docs[6].meta["split_id"] == 6
assert chunks_docs[6].meta["split_idx_start"] == text.index(chunks_docs[6].content)
def test_run_split_document_with_overlap_character_unit():
splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=10, separators=["."], split_unit="char")
text = """A simple sentence1. A bright sentence2. A clever sentence3"""
doc = Document(content=text)
doc_chunks = splitter.run([doc])
doc_chunks = doc_chunks["documents"]
assert len(doc_chunks) == 5
assert doc_chunks[0].content == "A simple sentence1."
assert doc_chunks[0].meta["split_id"] == 0
assert doc_chunks[0].meta["split_idx_start"] == text.index(doc_chunks[0].content)
assert doc_chunks[0].meta["_split_overlap"] == [{"doc_id": doc_chunks[1].id, "range": (0, 10)}]
assert doc_chunks[1].content == "sentence1. A bright "
assert doc_chunks[1].meta["split_id"] == 1
assert doc_chunks[1].meta["split_idx_start"] == text.index(doc_chunks[1].content)
assert doc_chunks[1].meta["_split_overlap"] == [
{"doc_id": doc_chunks[0].id, "range": (9, 19)},
{"doc_id": doc_chunks[2].id, "range": (0, 10)},
]
assert doc_chunks[2].content == " A bright sentence2."
assert doc_chunks[2].meta["split_id"] == 2
assert doc_chunks[2].meta["split_idx_start"] == text.index(doc_chunks[2].content)
assert doc_chunks[2].meta["_split_overlap"] == [
{"doc_id": doc_chunks[1].id, "range": (10, 20)},
{"doc_id": doc_chunks[3].id, "range": (0, 10)},
]
assert doc_chunks[3].content == "sentence2. A clever "
assert doc_chunks[3].meta["split_id"] == 3
assert doc_chunks[3].meta["split_idx_start"] == text.index(doc_chunks[3].content)
assert doc_chunks[3].meta["_split_overlap"] == [
{"doc_id": doc_chunks[2].id, "range": (10, 20)},
{"doc_id": doc_chunks[4].id, "range": (0, 10)},
]
assert doc_chunks[4].content == " A clever sentence3"
assert doc_chunks[4].meta["split_id"] == 4
assert doc_chunks[4].meta["split_idx_start"] == text.index(doc_chunks[4].content)
assert doc_chunks[4].meta["_split_overlap"] == [{"doc_id": doc_chunks[3].id, "range": (10, 20)}]
def test_run_split_document_with_overlap_and_fallback_character_unit():
splitter = RecursiveDocumentSplitter(split_length=8, split_overlap=4, separators=["."], split_unit="char")
text = "A simple sentence1. Short. Short."
doc = Document(content=text)
doc_chunks = splitter.run([doc])
doc_chunks = doc_chunks["documents"]
assert len(doc_chunks) == 8
assert doc_chunks[0].content == "A simple"
assert doc_chunks[1].content == "mple sen"
assert doc_chunks[2].content == " sentenc"
assert doc_chunks[3].content == "tence1. "
assert doc_chunks[4].content == "e1. Shor"
assert doc_chunks[5].content == "Short. S"
assert doc_chunks[6].content == "t. Short"
assert doc_chunks[7].content == "hort."
def test_run_separator_exists_but_split_length_too_small_fall_back_to_character_chunking():
splitter = RecursiveDocumentSplitter(separators=[" "], split_length=2, split_unit="char")
doc = Document(content="This is some text")
result = splitter.run(documents=[doc])
assert len(result["documents"]) == 10
for doc in result["documents"]:
if re.escape(doc.content) not in ["\\ "]:
assert len(doc.content) == 2
def test_run_fallback_to_character_chunking_by_default_length_too_short():
text = "abczdefzghizjkl"
separators = ["\n\n", "\n", "z"]
splitter = RecursiveDocumentSplitter(split_length=2, separators=separators, split_unit="char")
doc = Document(content=text)
chunks = splitter.run([doc])["documents"]
for chunk in chunks:
assert len(chunk.content) <= 2
def test_run_fallback_to_word_chunking_by_default_length_too_short():
text = "This is some text. This is some more text, and even more text."
separators = ["\n\n", "\n", "."]
splitter = RecursiveDocumentSplitter(split_length=2, separators=separators, split_unit="word")
doc = Document(content=text)
chunks = splitter.run([doc])["documents"]
for chunk in chunks:
assert splitter._chunk_length(chunk.content) <= 2
def test_run_custom_sentence_tokenizer_document_and_overlap_char_unit():
"""Test that RecursiveDocumentSplitter works correctly with custom sentence tokenizer and overlap"""
splitter = RecursiveDocumentSplitter(split_length=25, split_overlap=10, separators=["sentence"], split_unit="char")
text = "This is sentence one. This is sentence two. This is sentence three."
splitter.warm_up()
doc = Document(content=text)
doc_chunks = splitter.run([doc])["documents"]
assert len(doc_chunks) == 4
assert doc_chunks[0].content == "This is sentence one. "
assert doc_chunks[0].meta["split_id"] == 0
assert doc_chunks[0].meta["split_idx_start"] == text.index(doc_chunks[0].content)
assert doc_chunks[0].meta["_split_overlap"] == [{"doc_id": doc_chunks[1].id, "range": (0, 10)}]
assert doc_chunks[1].content == "ence one. This is sentenc"
assert doc_chunks[1].meta["split_id"] == 1
assert doc_chunks[1].meta["split_idx_start"] == text.index(doc_chunks[1].content)
assert doc_chunks[1].meta["_split_overlap"] == [
{"doc_id": doc_chunks[0].id, "range": (12, 22)},
{"doc_id": doc_chunks[2].id, "range": (0, 10)},
]
assert doc_chunks[2].content == "is sentence two. This is "
assert doc_chunks[2].meta["split_id"] == 2
assert doc_chunks[2].meta["split_idx_start"] == text.index(doc_chunks[2].content)
assert doc_chunks[2].meta["_split_overlap"] == [
{"doc_id": doc_chunks[1].id, "range": (15, 25)},
{"doc_id": doc_chunks[3].id, "range": (0, 10)},
]
assert doc_chunks[3].content == ". This is sentence three."
assert doc_chunks[3].meta["split_id"] == 3
assert doc_chunks[3].meta["split_idx_start"] == text.index(doc_chunks[3].content)
assert doc_chunks[3].meta["_split_overlap"] == [{"doc_id": doc_chunks[2].id, "range": (15, 25)}]
def test_run_split_by_dot_count_page_breaks_word_unit() -> None:
document_splitter = RecursiveDocumentSplitter(separators=["."], split_length=4, split_overlap=0, split_unit="word")
text = (
"Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f"
"Sentence on page 3. Another on page 3.\f\f Sentence on page 5."
)
documents = document_splitter.run(documents=[Document(content=text)])["documents"]
assert len(documents) == 8
assert documents[0].content == "Sentence on page 1."
assert documents[0].meta["page_number"] == 1
assert documents[0].meta["split_id"] == 0
assert documents[0].meta["split_idx_start"] == text.index(documents[0].content)
assert documents[1].content == " Another on page 1."
assert documents[1].meta["page_number"] == 1
assert documents[1].meta["split_id"] == 1
assert documents[1].meta["split_idx_start"] == text.index(documents[1].content)
assert documents[2].content == "\fSentence on page 2."
assert documents[2].meta["page_number"] == 2
assert documents[2].meta["split_id"] == 2
assert documents[2].meta["split_idx_start"] == text.index(documents[2].content)
assert documents[3].content == " Another on page 2."
assert documents[3].meta["page_number"] == 2
assert documents[3].meta["split_id"] == 3
assert documents[3].meta["split_idx_start"] == text.index(documents[3].content)
assert documents[4].content == "\fSentence on page 3."
assert documents[4].meta["page_number"] == 3
assert documents[4].meta["split_id"] == 4
assert documents[4].meta["split_idx_start"] == text.index(documents[4].content)
assert documents[5].content == " Another on page 3."
assert documents[5].meta["page_number"] == 3
assert documents[5].meta["split_id"] == 5
assert documents[5].meta["split_idx_start"] == text.index(documents[5].content)
assert documents[6].content == "\f\f Sentence on page"
assert documents[6].meta["page_number"] == 5
assert documents[6].meta["split_id"] == 6
assert documents[6].meta["split_idx_start"] == text.index(documents[6].content)
assert documents[7].content == " 5."
assert documents[7].meta["page_number"] == 5
assert documents[7].meta["split_id"] == 7
assert documents[7].meta["split_idx_start"] == text.index(documents[7].content)
def test_run_split_by_word_count_page_breaks_word_unit():
splitter = RecursiveDocumentSplitter(split_length=4, split_overlap=0, separators=[" "], split_unit="word")
text = "This is some text. \f This text is on another page. \f This is the last pag3."
doc = Document(content=text)
doc_chunks = splitter.run([doc])
doc_chunks = doc_chunks["documents"]
assert len(doc_chunks) == 5
assert doc_chunks[0].content == "This is some text. "
assert doc_chunks[0].meta["page_number"] == 1
assert doc_chunks[0].meta["split_id"] == 0
assert doc_chunks[0].meta["split_idx_start"] == text.index(doc_chunks[0].content)
assert doc_chunks[1].content == "\f This text is "
assert doc_chunks[1].meta["page_number"] == 2
assert doc_chunks[1].meta["split_id"] == 1
assert doc_chunks[1].meta["split_idx_start"] == text.index(doc_chunks[1].content)
assert doc_chunks[2].content == "on another page. \f "
assert doc_chunks[2].meta["page_number"] == 3
assert doc_chunks[2].meta["split_id"] == 2
assert doc_chunks[2].meta["split_idx_start"] == text.index(doc_chunks[2].content)
assert doc_chunks[3].content == "This is the last "
assert doc_chunks[3].meta["page_number"] == 3
assert doc_chunks[3].meta["split_id"] == 3
assert doc_chunks[3].meta["split_idx_start"] == text.index(doc_chunks[3].content)
assert doc_chunks[4].content == "pag3."
assert doc_chunks[4].meta["page_number"] == 3
assert doc_chunks[4].meta["split_id"] == 4
assert doc_chunks[4].meta["split_idx_start"] == text.index(doc_chunks[4].content)
def test_run_split_by_page_break_count_page_breaks_word_unit() -> None:
document_splitter = RecursiveDocumentSplitter(separators=["\f"], split_length=8, split_overlap=0, split_unit="word")
text = (
"Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f"
"Sentence on page 3. Another on page 3.\f\f Sentence on page 5."
)
documents = document_splitter.run(documents=[Document(content=text)])
chunks_docs = documents["documents"]
assert len(chunks_docs) == 4
assert chunks_docs[0].content == "Sentence on page 1. Another on page 1.\f"
assert chunks_docs[0].meta["page_number"] == 1
assert chunks_docs[0].meta["split_id"] == 0
assert chunks_docs[0].meta["split_idx_start"] == text.index(chunks_docs[0].content)
assert chunks_docs[1].content == "Sentence on page 2. Another on page 2.\f"
assert chunks_docs[1].meta["page_number"] == 2
assert chunks_docs[1].meta["split_id"] == 1
assert chunks_docs[1].meta["split_idx_start"] == text.index(chunks_docs[1].content)
assert chunks_docs[2].content == "Sentence on page 3. Another on page 3.\f"
assert chunks_docs[2].meta["page_number"] == 3
assert chunks_docs[2].meta["split_id"] == 2
assert chunks_docs[2].meta["split_idx_start"] == text.index(chunks_docs[2].content)
assert chunks_docs[3].content == "\f Sentence on page 5."
assert chunks_docs[3].meta["page_number"] == 5
assert chunks_docs[3].meta["split_id"] == 3
assert chunks_docs[3].meta["split_idx_start"] == text.index(chunks_docs[3].content)
def test_run_split_by_new_line_count_page_breaks_word_unit() -> None:
document_splitter = RecursiveDocumentSplitter(separators=["\n"], split_length=4, split_overlap=0, split_unit="word")
text = (
"Sentence on page 1.\nAnother on page 1.\n\f"
"Sentence on page 2.\nAnother on page 2.\n\f"
"Sentence on page 3.\nAnother on page 3.\n\f\f"
"Sentence on page 5."
)
documents = document_splitter.run(documents=[Document(content=text)])
chunks_docs = documents["documents"]
assert len(chunks_docs) == 7
assert chunks_docs[0].content == "Sentence on page 1.\n"
assert chunks_docs[0].meta["page_number"] == 1
assert chunks_docs[0].meta["split_id"] == 0
assert chunks_docs[0].meta["split_idx_start"] == text.index(chunks_docs[0].content)
assert chunks_docs[1].content == "Another on page 1.\n"
assert chunks_docs[1].meta["page_number"] == 1
assert chunks_docs[1].meta["split_id"] == 1
assert chunks_docs[1].meta["split_idx_start"] == text.index(chunks_docs[1].content)
assert chunks_docs[2].content == "\fSentence on page 2.\n"
assert chunks_docs[2].meta["page_number"] == 2
assert chunks_docs[2].meta["split_id"] == 2
assert chunks_docs[2].meta["split_idx_start"] == text.index(chunks_docs[2].content)
assert chunks_docs[3].content == "Another on page 2.\n"
assert chunks_docs[3].meta["page_number"] == 2
assert chunks_docs[3].meta["split_id"] == 3
assert chunks_docs[3].meta["split_idx_start"] == text.index(chunks_docs[3].content)
assert chunks_docs[4].content == "\fSentence on page 3.\n"
assert chunks_docs[4].meta["page_number"] == 3
assert chunks_docs[4].meta["split_id"] == 4
assert chunks_docs[4].meta["split_idx_start"] == text.index(chunks_docs[4].content)
assert chunks_docs[5].content == "Another on page 3.\n"
assert chunks_docs[5].meta["page_number"] == 3
assert chunks_docs[5].meta["split_id"] == 5
assert chunks_docs[5].meta["split_idx_start"] == text.index(chunks_docs[5].content)
assert chunks_docs[6].content == "\f\fSentence on page 5."
assert chunks_docs[6].meta["page_number"] == 5
assert chunks_docs[6].meta["split_id"] == 6
assert chunks_docs[6].meta["split_idx_start"] == text.index(chunks_docs[6].content)
def test_run_split_by_sentence_count_page_breaks_word_unit() -> None:
document_splitter = RecursiveDocumentSplitter(
separators=["sentence"], split_length=7, split_overlap=0, split_unit="word"
)
document_splitter.warm_up()
text = (
"Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f"
"Sentence on page 3. Another on page 3.\f\fSentence on page 5."
)
documents = document_splitter.run(documents=[Document(content=text)])
chunks_docs = documents["documents"]
assert len(chunks_docs) == 7
assert chunks_docs[0].content == "Sentence on page 1. "
assert chunks_docs[0].meta["page_number"] == 1
assert chunks_docs[0].meta["split_id"] == 0
assert chunks_docs[0].meta["split_idx_start"] == text.index(chunks_docs[0].content)
assert chunks_docs[1].content == "Another on page 1.\f"
assert chunks_docs[1].meta["page_number"] == 1
assert chunks_docs[1].meta["split_id"] == 1
assert chunks_docs[1].meta["split_idx_start"] == text.index(chunks_docs[1].content)
assert chunks_docs[2].content == "Sentence on page 2. "
assert chunks_docs[2].meta["page_number"] == 2
assert chunks_docs[2].meta["split_id"] == 2
assert chunks_docs[2].meta["split_idx_start"] == text.index(chunks_docs[2].content)
assert chunks_docs[3].content == "Another on page 2.\f"
assert chunks_docs[3].meta["page_number"] == 2
assert chunks_docs[3].meta["split_id"] == 3
assert chunks_docs[3].meta["split_idx_start"] == text.index(chunks_docs[3].content)
assert chunks_docs[4].content == "Sentence on page 3. "
assert chunks_docs[4].meta["page_number"] == 3
assert chunks_docs[4].meta["split_id"] == 4
assert chunks_docs[4].meta["split_idx_start"] == text.index(chunks_docs[4].content)
assert chunks_docs[5].content == "Another on page 3.\f\f"
assert chunks_docs[5].meta["page_number"] == 3
assert chunks_docs[5].meta["split_id"] == 5
assert chunks_docs[5].meta["split_idx_start"] == text.index(chunks_docs[5].content)
assert chunks_docs[6].content == "Sentence on page 5."
assert chunks_docs[6].meta["page_number"] == 5
assert chunks_docs[6].meta["split_id"] == 6
assert chunks_docs[6].meta["split_idx_start"] == text.index(chunks_docs[6].content)
def test_run_split_by_sentence_tokenizer_document_and_overlap_word_unit_no_overlap():
splitter = RecursiveDocumentSplitter(split_length=4, split_overlap=0, separators=["."], split_unit="word")
text = "This is sentence one. This is sentence two. This is sentence three."
chunks = splitter.run([Document(content=text)])["documents"]
assert len(chunks) == 3
assert chunks[0].content == "This is sentence one."
assert chunks[1].content == " This is sentence two."
assert chunks[2].content == " This is sentence three."
def test_run_split_by_dot_and_overlap_1_word_unit():
splitter = RecursiveDocumentSplitter(split_length=4, split_overlap=1, separators=["."], split_unit="word")
text = "This is sentence one. This is sentence two. This is sentence three. This is sentence four."
chunks = splitter.run([Document(content=text)])["documents"]
assert len(chunks) == 5
assert chunks[0].content == "This is sentence one."
assert chunks[1].content == "one. This is sentence"
assert chunks[2].content == "sentence two. This is"
assert chunks[3].content == "is sentence three. This"
assert chunks[4].content == "This is sentence four."
def test_run_trigger_dealing_with_remaining_word_larger_than_split_length():
splitter = RecursiveDocumentSplitter(split_length=3, split_overlap=2, separators=["."], split_unit="word")
text = """A simple sentence1. A bright sentence2. A clever sentence3"""
doc = Document(content=text)
chunks = splitter.run([doc])["documents"]
assert len(chunks) == 7
assert chunks[0].content == "A simple sentence1."
assert chunks[1].content == "simple sentence1. A"
assert chunks[2].content == "sentence1. A bright"
assert chunks[3].content == "A bright sentence2."
assert chunks[4].content == "bright sentence2. A"
assert chunks[5].content == "sentence2. A clever"
assert chunks[6].content == "A clever sentence3"
def test_run_trigger_dealing_with_remaining_char_larger_than_split_length():
splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=15, separators=["."], split_unit="char")
text = """A simple sentence1. A bright sentence2. A clever sentence3"""
doc = Document(content=text)
chunks = splitter.run([doc])["documents"]
assert len(chunks) == 9
assert chunks[0].content == "A simple sentence1."
assert chunks[0].meta["split_id"] == 0
assert chunks[0].meta["split_idx_start"] == text.index(chunks[0].content)
assert chunks[0].meta["_split_overlap"] == [{"doc_id": chunks[1].id, "range": (0, 15)}]
assert chunks[1].content == "mple sentence1. A br"
assert chunks[1].meta["split_id"] == 1
assert chunks[1].meta["split_idx_start"] == text.index(chunks[1].content)
assert chunks[1].meta["_split_overlap"] == [
{"doc_id": chunks[0].id, "range": (4, 19)},
{"doc_id": chunks[2].id, "range": (0, 15)},
]
assert chunks[2].content == "sentence1. A bright "
assert chunks[2].meta["split_id"] == 2
assert chunks[2].meta["split_idx_start"] == text.index(chunks[2].content)
assert chunks[2].meta["_split_overlap"] == [
{"doc_id": chunks[1].id, "range": (5, 20)},
{"doc_id": chunks[3].id, "range": (0, 15)},
]
assert chunks[3].content == "nce1. A bright sente"
assert chunks[3].meta["split_id"] == 3
assert chunks[3].meta["split_idx_start"] == text.index(chunks[3].content)
assert chunks[3].meta["_split_overlap"] == [
{"doc_id": chunks[2].id, "range": (5, 20)},
{"doc_id": chunks[4].id, "range": (0, 15)},
]
assert chunks[4].content == " A bright sentence2."
assert chunks[4].meta["split_id"] == 4
assert chunks[4].meta["split_idx_start"] == text.index(chunks[4].content)
assert chunks[4].meta["_split_overlap"] == [
{"doc_id": chunks[3].id, "range": (5, 20)},
{"doc_id": chunks[5].id, "range": (0, 15)},
]
assert chunks[5].content == "ight sentence2. A cl"
assert chunks[5].meta["split_id"] == 5
assert chunks[5].meta["split_idx_start"] == text.index(chunks[5].content)
assert chunks[5].meta["_split_overlap"] == [
{"doc_id": chunks[4].id, "range": (5, 20)},
{"doc_id": chunks[6].id, "range": (0, 15)},
]
assert chunks[6].content == "sentence2. A clever "
assert chunks[6].meta["split_id"] == 6
assert chunks[6].meta["split_idx_start"] == text.index(chunks[6].content)
assert chunks[6].meta["_split_overlap"] == [
{"doc_id": chunks[5].id, "range": (5, 20)},
{"doc_id": chunks[7].id, "range": (0, 15)},
]
assert chunks[7].content == "nce2. A clever sente"
assert chunks[7].meta["split_id"] == 7
assert chunks[7].meta["split_idx_start"] == text.index(chunks[7].content)
assert chunks[7].meta["_split_overlap"] == [
{"doc_id": chunks[6].id, "range": (5, 20)},
{"doc_id": chunks[8].id, "range": (0, 15)},
]
assert chunks[8].content == " A clever sentence3"
assert chunks[8].meta["split_id"] == 8
assert chunks[8].meta["split_idx_start"] == text.index(chunks[8].content)
assert chunks[8].meta["_split_overlap"] == [{"doc_id": chunks[7].id, "range": (5, 20)}]
def test_run_custom_split_by_dot_and_overlap_3_char_unit():
document_splitter = RecursiveDocumentSplitter(separators=["."], split_length=4, split_overlap=0, split_unit="word")
text = "\x0c\x0c Sentence on page 5."
chunks = document_splitter._fall_back_to_fixed_chunking(text, split_units="word")
assert len(chunks) == 2
assert chunks[0] == "\x0c\x0c Sentence on page"
assert chunks[1] == " 5."
def test_run_serialization_in_pipeline():
pipeline = Pipeline()
pipeline.add_component("chunker", RecursiveDocumentSplitter(split_length=20, split_overlap=5, separators=["."]))
pipeline_dict = pipeline.dumps()
new_pipeline = Pipeline.loads(pipeline_dict)
assert pipeline_dict == new_pipeline.dumps()
@pytest.mark.integration
def test_run_split_by_token_count():
splitter = RecursiveDocumentSplitter(split_length=5, separators=["."], split_unit="token")
splitter.warm_up()
text = "This is a test. This is another test. This is the final test."
doc = Document(content=text)
chunks = splitter.run([doc])["documents"]
assert len(chunks) == 4
assert chunks[0].content == "This is a test."
assert chunks[1].content == " This is another test."
assert chunks[2].content == " This is the final test"
assert chunks[3].content == "."
@pytest.mark.integration
def test_run_split_by_token_count_with_html_tags():
splitter = RecursiveDocumentSplitter(split_length=4, separators=["."], split_unit="token")
splitter.warm_up()
text = "This is a test. <h><a><y><s><t><a><c><k><c><o><r><e> This is the final test."
doc = Document(content=text)
chunks = splitter.run([doc])["documents"]
assert len(chunks) == 10
@pytest.mark.integration
def test_run_split_by_token_with_sentence_tokenizer():
splitter = RecursiveDocumentSplitter(split_length=4, separators=["sentence"], split_unit="token")
splitter.warm_up()
text = "This is sentence one. This is sentence two."
doc = Document(content=text)
chunks = splitter.run([doc])["documents"]
assert len(chunks) == 4
assert chunks[0].content == "This is sentence one"
assert chunks[1].content == ". "
assert chunks[2].content == "This is sentence two"
assert chunks[3].content == "."
@pytest.mark.integration
def test_run_split_by_token_with_empty_document(caplog: LogCaptureFixture):
splitter = RecursiveDocumentSplitter(split_length=4, separators=["."], split_unit="token")
splitter.warm_up()
empty_doc = Document(content="")
doc_chunks = splitter.run([empty_doc])
doc_chunks = doc_chunks["documents"]
assert len(doc_chunks) == 0
assert "has an empty content. Skipping this document." in caplog.text
@pytest.mark.integration
def test_run_split_by_token_with_fallback():
splitter = RecursiveDocumentSplitter(split_length=2, separators=["."], split_unit="token")
splitter.warm_up()
text = "This is a very long sentence that will need to be split into smaller chunks."
doc = Document(content=text)
chunks = splitter.run([doc])["documents"]
assert len(chunks) > 1
for chunk in chunks:
assert splitter._chunk_length(chunk.content) <= 2
@pytest.mark.integration
def test_run_split_by_token_with_overlap_and_fallback():
splitter = RecursiveDocumentSplitter(split_length=4, split_overlap=2, separators=["."], split_unit="token")
splitter.warm_up()
text = "This is a test. This is another test. This is the final test."
doc = Document(content=text)
chunks = splitter.run([doc])["documents"]
assert len(chunks) == 8
assert chunks[0].content == "This is a test"
assert chunks[1].content == " a test."
assert chunks[2].content == " test. This is"
assert chunks[3].content == " This is another test"
assert chunks[4].content == " another test. This"
assert chunks[5].content == ". This is the"
assert chunks[6].content == " is the final test"
assert chunks[7].content == " final test."
def test_run_without_warm_up_raises_error():
docs = [Document(content="text")]
splitter_token = RecursiveDocumentSplitter(split_unit="token")
with pytest.raises(RuntimeError, match="warmed up"):
splitter_token.run(docs)
splitter_sentence = RecursiveDocumentSplitter(separators=["sentence"])
with pytest.raises(RuntimeError, match="warmed up"):
splitter_sentence.run(docs)
# Test that no error is raised when warm_up is not required
splitter_no_warmup = RecursiveDocumentSplitter(separators=["."])
result = splitter_no_warmup.run(docs)
assert len(result["documents"]) == 1
assert result["documents"][0].content == "text"
def test_run_complex_text_with_multiple_separators():
"""
Test that RecursiveDocumentSplitter correctly handles complex text with multiple separators and chunks that exceed
the split_length.
"""
# Create a complex text with multiple separators and chunks of different sizes
long_text = (
"A" * 150
+ "\n\n" # triggers first-level split on \n\n
+ "B" * 100
+ "\n"
+ "B" * 105
+ "\n\n" # this chunk exceeds split_length and goes through recursion
+ "C" * 100
+ "\n\n" # short chunk1
+ "D" * 50 # short chunk2
)
doc = Document(content=long_text)
splitter = RecursiveDocumentSplitter(
split_length=200, split_overlap=0, split_unit="char", separators=["\n\n", "\n", " "]
)
splitter.warm_up()
result = splitter.run([doc])
chunks = result["documents"]
assert len(chunks) == 4
assert len(chunks[0].content) == 152
assert chunks[0].content.startswith("A")
assert len(chunks[1].content) == 101
assert chunks[1].content.startswith("B")
assert len(chunks[2].content) == 107
assert chunks[2].content.startswith("B")
assert len(chunks[3].content) == 152
assert chunks[3].content.startswith("C")
assert chunks[3].content.endswith("D" * 50)