2024-05-09 15:40:36 +02:00
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
2023-09-27 12:26:20 +02:00
import pytest
2023-11-24 14:48:43 +01:00
from haystack import Document
from haystack . components . preprocessors import DocumentSplitter
2023-09-27 12:26:20 +02:00
2023-11-03 11:33:20 +01:00
class TestDocumentSplitter :
2023-09-27 12:26:20 +02:00
def test_non_text_document ( self ) :
with pytest . raises (
2023-11-03 11:33:20 +01:00
ValueError , match = " DocumentSplitter only works with text documents but document.content for document ID "
2023-09-27 12:26:20 +02:00
) :
2023-11-03 11:33:20 +01:00
splitter = DocumentSplitter ( )
2023-09-27 12:26:20 +02:00
splitter . run ( documents = [ Document ( ) ] )
def test_single_doc ( self ) :
2023-11-03 11:33:20 +01:00
with pytest . raises ( TypeError , match = " DocumentSplitter expects a List of Documents as input. " ) :
splitter = DocumentSplitter ( )
2023-09-27 12:26:20 +02:00
splitter . run ( documents = Document ( ) )
def test_empty_list ( self ) :
2023-11-03 11:33:20 +01:00
splitter = DocumentSplitter ( )
2023-10-17 11:25:28 +02:00
res = splitter . run ( documents = [ ] )
assert res == { " documents " : [ ] }
2023-09-27 12:26:20 +02:00
def test_unsupported_split_by ( self ) :
2024-01-17 20:06:29 +05:30
with pytest . raises ( ValueError , match = " split_by must be one of ' word ' , ' sentence ' , ' page ' or ' passage ' . " ) :
2023-11-03 11:33:20 +01:00
DocumentSplitter ( split_by = " unsupported " )
2023-09-27 12:26:20 +02:00
def test_unsupported_split_length ( self ) :
with pytest . raises ( ValueError , match = " split_length must be greater than 0. " ) :
2023-11-03 11:33:20 +01:00
DocumentSplitter ( split_length = 0 )
2023-09-27 12:26:20 +02:00
def test_unsupported_split_overlap ( self ) :
with pytest . raises ( ValueError , match = " split_overlap must be greater than or equal to 0. " ) :
2023-11-03 11:33:20 +01:00
DocumentSplitter ( split_overlap = - 1 )
2023-09-27 12:26:20 +02:00
def test_split_by_word ( self ) :
2023-11-03 11:33:20 +01:00
splitter = DocumentSplitter ( split_by = " word " , split_length = 10 )
2023-09-27 12:26:20 +02:00
result = splitter . run (
documents = [
Document (
2023-10-31 12:44:04 +01:00
content = " This is a text with some words. There is a second sentence. And there is a third sentence. "
2023-09-27 12:26:20 +02:00
)
]
)
assert len ( result [ " documents " ] ) == 2
2023-10-31 12:44:04 +01:00
assert result [ " documents " ] [ 0 ] . content == " This is a text with some words. There is a "
assert result [ " documents " ] [ 1 ] . content == " second sentence. And there is a third sentence. "
2023-09-27 12:26:20 +02:00
def test_split_by_word_multiple_input_docs ( self ) :
2023-11-03 11:33:20 +01:00
splitter = DocumentSplitter ( split_by = " word " , split_length = 10 )
2023-09-27 12:26:20 +02:00
result = splitter . run (
documents = [
Document (
2023-10-31 12:44:04 +01:00
content = " This is a text with some words. There is a second sentence. And there is a third sentence. "
2023-09-27 12:26:20 +02:00
) ,
Document (
2023-10-31 12:44:04 +01:00
content = " This is a different text with some words. There is a second sentence. And there is a third sentence. And there is a fourth sentence. "
2023-09-27 12:26:20 +02:00
) ,
]
)
assert len ( result [ " documents " ] ) == 5
2023-10-31 12:44:04 +01:00
assert result [ " documents " ] [ 0 ] . content == " This is a text with some words. There is a "
assert result [ " documents " ] [ 1 ] . content == " second sentence. And there is a third sentence. "
assert result [ " documents " ] [ 2 ] . content == " This is a different text with some words. There is "
assert result [ " documents " ] [ 3 ] . content == " a second sentence. And there is a third sentence. And "
assert result [ " documents " ] [ 4 ] . content == " there is a fourth sentence. "
2023-09-27 12:26:20 +02:00
def test_split_by_sentence ( self ) :
2023-11-03 11:33:20 +01:00
splitter = DocumentSplitter ( split_by = " sentence " , split_length = 1 )
2023-09-27 12:26:20 +02:00
result = splitter . run (
documents = [
Document (
2023-10-31 12:44:04 +01:00
content = " This is a text with some words. There is a second sentence. And there is a third sentence. "
2023-09-27 12:26:20 +02:00
)
]
)
assert len ( result [ " documents " ] ) == 3
2023-10-31 12:44:04 +01:00
assert result [ " documents " ] [ 0 ] . content == " This is a text with some words. "
assert result [ " documents " ] [ 1 ] . content == " There is a second sentence. "
assert result [ " documents " ] [ 2 ] . content == " And there is a third sentence. "
2023-09-27 12:26:20 +02:00
def test_split_by_passage ( self ) :
2023-11-03 11:33:20 +01:00
splitter = DocumentSplitter ( split_by = " passage " , split_length = 1 )
2023-09-27 12:26:20 +02:00
result = splitter . run (
documents = [
Document (
2023-10-31 12:44:04 +01:00
content = " This is a text with some words. There is a second sentence. \n \n And there is a third sentence. \n \n And another passage. "
2023-09-27 12:26:20 +02:00
)
]
)
assert len ( result [ " documents " ] ) == 3
2023-10-31 12:44:04 +01:00
assert result [ " documents " ] [ 0 ] . content == " This is a text with some words. There is a second sentence. \n \n "
assert result [ " documents " ] [ 1 ] . content == " And there is a third sentence. \n \n "
assert result [ " documents " ] [ 2 ] . content == " And another passage. "
2023-09-27 12:26:20 +02:00
2024-01-17 20:06:29 +05:30
def test_split_by_page ( self ) :
splitter = DocumentSplitter ( split_by = " page " , split_length = 1 )
result = splitter . run (
documents = [
Document (
content = " This is a text with some words. There is a second sentence. \f And there is a third sentence. \f And another passage. "
)
]
)
assert len ( result [ " documents " ] ) == 3
assert result [ " documents " ] [ 0 ] . content == " This is a text with some words. There is a second sentence. \x0c "
assert result [ " documents " ] [ 1 ] . content == " And there is a third sentence. \x0c "
assert result [ " documents " ] [ 2 ] . content == " And another passage. "
2023-09-27 12:26:20 +02:00
def test_split_by_word_with_overlap ( self ) :
2023-11-03 11:33:20 +01:00
splitter = DocumentSplitter ( split_by = " word " , split_length = 10 , split_overlap = 2 )
2023-09-27 12:26:20 +02:00
result = splitter . run (
documents = [
Document (
2023-10-31 12:44:04 +01:00
content = " This is a text with some words. There is a second sentence. And there is a third sentence. "
2023-09-27 12:26:20 +02:00
)
]
)
assert len ( result [ " documents " ] ) == 2
2023-10-31 12:44:04 +01:00
assert result [ " documents " ] [ 0 ] . content == " This is a text with some words. There is a "
assert result [ " documents " ] [ 1 ] . content == " is a second sentence. And there is a third sentence. "
2023-09-27 12:26:20 +02:00
def test_source_id_stored_in_metadata ( self ) :
2023-11-03 11:33:20 +01:00
splitter = DocumentSplitter ( split_by = " word " , split_length = 10 )
2023-10-31 12:44:04 +01:00
doc1 = Document ( content = " This is a text with some words. " )
doc2 = Document ( content = " This is a different text with some words. " )
2023-09-27 12:26:20 +02:00
result = splitter . run ( documents = [ doc1 , doc2 ] )
2023-10-31 12:44:04 +01:00
assert result [ " documents " ] [ 0 ] . meta [ " source_id " ] == doc1 . id
assert result [ " documents " ] [ 1 ] . meta [ " source_id " ] == doc2 . id
2023-10-17 11:03:48 +02:00
2023-10-20 15:18:28 +02:00
def test_copy_metadata ( self ) :
2023-11-03 11:33:20 +01:00
splitter = DocumentSplitter ( split_by = " word " , split_length = 10 )
2023-10-17 11:03:48 +02:00
documents = [
2023-10-31 12:44:04 +01:00
Document ( content = " Text. " , meta = { " name " : " doc 0 " } ) ,
Document ( content = " Text. " , meta = { " name " : " doc 1 " } ) ,
2023-10-17 11:03:48 +02:00
]
result = splitter . run ( documents = documents )
assert len ( result [ " documents " ] ) == 2
assert result [ " documents " ] [ 0 ] . id != result [ " documents " ] [ 1 ] . id
for doc , split_doc in zip ( documents , result [ " documents " ] ) :
2023-10-31 12:44:04 +01:00
assert doc . meta . items ( ) < = split_doc . meta . items ( )
assert split_doc . content == " Text. "
2024-04-29 12:51:18 +02:00
def test_add_page_number_to_metadata_with_no_overlap_word_split ( self ) :
splitter = DocumentSplitter ( split_by = " word " , split_length = 2 )
doc1 = Document ( content = " This is some text. \f This text is on another page. " )
doc2 = Document ( content = " This content has two. \f \f page brakes. " )
result = splitter . run ( documents = [ doc1 , doc2 ] )
expected_pages = [ 1 , 1 , 2 , 2 , 2 , 1 , 1 , 3 ]
for doc , p in zip ( result [ " documents " ] , expected_pages ) :
assert doc . meta [ " page_number " ] == p
def test_add_page_number_to_metadata_with_no_overlap_sentence_split ( self ) :
splitter = DocumentSplitter ( split_by = " sentence " , split_length = 1 )
doc1 = Document ( content = " This is some text. \f This text is on another page. " )
doc2 = Document ( content = " This content has two. \f \f page brakes. " )
result = splitter . run ( documents = [ doc1 , doc2 ] )
expected_pages = [ 1 , 1 , 1 , 1 ]
for doc , p in zip ( result [ " documents " ] , expected_pages ) :
assert doc . meta [ " page_number " ] == p
def test_add_page_number_to_metadata_with_no_overlap_passage_split ( self ) :
splitter = DocumentSplitter ( split_by = " passage " , split_length = 1 )
doc1 = Document (
content = " This is a text with some words. \f There is a second sentence. \n \n And there is a third sentence. \n \n And more passages. \n \n \f And another passage. "
)
result = splitter . run ( documents = [ doc1 ] )
expected_pages = [ 1 , 2 , 2 , 2 ]
for doc , p in zip ( result [ " documents " ] , expected_pages ) :
assert doc . meta [ " page_number " ] == p
def test_add_page_number_to_metadata_with_no_overlap_page_split ( self ) :
splitter = DocumentSplitter ( split_by = " page " , split_length = 1 )
doc1 = Document (
content = " This is a text with some words. There is a second sentence. \f And there is a third sentence. \f And another passage. "
)
result = splitter . run ( documents = [ doc1 ] )
expected_pages = [ 1 , 2 , 3 ]
for doc , p in zip ( result [ " documents " ] , expected_pages ) :
assert doc . meta [ " page_number " ] == p
splitter = DocumentSplitter ( split_by = " page " , split_length = 2 )
doc1 = Document (
content = " This is a text with some words. There is a second sentence. \f And there is a third sentence. \f And another passage. "
)
result = splitter . run ( documents = [ doc1 ] )
expected_pages = [ 1 , 3 ]
for doc , p in zip ( result [ " documents " ] , expected_pages ) :
assert doc . meta [ " page_number " ] == p
def test_add_page_number_to_metadata_with_overlap_word_split ( self ) :
splitter = DocumentSplitter ( split_by = " word " , split_length = 3 , split_overlap = 1 )
doc1 = Document ( content = " This is some text. And \f this text is on another page. " )
doc2 = Document ( content = " This content has two. \f \f page brakes. " )
result = splitter . run ( documents = [ doc1 , doc2 ] )
expected_pages = [ 1 , 1 , 1 , 2 , 2 , 1 , 1 , 3 ]
for doc , p in zip ( result [ " documents " ] , expected_pages ) :
print ( doc . content , doc . meta , p )
assert doc . meta [ " page_number " ] == p
def test_add_page_number_to_metadata_with_overlap_sentence_split ( self ) :
splitter = DocumentSplitter ( split_by = " sentence " , split_length = 2 , split_overlap = 1 )
doc1 = Document ( content = " This is some text. And this is more text. \f This text is on another page. End. " )
doc2 = Document ( content = " This content has two. \f \f page brakes. More text. " )
result = splitter . run ( documents = [ doc1 , doc2 ] )
expected_pages = [ 1 , 1 , 1 , 2 , 1 , 1 ]
for doc , p in zip ( result [ " documents " ] , expected_pages ) :
print ( doc . content , doc . meta , p )
assert doc . meta [ " page_number " ] == p
def test_add_page_number_to_metadata_with_overlap_passage_split ( self ) :
splitter = DocumentSplitter ( split_by = " passage " , split_length = 2 , split_overlap = 1 )
doc1 = Document (
content = " This is a text with some words. \f There is a second sentence. \n \n And there is a third sentence. \n \n And more passages. \n \n \f And another passage. "
)
result = splitter . run ( documents = [ doc1 ] )
expected_pages = [ 1 , 2 , 2 ]
for doc , p in zip ( result [ " documents " ] , expected_pages ) :
assert doc . meta [ " page_number " ] == p
def test_add_page_number_to_metadata_with_overlap_page_split ( self ) :
splitter = DocumentSplitter ( split_by = " page " , split_length = 2 , split_overlap = 1 )
doc1 = Document (
content = " This is a text with some words. There is a second sentence. \f And there is a third sentence. \f And another passage. "
)
result = splitter . run ( documents = [ doc1 ] )
expected_pages = [ 1 , 2 , 3 ]
for doc , p in zip ( result [ " documents " ] , expected_pages ) :
assert doc . meta [ " page_number " ] == p