haystack/test/components/preprocessors/test_document_preprocessor.py

137 lines
5.4 KiB
Python
Raw Normal View History

feat: integrate two ready-made SuperComponents from haystack-experimental (#9235) * Add super component decorator * Add reno * MultiFileConverter * Add DocumentPreprocessor * Add reno * Add tests and change doc preprocessor to split first then clean * Remove code from merge * Add to pydoc and missing test file * PR comments * Lint fix * Fix mypy * Fix mypy * Add comment * PR comments * Update haystack/components/converters/multi_file_converter.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/converters/multi_file_converter.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * PR comments * PR comment --------- Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>
2025-04-17 12:02:26 +02:00
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
from unittest.mock import patch
feat: integrate two ready-made SuperComponents from haystack-experimental (#9235) * Add super component decorator * Add reno * MultiFileConverter * Add DocumentPreprocessor * Add reno * Add tests and change doc preprocessor to split first then clean * Remove code from merge * Add to pydoc and missing test file * PR comments * Lint fix * Fix mypy * Fix mypy * Add comment * PR comments * Update haystack/components/converters/multi_file_converter.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/converters/multi_file_converter.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * PR comments * PR comment --------- Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>
2025-04-17 12:02:26 +02:00
import pytest
from haystack import Document, Pipeline
from haystack.components.preprocessors.document_preprocessor import DocumentPreprocessor
class TestDocumentPreprocessor:
@pytest.fixture
def preprocessor(self) -> DocumentPreprocessor:
return DocumentPreprocessor(
# Cleaner parameters
remove_empty_lines=True,
remove_extra_whitespaces=True,
remove_repeated_substrings=False,
keep_id=True,
# Splitter parameters
split_by="word",
split_length=3,
split_overlap=1,
respect_sentence_boundary=False,
language="en",
)
def test_init(self, preprocessor: DocumentPreprocessor) -> None:
assert isinstance(preprocessor.pipeline, Pipeline)
assert preprocessor.input_mapping == {"documents": ["splitter.documents"]}
assert preprocessor.output_mapping == {"cleaner.documents": "documents"}
cleaner = preprocessor.pipeline.get_component("cleaner")
assert cleaner.remove_empty_lines is True
assert cleaner.remove_extra_whitespaces is True
assert cleaner.remove_repeated_substrings is False
assert cleaner.keep_id is True
splitter = preprocessor.pipeline.get_component("splitter")
assert splitter.split_by == "word"
assert splitter.split_length == 3
assert splitter.split_overlap == 1
assert splitter.respect_sentence_boundary is False
assert splitter.language == "en"
def test_from_dict(self) -> None:
2025-04-22 08:59:03 +02:00
data = {
"init_parameters": {
"remove_empty_lines": True,
"remove_extra_whitespaces": True,
"remove_repeated_substrings": False,
"keep_id": True,
"remove_substrings": None,
"remove_regex": None,
"unicode_normalization": None,
"ascii_only": False,
"split_by": "word",
"split_length": 3,
"split_overlap": 1,
"split_threshold": 0,
"splitting_function": None,
"respect_sentence_boundary": False,
"language": "en",
"use_split_rules": True,
"extend_abbreviations": True,
},
"type": "haystack.components.preprocessors.document_preprocessor.DocumentPreprocessor",
}
preprocessor = DocumentPreprocessor.from_dict(data)
feat: integrate two ready-made SuperComponents from haystack-experimental (#9235) * Add super component decorator * Add reno * MultiFileConverter * Add DocumentPreprocessor * Add reno * Add tests and change doc preprocessor to split first then clean * Remove code from merge * Add to pydoc and missing test file * PR comments * Lint fix * Fix mypy * Fix mypy * Add comment * PR comments * Update haystack/components/converters/multi_file_converter.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/converters/multi_file_converter.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * PR comments * PR comment --------- Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>
2025-04-17 12:02:26 +02:00
assert isinstance(preprocessor, DocumentPreprocessor)
def test_to_dict(self, preprocessor: DocumentPreprocessor) -> None:
expected = {
"init_parameters": {
"remove_empty_lines": True,
"remove_extra_whitespaces": True,
"remove_repeated_substrings": False,
"keep_id": True,
"remove_substrings": None,
"remove_regex": None,
"unicode_normalization": None,
"ascii_only": False,
"split_by": "word",
"split_length": 3,
"split_overlap": 1,
"split_threshold": 0,
"splitting_function": None,
"respect_sentence_boundary": False,
"language": "en",
"use_split_rules": True,
"extend_abbreviations": True,
},
"type": "haystack.components.preprocessors.document_preprocessor.DocumentPreprocessor",
}
assert preprocessor.to_dict() == expected
def test_warm_up(self, preprocessor: DocumentPreprocessor) -> None:
with patch.object(preprocessor.pipeline, "warm_up") as mock_warm_up:
preprocessor.warm_up()
mock_warm_up.assert_called_once()
def test_run(self, preprocessor: DocumentPreprocessor) -> None:
documents = [
Document(content="This is a test document. It has multiple sentences."),
Document(content="Another test document with some content."),
]
preprocessor.warm_up()
result = preprocessor.run(documents=documents)
# Check that we got processed documents back
assert "documents" in result
processed_docs = result["documents"]
assert len(processed_docs) > len(documents) # Should have more docs due to splitting
# Check that the content was cleaned and split
for doc in processed_docs:
assert doc.content.strip() == doc.content
assert len(doc.content.split()) <= 3 # Split length of 3 words
assert doc.id is not None
def test_run_with_custom_splitting_function(self) -> None:
def custom_split(text: str) -> list[str]:
return [t for t in text.split(".") if t.strip() != ""]
preprocessor = DocumentPreprocessor(split_by="function", splitting_function=custom_split, split_length=1)
documents = [Document(content="First sentence. Second sentence. Third sentence.")]
preprocessor.warm_up()
result = preprocessor.run(documents=documents)
processed_docs = result["documents"]
assert len(processed_docs) == 3 # Should be split into 3 sentences
assert all("." not in doc.content for doc in processed_docs) # Each doc should be a single sentence