feat: integrate two ready-made SuperComponents from haystack-experimental (#9235)

* Add super component decorator * Add reno * MultiFileConverter * Add DocumentPreprocessor * Add reno * Add tests and change doc preprocessor to split first then clean * Remove code from merge * Add to pydoc and missing test file * PR comments * Lint fix * Fix mypy * Fix mypy * Add comment * PR comments * Update haystack/components/converters/multi_file_converter.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/converters/multi_file_converter.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * PR comments * PR comment --------- Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>
2025-06-26 22:00:13 +00:00 · 2025-04-17 12:02:26 +02:00 · 2025-04-17 12:02:26 +02:00 · 19cf220136
commit 19cf220136
parent 4279d7e149
10 changed files with 599 additions and 0 deletions
--- a/docs/pydoc/config/converters_api.yml
+++ b/docs/pydoc/config/converters_api.yml
@ -10,6 +10,7 @@ loaders:
        "json",
        "markdown",
        "msg",
+        "multi_file_converter",
        "openapi_functions",
        "output_adapter",
        "pdfminer",
--- a/docs/pydoc/config/preprocessors_api.yml
+++ b/docs/pydoc/config/preprocessors_api.yml
@ -5,6 +5,7 @@ loaders:
      "csv_document_cleaner",
      "csv_document_splitter",
      "document_cleaner",
+      "document_preprocessor",
      "document_splitter",
      "hierarchical_document_splitter",
      "recursive_splitter",
--- a/haystack/components/converters/init.py
+++ b/haystack/components/converters/init.py
@ -15,6 +15,7 @@ _import_structure = {
    "json": ["JSONConverter"],
    "markdown": ["MarkdownToDocument"],
    "msg": ["MSGToDocument"],
+    "multi_file_converter": ["MultiFileConverter"],
    "openapi_functions": ["OpenAPIServiceToFunctions"],
    "output_adapter": ["OutputAdapter"],
    "pdfminer": ["PDFMinerToDocument"],
@ -33,6 +34,7 @@ if TYPE_CHECKING:
    from .json import JSONConverter
    from .markdown import MarkdownToDocument
    from .msg import MSGToDocument
+    from .multi_file_converter import MultiFileConverter
    from .openapi_functions import OpenAPIServiceToFunctions
    from .output_adapter import OutputAdapter
    from .pdfminer import PDFMinerToDocument
--- a/haystack/components/converters/multi_file_converter.py
+++ b/haystack/components/converters/multi_file_converter.py
@ -0,0 +1,118 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from enum import Enum
+
+from haystack import Pipeline, super_component
+from haystack.components.converters import (
+    CSVToDocument,
+    DOCXToDocument,
+    HTMLToDocument,
+    JSONConverter,
+    PPTXToDocument,
+    PyPDFToDocument,
+    TextFileToDocument,
+    XLSXToDocument,
+)
+from haystack.components.joiners import DocumentJoiner
+from haystack.components.routers import FileTypeRouter
+
+
+class ConverterMimeType(str, Enum):
+    CSV = "text/csv"
+    DOCX = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+    HTML = "text/html"
+    JSON = "application/json"
+    MD = "text/markdown"
+    TEXT = "text/plain"
+    PDF = "application/pdf"
+    PPTX = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
+    XLSX = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+
+
+@super_component
+class MultiFileConverter:
+    """
+    A file converter that handles conversion of multiple file types.
+
+    The MultiFileConverter handles the following file types:
+    - CSV
+    - DOCX
+    - HTML
+    - JSON
+    - MD
+    - TEXT
+    - PDF (no OCR)
+    - PPTX
+    - XLSX
+
+    Usage example:
+    ```
+    from haystack.super_components.converters import MultiFileConverter
+
+    converter = MultiFileConverter()
+    converter.run(sources=["test.txt", "test.pdf"], meta={})
+    ```
+    """
+
+    def __init__(self, encoding: str = "utf-8", json_content_key: str = "content") -> None:
+        """
+        Initialize the MultiFileConverter.
+
+        :param encoding: The encoding to use when reading files.
+        :param json_content_key: The key to use in a content field in a document when converting JSON files.
+        """
+        self.encoding = encoding
+        self.json_content_key = json_content_key
+
+        # initialize components
+        router = FileTypeRouter(
+            mime_types=[mime_type.value for mime_type in ConverterMimeType],
+            # Ensure common extensions are registered. Tests on Windows fail otherwise.
+            additional_mimetypes={
+                "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
+                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
+                "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
+            },
+        )
+
+        # Create pipeline and add components
+        pp = Pipeline()
+
+        # We use type ignore here to avoid type checking errors
+        # This is due to how the run method within the Component protocol is defined
+        pp.add_component("router", router)  # type: ignore[arg-type]
+        pp.add_component("docx", DOCXToDocument(link_format="markdown"))  # type: ignore[arg-type]
+        pp.add_component(
+            "html",
+            HTMLToDocument(  # type: ignore[arg-type]
+                extraction_kwargs={"output_format": "markdown", "include_tables": True, "include_links": True}
+            ),
+        )
+        pp.add_component("json", JSONConverter(content_key=self.json_content_key))  # type: ignore[arg-type]
+        pp.add_component("md", TextFileToDocument(encoding=self.encoding))  # type: ignore[arg-type]
+        pp.add_component("text", TextFileToDocument(encoding=self.encoding))  # type: ignore[arg-type]
+        pp.add_component("pdf", PyPDFToDocument())  # type: ignore[arg-type]
+        pp.add_component("pptx", PPTXToDocument())  # type: ignore[arg-type]
+        pp.add_component("xlsx", XLSXToDocument())  # type: ignore[arg-type]
+        pp.add_component("joiner", DocumentJoiner())  # type: ignore[arg-type]
+        pp.add_component("csv", CSVToDocument(encoding=self.encoding))  # type: ignore[arg-type]
+
+        for mime_type in ConverterMimeType:
+            pp.connect(f"router.{mime_type.value}", str(mime_type).lower().rsplit(".", maxsplit=1)[-1])
+
+        pp.connect("docx.documents", "joiner.documents")
+        pp.connect("html.documents", "joiner.documents")
+        pp.connect("json.documents", "joiner.documents")
+        pp.connect("md.documents", "joiner.documents")
+        pp.connect("text.documents", "joiner.documents")
+        pp.connect("pdf.documents", "joiner.documents")
+        pp.connect("pptx.documents", "joiner.documents")
+
+        pp.connect("csv.documents", "joiner.documents")
+        pp.connect("xlsx.documents", "joiner.documents")
+
+        self.pipeline = pp
+        self.output_mapping = {"joiner.documents": "documents", "router.unclassified": "unclassified"}
+        self.input_mapping = {"sources": ["router.sources"], "meta": ["router.meta"]}
--- a/haystack/components/preprocessors/init.py
+++ b/haystack/components/preprocessors/init.py
@ -11,6 +11,7 @@ _import_structure = {
    "csv_document_cleaner": ["CSVDocumentCleaner"],
    "csv_document_splitter": ["CSVDocumentSplitter"],
    "document_cleaner": ["DocumentCleaner"],
+    "document_preprocessor": ["DocumentPreprocessor"],
    "document_splitter": ["DocumentSplitter"],
    "hierarchical_document_splitter": ["HierarchicalDocumentSplitter"],
    "recursive_splitter": ["RecursiveDocumentSplitter"],
@ -21,6 +22,7 @@ if TYPE_CHECKING:
    from .csv_document_cleaner import CSVDocumentCleaner
    from .csv_document_splitter import CSVDocumentSplitter
    from .document_cleaner import DocumentCleaner
+    from .document_preprocessor import DocumentPreprocessor
    from .document_splitter import DocumentSplitter
    from .hierarchical_document_splitter import HierarchicalDocumentSplitter
    from .recursive_splitter import RecursiveDocumentSplitter
--- a/haystack/components/preprocessors/document_preprocessor.py
+++ b/haystack/components/preprocessors/document_preprocessor.py
@ -0,0 +1,192 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Dict, List, Literal, Optional
+
+from haystack import Pipeline, default_from_dict, default_to_dict, super_component
+from haystack.components.preprocessors.document_cleaner import DocumentCleaner
+from haystack.components.preprocessors.document_splitter import DocumentSplitter, Language
+from haystack.utils import deserialize_callable, serialize_callable
+
+
+@super_component
+class DocumentPreprocessor:
+    """
+    A SuperComponent that first splits and then cleans documents.
+
+    This component consists of a DocumentSplitter followed by a DocumentCleaner in a single pipeline.
+    It takes a list of documents as input and returns a processed list of documents.
+
+    Usage example:
+    ```python
+    from haystack import Document
+    doc = Document(content="I love pizza!")
+    preprocessor = DocumentPreProcessor()
+    results = preprocessor.run(documents=[doc])
+    print(result["documents"])
+    ```
+    """
+
+    def __init__(  # noqa: PLR0913 (too-many-arguments)
+        self,
+        *,
+        # --- DocumentSplitter arguments ---
+        split_by: Literal["function", "page", "passage", "period", "word", "line", "sentence"] = "word",
+        split_length: int = 250,
+        split_overlap: int = 0,
+        split_threshold: int = 0,
+        splitting_function: Optional[Callable[[str], List[str]]] = None,
+        respect_sentence_boundary: bool = False,
+        language: Language = "en",
+        use_split_rules: bool = True,
+        extend_abbreviations: bool = True,
+        # --- DocumentCleaner arguments ---
+        remove_empty_lines: bool = True,
+        remove_extra_whitespaces: bool = True,
+        remove_repeated_substrings: bool = False,
+        keep_id: bool = False,
+        remove_substrings: Optional[List[str]] = None,
+        remove_regex: Optional[str] = None,
+        unicode_normalization: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None,
+        ascii_only: bool = False,
+    ) -> None:
+        """
+        Initialize a DocumentPreProcessor that first splits and then cleans documents.
+
+        **Splitter Parameters**:
+        :param split_by: The unit of splitting: "function", "page", "passage", "period", "word", "line", or "sentence".
+        :param split_length: The maximum number of units (words, lines, pages, and so on) in each split.
+        :param split_overlap: The number of overlapping units between consecutive splits.
+        :param split_threshold: The minimum number of units per split. If a split is smaller than this, it's merged
+            with the previous split.
+        :param splitting_function: A custom function for splitting if `split_by="function"`.
+        :param respect_sentence_boundary: If `True`, splits by words but tries not to break inside a sentence.
+        :param language: Language used by the sentence tokenizer if `split_by="sentence"` or
+            `respect_sentence_boundary=True`.
+        :param use_split_rules: Whether to apply additional splitting heuristics for the sentence splitter.
+        :param extend_abbreviations: Whether to extend the sentence splitter with curated abbreviations for certain
+            languages.
+
+        **Cleaner Parameters**:
+        :param remove_empty_lines: If `True`, removes empty lines.
+        :param remove_extra_whitespaces: If `True`, removes extra whitespaces.
+        :param remove_repeated_substrings: If `True`, removes repeated substrings like headers/footers across pages.
+        :param keep_id: If `True`, keeps the original document IDs.
+        :param remove_substrings: A list of strings to remove from the document content.
+        :param remove_regex: A regex pattern whose matches will be removed from the document content.
+        :param unicode_normalization: Unicode normalization form to apply to the text, for example `"NFC"`.
+        :param ascii_only: If `True`, converts text to ASCII only.
+        """
+        # Store arguments for serialization
+        self.remove_empty_lines = remove_empty_lines
+        self.remove_extra_whitespaces = remove_extra_whitespaces
+        self.remove_repeated_substrings = remove_repeated_substrings
+        self.keep_id = keep_id
+        self.remove_substrings = remove_substrings
+        self.remove_regex = remove_regex
+        self.unicode_normalization = unicode_normalization
+        self.ascii_only = ascii_only
+
+        self.split_by = split_by
+        self.split_length = split_length
+        self.split_overlap = split_overlap
+        self.split_threshold = split_threshold
+        self.splitting_function = splitting_function
+        self.respect_sentence_boundary = respect_sentence_boundary
+        self.language = language
+        self.use_split_rules = use_split_rules
+        self.extend_abbreviations = extend_abbreviations
+
+        # Instantiate sub-components
+        splitter = DocumentSplitter(
+            split_by=self.split_by,
+            split_length=self.split_length,
+            split_overlap=self.split_overlap,
+            split_threshold=self.split_threshold,
+            splitting_function=self.splitting_function,
+            respect_sentence_boundary=self.respect_sentence_boundary,
+            language=self.language,
+            use_split_rules=self.use_split_rules,
+            extend_abbreviations=self.extend_abbreviations,
+        )
+
+        cleaner = DocumentCleaner(
+            remove_empty_lines=self.remove_empty_lines,
+            remove_extra_whitespaces=self.remove_extra_whitespaces,
+            remove_repeated_substrings=self.remove_repeated_substrings,
+            keep_id=self.keep_id,
+            remove_substrings=self.remove_substrings,
+            remove_regex=self.remove_regex,
+            unicode_normalization=self.unicode_normalization,
+            ascii_only=self.ascii_only,
+        )
+
+        # Build the Pipeline
+        pp = Pipeline()
+
+        # We use type ignore here to avoid type checking errors
+        # This is due to how the run method within the Component protocol is defined
+        pp.add_component("splitter", splitter)  # type: ignore[arg-type]
+        pp.add_component("cleaner", cleaner)  # type: ignore[arg-type]
+
+        # Connect the splitter output to cleaner
+        pp.connect("splitter.documents", "cleaner.documents")
+        self.pipeline = pp
+
+        # Define how pipeline inputs/outputs map to sub-component inputs/outputs
+        self.input_mapping = {
+            # The pipeline input "documents" feeds into "splitter.documents"
+            "documents": ["splitter.documents"]
+        }
+        # The pipeline output "documents" comes from "cleaner.documents"
+        self.output_mapping = {"cleaner.documents": "documents"}
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serialize SuperComponent to a dictionary.
+
+        :return:
+            Dictionary with serialized data.
+        """
+        splitting_function = None
+        if self.splitting_function is not None:
+            splitting_function = serialize_callable(self.splitting_function)
+
+        return default_to_dict(
+            self,
+            remove_empty_lines=self.remove_empty_lines,
+            remove_extra_whitespaces=self.remove_extra_whitespaces,
+            remove_repeated_substrings=self.remove_repeated_substrings,
+            keep_id=self.keep_id,
+            remove_substrings=self.remove_substrings,
+            remove_regex=self.remove_regex,
+            unicode_normalization=self.unicode_normalization,
+            ascii_only=self.ascii_only,
+            split_by=self.split_by,
+            split_length=self.split_length,
+            split_overlap=self.split_overlap,
+            split_threshold=self.split_threshold,
+            splitting_function=splitting_function,
+            respect_sentence_boundary=self.respect_sentence_boundary,
+            language=self.language,
+            use_split_rules=self.use_split_rules,
+            extend_abbreviations=self.extend_abbreviations,
+        )
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "DocumentPreprocessor":
+        """
+        Deserializes the SuperComponent from a dictionary.
+
+        :param data:
+            Dictionary to deserialize from.
+        :returns:
+            Deserialized SuperComponent.
+        """
+        if "splitting_function" in data["init_parameters"]:
+            data["init_parameters"]["splitting_function"] = deserialize_callable(
+                data["init_parameters"]["splitting_function"]
+            )
+
+        return default_from_dict(cls, data)
--- a/releasenotes/notes/move-over-supercomponents-24907133d0d90ed6.yaml
+++ b/releasenotes/notes/move-over-supercomponents-24907133d0d90ed6.yaml
@ -0,0 +1,9 @@
+---
+highlights: >
+    Two ready-made SuperComponents simplify document preprocessing: MultiFileConverter, and DocumentPreProcessor.
+    For example, if you have all extra dependencies for file conversion installed, you can run the following:
+    ```python
+        from haystack.super_components.converters import MultiFileConverter
+        converter = MultiFileConverter()
+        converter.run(sources=["test.txt", "test.pdf"], meta={})
+    ```
--- a/test/components/converters/test_multi_file_converter.py
+++ b/test/components/converters/test_multi_file_converter.py
@ -0,0 +1,145 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+from haystack import Document, Pipeline
+from haystack.core.pipeline.base import component_to_dict, component_from_dict
+from haystack.core.component.component import Component
+from haystack.dataclasses import ByteStream
+from haystack.components.converters.multi_file_converter import MultiFileConverter
+
+
+@pytest.fixture
+def converter():
+    converter = MultiFileConverter()
+    converter.warm_up()
+    return converter
+
+
+class TestMultiFileConverter:
+    def test_init_default_params(self, converter):
+        """Test initialization with default parameters"""
+        assert converter.encoding == "utf-8"
+        assert converter.json_content_key == "content"
+        assert isinstance(converter, Component)
+
+    def test_init_custom_params(self, converter):
+        """Test initialization with custom parameters"""
+        converter = MultiFileConverter(encoding="latin-1", json_content_key="text")
+        assert converter.encoding == "latin-1"
+        assert converter.json_content_key == "text"
+
+    def test_to_dict(self, converter):
+        """Test serialization to dictionary"""
+        data = component_to_dict(converter, "converter")
+        assert data == {
+            "type": "haystack.components.converters.multi_file_converter.MultiFileConverter",
+            "init_parameters": {"encoding": "utf-8", "json_content_key": "content"},
+        }
+
+    def test_from_dict(self):
+        """Test deserialization from dictionary"""
+        data = {
+            "type": "haystack.components.converters.multi_file_converter.MultiFileConverter",
+            "init_parameters": {"encoding": "latin-1", "json_content_key": "text"},
+        }
+        conv = component_from_dict(MultiFileConverter, data, "converter")
+        assert conv.encoding == "latin-1"
+        assert conv.json_content_key == "text"
+
+    @pytest.mark.parametrize(
+        "suffix,file_path",
+        [
+            ("csv", "csv/sample_1.csv"),
+            ("docx", "docx/sample_docx.docx"),
+            ("html", "html/what_is_haystack.html"),
+            ("json", "json/json_conversion_testfile.json"),
+            ("md", "markdown/sample.md"),
+            ("pdf", "pdf/sample_pdf_1.pdf"),
+            ("pptx", "pptx/sample_pptx.pptx"),
+            ("txt", "txt/doc_1.txt"),
+            ("xlsx", "xlsx/table_empty_rows_and_columns.xlsx"),
+        ],
+    )
+    @pytest.mark.integration
+    def test_run(self, test_files_path, converter, suffix, file_path):
+        unclassified_bytestream = ByteStream(b"unclassified content")
+        unclassified_bytestream.meta["content_type"] = "unknown_type"
+
+        paths = [test_files_path / file_path, unclassified_bytestream]
+
+        output = converter.run(sources=paths)
+        docs = output["documents"]
+        unclassified = output["unclassified"]
+
+        assert len(docs) == 1
+        assert isinstance(docs[0], Document)
+        assert docs[0].content is not None
+        assert docs[0].meta["file_path"].endswith(suffix)
+
+        assert len(unclassified) == 1
+        assert isinstance(unclassified[0], ByteStream)
+        assert unclassified[0].meta["content_type"] == "unknown_type"
+
+    def test_run_with_meta(self, test_files_path, converter):
+        """Test conversion with metadata"""
+        paths = [test_files_path / "txt" / "doc_1.txt"]
+        meta = {"language": "en", "author": "test"}
+        output = converter.run(sources=paths, meta=meta)
+        docs = output["documents"]
+        assert docs[0].meta["language"] == "en"
+        assert docs[0].meta["author"] == "test"
+
+    def test_run_with_bytestream(self, test_files_path, converter):
+        """Test converting ByteStream input"""
+        bytestream = ByteStream(data=b"test content", mime_type="text/plain", meta={"file_path": "test.txt"})
+        output = converter.run(sources=[bytestream])
+        docs = output["documents"]
+        assert len(docs) == 1
+        assert docs[0].content == "test content"
+        assert docs[0].meta["file_path"] == "test.txt"
+
+    def test_run_error_handling(self, test_files_path, converter, caplog):
+        """Test error handling for non-existent files"""
+        paths = [test_files_path / "non_existent.txt"]
+        with caplog.at_level("WARNING"):
+            output = converter.run(sources=paths)
+            assert "Could not read" in caplog.text
+            assert len(output["documents"]) == 0
+
+    @pytest.mark.integration
+    def test_run_all_file_types(self, test_files_path, converter):
+        """Test converting all supported file types in parallel"""
+        paths = [
+            test_files_path / "csv" / "sample_1.csv",
+            test_files_path / "docx" / "sample_docx.docx",
+            test_files_path / "html" / "what_is_haystack.html",
+            test_files_path / "json" / "json_conversion_testfile.json",
+            test_files_path / "markdown" / "sample.md",
+            test_files_path / "txt" / "doc_1.txt",
+            test_files_path / "pdf" / "sample_pdf_1.pdf",
+            test_files_path / "pptx" / "sample_pptx.pptx",
+            test_files_path / "xlsx" / "table_empty_rows_and_columns.xlsx",
+        ]
+        output = converter.run(sources=paths)
+        docs = output["documents"]
+
+        # Verify we got a document for each file
+        assert len(docs) == len(paths)
+        assert all(isinstance(doc, Document) for doc in docs)
+
+    @pytest.mark.integration
+    def test_run_in_pipeline(self, test_files_path, converter):
+        pipeline = Pipeline(max_runs_per_component=1)
+        pipeline.add_component("converter", converter)
+
+        paths = [test_files_path / "txt" / "doc_1.txt", test_files_path / "pdf" / "sample_pdf_1.pdf"]
+
+        output = pipeline.run(data={"sources": paths})
+        docs = output["converter"]["documents"]
+
+        assert len(docs) == 2
+        assert all(isinstance(doc, Document) for doc in docs)
+        assert all(doc.content is not None for doc in docs)
--- a/test/components/preprocessors/test_document_preprocessor.py
+++ b/test/components/preprocessors/test_document_preprocessor.py
@ -0,0 +1,128 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from unittest.mock import patch
+import pytest
+
+from haystack import Document, Pipeline
+from haystack.components.preprocessors.document_preprocessor import DocumentPreprocessor
+
+
+class TestDocumentPreprocessor:
+    @pytest.fixture
+    def preprocessor(self) -> DocumentPreprocessor:
+        return DocumentPreprocessor(
+            # Cleaner parameters
+            remove_empty_lines=True,
+            remove_extra_whitespaces=True,
+            remove_repeated_substrings=False,
+            keep_id=True,
+            # Splitter parameters
+            split_by="word",
+            split_length=3,
+            split_overlap=1,
+            respect_sentence_boundary=False,
+            language="en",
+        )
+
+    def test_init(self, preprocessor: DocumentPreprocessor) -> None:
+        assert isinstance(preprocessor.pipeline, Pipeline)
+        assert preprocessor.input_mapping == {"documents": ["splitter.documents"]}
+        assert preprocessor.output_mapping == {"cleaner.documents": "documents"}
+
+        cleaner = preprocessor.pipeline.get_component("cleaner")
+        assert cleaner.remove_empty_lines is True
+        assert cleaner.remove_extra_whitespaces is True
+        assert cleaner.remove_repeated_substrings is False
+        assert cleaner.keep_id is True
+
+        splitter = preprocessor.pipeline.get_component("splitter")
+        assert splitter.split_by == "word"
+        assert splitter.split_length == 3
+        assert splitter.split_overlap == 1
+        assert splitter.respect_sentence_boundary is False
+        assert splitter.language == "en"
+
+    def test_from_dict(self) -> None:
+        preprocessor = DocumentPreprocessor.from_dict(
+            {
+                "init_parameters": {
+                    "remove_empty_lines": True,
+                    "remove_extra_whitespaces": True,
+                    "remove_repeated_substrings": False,
+                    "keep_id": True,
+                    "split_by": "word",
+                    "split_length": 3,
+                    "split_overlap": 1,
+                    "respect_sentence_boundary": False,
+                    "language": "en",
+                },
+                "type": "haystack.components.preprocessors.document_preprocessor.DocumentPreprocessor",
+            }
+        )
+        assert isinstance(preprocessor, DocumentPreprocessor)
+
+    def test_to_dict(self, preprocessor: DocumentPreprocessor) -> None:
+        expected = {
+            "init_parameters": {
+                "remove_empty_lines": True,
+                "remove_extra_whitespaces": True,
+                "remove_repeated_substrings": False,
+                "keep_id": True,
+                "remove_substrings": None,
+                "remove_regex": None,
+                "unicode_normalization": None,
+                "ascii_only": False,
+                "split_by": "word",
+                "split_length": 3,
+                "split_overlap": 1,
+                "split_threshold": 0,
+                "splitting_function": None,
+                "respect_sentence_boundary": False,
+                "language": "en",
+                "use_split_rules": True,
+                "extend_abbreviations": True,
+            },
+            "type": "haystack.components.preprocessors.document_preprocessor.DocumentPreprocessor",
+        }
+        assert preprocessor.to_dict() == expected
+
+    def test_warm_up(self, preprocessor: DocumentPreprocessor) -> None:
+        with patch.object(preprocessor.pipeline, "warm_up") as mock_warm_up:
+            preprocessor.warm_up()
+            mock_warm_up.assert_called_once()
+
+    def test_run(self, preprocessor: DocumentPreprocessor) -> None:
+        documents = [
+            Document(content="This is a test document. It has multiple sentences."),
+            Document(content="Another test document with some content."),
+        ]
+
+        preprocessor.warm_up()
+        result = preprocessor.run(documents=documents)
+
+        # Check that we got processed documents back
+        assert "documents" in result
+        processed_docs = result["documents"]
+        assert len(processed_docs) > len(documents)  # Should have more docs due to splitting
+
+        # Check that the content was cleaned and split
+        for doc in processed_docs:
+            assert doc.content.strip() == doc.content
+            assert len(doc.content.split()) <= 3  # Split length of 3 words
+            assert doc.id is not None
+
+    def test_run_with_custom_splitting_function(self) -> None:
+        def custom_split(text: str) -> list[str]:
+            return [t for t in text.split(".") if t.strip() != ""]
+
+        preprocessor = DocumentPreprocessor(split_by="function", splitting_function=custom_split, split_length=1)
+
+        documents = [Document(content="First sentence. Second sentence. Third sentence.")]
+        preprocessor.warm_up()
+        result = preprocessor.run(documents=documents)
+
+        processed_docs = result["documents"]
+        assert len(processed_docs) == 3  # Should be split into 3 sentences
+        assert all("." not in doc.content for doc in processed_docs)  # Each doc should be a single sentence
--- a/test/test_files/json/json_conversion_testfile.json
+++ b/test/test_files/json/json_conversion_testfile.json
@ -0,0 +1 @@
+{"content": "Content from a json file"}