feat: integrate two ready-made SuperComponents from haystack-experimental (#9235)

* Add super component decorator

* Add reno

* MultiFileConverter

* Add DocumentPreprocessor

* Add reno

* Add tests and change doc preprocessor to split first then clean

* Remove code from merge

* Add to pydoc and missing test file

* PR comments

* Lint fix

* Fix mypy

* Fix mypy

* Add comment

* PR comments

* Update haystack/components/converters/multi_file_converter.py

Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>

* Update haystack/components/preprocessors/document_preprocessor.py

Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>

* Update haystack/components/preprocessors/document_preprocessor.py

Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>

* Update haystack/components/preprocessors/document_preprocessor.py

Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>

* Update haystack/components/preprocessors/document_preprocessor.py

Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>

* Update haystack/components/preprocessors/document_preprocessor.py

Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>

* Update haystack/components/preprocessors/document_preprocessor.py

Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>

* Update haystack/components/preprocessors/document_preprocessor.py

Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>

* Update haystack/components/preprocessors/document_preprocessor.py

Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>

* Update haystack/components/preprocessors/document_preprocessor.py

Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>

* Update haystack/components/preprocessors/document_preprocessor.py

Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>

* Update haystack/components/converters/multi_file_converter.py

Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>

* PR comments

* PR comment

---------

Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>
This commit is contained in:
Sebastian Husch Lee 2025-04-17 12:02:26 +02:00 committed by GitHub
parent 4279d7e149
commit 19cf220136
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 599 additions and 0 deletions

View File

@ -10,6 +10,7 @@ loaders:
"json",
"markdown",
"msg",
"multi_file_converter",
"openapi_functions",
"output_adapter",
"pdfminer",

View File

@ -5,6 +5,7 @@ loaders:
"csv_document_cleaner",
"csv_document_splitter",
"document_cleaner",
"document_preprocessor",
"document_splitter",
"hierarchical_document_splitter",
"recursive_splitter",

View File

@ -15,6 +15,7 @@ _import_structure = {
"json": ["JSONConverter"],
"markdown": ["MarkdownToDocument"],
"msg": ["MSGToDocument"],
"multi_file_converter": ["MultiFileConverter"],
"openapi_functions": ["OpenAPIServiceToFunctions"],
"output_adapter": ["OutputAdapter"],
"pdfminer": ["PDFMinerToDocument"],
@ -33,6 +34,7 @@ if TYPE_CHECKING:
from .json import JSONConverter
from .markdown import MarkdownToDocument
from .msg import MSGToDocument
from .multi_file_converter import MultiFileConverter
from .openapi_functions import OpenAPIServiceToFunctions
from .output_adapter import OutputAdapter
from .pdfminer import PDFMinerToDocument

View File

@ -0,0 +1,118 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
from enum import Enum
from haystack import Pipeline, super_component
from haystack.components.converters import (
CSVToDocument,
DOCXToDocument,
HTMLToDocument,
JSONConverter,
PPTXToDocument,
PyPDFToDocument,
TextFileToDocument,
XLSXToDocument,
)
from haystack.components.joiners import DocumentJoiner
from haystack.components.routers import FileTypeRouter
class ConverterMimeType(str, Enum):
CSV = "text/csv"
DOCX = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
HTML = "text/html"
JSON = "application/json"
MD = "text/markdown"
TEXT = "text/plain"
PDF = "application/pdf"
PPTX = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
XLSX = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
@super_component
class MultiFileConverter:
"""
A file converter that handles conversion of multiple file types.
The MultiFileConverter handles the following file types:
- CSV
- DOCX
- HTML
- JSON
- MD
- TEXT
- PDF (no OCR)
- PPTX
- XLSX
Usage example:
```
from haystack.super_components.converters import MultiFileConverter
converter = MultiFileConverter()
converter.run(sources=["test.txt", "test.pdf"], meta={})
```
"""
def __init__(self, encoding: str = "utf-8", json_content_key: str = "content") -> None:
"""
Initialize the MultiFileConverter.
:param encoding: The encoding to use when reading files.
:param json_content_key: The key to use in a content field in a document when converting JSON files.
"""
self.encoding = encoding
self.json_content_key = json_content_key
# initialize components
router = FileTypeRouter(
mime_types=[mime_type.value for mime_type in ConverterMimeType],
# Ensure common extensions are registered. Tests on Windows fail otherwise.
additional_mimetypes={
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
},
)
# Create pipeline and add components
pp = Pipeline()
# We use type ignore here to avoid type checking errors
# This is due to how the run method within the Component protocol is defined
pp.add_component("router", router) # type: ignore[arg-type]
pp.add_component("docx", DOCXToDocument(link_format="markdown")) # type: ignore[arg-type]
pp.add_component(
"html",
HTMLToDocument( # type: ignore[arg-type]
extraction_kwargs={"output_format": "markdown", "include_tables": True, "include_links": True}
),
)
pp.add_component("json", JSONConverter(content_key=self.json_content_key)) # type: ignore[arg-type]
pp.add_component("md", TextFileToDocument(encoding=self.encoding)) # type: ignore[arg-type]
pp.add_component("text", TextFileToDocument(encoding=self.encoding)) # type: ignore[arg-type]
pp.add_component("pdf", PyPDFToDocument()) # type: ignore[arg-type]
pp.add_component("pptx", PPTXToDocument()) # type: ignore[arg-type]
pp.add_component("xlsx", XLSXToDocument()) # type: ignore[arg-type]
pp.add_component("joiner", DocumentJoiner()) # type: ignore[arg-type]
pp.add_component("csv", CSVToDocument(encoding=self.encoding)) # type: ignore[arg-type]
for mime_type in ConverterMimeType:
pp.connect(f"router.{mime_type.value}", str(mime_type).lower().rsplit(".", maxsplit=1)[-1])
pp.connect("docx.documents", "joiner.documents")
pp.connect("html.documents", "joiner.documents")
pp.connect("json.documents", "joiner.documents")
pp.connect("md.documents", "joiner.documents")
pp.connect("text.documents", "joiner.documents")
pp.connect("pdf.documents", "joiner.documents")
pp.connect("pptx.documents", "joiner.documents")
pp.connect("csv.documents", "joiner.documents")
pp.connect("xlsx.documents", "joiner.documents")
self.pipeline = pp
self.output_mapping = {"joiner.documents": "documents", "router.unclassified": "unclassified"}
self.input_mapping = {"sources": ["router.sources"], "meta": ["router.meta"]}

View File

@ -11,6 +11,7 @@ _import_structure = {
"csv_document_cleaner": ["CSVDocumentCleaner"],
"csv_document_splitter": ["CSVDocumentSplitter"],
"document_cleaner": ["DocumentCleaner"],
"document_preprocessor": ["DocumentPreprocessor"],
"document_splitter": ["DocumentSplitter"],
"hierarchical_document_splitter": ["HierarchicalDocumentSplitter"],
"recursive_splitter": ["RecursiveDocumentSplitter"],
@ -21,6 +22,7 @@ if TYPE_CHECKING:
from .csv_document_cleaner import CSVDocumentCleaner
from .csv_document_splitter import CSVDocumentSplitter
from .document_cleaner import DocumentCleaner
from .document_preprocessor import DocumentPreprocessor
from .document_splitter import DocumentSplitter
from .hierarchical_document_splitter import HierarchicalDocumentSplitter
from .recursive_splitter import RecursiveDocumentSplitter

View File

@ -0,0 +1,192 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
from typing import Any, Callable, Dict, List, Literal, Optional
from haystack import Pipeline, default_from_dict, default_to_dict, super_component
from haystack.components.preprocessors.document_cleaner import DocumentCleaner
from haystack.components.preprocessors.document_splitter import DocumentSplitter, Language
from haystack.utils import deserialize_callable, serialize_callable
@super_component
class DocumentPreprocessor:
"""
A SuperComponent that first splits and then cleans documents.
This component consists of a DocumentSplitter followed by a DocumentCleaner in a single pipeline.
It takes a list of documents as input and returns a processed list of documents.
Usage example:
```python
from haystack import Document
doc = Document(content="I love pizza!")
preprocessor = DocumentPreProcessor()
results = preprocessor.run(documents=[doc])
print(result["documents"])
```
"""
def __init__( # noqa: PLR0913 (too-many-arguments)
self,
*,
# --- DocumentSplitter arguments ---
split_by: Literal["function", "page", "passage", "period", "word", "line", "sentence"] = "word",
split_length: int = 250,
split_overlap: int = 0,
split_threshold: int = 0,
splitting_function: Optional[Callable[[str], List[str]]] = None,
respect_sentence_boundary: bool = False,
language: Language = "en",
use_split_rules: bool = True,
extend_abbreviations: bool = True,
# --- DocumentCleaner arguments ---
remove_empty_lines: bool = True,
remove_extra_whitespaces: bool = True,
remove_repeated_substrings: bool = False,
keep_id: bool = False,
remove_substrings: Optional[List[str]] = None,
remove_regex: Optional[str] = None,
unicode_normalization: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None,
ascii_only: bool = False,
) -> None:
"""
Initialize a DocumentPreProcessor that first splits and then cleans documents.
**Splitter Parameters**:
:param split_by: The unit of splitting: "function", "page", "passage", "period", "word", "line", or "sentence".
:param split_length: The maximum number of units (words, lines, pages, and so on) in each split.
:param split_overlap: The number of overlapping units between consecutive splits.
:param split_threshold: The minimum number of units per split. If a split is smaller than this, it's merged
with the previous split.
:param splitting_function: A custom function for splitting if `split_by="function"`.
:param respect_sentence_boundary: If `True`, splits by words but tries not to break inside a sentence.
:param language: Language used by the sentence tokenizer if `split_by="sentence"` or
`respect_sentence_boundary=True`.
:param use_split_rules: Whether to apply additional splitting heuristics for the sentence splitter.
:param extend_abbreviations: Whether to extend the sentence splitter with curated abbreviations for certain
languages.
**Cleaner Parameters**:
:param remove_empty_lines: If `True`, removes empty lines.
:param remove_extra_whitespaces: If `True`, removes extra whitespaces.
:param remove_repeated_substrings: If `True`, removes repeated substrings like headers/footers across pages.
:param keep_id: If `True`, keeps the original document IDs.
:param remove_substrings: A list of strings to remove from the document content.
:param remove_regex: A regex pattern whose matches will be removed from the document content.
:param unicode_normalization: Unicode normalization form to apply to the text, for example `"NFC"`.
:param ascii_only: If `True`, converts text to ASCII only.
"""
# Store arguments for serialization
self.remove_empty_lines = remove_empty_lines
self.remove_extra_whitespaces = remove_extra_whitespaces
self.remove_repeated_substrings = remove_repeated_substrings
self.keep_id = keep_id
self.remove_substrings = remove_substrings
self.remove_regex = remove_regex
self.unicode_normalization = unicode_normalization
self.ascii_only = ascii_only
self.split_by = split_by
self.split_length = split_length
self.split_overlap = split_overlap
self.split_threshold = split_threshold
self.splitting_function = splitting_function
self.respect_sentence_boundary = respect_sentence_boundary
self.language = language
self.use_split_rules = use_split_rules
self.extend_abbreviations = extend_abbreviations
# Instantiate sub-components
splitter = DocumentSplitter(
split_by=self.split_by,
split_length=self.split_length,
split_overlap=self.split_overlap,
split_threshold=self.split_threshold,
splitting_function=self.splitting_function,
respect_sentence_boundary=self.respect_sentence_boundary,
language=self.language,
use_split_rules=self.use_split_rules,
extend_abbreviations=self.extend_abbreviations,
)
cleaner = DocumentCleaner(
remove_empty_lines=self.remove_empty_lines,
remove_extra_whitespaces=self.remove_extra_whitespaces,
remove_repeated_substrings=self.remove_repeated_substrings,
keep_id=self.keep_id,
remove_substrings=self.remove_substrings,
remove_regex=self.remove_regex,
unicode_normalization=self.unicode_normalization,
ascii_only=self.ascii_only,
)
# Build the Pipeline
pp = Pipeline()
# We use type ignore here to avoid type checking errors
# This is due to how the run method within the Component protocol is defined
pp.add_component("splitter", splitter) # type: ignore[arg-type]
pp.add_component("cleaner", cleaner) # type: ignore[arg-type]
# Connect the splitter output to cleaner
pp.connect("splitter.documents", "cleaner.documents")
self.pipeline = pp
# Define how pipeline inputs/outputs map to sub-component inputs/outputs
self.input_mapping = {
# The pipeline input "documents" feeds into "splitter.documents"
"documents": ["splitter.documents"]
}
# The pipeline output "documents" comes from "cleaner.documents"
self.output_mapping = {"cleaner.documents": "documents"}
def to_dict(self) -> Dict[str, Any]:
"""
Serialize SuperComponent to a dictionary.
:return:
Dictionary with serialized data.
"""
splitting_function = None
if self.splitting_function is not None:
splitting_function = serialize_callable(self.splitting_function)
return default_to_dict(
self,
remove_empty_lines=self.remove_empty_lines,
remove_extra_whitespaces=self.remove_extra_whitespaces,
remove_repeated_substrings=self.remove_repeated_substrings,
keep_id=self.keep_id,
remove_substrings=self.remove_substrings,
remove_regex=self.remove_regex,
unicode_normalization=self.unicode_normalization,
ascii_only=self.ascii_only,
split_by=self.split_by,
split_length=self.split_length,
split_overlap=self.split_overlap,
split_threshold=self.split_threshold,
splitting_function=splitting_function,
respect_sentence_boundary=self.respect_sentence_boundary,
language=self.language,
use_split_rules=self.use_split_rules,
extend_abbreviations=self.extend_abbreviations,
)
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "DocumentPreprocessor":
"""
Deserializes the SuperComponent from a dictionary.
:param data:
Dictionary to deserialize from.
:returns:
Deserialized SuperComponent.
"""
if "splitting_function" in data["init_parameters"]:
data["init_parameters"]["splitting_function"] = deserialize_callable(
data["init_parameters"]["splitting_function"]
)
return default_from_dict(cls, data)

View File

@ -0,0 +1,9 @@
---
highlights: >
Two ready-made SuperComponents simplify document preprocessing: MultiFileConverter, and DocumentPreProcessor.
For example, if you have all extra dependencies for file conversion installed, you can run the following:
```python
from haystack.super_components.converters import MultiFileConverter
converter = MultiFileConverter()
converter.run(sources=["test.txt", "test.pdf"], meta={})
```

View File

@ -0,0 +1,145 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
import pytest
from haystack import Document, Pipeline
from haystack.core.pipeline.base import component_to_dict, component_from_dict
from haystack.core.component.component import Component
from haystack.dataclasses import ByteStream
from haystack.components.converters.multi_file_converter import MultiFileConverter
@pytest.fixture
def converter():
converter = MultiFileConverter()
converter.warm_up()
return converter
class TestMultiFileConverter:
def test_init_default_params(self, converter):
"""Test initialization with default parameters"""
assert converter.encoding == "utf-8"
assert converter.json_content_key == "content"
assert isinstance(converter, Component)
def test_init_custom_params(self, converter):
"""Test initialization with custom parameters"""
converter = MultiFileConverter(encoding="latin-1", json_content_key="text")
assert converter.encoding == "latin-1"
assert converter.json_content_key == "text"
def test_to_dict(self, converter):
"""Test serialization to dictionary"""
data = component_to_dict(converter, "converter")
assert data == {
"type": "haystack.components.converters.multi_file_converter.MultiFileConverter",
"init_parameters": {"encoding": "utf-8", "json_content_key": "content"},
}
def test_from_dict(self):
"""Test deserialization from dictionary"""
data = {
"type": "haystack.components.converters.multi_file_converter.MultiFileConverter",
"init_parameters": {"encoding": "latin-1", "json_content_key": "text"},
}
conv = component_from_dict(MultiFileConverter, data, "converter")
assert conv.encoding == "latin-1"
assert conv.json_content_key == "text"
@pytest.mark.parametrize(
"suffix,file_path",
[
("csv", "csv/sample_1.csv"),
("docx", "docx/sample_docx.docx"),
("html", "html/what_is_haystack.html"),
("json", "json/json_conversion_testfile.json"),
("md", "markdown/sample.md"),
("pdf", "pdf/sample_pdf_1.pdf"),
("pptx", "pptx/sample_pptx.pptx"),
("txt", "txt/doc_1.txt"),
("xlsx", "xlsx/table_empty_rows_and_columns.xlsx"),
],
)
@pytest.mark.integration
def test_run(self, test_files_path, converter, suffix, file_path):
unclassified_bytestream = ByteStream(b"unclassified content")
unclassified_bytestream.meta["content_type"] = "unknown_type"
paths = [test_files_path / file_path, unclassified_bytestream]
output = converter.run(sources=paths)
docs = output["documents"]
unclassified = output["unclassified"]
assert len(docs) == 1
assert isinstance(docs[0], Document)
assert docs[0].content is not None
assert docs[0].meta["file_path"].endswith(suffix)
assert len(unclassified) == 1
assert isinstance(unclassified[0], ByteStream)
assert unclassified[0].meta["content_type"] == "unknown_type"
def test_run_with_meta(self, test_files_path, converter):
"""Test conversion with metadata"""
paths = [test_files_path / "txt" / "doc_1.txt"]
meta = {"language": "en", "author": "test"}
output = converter.run(sources=paths, meta=meta)
docs = output["documents"]
assert docs[0].meta["language"] == "en"
assert docs[0].meta["author"] == "test"
def test_run_with_bytestream(self, test_files_path, converter):
"""Test converting ByteStream input"""
bytestream = ByteStream(data=b"test content", mime_type="text/plain", meta={"file_path": "test.txt"})
output = converter.run(sources=[bytestream])
docs = output["documents"]
assert len(docs) == 1
assert docs[0].content == "test content"
assert docs[0].meta["file_path"] == "test.txt"
def test_run_error_handling(self, test_files_path, converter, caplog):
"""Test error handling for non-existent files"""
paths = [test_files_path / "non_existent.txt"]
with caplog.at_level("WARNING"):
output = converter.run(sources=paths)
assert "Could not read" in caplog.text
assert len(output["documents"]) == 0
@pytest.mark.integration
def test_run_all_file_types(self, test_files_path, converter):
"""Test converting all supported file types in parallel"""
paths = [
test_files_path / "csv" / "sample_1.csv",
test_files_path / "docx" / "sample_docx.docx",
test_files_path / "html" / "what_is_haystack.html",
test_files_path / "json" / "json_conversion_testfile.json",
test_files_path / "markdown" / "sample.md",
test_files_path / "txt" / "doc_1.txt",
test_files_path / "pdf" / "sample_pdf_1.pdf",
test_files_path / "pptx" / "sample_pptx.pptx",
test_files_path / "xlsx" / "table_empty_rows_and_columns.xlsx",
]
output = converter.run(sources=paths)
docs = output["documents"]
# Verify we got a document for each file
assert len(docs) == len(paths)
assert all(isinstance(doc, Document) for doc in docs)
@pytest.mark.integration
def test_run_in_pipeline(self, test_files_path, converter):
pipeline = Pipeline(max_runs_per_component=1)
pipeline.add_component("converter", converter)
paths = [test_files_path / "txt" / "doc_1.txt", test_files_path / "pdf" / "sample_pdf_1.pdf"]
output = pipeline.run(data={"sources": paths})
docs = output["converter"]["documents"]
assert len(docs) == 2
assert all(isinstance(doc, Document) for doc in docs)
assert all(doc.content is not None for doc in docs)

View File

@ -0,0 +1,128 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
from unittest.mock import patch
import pytest
from haystack import Document, Pipeline
from haystack.components.preprocessors.document_preprocessor import DocumentPreprocessor
class TestDocumentPreprocessor:
@pytest.fixture
def preprocessor(self) -> DocumentPreprocessor:
return DocumentPreprocessor(
# Cleaner parameters
remove_empty_lines=True,
remove_extra_whitespaces=True,
remove_repeated_substrings=False,
keep_id=True,
# Splitter parameters
split_by="word",
split_length=3,
split_overlap=1,
respect_sentence_boundary=False,
language="en",
)
def test_init(self, preprocessor: DocumentPreprocessor) -> None:
assert isinstance(preprocessor.pipeline, Pipeline)
assert preprocessor.input_mapping == {"documents": ["splitter.documents"]}
assert preprocessor.output_mapping == {"cleaner.documents": "documents"}
cleaner = preprocessor.pipeline.get_component("cleaner")
assert cleaner.remove_empty_lines is True
assert cleaner.remove_extra_whitespaces is True
assert cleaner.remove_repeated_substrings is False
assert cleaner.keep_id is True
splitter = preprocessor.pipeline.get_component("splitter")
assert splitter.split_by == "word"
assert splitter.split_length == 3
assert splitter.split_overlap == 1
assert splitter.respect_sentence_boundary is False
assert splitter.language == "en"
def test_from_dict(self) -> None:
preprocessor = DocumentPreprocessor.from_dict(
{
"init_parameters": {
"remove_empty_lines": True,
"remove_extra_whitespaces": True,
"remove_repeated_substrings": False,
"keep_id": True,
"split_by": "word",
"split_length": 3,
"split_overlap": 1,
"respect_sentence_boundary": False,
"language": "en",
},
"type": "haystack.components.preprocessors.document_preprocessor.DocumentPreprocessor",
}
)
assert isinstance(preprocessor, DocumentPreprocessor)
def test_to_dict(self, preprocessor: DocumentPreprocessor) -> None:
expected = {
"init_parameters": {
"remove_empty_lines": True,
"remove_extra_whitespaces": True,
"remove_repeated_substrings": False,
"keep_id": True,
"remove_substrings": None,
"remove_regex": None,
"unicode_normalization": None,
"ascii_only": False,
"split_by": "word",
"split_length": 3,
"split_overlap": 1,
"split_threshold": 0,
"splitting_function": None,
"respect_sentence_boundary": False,
"language": "en",
"use_split_rules": True,
"extend_abbreviations": True,
},
"type": "haystack.components.preprocessors.document_preprocessor.DocumentPreprocessor",
}
assert preprocessor.to_dict() == expected
def test_warm_up(self, preprocessor: DocumentPreprocessor) -> None:
with patch.object(preprocessor.pipeline, "warm_up") as mock_warm_up:
preprocessor.warm_up()
mock_warm_up.assert_called_once()
def test_run(self, preprocessor: DocumentPreprocessor) -> None:
documents = [
Document(content="This is a test document. It has multiple sentences."),
Document(content="Another test document with some content."),
]
preprocessor.warm_up()
result = preprocessor.run(documents=documents)
# Check that we got processed documents back
assert "documents" in result
processed_docs = result["documents"]
assert len(processed_docs) > len(documents) # Should have more docs due to splitting
# Check that the content was cleaned and split
for doc in processed_docs:
assert doc.content.strip() == doc.content
assert len(doc.content.split()) <= 3 # Split length of 3 words
assert doc.id is not None
def test_run_with_custom_splitting_function(self) -> None:
def custom_split(text: str) -> list[str]:
return [t for t in text.split(".") if t.strip() != ""]
preprocessor = DocumentPreprocessor(split_by="function", splitting_function=custom_split, split_length=1)
documents = [Document(content="First sentence. Second sentence. Third sentence.")]
preprocessor.warm_up()
result = preprocessor.run(documents=documents)
processed_docs = result["documents"]
assert len(processed_docs) == 3 # Should be split into 3 sentences
assert all("." not in doc.content for doc in processed_docs) # Each doc should be a single sentence

View File

@ -0,0 +1 @@
{"content": "Content from a json file"}