diff --git a/docs/pydoc/config/converters_api.yml b/docs/pydoc/config/converters_api.yml index 487ac5ea2..c9539038c 100644 --- a/docs/pydoc/config/converters_api.yml +++ b/docs/pydoc/config/converters_api.yml @@ -10,6 +10,7 @@ loaders: "json", "markdown", "msg", + "multi_file_converter", "openapi_functions", "output_adapter", "pdfminer", diff --git a/docs/pydoc/config/preprocessors_api.yml b/docs/pydoc/config/preprocessors_api.yml index 19710cf53..08a719ae6 100644 --- a/docs/pydoc/config/preprocessors_api.yml +++ b/docs/pydoc/config/preprocessors_api.yml @@ -5,6 +5,7 @@ loaders: "csv_document_cleaner", "csv_document_splitter", "document_cleaner", + "document_preprocessor", "document_splitter", "hierarchical_document_splitter", "recursive_splitter", diff --git a/haystack/components/converters/__init__.py b/haystack/components/converters/__init__.py index 6036fa6db..422928f75 100644 --- a/haystack/components/converters/__init__.py +++ b/haystack/components/converters/__init__.py @@ -15,6 +15,7 @@ _import_structure = { "json": ["JSONConverter"], "markdown": ["MarkdownToDocument"], "msg": ["MSGToDocument"], + "multi_file_converter": ["MultiFileConverter"], "openapi_functions": ["OpenAPIServiceToFunctions"], "output_adapter": ["OutputAdapter"], "pdfminer": ["PDFMinerToDocument"], @@ -33,6 +34,7 @@ if TYPE_CHECKING: from .json import JSONConverter from .markdown import MarkdownToDocument from .msg import MSGToDocument + from .multi_file_converter import MultiFileConverter from .openapi_functions import OpenAPIServiceToFunctions from .output_adapter import OutputAdapter from .pdfminer import PDFMinerToDocument diff --git a/haystack/components/converters/multi_file_converter.py b/haystack/components/converters/multi_file_converter.py new file mode 100644 index 000000000..9a13fad8c --- /dev/null +++ b/haystack/components/converters/multi_file_converter.py @@ -0,0 +1,118 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from enum import Enum + +from haystack import Pipeline, super_component +from haystack.components.converters import ( + CSVToDocument, + DOCXToDocument, + HTMLToDocument, + JSONConverter, + PPTXToDocument, + PyPDFToDocument, + TextFileToDocument, + XLSXToDocument, +) +from haystack.components.joiners import DocumentJoiner +from haystack.components.routers import FileTypeRouter + + +class ConverterMimeType(str, Enum): + CSV = "text/csv" + DOCX = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + HTML = "text/html" + JSON = "application/json" + MD = "text/markdown" + TEXT = "text/plain" + PDF = "application/pdf" + PPTX = "application/vnd.openxmlformats-officedocument.presentationml.presentation" + XLSX = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + + +@super_component +class MultiFileConverter: + """ + A file converter that handles conversion of multiple file types. + + The MultiFileConverter handles the following file types: + - CSV + - DOCX + - HTML + - JSON + - MD + - TEXT + - PDF (no OCR) + - PPTX + - XLSX + + Usage example: + ``` + from haystack.super_components.converters import MultiFileConverter + + converter = MultiFileConverter() + converter.run(sources=["test.txt", "test.pdf"], meta={}) + ``` + """ + + def __init__(self, encoding: str = "utf-8", json_content_key: str = "content") -> None: + """ + Initialize the MultiFileConverter. + + :param encoding: The encoding to use when reading files. + :param json_content_key: The key to use in a content field in a document when converting JSON files. + """ + self.encoding = encoding + self.json_content_key = json_content_key + + # initialize components + router = FileTypeRouter( + mime_types=[mime_type.value for mime_type in ConverterMimeType], + # Ensure common extensions are registered. Tests on Windows fail otherwise. + additional_mimetypes={ + "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx", + "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx", + }, + ) + + # Create pipeline and add components + pp = Pipeline() + + # We use type ignore here to avoid type checking errors + # This is due to how the run method within the Component protocol is defined + pp.add_component("router", router) # type: ignore[arg-type] + pp.add_component("docx", DOCXToDocument(link_format="markdown")) # type: ignore[arg-type] + pp.add_component( + "html", + HTMLToDocument( # type: ignore[arg-type] + extraction_kwargs={"output_format": "markdown", "include_tables": True, "include_links": True} + ), + ) + pp.add_component("json", JSONConverter(content_key=self.json_content_key)) # type: ignore[arg-type] + pp.add_component("md", TextFileToDocument(encoding=self.encoding)) # type: ignore[arg-type] + pp.add_component("text", TextFileToDocument(encoding=self.encoding)) # type: ignore[arg-type] + pp.add_component("pdf", PyPDFToDocument()) # type: ignore[arg-type] + pp.add_component("pptx", PPTXToDocument()) # type: ignore[arg-type] + pp.add_component("xlsx", XLSXToDocument()) # type: ignore[arg-type] + pp.add_component("joiner", DocumentJoiner()) # type: ignore[arg-type] + pp.add_component("csv", CSVToDocument(encoding=self.encoding)) # type: ignore[arg-type] + + for mime_type in ConverterMimeType: + pp.connect(f"router.{mime_type.value}", str(mime_type).lower().rsplit(".", maxsplit=1)[-1]) + + pp.connect("docx.documents", "joiner.documents") + pp.connect("html.documents", "joiner.documents") + pp.connect("json.documents", "joiner.documents") + pp.connect("md.documents", "joiner.documents") + pp.connect("text.documents", "joiner.documents") + pp.connect("pdf.documents", "joiner.documents") + pp.connect("pptx.documents", "joiner.documents") + + pp.connect("csv.documents", "joiner.documents") + pp.connect("xlsx.documents", "joiner.documents") + + self.pipeline = pp + self.output_mapping = {"joiner.documents": "documents", "router.unclassified": "unclassified"} + self.input_mapping = {"sources": ["router.sources"], "meta": ["router.meta"]} diff --git a/haystack/components/preprocessors/__init__.py b/haystack/components/preprocessors/__init__.py index cf5a95db8..cdbdf4a7a 100644 --- a/haystack/components/preprocessors/__init__.py +++ b/haystack/components/preprocessors/__init__.py @@ -11,6 +11,7 @@ _import_structure = { "csv_document_cleaner": ["CSVDocumentCleaner"], "csv_document_splitter": ["CSVDocumentSplitter"], "document_cleaner": ["DocumentCleaner"], + "document_preprocessor": ["DocumentPreprocessor"], "document_splitter": ["DocumentSplitter"], "hierarchical_document_splitter": ["HierarchicalDocumentSplitter"], "recursive_splitter": ["RecursiveDocumentSplitter"], @@ -21,6 +22,7 @@ if TYPE_CHECKING: from .csv_document_cleaner import CSVDocumentCleaner from .csv_document_splitter import CSVDocumentSplitter from .document_cleaner import DocumentCleaner + from .document_preprocessor import DocumentPreprocessor from .document_splitter import DocumentSplitter from .hierarchical_document_splitter import HierarchicalDocumentSplitter from .recursive_splitter import RecursiveDocumentSplitter diff --git a/haystack/components/preprocessors/document_preprocessor.py b/haystack/components/preprocessors/document_preprocessor.py new file mode 100644 index 000000000..75d8a54d2 --- /dev/null +++ b/haystack/components/preprocessors/document_preprocessor.py @@ -0,0 +1,192 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, Callable, Dict, List, Literal, Optional + +from haystack import Pipeline, default_from_dict, default_to_dict, super_component +from haystack.components.preprocessors.document_cleaner import DocumentCleaner +from haystack.components.preprocessors.document_splitter import DocumentSplitter, Language +from haystack.utils import deserialize_callable, serialize_callable + + +@super_component +class DocumentPreprocessor: + """ + A SuperComponent that first splits and then cleans documents. + + This component consists of a DocumentSplitter followed by a DocumentCleaner in a single pipeline. + It takes a list of documents as input and returns a processed list of documents. + + Usage example: + ```python + from haystack import Document + doc = Document(content="I love pizza!") + preprocessor = DocumentPreProcessor() + results = preprocessor.run(documents=[doc]) + print(result["documents"]) + ``` + """ + + def __init__( # noqa: PLR0913 (too-many-arguments) + self, + *, + # --- DocumentSplitter arguments --- + split_by: Literal["function", "page", "passage", "period", "word", "line", "sentence"] = "word", + split_length: int = 250, + split_overlap: int = 0, + split_threshold: int = 0, + splitting_function: Optional[Callable[[str], List[str]]] = None, + respect_sentence_boundary: bool = False, + language: Language = "en", + use_split_rules: bool = True, + extend_abbreviations: bool = True, + # --- DocumentCleaner arguments --- + remove_empty_lines: bool = True, + remove_extra_whitespaces: bool = True, + remove_repeated_substrings: bool = False, + keep_id: bool = False, + remove_substrings: Optional[List[str]] = None, + remove_regex: Optional[str] = None, + unicode_normalization: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None, + ascii_only: bool = False, + ) -> None: + """ + Initialize a DocumentPreProcessor that first splits and then cleans documents. + + **Splitter Parameters**: + :param split_by: The unit of splitting: "function", "page", "passage", "period", "word", "line", or "sentence". + :param split_length: The maximum number of units (words, lines, pages, and so on) in each split. + :param split_overlap: The number of overlapping units between consecutive splits. + :param split_threshold: The minimum number of units per split. If a split is smaller than this, it's merged + with the previous split. + :param splitting_function: A custom function for splitting if `split_by="function"`. + :param respect_sentence_boundary: If `True`, splits by words but tries not to break inside a sentence. + :param language: Language used by the sentence tokenizer if `split_by="sentence"` or + `respect_sentence_boundary=True`. + :param use_split_rules: Whether to apply additional splitting heuristics for the sentence splitter. + :param extend_abbreviations: Whether to extend the sentence splitter with curated abbreviations for certain + languages. + + **Cleaner Parameters**: + :param remove_empty_lines: If `True`, removes empty lines. + :param remove_extra_whitespaces: If `True`, removes extra whitespaces. + :param remove_repeated_substrings: If `True`, removes repeated substrings like headers/footers across pages. + :param keep_id: If `True`, keeps the original document IDs. + :param remove_substrings: A list of strings to remove from the document content. + :param remove_regex: A regex pattern whose matches will be removed from the document content. + :param unicode_normalization: Unicode normalization form to apply to the text, for example `"NFC"`. + :param ascii_only: If `True`, converts text to ASCII only. + """ + # Store arguments for serialization + self.remove_empty_lines = remove_empty_lines + self.remove_extra_whitespaces = remove_extra_whitespaces + self.remove_repeated_substrings = remove_repeated_substrings + self.keep_id = keep_id + self.remove_substrings = remove_substrings + self.remove_regex = remove_regex + self.unicode_normalization = unicode_normalization + self.ascii_only = ascii_only + + self.split_by = split_by + self.split_length = split_length + self.split_overlap = split_overlap + self.split_threshold = split_threshold + self.splitting_function = splitting_function + self.respect_sentence_boundary = respect_sentence_boundary + self.language = language + self.use_split_rules = use_split_rules + self.extend_abbreviations = extend_abbreviations + + # Instantiate sub-components + splitter = DocumentSplitter( + split_by=self.split_by, + split_length=self.split_length, + split_overlap=self.split_overlap, + split_threshold=self.split_threshold, + splitting_function=self.splitting_function, + respect_sentence_boundary=self.respect_sentence_boundary, + language=self.language, + use_split_rules=self.use_split_rules, + extend_abbreviations=self.extend_abbreviations, + ) + + cleaner = DocumentCleaner( + remove_empty_lines=self.remove_empty_lines, + remove_extra_whitespaces=self.remove_extra_whitespaces, + remove_repeated_substrings=self.remove_repeated_substrings, + keep_id=self.keep_id, + remove_substrings=self.remove_substrings, + remove_regex=self.remove_regex, + unicode_normalization=self.unicode_normalization, + ascii_only=self.ascii_only, + ) + + # Build the Pipeline + pp = Pipeline() + + # We use type ignore here to avoid type checking errors + # This is due to how the run method within the Component protocol is defined + pp.add_component("splitter", splitter) # type: ignore[arg-type] + pp.add_component("cleaner", cleaner) # type: ignore[arg-type] + + # Connect the splitter output to cleaner + pp.connect("splitter.documents", "cleaner.documents") + self.pipeline = pp + + # Define how pipeline inputs/outputs map to sub-component inputs/outputs + self.input_mapping = { + # The pipeline input "documents" feeds into "splitter.documents" + "documents": ["splitter.documents"] + } + # The pipeline output "documents" comes from "cleaner.documents" + self.output_mapping = {"cleaner.documents": "documents"} + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize SuperComponent to a dictionary. + + :return: + Dictionary with serialized data. + """ + splitting_function = None + if self.splitting_function is not None: + splitting_function = serialize_callable(self.splitting_function) + + return default_to_dict( + self, + remove_empty_lines=self.remove_empty_lines, + remove_extra_whitespaces=self.remove_extra_whitespaces, + remove_repeated_substrings=self.remove_repeated_substrings, + keep_id=self.keep_id, + remove_substrings=self.remove_substrings, + remove_regex=self.remove_regex, + unicode_normalization=self.unicode_normalization, + ascii_only=self.ascii_only, + split_by=self.split_by, + split_length=self.split_length, + split_overlap=self.split_overlap, + split_threshold=self.split_threshold, + splitting_function=splitting_function, + respect_sentence_boundary=self.respect_sentence_boundary, + language=self.language, + use_split_rules=self.use_split_rules, + extend_abbreviations=self.extend_abbreviations, + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "DocumentPreprocessor": + """ + Deserializes the SuperComponent from a dictionary. + + :param data: + Dictionary to deserialize from. + :returns: + Deserialized SuperComponent. + """ + if "splitting_function" in data["init_parameters"]: + data["init_parameters"]["splitting_function"] = deserialize_callable( + data["init_parameters"]["splitting_function"] + ) + + return default_from_dict(cls, data) diff --git a/releasenotes/notes/move-over-supercomponents-24907133d0d90ed6.yaml b/releasenotes/notes/move-over-supercomponents-24907133d0d90ed6.yaml new file mode 100644 index 000000000..d27fbf651 --- /dev/null +++ b/releasenotes/notes/move-over-supercomponents-24907133d0d90ed6.yaml @@ -0,0 +1,9 @@ +--- +highlights: > + Two ready-made SuperComponents simplify document preprocessing: MultiFileConverter, and DocumentPreProcessor. + For example, if you have all extra dependencies for file conversion installed, you can run the following: + ```python + from haystack.super_components.converters import MultiFileConverter + converter = MultiFileConverter() + converter.run(sources=["test.txt", "test.pdf"], meta={}) + ``` diff --git a/test/components/converters/test_multi_file_converter.py b/test/components/converters/test_multi_file_converter.py new file mode 100644 index 000000000..2559ff958 --- /dev/null +++ b/test/components/converters/test_multi_file_converter.py @@ -0,0 +1,145 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +from haystack import Document, Pipeline +from haystack.core.pipeline.base import component_to_dict, component_from_dict +from haystack.core.component.component import Component +from haystack.dataclasses import ByteStream +from haystack.components.converters.multi_file_converter import MultiFileConverter + + +@pytest.fixture +def converter(): + converter = MultiFileConverter() + converter.warm_up() + return converter + + +class TestMultiFileConverter: + def test_init_default_params(self, converter): + """Test initialization with default parameters""" + assert converter.encoding == "utf-8" + assert converter.json_content_key == "content" + assert isinstance(converter, Component) + + def test_init_custom_params(self, converter): + """Test initialization with custom parameters""" + converter = MultiFileConverter(encoding="latin-1", json_content_key="text") + assert converter.encoding == "latin-1" + assert converter.json_content_key == "text" + + def test_to_dict(self, converter): + """Test serialization to dictionary""" + data = component_to_dict(converter, "converter") + assert data == { + "type": "haystack.components.converters.multi_file_converter.MultiFileConverter", + "init_parameters": {"encoding": "utf-8", "json_content_key": "content"}, + } + + def test_from_dict(self): + """Test deserialization from dictionary""" + data = { + "type": "haystack.components.converters.multi_file_converter.MultiFileConverter", + "init_parameters": {"encoding": "latin-1", "json_content_key": "text"}, + } + conv = component_from_dict(MultiFileConverter, data, "converter") + assert conv.encoding == "latin-1" + assert conv.json_content_key == "text" + + @pytest.mark.parametrize( + "suffix,file_path", + [ + ("csv", "csv/sample_1.csv"), + ("docx", "docx/sample_docx.docx"), + ("html", "html/what_is_haystack.html"), + ("json", "json/json_conversion_testfile.json"), + ("md", "markdown/sample.md"), + ("pdf", "pdf/sample_pdf_1.pdf"), + ("pptx", "pptx/sample_pptx.pptx"), + ("txt", "txt/doc_1.txt"), + ("xlsx", "xlsx/table_empty_rows_and_columns.xlsx"), + ], + ) + @pytest.mark.integration + def test_run(self, test_files_path, converter, suffix, file_path): + unclassified_bytestream = ByteStream(b"unclassified content") + unclassified_bytestream.meta["content_type"] = "unknown_type" + + paths = [test_files_path / file_path, unclassified_bytestream] + + output = converter.run(sources=paths) + docs = output["documents"] + unclassified = output["unclassified"] + + assert len(docs) == 1 + assert isinstance(docs[0], Document) + assert docs[0].content is not None + assert docs[0].meta["file_path"].endswith(suffix) + + assert len(unclassified) == 1 + assert isinstance(unclassified[0], ByteStream) + assert unclassified[0].meta["content_type"] == "unknown_type" + + def test_run_with_meta(self, test_files_path, converter): + """Test conversion with metadata""" + paths = [test_files_path / "txt" / "doc_1.txt"] + meta = {"language": "en", "author": "test"} + output = converter.run(sources=paths, meta=meta) + docs = output["documents"] + assert docs[0].meta["language"] == "en" + assert docs[0].meta["author"] == "test" + + def test_run_with_bytestream(self, test_files_path, converter): + """Test converting ByteStream input""" + bytestream = ByteStream(data=b"test content", mime_type="text/plain", meta={"file_path": "test.txt"}) + output = converter.run(sources=[bytestream]) + docs = output["documents"] + assert len(docs) == 1 + assert docs[0].content == "test content" + assert docs[0].meta["file_path"] == "test.txt" + + def test_run_error_handling(self, test_files_path, converter, caplog): + """Test error handling for non-existent files""" + paths = [test_files_path / "non_existent.txt"] + with caplog.at_level("WARNING"): + output = converter.run(sources=paths) + assert "Could not read" in caplog.text + assert len(output["documents"]) == 0 + + @pytest.mark.integration + def test_run_all_file_types(self, test_files_path, converter): + """Test converting all supported file types in parallel""" + paths = [ + test_files_path / "csv" / "sample_1.csv", + test_files_path / "docx" / "sample_docx.docx", + test_files_path / "html" / "what_is_haystack.html", + test_files_path / "json" / "json_conversion_testfile.json", + test_files_path / "markdown" / "sample.md", + test_files_path / "txt" / "doc_1.txt", + test_files_path / "pdf" / "sample_pdf_1.pdf", + test_files_path / "pptx" / "sample_pptx.pptx", + test_files_path / "xlsx" / "table_empty_rows_and_columns.xlsx", + ] + output = converter.run(sources=paths) + docs = output["documents"] + + # Verify we got a document for each file + assert len(docs) == len(paths) + assert all(isinstance(doc, Document) for doc in docs) + + @pytest.mark.integration + def test_run_in_pipeline(self, test_files_path, converter): + pipeline = Pipeline(max_runs_per_component=1) + pipeline.add_component("converter", converter) + + paths = [test_files_path / "txt" / "doc_1.txt", test_files_path / "pdf" / "sample_pdf_1.pdf"] + + output = pipeline.run(data={"sources": paths}) + docs = output["converter"]["documents"] + + assert len(docs) == 2 + assert all(isinstance(doc, Document) for doc in docs) + assert all(doc.content is not None for doc in docs) diff --git a/test/components/preprocessors/test_document_preprocessor.py b/test/components/preprocessors/test_document_preprocessor.py new file mode 100644 index 000000000..21d6ee5c8 --- /dev/null +++ b/test/components/preprocessors/test_document_preprocessor.py @@ -0,0 +1,128 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from unittest.mock import patch +import pytest + +from haystack import Document, Pipeline +from haystack.components.preprocessors.document_preprocessor import DocumentPreprocessor + + +class TestDocumentPreprocessor: + @pytest.fixture + def preprocessor(self) -> DocumentPreprocessor: + return DocumentPreprocessor( + # Cleaner parameters + remove_empty_lines=True, + remove_extra_whitespaces=True, + remove_repeated_substrings=False, + keep_id=True, + # Splitter parameters + split_by="word", + split_length=3, + split_overlap=1, + respect_sentence_boundary=False, + language="en", + ) + + def test_init(self, preprocessor: DocumentPreprocessor) -> None: + assert isinstance(preprocessor.pipeline, Pipeline) + assert preprocessor.input_mapping == {"documents": ["splitter.documents"]} + assert preprocessor.output_mapping == {"cleaner.documents": "documents"} + + cleaner = preprocessor.pipeline.get_component("cleaner") + assert cleaner.remove_empty_lines is True + assert cleaner.remove_extra_whitespaces is True + assert cleaner.remove_repeated_substrings is False + assert cleaner.keep_id is True + + splitter = preprocessor.pipeline.get_component("splitter") + assert splitter.split_by == "word" + assert splitter.split_length == 3 + assert splitter.split_overlap == 1 + assert splitter.respect_sentence_boundary is False + assert splitter.language == "en" + + def test_from_dict(self) -> None: + preprocessor = DocumentPreprocessor.from_dict( + { + "init_parameters": { + "remove_empty_lines": True, + "remove_extra_whitespaces": True, + "remove_repeated_substrings": False, + "keep_id": True, + "split_by": "word", + "split_length": 3, + "split_overlap": 1, + "respect_sentence_boundary": False, + "language": "en", + }, + "type": "haystack.components.preprocessors.document_preprocessor.DocumentPreprocessor", + } + ) + assert isinstance(preprocessor, DocumentPreprocessor) + + def test_to_dict(self, preprocessor: DocumentPreprocessor) -> None: + expected = { + "init_parameters": { + "remove_empty_lines": True, + "remove_extra_whitespaces": True, + "remove_repeated_substrings": False, + "keep_id": True, + "remove_substrings": None, + "remove_regex": None, + "unicode_normalization": None, + "ascii_only": False, + "split_by": "word", + "split_length": 3, + "split_overlap": 1, + "split_threshold": 0, + "splitting_function": None, + "respect_sentence_boundary": False, + "language": "en", + "use_split_rules": True, + "extend_abbreviations": True, + }, + "type": "haystack.components.preprocessors.document_preprocessor.DocumentPreprocessor", + } + assert preprocessor.to_dict() == expected + + def test_warm_up(self, preprocessor: DocumentPreprocessor) -> None: + with patch.object(preprocessor.pipeline, "warm_up") as mock_warm_up: + preprocessor.warm_up() + mock_warm_up.assert_called_once() + + def test_run(self, preprocessor: DocumentPreprocessor) -> None: + documents = [ + Document(content="This is a test document. It has multiple sentences."), + Document(content="Another test document with some content."), + ] + + preprocessor.warm_up() + result = preprocessor.run(documents=documents) + + # Check that we got processed documents back + assert "documents" in result + processed_docs = result["documents"] + assert len(processed_docs) > len(documents) # Should have more docs due to splitting + + # Check that the content was cleaned and split + for doc in processed_docs: + assert doc.content.strip() == doc.content + assert len(doc.content.split()) <= 3 # Split length of 3 words + assert doc.id is not None + + def test_run_with_custom_splitting_function(self) -> None: + def custom_split(text: str) -> list[str]: + return [t for t in text.split(".") if t.strip() != ""] + + preprocessor = DocumentPreprocessor(split_by="function", splitting_function=custom_split, split_length=1) + + documents = [Document(content="First sentence. Second sentence. Third sentence.")] + preprocessor.warm_up() + result = preprocessor.run(documents=documents) + + processed_docs = result["documents"] + assert len(processed_docs) == 3 # Should be split into 3 sentences + assert all("." not in doc.content for doc in processed_docs) # Each doc should be a single sentence diff --git a/test/test_files/json/json_conversion_testfile.json b/test/test_files/json/json_conversion_testfile.json new file mode 100644 index 000000000..78f9b2c5b --- /dev/null +++ b/test/test_files/json/json_conversion_testfile.json @@ -0,0 +1 @@ +{"content": "Content from a json file"}