haystack/e2e/pipelines/test_preprocessing_pipeline.py

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

from haystack import Pipeline
from haystack.components.classifiers import DocumentLanguageClassifier
from haystack.components.converters import TextFileToDocument
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.routers import FileTypeRouter, MetadataRouter
from haystack.components.writers import DocumentWriter
from haystack.document_stores.in_memory import InMemoryDocumentStore


def test_preprocessing_pipeline(tmp_path):
    # Create the pipeline and its components
    document_store = InMemoryDocumentStore()
    preprocessing_pipeline = Pipeline()
    preprocessing_pipeline.add_component(instance=FileTypeRouter(mime_types=["text/plain"]), name="file_type_router")
    preprocessing_pipeline.add_component(instance=TextFileToDocument(), name="text_file_converter")
    preprocessing_pipeline.add_component(instance=DocumentLanguageClassifier(), name="language_classifier")
    preprocessing_pipeline.add_component(
        instance=MetadataRouter(rules={"en": {"field": "language", "operator": "==", "value": "en"}}), name="router"
    )
    preprocessing_pipeline.add_component(instance=DocumentCleaner(), name="cleaner")
    preprocessing_pipeline.add_component(instance=DocumentSplitter(split_by="period", split_length=1), name="splitter")
    preprocessing_pipeline.add_component(
        instance=SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"), name="embedder"
    )
    preprocessing_pipeline.add_component(instance=DocumentWriter(document_store=document_store), name="writer")
    preprocessing_pipeline.connect("file_type_router.text/plain", "text_file_converter.sources")
    preprocessing_pipeline.connect("text_file_converter.documents", "language_classifier.documents")
    preprocessing_pipeline.connect("language_classifier.documents", "router.documents")
    preprocessing_pipeline.connect("router.en", "cleaner.documents")
    preprocessing_pipeline.connect("cleaner.documents", "splitter.documents")
    preprocessing_pipeline.connect("splitter.documents", "embedder.documents")
    preprocessing_pipeline.connect("embedder.documents", "writer.documents")

    # Serialize the pipeline to YAML
    with open(tmp_path / "test_preprocessing_pipeline.yaml", "w") as f:
        preprocessing_pipeline.dump(f)

    # Load the pipeline back
    with open(tmp_path / "test_preprocessing_pipeline.yaml", "r") as f:
        preprocessing_pipeline = Pipeline.load(f)

    # Write a txt file
    with open(tmp_path / "test_file_english.txt", "w") as f:
        f.write(
            "This is an english sentence. There is more to it. It's a long text."
            "Spans multiple lines."
            ""
            "Even contains empty lines.  And extra whitespaces."
        )

    # Write a txt file
    with open(tmp_path / "test_file_german.txt", "w") as f:
        f.write("Ein deutscher Satz ohne Verb.")

    # Add two txt files and one non-txt file
    paths = [
        tmp_path / "test_file_english.txt",
        tmp_path / "test_file_german.txt",
        tmp_path / "test_preprocessing_pipeline.json",
    ]

    result = preprocessing_pipeline.run({"file_type_router": {"sources": paths}})

    assert result["writer"]["documents_written"] == 6
    filled_document_store = preprocessing_pipeline.get_component("writer").document_store
    assert filled_document_store.count_documents() == 6

    # Check preprocessed texts
    stored_documents = filled_document_store.filter_documents()
    expected_texts = [
        "This is an english sentence.",
        " There is more to it.",
        " It's a long text.",
        "Spans multiple lines.",
        "Even contains empty lines.",
        " And extra whitespaces.",
    ]
    assert expected_texts == [document.content for document in stored_documents]
    assert all(document.meta["language"] == "en" for document in stored_documents)
chore: add license header to all modules (#7675) * add license header to modules * check license header at linting time 2024-05-09 15:40:36 +02:00			`# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>`
			`#`
			`# SPDX-License-Identifier: Apache-2.0`

Fix all tests 2023-11-24 14:48:43 +01:00			`from haystack import Pipeline`
			`from haystack.components.classifiers import DocumentLanguageClassifier`
chore: add license header to all modules (#7675) * add license header to modules * check license header at linting time 2024-05-09 15:40:36 +02:00			`from haystack.components.converters import TextFileToDocument`
			`from haystack.components.embedders import SentenceTransformersDocumentEmbedder`
			`from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter`
Fix all tests 2023-11-24 14:48:43 +01:00			`from haystack.components.routers import FileTypeRouter, MetadataRouter`
			`from haystack.components.writers import DocumentWriter`
refact!: Remove symbols under the `haystack.document_stores` namespace (#6714) * remove symbols under the haystack.document_stores namespace * Update haystack/document_stores/types/protocol.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * fix * same for retrievers * leftovers * more leftovers * add relnote * leftovers * one more * fix examples --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> 2024-01-10 21:20:42 +01:00			`from haystack.document_stores.in_memory import InMemoryDocumentStore`
feat: Add DocumentLanguageClassifier 2.0 (#6037) * add DocumentLanguageClassifier and tests * reno * fix import, rename DocumentCleaner * mark example usage as python code * add assertions to e2e test * use deserialized document_store * Apply suggestions from code review Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> * remove from/to_dict * use renamed InMemoryDocumentStore * adapt to Document refactoring * improve docstring * fix test for new Document --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Co-authored-by: anakin87 <stefanofiorucci@gmail.com> 2023-10-31 15:35:05 +01:00

			`def test_preprocessing_pipeline(tmp_path):`
			`# Create the pipeline and its components`
			`document_store = InMemoryDocumentStore()`
			`preprocessing_pipeline = Pipeline()`
			`preprocessing_pipeline.add_component(instance=FileTypeRouter(mime_types=["text/plain"]), name="file_type_router")`
			`preprocessing_pipeline.add_component(instance=TextFileToDocument(), name="text_file_converter")`
			`preprocessing_pipeline.add_component(instance=DocumentLanguageClassifier(), name="language_classifier")`
test: Add MetadataRouter to preprocessing pipeline in e2e test (#6321) * add MetadataRouter to preprocessing pipeline * replace mimetype check with language check 2023-11-16 11:22:37 +01:00			`preprocessing_pipeline.add_component(`
refactor: Add support for new filters declaration (#6397) * Rework filter logic for InMemoryDocumentStore to support new filters declaration * Fix legacy filters tests * Simplify logic and handle dates comparison * Rework MetadataRouter to support new filters * Update docstrings * Add release notes * Fix linting * Avoid duplicating filters specifications * Handle corner case * Simplify docstring * Fix filters logic and tests * Fix Document Store testing legacy filters tests 2023-11-24 11:22:46 +01:00			`instance=MetadataRouter(rules={"en": {"field": "language", "operator": "==", "value": "en"}}), name="router"`
test: Add MetadataRouter to preprocessing pipeline in e2e test (#6321) * add MetadataRouter to preprocessing pipeline * replace mimetype check with language check 2023-11-16 11:22:37 +01:00			`)`
feat: Add DocumentLanguageClassifier 2.0 (#6037) * add DocumentLanguageClassifier and tests * reno * fix import, rename DocumentCleaner * mark example usage as python code * add assertions to e2e test * use deserialized document_store * Apply suggestions from code review Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> * remove from/to_dict * use renamed InMemoryDocumentStore * adapt to Document refactoring * improve docstring * fix test for new Document --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Co-authored-by: anakin87 <stefanofiorucci@gmail.com> 2023-10-31 15:35:05 +01:00			`preprocessing_pipeline.add_component(instance=DocumentCleaner(), name="cleaner")`
initial import (#8635) 2024-12-13 12:12:40 +01:00			`preprocessing_pipeline.add_component(instance=DocumentSplitter(split_by="period", split_length=1), name="splitter")`
feat: Add DocumentLanguageClassifier 2.0 (#6037) * add DocumentLanguageClassifier and tests * reno * fix import, rename DocumentCleaner * mark example usage as python code * add assertions to e2e test * use deserialized document_store * Apply suggestions from code review Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> * remove from/to_dict * use renamed InMemoryDocumentStore * adapt to Document refactoring * improve docstring * fix test for new Document --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Co-authored-by: anakin87 <stefanofiorucci@gmail.com> 2023-10-31 15:35:05 +01:00			`preprocessing_pipeline.add_component(`
feat!: Rename `model_name` or `model_name_or_path` to `model` in all Embedder classes (#6733) * rename model parameter in the openai doc embedder * fix tests for openai doc embedder * rename model parameter in the openai text embedder * fix tests for openai text embedder * rename model parameter in the st doc embedder * fix tests for st doc embedder * rename model parameter in the st backend * fix tests for st backend * rename model parameter in the st text embedder * fix tests for st text embedder * fix docstring * fix pipeline utils * fix e2e * reno * fix the indexing pipeline _create_embedder function * fix e2e eval rag pipeline * pytest 2024-01-12 15:30:17 +01:00			`instance=SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"), name="embedder"`
feat: Add DocumentLanguageClassifier 2.0 (#6037) * add DocumentLanguageClassifier and tests * reno * fix import, rename DocumentCleaner * mark example usage as python code * add assertions to e2e test * use deserialized document_store * Apply suggestions from code review Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> * remove from/to_dict * use renamed InMemoryDocumentStore * adapt to Document refactoring * improve docstring * fix test for new Document --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Co-authored-by: anakin87 <stefanofiorucci@gmail.com> 2023-10-31 15:35:05 +01:00			`)`
			`preprocessing_pipeline.add_component(instance=DocumentWriter(document_store=document_store), name="writer")`
feat: Add DocumentJoiner component 2.0 (#6105) * draft DocumentJoiner * implement merge and rrf * draft end-to-end test with DocumentJoiner in hybrid doc search pipeline * adjust for variadics Canals PR #122 * fix text_embedder input * adapt to the new Document class * adapt to new doc id * specify documents input as Variadic in run method * compare doc ids instead of full docs * rename text_file_converter input to sources * update docstring * Update haystack/preview/components/routers/document_joiner.py Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply suggestions from docstring review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * capitalize Documents and Retrievers in docstrings * fix log message in test --------- Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Co-authored-by: anakin87 <stefanofiorucci@gmail.com> Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> 2023-11-20 10:56:56 +01:00			`preprocessing_pipeline.connect("file_type_router.text/plain", "text_file_converter.sources")`
feat: Add DocumentLanguageClassifier 2.0 (#6037) * add DocumentLanguageClassifier and tests * reno * fix import, rename DocumentCleaner * mark example usage as python code * add assertions to e2e test * use deserialized document_store * Apply suggestions from code review Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> * remove from/to_dict * use renamed InMemoryDocumentStore * adapt to Document refactoring * improve docstring * fix test for new Document --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Co-authored-by: anakin87 <stefanofiorucci@gmail.com> 2023-10-31 15:35:05 +01:00			`preprocessing_pipeline.connect("text_file_converter.documents", "language_classifier.documents")`
test: Add MetadataRouter to preprocessing pipeline in e2e test (#6321) * add MetadataRouter to preprocessing pipeline * replace mimetype check with language check 2023-11-16 11:22:37 +01:00			`preprocessing_pipeline.connect("language_classifier.documents", "router.documents")`
			`preprocessing_pipeline.connect("router.en", "cleaner.documents")`
feat: Add DocumentLanguageClassifier 2.0 (#6037) * add DocumentLanguageClassifier and tests * reno * fix import, rename DocumentCleaner * mark example usage as python code * add assertions to e2e test * use deserialized document_store * Apply suggestions from code review Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> * remove from/to_dict * use renamed InMemoryDocumentStore * adapt to Document refactoring * improve docstring * fix test for new Document --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Co-authored-by: anakin87 <stefanofiorucci@gmail.com> 2023-10-31 15:35:05 +01:00			`preprocessing_pipeline.connect("cleaner.documents", "splitter.documents")`
			`preprocessing_pipeline.connect("splitter.documents", "embedder.documents")`
			`preprocessing_pipeline.connect("embedder.documents", "writer.documents")`

test: Update E2E tests to use `Pipeline.dump/load` (#6756) 2024-01-17 15:09:27 +01:00			`# Serialize the pipeline to YAML`
			`with open(tmp_path / "test_preprocessing_pipeline.yaml", "w") as f:`
			`preprocessing_pipeline.dump(f)`
feat: Add DocumentLanguageClassifier 2.0 (#6037) * add DocumentLanguageClassifier and tests * reno * fix import, rename DocumentCleaner * mark example usage as python code * add assertions to e2e test * use deserialized document_store * Apply suggestions from code review Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> * remove from/to_dict * use renamed InMemoryDocumentStore * adapt to Document refactoring * improve docstring * fix test for new Document --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Co-authored-by: anakin87 <stefanofiorucci@gmail.com> 2023-10-31 15:35:05 +01:00
			`# Load the pipeline back`
test: Update E2E tests to use `Pipeline.dump/load` (#6756) 2024-01-17 15:09:27 +01:00			`with open(tmp_path / "test_preprocessing_pipeline.yaml", "r") as f:`
			`preprocessing_pipeline = Pipeline.load(f)`
feat: Add DocumentLanguageClassifier 2.0 (#6037) * add DocumentLanguageClassifier and tests * reno * fix import, rename DocumentCleaner * mark example usage as python code * add assertions to e2e test * use deserialized document_store * Apply suggestions from code review Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> * remove from/to_dict * use renamed InMemoryDocumentStore * adapt to Document refactoring * improve docstring * fix test for new Document --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Co-authored-by: anakin87 <stefanofiorucci@gmail.com> 2023-10-31 15:35:05 +01:00
			`# Write a txt file`
			`with open(tmp_path / "test_file_english.txt", "w") as f:`
			`f.write(`
			`"This is an english sentence. There is more to it. It's a long text."`
			`"Spans multiple lines."`
			`""`
			`"Even contains empty lines. And extra whitespaces."`
			`)`

			`# Write a txt file`
			`with open(tmp_path / "test_file_german.txt", "w") as f:`
			`f.write("Ein deutscher Satz ohne Verb.")`

			`# Add two txt files and one non-txt file`
			`paths = [`
			`tmp_path / "test_file_english.txt",`
			`tmp_path / "test_file_german.txt",`
			`tmp_path / "test_preprocessing_pipeline.json",`
			`]`

			`result = preprocessing_pipeline.run({"file_type_router": {"sources": paths}})`

			`assert result["writer"]["documents_written"] == 6`
			`filled_document_store = preprocessing_pipeline.get_component("writer").document_store`
			`assert filled_document_store.count_documents() == 6`

feat: Add DocumentJoiner component 2.0 (#6105) * draft DocumentJoiner * implement merge and rrf * draft end-to-end test with DocumentJoiner in hybrid doc search pipeline * adjust for variadics Canals PR #122 * fix text_embedder input * adapt to the new Document class * adapt to new doc id * specify documents input as Variadic in run method * compare doc ids instead of full docs * rename text_file_converter input to sources * update docstring * Update haystack/preview/components/routers/document_joiner.py Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply suggestions from docstring review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * capitalize Documents and Retrievers in docstrings * fix log message in test --------- Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Co-authored-by: anakin87 <stefanofiorucci@gmail.com> Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> 2023-11-20 10:56:56 +01:00			`# Check preprocessed texts`
feat: Add DocumentLanguageClassifier 2.0 (#6037) * add DocumentLanguageClassifier and tests * reno * fix import, rename DocumentCleaner * mark example usage as python code * add assertions to e2e test * use deserialized document_store * Apply suggestions from code review Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> * remove from/to_dict * use renamed InMemoryDocumentStore * adapt to Document refactoring * improve docstring * fix test for new Document --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Co-authored-by: anakin87 <stefanofiorucci@gmail.com> 2023-10-31 15:35:05 +01:00			`stored_documents = filled_document_store.filter_documents()`
			`expected_texts = [`
			`"This is an english sentence.",`
			`" There is more to it.",`
			`" It's a long text.",`
			`"Spans multiple lines.",`
			`"Even contains empty lines.",`
			`" And extra whitespaces.",`
			`]`
			`assert expected_texts == [document.content for document in stored_documents]`
test: Add MetadataRouter to preprocessing pipeline in e2e test (#6321) * add MetadataRouter to preprocessing pipeline * replace mimetype check with language check 2023-11-16 11:22:37 +01:00			`assert all(document.meta["language"] == "en" for document in stored_documents)`