haystack/e2e/preview/pipelines/test_preprocessing_pipeline.py

import json

from haystack.preview import Pipeline
from haystack.preview.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.preview.components.file_converters import TextFileToDocument
from haystack.preview.components.preprocessors import DocumentSplitter, DocumentCleaner
from haystack.preview.components.classifiers import DocumentLanguageClassifier
from haystack.preview.components.routers import FileTypeRouter, MetadataRouter
from haystack.preview.components.writers import DocumentWriter
from haystack.preview.document_stores import InMemoryDocumentStore


def test_preprocessing_pipeline(tmp_path):
    # Create the pipeline and its components
    document_store = InMemoryDocumentStore()
    preprocessing_pipeline = Pipeline()
    preprocessing_pipeline.add_component(instance=FileTypeRouter(mime_types=["text/plain"]), name="file_type_router")
    preprocessing_pipeline.add_component(instance=TextFileToDocument(), name="text_file_converter")
    preprocessing_pipeline.add_component(instance=DocumentLanguageClassifier(), name="language_classifier")
    preprocessing_pipeline.add_component(
        instance=MetadataRouter(rules={"en": {"language": {"$eq": "en"}}}), name="router"
    )
    preprocessing_pipeline.add_component(instance=DocumentCleaner(), name="cleaner")
    preprocessing_pipeline.add_component(
        instance=DocumentSplitter(split_by="sentence", split_length=1), name="splitter"
    )
    preprocessing_pipeline.add_component(
        instance=SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),
        name="embedder",
    )
    preprocessing_pipeline.add_component(instance=DocumentWriter(document_store=document_store), name="writer")
    preprocessing_pipeline.connect("file_type_router.text/plain", "text_file_converter.sources")
    preprocessing_pipeline.connect("text_file_converter.documents", "language_classifier.documents")
    preprocessing_pipeline.connect("language_classifier.documents", "router.documents")
    preprocessing_pipeline.connect("router.en", "cleaner.documents")
    preprocessing_pipeline.connect("cleaner.documents", "splitter.documents")
    preprocessing_pipeline.connect("splitter.documents", "embedder.documents")
    preprocessing_pipeline.connect("embedder.documents", "writer.documents")

    # Draw the pipeline
    preprocessing_pipeline.draw(tmp_path / "test_preprocessing_pipeline.png")

    # Serialize the pipeline to JSON
    with open(tmp_path / "test_preprocessing_pipeline.json", "w") as f:
        print(json.dumps(preprocessing_pipeline.to_dict(), indent=4))
        json.dump(preprocessing_pipeline.to_dict(), f)

    # Load the pipeline back
    with open(tmp_path / "test_preprocessing_pipeline.json", "r") as f:
        preprocessing_pipeline = Pipeline.from_dict(json.load(f))

    # Write a txt file
    with open(tmp_path / "test_file_english.txt", "w") as f:
        f.write(
            "This is an english sentence. There is more to it. It's a long text."
            "Spans multiple lines."
            ""
            "Even contains empty lines.  And extra whitespaces."
        )

    # Write a txt file
    with open(tmp_path / "test_file_german.txt", "w") as f:
        f.write("Ein deutscher Satz ohne Verb.")

    # Add two txt files and one non-txt file
    paths = [
        tmp_path / "test_file_english.txt",
        tmp_path / "test_file_german.txt",
        tmp_path / "test_preprocessing_pipeline.json",
    ]

    result = preprocessing_pipeline.run({"file_type_router": {"sources": paths}})

    assert result["writer"]["documents_written"] == 6
    filled_document_store = preprocessing_pipeline.get_component("writer").document_store
    assert filled_document_store.count_documents() == 6

    # Check preprocessed texts
    stored_documents = filled_document_store.filter_documents()
    expected_texts = [
        "This is an english sentence.",
        " There is more to it.",
        " It's a long text.",
        "Spans multiple lines.",
        "Even contains empty lines.",
        " And extra whitespaces.",
    ]
    assert expected_texts == [document.content for document in stored_documents]
    assert all(document.meta["language"] == "en" for document in stored_documents)
feat: Add DocumentLanguageClassifier 2.0 (#6037) * add DocumentLanguageClassifier and tests * reno * fix import, rename DocumentCleaner * mark example usage as python code * add assertions to e2e test * use deserialized document_store * Apply suggestions from code review Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> * remove from/to_dict * use renamed InMemoryDocumentStore * adapt to Document refactoring * improve docstring * fix test for new Document --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Co-authored-by: anakin87 <stefanofiorucci@gmail.com> 2023-10-31 15:35:05 +01:00			`import json`

			`from haystack.preview import Pipeline`
			`from haystack.preview.components.embedders import SentenceTransformersDocumentEmbedder`
			`from haystack.preview.components.file_converters import TextFileToDocument`
fix: fix failing e2e test (after moving classifiers) (#6243) * mv classifiers * release note * fix e2e test 2023-11-06 17:08:20 +01:00			`from haystack.preview.components.preprocessors import DocumentSplitter, DocumentCleaner`
			`from haystack.preview.components.classifiers import DocumentLanguageClassifier`
test: Add MetadataRouter to preprocessing pipeline in e2e test (#6321) * add MetadataRouter to preprocessing pipeline * replace mimetype check with language check 2023-11-16 11:22:37 +01:00			`from haystack.preview.components.routers import FileTypeRouter, MetadataRouter`
feat: Add DocumentLanguageClassifier 2.0 (#6037) * add DocumentLanguageClassifier and tests * reno * fix import, rename DocumentCleaner * mark example usage as python code * add assertions to e2e test * use deserialized document_store * Apply suggestions from code review Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> * remove from/to_dict * use renamed InMemoryDocumentStore * adapt to Document refactoring * improve docstring * fix test for new Document --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Co-authored-by: anakin87 <stefanofiorucci@gmail.com> 2023-10-31 15:35:05 +01:00			`from haystack.preview.components.writers import DocumentWriter`
			`from haystack.preview.document_stores import InMemoryDocumentStore`


			`def test_preprocessing_pipeline(tmp_path):`
			`# Create the pipeline and its components`
			`document_store = InMemoryDocumentStore()`
			`preprocessing_pipeline = Pipeline()`
			`preprocessing_pipeline.add_component(instance=FileTypeRouter(mime_types=["text/plain"]), name="file_type_router")`
			`preprocessing_pipeline.add_component(instance=TextFileToDocument(), name="text_file_converter")`
			`preprocessing_pipeline.add_component(instance=DocumentLanguageClassifier(), name="language_classifier")`
test: Add MetadataRouter to preprocessing pipeline in e2e test (#6321) * add MetadataRouter to preprocessing pipeline * replace mimetype check with language check 2023-11-16 11:22:37 +01:00			`preprocessing_pipeline.add_component(`
			`instance=MetadataRouter(rules={"en": {"language": {"$eq": "en"}}}), name="router"`
			`)`
feat: Add DocumentLanguageClassifier 2.0 (#6037) * add DocumentLanguageClassifier and tests * reno * fix import, rename DocumentCleaner * mark example usage as python code * add assertions to e2e test * use deserialized document_store * Apply suggestions from code review Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> * remove from/to_dict * use renamed InMemoryDocumentStore * adapt to Document refactoring * improve docstring * fix test for new Document --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Co-authored-by: anakin87 <stefanofiorucci@gmail.com> 2023-10-31 15:35:05 +01:00			`preprocessing_pipeline.add_component(instance=DocumentCleaner(), name="cleaner")`
			`preprocessing_pipeline.add_component(`
refactor!: rename `TextDocumentSplitter` to `DocumentSplitter` (#6223) * rename TextDocumentSplitter to DocumentSplitter * reno * fix init 2023-11-03 11:33:20 +01:00			`instance=DocumentSplitter(split_by="sentence", split_length=1), name="splitter"`
feat: Add DocumentLanguageClassifier 2.0 (#6037) * add DocumentLanguageClassifier and tests * reno * fix import, rename DocumentCleaner * mark example usage as python code * add assertions to e2e test * use deserialized document_store * Apply suggestions from code review Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> * remove from/to_dict * use renamed InMemoryDocumentStore * adapt to Document refactoring * improve docstring * fix test for new Document --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Co-authored-by: anakin87 <stefanofiorucci@gmail.com> 2023-10-31 15:35:05 +01:00			`)`
			`preprocessing_pipeline.add_component(`
			`instance=SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),`
			`name="embedder",`
			`)`
			`preprocessing_pipeline.add_component(instance=DocumentWriter(document_store=document_store), name="writer")`
feat: Add DocumentJoiner component 2.0 (#6105) * draft DocumentJoiner * implement merge and rrf * draft end-to-end test with DocumentJoiner in hybrid doc search pipeline * adjust for variadics Canals PR #122 * fix text_embedder input * adapt to the new Document class * adapt to new doc id * specify documents input as Variadic in run method * compare doc ids instead of full docs * rename text_file_converter input to sources * update docstring * Update haystack/preview/components/routers/document_joiner.py Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply suggestions from docstring review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * capitalize Documents and Retrievers in docstrings * fix log message in test --------- Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Co-authored-by: anakin87 <stefanofiorucci@gmail.com> Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> 2023-11-20 10:56:56 +01:00			`preprocessing_pipeline.connect("file_type_router.text/plain", "text_file_converter.sources")`
feat: Add DocumentLanguageClassifier 2.0 (#6037) * add DocumentLanguageClassifier and tests * reno * fix import, rename DocumentCleaner * mark example usage as python code * add assertions to e2e test * use deserialized document_store * Apply suggestions from code review Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> * remove from/to_dict * use renamed InMemoryDocumentStore * adapt to Document refactoring * improve docstring * fix test for new Document --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Co-authored-by: anakin87 <stefanofiorucci@gmail.com> 2023-10-31 15:35:05 +01:00			`preprocessing_pipeline.connect("text_file_converter.documents", "language_classifier.documents")`
test: Add MetadataRouter to preprocessing pipeline in e2e test (#6321) * add MetadataRouter to preprocessing pipeline * replace mimetype check with language check 2023-11-16 11:22:37 +01:00			`preprocessing_pipeline.connect("language_classifier.documents", "router.documents")`
			`preprocessing_pipeline.connect("router.en", "cleaner.documents")`
feat: Add DocumentLanguageClassifier 2.0 (#6037) * add DocumentLanguageClassifier and tests * reno * fix import, rename DocumentCleaner * mark example usage as python code * add assertions to e2e test * use deserialized document_store * Apply suggestions from code review Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> * remove from/to_dict * use renamed InMemoryDocumentStore * adapt to Document refactoring * improve docstring * fix test for new Document --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Co-authored-by: anakin87 <stefanofiorucci@gmail.com> 2023-10-31 15:35:05 +01:00			`preprocessing_pipeline.connect("cleaner.documents", "splitter.documents")`
			`preprocessing_pipeline.connect("splitter.documents", "embedder.documents")`
			`preprocessing_pipeline.connect("embedder.documents", "writer.documents")`

			`# Draw the pipeline`
			`preprocessing_pipeline.draw(tmp_path / "test_preprocessing_pipeline.png")`

			`# Serialize the pipeline to JSON`
			`with open(tmp_path / "test_preprocessing_pipeline.json", "w") as f:`
			`print(json.dumps(preprocessing_pipeline.to_dict(), indent=4))`
			`json.dump(preprocessing_pipeline.to_dict(), f)`

			`# Load the pipeline back`
			`with open(tmp_path / "test_preprocessing_pipeline.json", "r") as f:`
			`preprocessing_pipeline = Pipeline.from_dict(json.load(f))`

			`# Write a txt file`
			`with open(tmp_path / "test_file_english.txt", "w") as f:`
			`f.write(`
			`"This is an english sentence. There is more to it. It's a long text."`
			`"Spans multiple lines."`
			`""`
			`"Even contains empty lines. And extra whitespaces."`
			`)`

			`# Write a txt file`
			`with open(tmp_path / "test_file_german.txt", "w") as f:`
			`f.write("Ein deutscher Satz ohne Verb.")`

			`# Add two txt files and one non-txt file`
			`paths = [`
			`tmp_path / "test_file_english.txt",`
			`tmp_path / "test_file_german.txt",`
			`tmp_path / "test_preprocessing_pipeline.json",`
			`]`

			`result = preprocessing_pipeline.run({"file_type_router": {"sources": paths}})`

			`assert result["writer"]["documents_written"] == 6`
			`filled_document_store = preprocessing_pipeline.get_component("writer").document_store`
			`assert filled_document_store.count_documents() == 6`

feat: Add DocumentJoiner component 2.0 (#6105) * draft DocumentJoiner * implement merge and rrf * draft end-to-end test with DocumentJoiner in hybrid doc search pipeline * adjust for variadics Canals PR #122 * fix text_embedder input * adapt to the new Document class * adapt to new doc id * specify documents input as Variadic in run method * compare doc ids instead of full docs * rename text_file_converter input to sources * update docstring * Update haystack/preview/components/routers/document_joiner.py Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply suggestions from docstring review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * capitalize Documents and Retrievers in docstrings * fix log message in test --------- Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Co-authored-by: anakin87 <stefanofiorucci@gmail.com> Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> 2023-11-20 10:56:56 +01:00			`# Check preprocessed texts`
feat: Add DocumentLanguageClassifier 2.0 (#6037) * add DocumentLanguageClassifier and tests * reno * fix import, rename DocumentCleaner * mark example usage as python code * add assertions to e2e test * use deserialized document_store * Apply suggestions from code review Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> * remove from/to_dict * use renamed InMemoryDocumentStore * adapt to Document refactoring * improve docstring * fix test for new Document --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Co-authored-by: anakin87 <stefanofiorucci@gmail.com> 2023-10-31 15:35:05 +01:00			`stored_documents = filled_document_store.filter_documents()`
			`expected_texts = [`
			`"This is an english sentence.",`
			`" There is more to it.",`
			`" It's a long text.",`
			`"Spans multiple lines.",`
			`"Even contains empty lines.",`
			`" And extra whitespaces.",`
			`]`
			`assert expected_texts == [document.content for document in stored_documents]`
test: Add MetadataRouter to preprocessing pipeline in e2e test (#6321) * add MetadataRouter to preprocessing pipeline * replace mimetype check with language check 2023-11-16 11:22:37 +01:00			`assert all(document.meta["language"] == "en" for document in stored_documents)`