Split pipeline tests into three suites (#1755)

* Split pipeline tests into three suites * Will this trigger the CI? * Rename duplicate test into test_most_similar_documents_pipeline * Fixing a bug that was probably never noticed
2026-01-07 12:37:27 +00:00 · 2021-11-15 12:16:27 +01:00 · 2021-11-15 12:16:27 +01:00 · 1a10de506c
commit 1a10de506c
parent 09a462d756
3 changed files with 524 additions and 471 deletions
--- a/test/test_pipeline.py
+++ b/test/test_pipeline.py
@ -1,26 +1,9 @@
 from pathlib import Path

 import os
-import json
-import math
 import pytest

-from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore
-from haystack.pipeline import (
-    JoinDocuments,
-    Pipeline,
-    FAQPipeline,
-    DocumentSearchPipeline,
-    RootNode,
-    SklearnQueryClassifier,
-    TransformersQueryClassifier,
-    MostSimilarDocumentsPipeline,
-)
-from haystack.reader import FARMReader
-from haystack.retriever.dense import DensePassageRetriever
-from haystack.retriever.sparse import ElasticsearchRetriever
-from haystack.schema import Document
-
+from haystack.pipelines import Pipeline, RootNode

@pytest.mark.elasticsearch
@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True)
@ -110,141 +93,6 @@ def test_load_tfidfretriever_yaml(tmp_path):
    assert prediction["answers"][0].answer == "haystack"


-@pytest.mark.elasticsearch
-@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
-def test_node_names_validation(document_store_with_docs, tmp_path):
-    pipeline = Pipeline()
-    pipeline.add_node(
-        component=ElasticsearchRetriever(document_store=document_store_with_docs), 
-        name="Retriever", 
-        inputs=["Query"])
-    pipeline.add_node(
-        component=FARMReader(model_name_or_path="deepset/minilm-uncased-squad2"), 
-        name="Reader", 
-        inputs=["Retriever"])
-
-    with pytest.raises(ValueError) as exc_info:
-        pipeline.run(
-            query="Who lives in Berlin?",
-            params={
-                "Reader": {"top_k": 3}, 
-                "non-existing-node": {"top_k": 10}, 
-                "top_k": 5,
-                "non-existing-global_param": "wrong",
-            },
-            debug=True
-        )
-    exception_raised = str(exc_info.value)
-    assert "non-existing-node" in exception_raised
-    assert "non-existing-global_param" in exception_raised
-    assert "Reader" not in exception_raised
-    assert "top_k" not in exception_raised
-
-
-@pytest.mark.elasticsearch
-@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
-def test_debug_attributes_global(document_store_with_docs, tmp_path):
-
-    es_retriever = ElasticsearchRetriever(document_store=document_store_with_docs)
-    reader = FARMReader(model_name_or_path="deepset/minilm-uncased-squad2")
-
-    pipeline = Pipeline()
-    pipeline.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"])
-    pipeline.add_node(component=reader, name="Reader", inputs=["ESRetriever"])
-
-    prediction = pipeline.run(
-        query="Who lives in Berlin?",
-        params={"ESRetriever": {"top_k": 10}, "Reader": {"top_k": 3}},
-        debug=True
-    )
-    assert "_debug" in prediction.keys()
-    assert "ESRetriever" in prediction["_debug"].keys()
-    assert "Reader" in prediction["_debug"].keys()
-    assert "input" in prediction["_debug"]["ESRetriever"].keys()
-    assert "output" in prediction["_debug"]["ESRetriever"].keys()
-    assert "input" in prediction["_debug"]["Reader"].keys()
-    assert "output" in prediction["_debug"]["Reader"].keys()
-    assert prediction["_debug"]["ESRetriever"]["input"]
-    assert prediction["_debug"]["ESRetriever"]["output"]
-    assert prediction["_debug"]["Reader"]["input"]
-    assert prediction["_debug"]["Reader"]["output"]
-
-    # Avoid circular reference: easiest way to detect those is to use json.dumps
-    json.dumps(prediction, default=str)
-
-@pytest.mark.elasticsearch
-@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
-def test_debug_attributes_per_node(document_store_with_docs, tmp_path):
-
-    es_retriever = ElasticsearchRetriever(document_store=document_store_with_docs)
-    reader = FARMReader(model_name_or_path="deepset/minilm-uncased-squad2")
-
-    pipeline = Pipeline()
-    pipeline.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"])
-    pipeline.add_node(component=reader, name="Reader", inputs=["ESRetriever"])
-
-    prediction = pipeline.run(
-        query="Who lives in Berlin?",
-        params={
-            "ESRetriever": {"top_k": 10, "debug": True},
-            "Reader": {"top_k": 3}
-        },
-    )
-    assert "_debug" in prediction.keys()
-    assert "ESRetriever" in prediction["_debug"].keys()
-    assert "Reader" not in prediction["_debug"].keys()
-    assert "input" in prediction["_debug"]["ESRetriever"].keys()
-    assert "output" in prediction["_debug"]["ESRetriever"].keys()
-    assert prediction["_debug"]["ESRetriever"]["input"]
-    assert prediction["_debug"]["ESRetriever"]["output"]
-
-    # Avoid circular reference: easiest way to detect those is to use json.dumps
-    json.dumps(prediction, default=str)
-
-
-@pytest.mark.elasticsearch
-@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
-def test_global_debug_attributes_override_node_ones(document_store_with_docs, tmp_path):
-
-    es_retriever = ElasticsearchRetriever(document_store=document_store_with_docs)
-    reader = FARMReader(model_name_or_path="deepset/minilm-uncased-squad2")
-
-    pipeline = Pipeline()
-    pipeline.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"])
-    pipeline.add_node(component=reader, name="Reader", inputs=["ESRetriever"])
-
-    prediction = pipeline.run(
-        query="Who lives in Berlin?",
-        params={
-            "ESRetriever": {"top_k": 10, "debug": True},
-            "Reader": {"top_k": 3, "debug": True}
-        },
-        debug=False
-    )
-    assert "_debug" not in prediction.keys()
-
-    prediction = pipeline.run(
-        query="Who lives in Berlin?",
-        params={
-            "ESRetriever": {"top_k": 10, "debug": False},
-            "Reader": {"top_k": 3, "debug": False}
-        },
-        debug=True
-    )
-    assert "_debug" in prediction.keys()
-    assert "ESRetriever" in prediction["_debug"].keys()
-    assert "Reader" in prediction["_debug"].keys()
-    assert "input" in prediction["_debug"]["ESRetriever"].keys()
-    assert "output" in prediction["_debug"]["ESRetriever"].keys()
-    assert "input" in prediction["_debug"]["Reader"].keys()
-    assert "output" in prediction["_debug"]["Reader"].keys()
-    assert prediction["_debug"]["ESRetriever"]["input"]
-    assert prediction["_debug"]["ESRetriever"]["output"]
-    assert prediction["_debug"]["Reader"]["input"]
-    assert prediction["_debug"]["Reader"]["output"]
-
-
-
 # @pytest.mark.slow
 # @pytest.mark.elasticsearch
 # @pytest.mark.parametrize(
@ -295,181 +143,6 @@ def test_graph_creation(retriever_with_docs, document_store_with_docs):
        )


-def test_invalid_run_args():
-    pipeline = Pipeline.load_from_yaml(
-        Path(__file__).parent/"samples"/"pipeline"/"test_pipeline.yaml", pipeline_name="query_pipeline"
-    )
-    with pytest.raises(Exception) as exc:
-        pipeline.run(params={"ESRetriever": {"top_k": 10}})
-    assert "run() missing 1 required positional argument: 'query'" in str(exc.value)
-
-    with pytest.raises(Exception) as exc:
-        pipeline.run(invalid_query="Who made the PDF specification?", params={"ESRetriever": {"top_k": 10}})
-    assert "run() got an unexpected keyword argument 'invalid_query'" in str(exc.value)
-
-    with pytest.raises(Exception) as exc:
-        pipeline.run(query="Who made the PDF specification?", params={"ESRetriever": {"invalid": 10}})
-    assert "Invalid parameter 'invalid' for the node 'ESRetriever'" in str(exc.value)
-
-
-@pytest.mark.parametrize(
-    "retriever,document_store",
-    [
-        ("embedding", "memory"),
-        ("embedding", "faiss"),
-        ("embedding", "milvus"),
-        ("embedding", "elasticsearch"),
-    ],
-    indirect=True,
-)
-def test_faq_pipeline(retriever, document_store):
-    documents = [
-        {
-            "content": "How to test module-1?",
-            "meta": {"source": "wiki1", "answer": "Using tests for module-1"},
-        },
-        {
-            "content": "How to test module-2?",
-            "meta": {"source": "wiki2", "answer": "Using tests for module-2"},
-        },
-        {
-            "content": "How to test module-3?",
-            "meta": {"source": "wiki3", "answer": "Using tests for module-3"},
-        },
-        {
-            "content": "How to test module-4?",
-            "meta": {"source": "wiki4", "answer": "Using tests for module-4"},
-        },
-        {
-            "content": "How to test module-5?",
-            "meta": {"source": "wiki5", "answer": "Using tests for module-5"},
-        },
-    ]
-
-    document_store.write_documents(documents)
-    document_store.update_embeddings(retriever)
-
-    pipeline = FAQPipeline(retriever=retriever)
-
-    output = pipeline.run(query="How to test this?", params={"Retriever": {"top_k": 3}})
-    assert len(output["answers"]) == 3
-    assert output["query"].startswith("How to")
-    assert output["answers"][0].answer.startswith("Using tests")
-
-    if isinstance(document_store, ElasticsearchDocumentStore):
-        output = pipeline.run(query="How to test this?", params={"Retriever": {"filters": {"source": ["wiki2"]}, "top_k": 5}})
-        assert len(output["answers"]) == 1
-
-
-@pytest.mark.parametrize("retriever_with_docs", ["embedding"], indirect=True)
-def test_document_search_pipeline(retriever, document_store):
-    documents = [
-        {"content": "Sample text for document-1", "meta": {"source": "wiki1"}},
-        {"content": "Sample text for document-2", "meta": {"source": "wiki2"}},
-        {"content": "Sample text for document-3", "meta": {"source": "wiki3"}},
-        {"content": "Sample text for document-4", "meta": {"source": "wiki4"}},
-        {"content": "Sample text for document-5", "meta": {"source": "wiki5"}},
-    ]
-
-    document_store.write_documents(documents)
-    document_store.update_embeddings(retriever)
-
-    pipeline = DocumentSearchPipeline(retriever=retriever)
-    output = pipeline.run(query="How to test this?", params={"top_k": 4})
-    assert len(output.get("documents", [])) == 4
-
-    if isinstance(document_store, ElasticsearchDocumentStore):
-        output = pipeline.run(query="How to test this?", params={"filters": {"source": ["wiki2"]}, "top_k": 5})
-        assert len(output["documents"]) == 1
-
-
-@pytest.mark.elasticsearch
-@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
-@pytest.mark.parametrize("reader", ["farm"], indirect=True)
-def test_join_document_pipeline(document_store_with_docs, reader):
-    es = ElasticsearchRetriever(document_store=document_store_with_docs)
-    dpr = DensePassageRetriever(
-        document_store=document_store_with_docs,
-        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
-        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
-        use_gpu=False,
-    )
-    document_store_with_docs.update_embeddings(dpr)
-
-    query = "Where does Carla live?"
-
-    # test merge without weights
-    join_node = JoinDocuments(join_mode="merge")
-    p = Pipeline()
-    p.add_node(component=es, name="R1", inputs=["Query"])
-    p.add_node(component=dpr, name="R2", inputs=["Query"])
-    p.add_node(component=join_node, name="Join", inputs=["R1", "R2"])
-    results = p.run(query=query)
-    assert len(results["documents"]) == 3
-
-    # test merge with weights
-    join_node = JoinDocuments(join_mode="merge", weights=[1000, 1], top_k_join=2)
-    p = Pipeline()
-    p.add_node(component=es, name="R1", inputs=["Query"])
-    p.add_node(component=dpr, name="R2", inputs=["Query"])
-    p.add_node(component=join_node, name="Join", inputs=["R1", "R2"])
-    results = p.run(query=query)
-    assert math.isclose(results["documents"][0].score, 0.5350644373470798, rel_tol=0.0001)
-    assert len(results["documents"]) == 2
-
-    # test concatenate
-    join_node = JoinDocuments(join_mode="concatenate")
-    p = Pipeline()
-    p.add_node(component=es, name="R1", inputs=["Query"])
-    p.add_node(component=dpr, name="R2", inputs=["Query"])
-    p.add_node(component=join_node, name="Join", inputs=["R1", "R2"])
-    results = p.run(query=query)
-    assert len(results["documents"]) == 3
-
-    # test join_node with reader
-    join_node = JoinDocuments()
-    p = Pipeline()
-    p.add_node(component=es, name="R1", inputs=["Query"])
-    p.add_node(component=dpr, name="R2", inputs=["Query"])
-    p.add_node(component=join_node, name="Join", inputs=["R1", "R2"])
-    p.add_node(component=reader, name="Reader", inputs=["Join"])
-    results = p.run(query=query)
-    #check whether correct answer is within top 2 predictions
-    assert results["answers"][0].answer == "Berlin" or results["answers"][1].answer == "Berlin"
-
-
-def test_debug_info_propagation():
-    class A(RootNode):
-        def run(self):
-            test = "A"
-            return {"test": test, "_debug": {"debug_key_a": "debug_value_a"}}, "output_1"
-
-    class B(RootNode):
-        def run(self, test):
-            test += "B"
-            return {"test": test, "_debug": {"debug_key_b": "debug_value_b"}}, "output_1"
-
-    class C(RootNode):
-        def run(self, test):
-            test += "C"
-            return {"test": test}, "output_1"
-
-    class D(RootNode):
-        def run(self, test, _debug):
-            test += "C"
-            assert _debug["B"]["debug_key_b"] == "debug_value_b"
-            return {"test": test}, "output_1"
-
-    pipeline = Pipeline()
-    pipeline.add_node(name="A", component=A(), inputs=["Query"])
-    pipeline.add_node(name="B", component=B(), inputs=["A"])
-    pipeline.add_node(name="C", component=C(), inputs=["B"])
-    pipeline.add_node(name="D", component=D(), inputs=["C"])
-    output = pipeline.run(query="test")
-    assert output["_debug"]["A"]["debug_key_a"] == "debug_value_a"
-    assert output["_debug"]["B"]["debug_key_b"] == "debug_value_b"
-
-
 def test_parallel_paths_in_pipeline_graph():
    class A(RootNode):
        def run(self):
@ -604,149 +277,6 @@ def test_parallel_paths_in_pipeline_graph_with_branching():
    assert output["output"] == "ACABEABD"


-def test_query_keyword_statement_classifier():
-    class KeywordOutput(RootNode):
-        outgoing_edges = 2
-
-        def run(self, **kwargs):
-            kwargs["output"] = "keyword"
-            return kwargs, "output_1"
-
-    class QuestionOutput(RootNode):
-        outgoing_edges = 2
-
-        def run(self, **kwargs):
-            kwargs["output"] = "question"
-            return kwargs, "output_2"
-
-    pipeline = Pipeline()
-    pipeline.add_node(
-        name="SkQueryKeywordQuestionClassifier",
-        component=SklearnQueryClassifier(),
-        inputs=["Query"],
-    )
-    pipeline.add_node(
-        name="KeywordNode",
-        component=KeywordOutput(),
-        inputs=["SkQueryKeywordQuestionClassifier.output_2"],
-    )
-    pipeline.add_node(
-        name="QuestionNode",
-        component=QuestionOutput(),
-        inputs=["SkQueryKeywordQuestionClassifier.output_1"],
-    )
-    output = pipeline.run(query="morse code")
-    assert output["output"] == "keyword"
-
-    output = pipeline.run(query="How old is John?")
-    assert output["output"] == "question"
-
-    pipeline = Pipeline()
-    pipeline.add_node(
-        name="TfQueryKeywordQuestionClassifier",
-        component=TransformersQueryClassifier(),
-        inputs=["Query"],
-    )
-    pipeline.add_node(
-        name="KeywordNode",
-        component=KeywordOutput(),
-        inputs=["TfQueryKeywordQuestionClassifier.output_2"],
-    )
-    pipeline.add_node(
-        name="QuestionNode",
-        component=QuestionOutput(),
-        inputs=["TfQueryKeywordQuestionClassifier.output_1"],
-    )
-    output = pipeline.run(query="morse code")
-    assert output["output"] == "keyword"
-
-    output = pipeline.run(query="How old is John?")
-    assert output["output"] == "question"
-
-
-@pytest.mark.parametrize(
-        "retriever,document_store",
-        [
-            ("embedding", "faiss"),
-            ("embedding", "milvus"),
-            ("embedding", "elasticsearch"),
-        ],
-        indirect=True,
-)
-def test_document_search_pipeline(retriever, document_store):
-    documents = [
-        {"id": "a", "content": "Sample text for document-1", "meta": {"source": "wiki1"}},
-        {"id": "b", "content": "Sample text for document-2", "meta": {"source": "wiki2"}},
-        {"content": "Sample text for document-3", "meta": {"source": "wiki3"}},
-        {"content": "Sample text for document-4", "meta": {"source": "wiki4"}},
-        {"content": "Sample text for document-5", "meta": {"source": "wiki5"}},
-    ]
-
-    document_store.write_documents(documents)
-    document_store.update_embeddings(retriever)
-
-    docs_id: list = ["a", "b"]
-    pipeline = MostSimilarDocumentsPipeline(document_store=document_store)
-    list_of_documents = pipeline.run(document_ids=docs_id)
-
-    assert len(list_of_documents[0]) > 1
-    assert isinstance(list_of_documents, list)
-    assert len(list_of_documents) == len(docs_id)
-
-    for another_list in list_of_documents:
-        assert isinstance(another_list, list)
-        for document in another_list:
-            assert isinstance(document, Document)
-            assert isinstance(document.id, str)
-            assert isinstance(document.content, str)
-
-
-@pytest.mark.elasticsearch
-@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True)
-def test_indexing_pipeline_with_classifier(document_store):
-    # test correct load of indexing pipeline from yaml
-    pipeline = Pipeline.load_from_yaml(
-        Path(__file__).parent/"samples"/"pipeline"/"test_pipeline.yaml", pipeline_name="indexing_pipeline_with_classifier"
-    )
-    pipeline.run(
-        file_paths=Path(__file__).parent/"samples"/"pdf"/"sample_pdf_1.pdf"
-    )
-    # test correct load of query pipeline from yaml
-    pipeline = Pipeline.load_from_yaml(
-        Path(__file__).parent/"samples"/"pipeline"/"test_pipeline.yaml", pipeline_name="query_pipeline"
-    )
-    prediction = pipeline.run(
-        query="Who made the PDF specification?", params={"ESRetriever": {"top_k": 10}, "Reader": {"top_k": 3}}
-    )
-    assert prediction["query"] == "Who made the PDF specification?"
-    assert prediction["answers"][0].answer == "Adobe Systems"
-    assert prediction["answers"][0].meta["classification"]["label"] == "joy"
-    assert "_debug" not in prediction.keys()
-
-
-@pytest.mark.elasticsearch
-@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True)
-def test_query_pipeline_with_document_classifier(document_store):
-    # test correct load of indexing pipeline from yaml
-    pipeline = Pipeline.load_from_yaml(
-        Path(__file__).parent/"samples"/"pipeline"/"test_pipeline.yaml", pipeline_name="indexing_pipeline"
-    )
-    pipeline.run(
-        file_paths=Path(__file__).parent/"samples"/"pdf"/"sample_pdf_1.pdf"
-    )
-    # test correct load of query pipeline from yaml
-    pipeline = Pipeline.load_from_yaml(
-        Path(__file__).parent/"samples"/"pipeline"/"test_pipeline.yaml", pipeline_name="query_pipeline_with_document_classifier"
-    )
-    prediction = pipeline.run(
-        query="Who made the PDF specification?", params={"ESRetriever": {"top_k": 10}, "Reader": {"top_k": 3}}
-    )
-    assert prediction["query"] == "Who made the PDF specification?"
-    assert prediction["answers"][0].answer == "Adobe Systems"
-    assert prediction["answers"][0].meta["classification"]["label"] == "joy"
-    assert "_debug" not in prediction.keys()
-
-
 def test_existing_faiss_document_store():
    clean_faiss_document_store()

--- a/test/test_pipeline_debug_and_validation.py
+++ b/test/test_pipeline_debug_and_validation.py
@ -0,0 +1,197 @@
+from pathlib import Path
+
+import json
+import pytest
+
+from haystack.pipelines import (
+    Pipeline,
+    RootNode,
+)
+from haystack.nodes import (
+    FARMReader,
+    ElasticsearchRetriever,
+)
+
+
+
+@pytest.mark.elasticsearch
+@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
+def test_node_names_validation(document_store_with_docs, tmp_path):
+    pipeline = Pipeline()
+    pipeline.add_node(
+        component=ElasticsearchRetriever(document_store=document_store_with_docs), 
+        name="Retriever", 
+        inputs=["Query"])
+    pipeline.add_node(
+        component=FARMReader(model_name_or_path="deepset/minilm-uncased-squad2"), 
+        name="Reader", 
+        inputs=["Retriever"])
+
+    with pytest.raises(ValueError) as exc_info:
+        pipeline.run(
+            query="Who lives in Berlin?",
+            params={
+                "Reader": {"top_k": 3}, 
+                "non-existing-node": {"top_k": 10}, 
+                "top_k": 5,
+                "non-existing-global_param": "wrong",
+            },
+            debug=True
+        )
+    exception_raised = str(exc_info.value)
+    assert "non-existing-node" in exception_raised
+    assert "non-existing-global_param" in exception_raised
+    assert "Reader" not in exception_raised
+    assert "top_k" not in exception_raised
+
+
+@pytest.mark.elasticsearch
+@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
+def test_debug_attributes_global(document_store_with_docs, tmp_path):
+
+    es_retriever = ElasticsearchRetriever(document_store=document_store_with_docs)
+    reader = FARMReader(model_name_or_path="deepset/minilm-uncased-squad2")
+
+    pipeline = Pipeline()
+    pipeline.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"])
+    pipeline.add_node(component=reader, name="Reader", inputs=["ESRetriever"])
+
+    prediction = pipeline.run(
+        query="Who lives in Berlin?",
+        params={"ESRetriever": {"top_k": 10}, "Reader": {"top_k": 3}},
+        debug=True
+    )
+    assert "_debug" in prediction.keys()
+    assert "ESRetriever" in prediction["_debug"].keys()
+    assert "Reader" in prediction["_debug"].keys()
+    assert "input" in prediction["_debug"]["ESRetriever"].keys()
+    assert "output" in prediction["_debug"]["ESRetriever"].keys()
+    assert "input" in prediction["_debug"]["Reader"].keys()
+    assert "output" in prediction["_debug"]["Reader"].keys()
+    assert prediction["_debug"]["ESRetriever"]["input"]
+    assert prediction["_debug"]["ESRetriever"]["output"]
+    assert prediction["_debug"]["Reader"]["input"]
+    assert prediction["_debug"]["Reader"]["output"]
+
+    # Avoid circular reference: easiest way to detect those is to use json.dumps
+    json.dumps(prediction, default=str)
+
+@pytest.mark.elasticsearch
+@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
+def test_debug_attributes_per_node(document_store_with_docs, tmp_path):
+
+    es_retriever = ElasticsearchRetriever(document_store=document_store_with_docs)
+    reader = FARMReader(model_name_or_path="deepset/minilm-uncased-squad2")
+
+    pipeline = Pipeline()
+    pipeline.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"])
+    pipeline.add_node(component=reader, name="Reader", inputs=["ESRetriever"])
+
+    prediction = pipeline.run(
+        query="Who lives in Berlin?",
+        params={
+            "ESRetriever": {"top_k": 10, "debug": True},
+            "Reader": {"top_k": 3}
+        },
+    )
+    assert "_debug" in prediction.keys()
+    assert "ESRetriever" in prediction["_debug"].keys()
+    assert "Reader" not in prediction["_debug"].keys()
+    assert "input" in prediction["_debug"]["ESRetriever"].keys()
+    assert "output" in prediction["_debug"]["ESRetriever"].keys()
+    assert prediction["_debug"]["ESRetriever"]["input"]
+    assert prediction["_debug"]["ESRetriever"]["output"]
+
+    # Avoid circular reference: easiest way to detect those is to use json.dumps
+    json.dumps(prediction, default=str)
+
+
+@pytest.mark.elasticsearch
+@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
+def test_global_debug_attributes_override_node_ones(document_store_with_docs, tmp_path):
+
+    es_retriever = ElasticsearchRetriever(document_store=document_store_with_docs)
+    reader = FARMReader(model_name_or_path="deepset/minilm-uncased-squad2")
+
+    pipeline = Pipeline()
+    pipeline.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"])
+    pipeline.add_node(component=reader, name="Reader", inputs=["ESRetriever"])
+
+    prediction = pipeline.run(
+        query="Who lives in Berlin?",
+        params={
+            "ESRetriever": {"top_k": 10, "debug": True},
+            "Reader": {"top_k": 3, "debug": True}
+        },
+        debug=False
+    )
+    assert "_debug" not in prediction.keys()
+
+    prediction = pipeline.run(
+        query="Who lives in Berlin?",
+        params={
+            "ESRetriever": {"top_k": 10, "debug": False},
+            "Reader": {"top_k": 3, "debug": False}
+        },
+        debug=True
+    )
+    assert "_debug" in prediction.keys()
+    assert "ESRetriever" in prediction["_debug"].keys()
+    assert "Reader" in prediction["_debug"].keys()
+    assert "input" in prediction["_debug"]["ESRetriever"].keys()
+    assert "output" in prediction["_debug"]["ESRetriever"].keys()
+    assert "input" in prediction["_debug"]["Reader"].keys()
+    assert "output" in prediction["_debug"]["Reader"].keys()
+    assert prediction["_debug"]["ESRetriever"]["input"]
+    assert prediction["_debug"]["ESRetriever"]["output"]
+    assert prediction["_debug"]["Reader"]["input"]
+    assert prediction["_debug"]["Reader"]["output"]
+
+
+def test_invalid_run_args():
+    pipeline = Pipeline.load_from_yaml(
+        Path(__file__).parent/"samples"/"pipeline"/"test_pipeline.yaml", pipeline_name="query_pipeline"
+    )
+    with pytest.raises(Exception) as exc:
+        pipeline.run(params={"ESRetriever": {"top_k": 10}})
+    assert "run() missing 1 required positional argument: 'query'" in str(exc.value)
+
+    with pytest.raises(Exception) as exc:
+        pipeline.run(invalid_query="Who made the PDF specification?", params={"ESRetriever": {"top_k": 10}})
+    assert "run() got an unexpected keyword argument 'invalid_query'" in str(exc.value)
+
+    with pytest.raises(Exception) as exc:
+        pipeline.run(query="Who made the PDF specification?", params={"ESRetriever": {"invalid": 10}})
+    assert "Invalid parameter 'invalid' for the node 'ESRetriever'" in str(exc.value)
+
+
+def test_debug_info_propagation():
+    class A(RootNode):
+        def run(self):
+            test = "A"
+            return {"test": test, "_debug": {"debug_key_a": "debug_value_a"}}, "output_1"
+
+    class B(RootNode):
+        def run(self, test):
+            test += "B"
+            return {"test": test, "_debug": {"debug_key_b": "debug_value_b"}}, "output_1"
+
+    class C(RootNode):
+        def run(self, test):
+            test += "C"
+            return {"test": test}, "output_1"
+
+    class D(RootNode):
+        def run(self, test, _debug):
+            test += "C"
+            assert _debug["B"]["debug_key_b"] == "debug_value_b"
+            return {"test": test}, "output_1"
+
+    pipeline = Pipeline()
+    pipeline.add_node(name="A", component=A(), inputs=["Query"])
+    pipeline.add_node(name="B", component=B(), inputs=["A"])
+    pipeline.add_node(name="C", component=C(), inputs=["B"])
+    pipeline.add_node(name="D", component=D(), inputs=["C"])
+    output = pipeline.run(query="test")
+    assert output["_debug"]["A"]["debug_key_a"] == "debug_value_a"
+    assert output["_debug"]["B"]["debug_key_b"] == "debug_value_b"
--- a/test/test_standard_pipelines.py
+++ b/test/test_standard_pipelines.py
@ -0,0 +1,326 @@
+from pathlib import Path
+
+import os
+import math
+import pytest
+
+from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore
+from haystack.pipelines import (
+    Pipeline,
+    FAQPipeline,
+    DocumentSearchPipeline,
+    RootNode,
+    MostSimilarDocumentsPipeline,
+)
+from haystack.nodes import DensePassageRetriever,  ElasticsearchRetriever, SklearnQueryClassifier, TransformersQueryClassifier, JoinDocuments
+from haystack.schema import Document
+
+
+@pytest.mark.parametrize(
+    "retriever,document_store",
+    [
+        ("embedding", "memory"),
+        ("embedding", "faiss"),
+        ("embedding", "milvus"),
+        ("embedding", "elasticsearch"),
+    ],
+    indirect=True,
+)
+def test_faq_pipeline(retriever, document_store):
+    documents = [
+        {
+            "content": "How to test module-1?",
+            "meta": {"source": "wiki1", "answer": "Using tests for module-1"},
+        },
+        {
+            "content": "How to test module-2?",
+            "meta": {"source": "wiki2", "answer": "Using tests for module-2"},
+        },
+        {
+            "content": "How to test module-3?",
+            "meta": {"source": "wiki3", "answer": "Using tests for module-3"},
+        },
+        {
+            "content": "How to test module-4?",
+            "meta": {"source": "wiki4", "answer": "Using tests for module-4"},
+        },
+        {
+            "content": "How to test module-5?",
+            "meta": {"source": "wiki5", "answer": "Using tests for module-5"},
+        },
+    ]
+
+    document_store.write_documents(documents)
+    document_store.update_embeddings(retriever)
+
+    pipeline = FAQPipeline(retriever=retriever)
+
+    output = pipeline.run(query="How to test this?", params={"Retriever": {"top_k": 3}})
+    assert len(output["answers"]) == 3
+    assert output["query"].startswith("How to")
+    assert output["answers"][0].answer.startswith("Using tests")
+
+    if isinstance(document_store, ElasticsearchDocumentStore):
+        output = pipeline.run(query="How to test this?", params={"Retriever": {"filters": {"source": ["wiki2"]}, "top_k": 5}})
+        assert len(output["answers"]) == 1
+
+
+@pytest.mark.parametrize("retriever", ["embedding"], indirect=True)
+def test_document_search_pipeline(retriever, document_store):
+    documents = [
+        {"content": "Sample text for document-1", "meta": {"source": "wiki1"}},
+        {"content": "Sample text for document-2", "meta": {"source": "wiki2"}},
+        {"content": "Sample text for document-3", "meta": {"source": "wiki3"}},
+        {"content": "Sample text for document-4", "meta": {"source": "wiki4"}},
+        {"content": "Sample text for document-5", "meta": {"source": "wiki5"}},
+    ]
+
+    document_store.write_documents(documents)
+    document_store.update_embeddings(retriever)
+
+    pipeline = DocumentSearchPipeline(retriever=retriever)
+    output = pipeline.run(query="How to test this?", params={"top_k": 4})
+    assert len(output.get("documents", [])) == 4
+
+    if isinstance(document_store, ElasticsearchDocumentStore):
+        output = pipeline.run(query="How to test this?", params={"filters": {"source": ["wiki2"]}, "top_k": 5})
+        assert len(output["documents"]) == 1
+
+
+@pytest.mark.parametrize(
+        "retriever,document_store",
+        [
+            ("embedding", "faiss"),
+            ("embedding", "milvus"),
+            ("embedding", "elasticsearch"),
+        ],
+        indirect=True,
+)
+def test_most_similar_documents_pipeline(retriever, document_store):
+    documents = [
+        {"id": "a", "content": "Sample text for document-1", "meta": {"source": "wiki1"}},
+        {"id": "b", "content": "Sample text for document-2", "meta": {"source": "wiki2"}},
+        {"content": "Sample text for document-3", "meta": {"source": "wiki3"}},
+        {"content": "Sample text for document-4", "meta": {"source": "wiki4"}},
+        {"content": "Sample text for document-5", "meta": {"source": "wiki5"}},
+    ]
+
+    document_store.write_documents(documents)
+    document_store.update_embeddings(retriever)
+
+    docs_id: list = ["a", "b"]
+    pipeline = MostSimilarDocumentsPipeline(document_store=document_store)
+    list_of_documents = pipeline.run(document_ids=docs_id)
+
+    assert len(list_of_documents[0]) > 1
+    assert isinstance(list_of_documents, list)
+    assert len(list_of_documents) == len(docs_id)
+
+    for another_list in list_of_documents:
+        assert isinstance(another_list, list)
+        for document in another_list:
+            assert isinstance(document, Document)
+            assert isinstance(document.id, str)
+            assert isinstance(document.content, str)
+
+
+@pytest.mark.elasticsearch
+@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
+@pytest.mark.parametrize("reader", ["farm"], indirect=True)
+def test_join_document_pipeline(document_store_with_docs, reader):
+    es = ElasticsearchRetriever(document_store=document_store_with_docs)
+    dpr = DensePassageRetriever(
+        document_store=document_store_with_docs,
+        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
+        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
+        use_gpu=False,
+    )
+    document_store_with_docs.update_embeddings(dpr)
+
+    query = "Where does Carla live?"
+
+    # test merge without weights
+    join_node = JoinDocuments(join_mode="merge")
+    p = Pipeline()
+    p.add_node(component=es, name="R1", inputs=["Query"])
+    p.add_node(component=dpr, name="R2", inputs=["Query"])
+    p.add_node(component=join_node, name="Join", inputs=["R1", "R2"])
+    results = p.run(query=query)
+    assert len(results["documents"]) == 3
+
+    # test merge with weights
+    join_node = JoinDocuments(join_mode="merge", weights=[1000, 1], top_k_join=2)
+    p = Pipeline()
+    p.add_node(component=es, name="R1", inputs=["Query"])
+    p.add_node(component=dpr, name="R2", inputs=["Query"])
+    p.add_node(component=join_node, name="Join", inputs=["R1", "R2"])
+    results = p.run(query=query)
+    assert math.isclose(results["documents"][0].score, 0.5350644373470798, rel_tol=0.0001)
+    assert len(results["documents"]) == 2
+
+    # test concatenate
+    join_node = JoinDocuments(join_mode="concatenate")
+    p = Pipeline()
+    p.add_node(component=es, name="R1", inputs=["Query"])
+    p.add_node(component=dpr, name="R2", inputs=["Query"])
+    p.add_node(component=join_node, name="Join", inputs=["R1", "R2"])
+    results = p.run(query=query)
+    assert len(results["documents"]) == 3
+
+    # test join_node with reader
+    join_node = JoinDocuments()
+    p = Pipeline()
+    p.add_node(component=es, name="R1", inputs=["Query"])
+    p.add_node(component=dpr, name="R2", inputs=["Query"])
+    p.add_node(component=join_node, name="Join", inputs=["R1", "R2"])
+    p.add_node(component=reader, name="Reader", inputs=["Join"])
+    results = p.run(query=query)
+    #check whether correct answer is within top 2 predictions
+    assert results["answers"][0].answer == "Berlin" or results["answers"][1].answer == "Berlin"
+
+
+def test_query_keyword_statement_classifier():
+    class KeywordOutput(RootNode):
+        outgoing_edges = 2
+
+        def run(self, **kwargs):
+            kwargs["output"] = "keyword"
+            return kwargs, "output_1"
+
+    class QuestionOutput(RootNode):
+        outgoing_edges = 2
+
+        def run(self, **kwargs):
+            kwargs["output"] = "question"
+            return kwargs, "output_2"
+
+    pipeline = Pipeline()
+    pipeline.add_node(
+        name="SkQueryKeywordQuestionClassifier",
+        component=SklearnQueryClassifier(),
+        inputs=["Query"],
+    )
+    pipeline.add_node(
+        name="KeywordNode",
+        component=KeywordOutput(),
+        inputs=["SkQueryKeywordQuestionClassifier.output_2"],
+    )
+    pipeline.add_node(
+        name="QuestionNode",
+        component=QuestionOutput(),
+        inputs=["SkQueryKeywordQuestionClassifier.output_1"],
+    )
+    output = pipeline.run(query="morse code")
+    assert output["output"] == "keyword"
+
+    output = pipeline.run(query="How old is John?")
+    assert output["output"] == "question"
+
+    pipeline = Pipeline()
+    pipeline.add_node(
+        name="TfQueryKeywordQuestionClassifier",
+        component=TransformersQueryClassifier(),
+        inputs=["Query"],
+    )
+    pipeline.add_node(
+        name="KeywordNode",
+        component=KeywordOutput(),
+        inputs=["TfQueryKeywordQuestionClassifier.output_2"],
+    )
+    pipeline.add_node(
+        name="QuestionNode",
+        component=QuestionOutput(),
+        inputs=["TfQueryKeywordQuestionClassifier.output_1"],
+    )
+    output = pipeline.run(query="morse code")
+    assert output["output"] == "keyword"
+
+    output = pipeline.run(query="How old is John?")
+    assert output["output"] == "question"
+
+
+@pytest.mark.elasticsearch
+@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True)
+def test_indexing_pipeline_with_classifier(document_store):
+    # test correct load of indexing pipeline from yaml
+    pipeline = Pipeline.load_from_yaml(
+        Path(__file__).parent/"samples"/"pipeline"/"test_pipeline.yaml", pipeline_name="indexing_pipeline_with_classifier"
+    )
+    pipeline.run(
+        file_paths=Path(__file__).parent/"samples"/"pdf"/"sample_pdf_1.pdf"
+    )
+    # test correct load of query pipeline from yaml
+    pipeline = Pipeline.load_from_yaml(
+        Path(__file__).parent/"samples"/"pipeline"/"test_pipeline.yaml", pipeline_name="query_pipeline"
+    )
+    prediction = pipeline.run(
+        query="Who made the PDF specification?", params={"ESRetriever": {"top_k": 10}, "Reader": {"top_k": 3}}
+    )
+    assert prediction["query"] == "Who made the PDF specification?"
+    assert prediction["answers"][0].answer == "Adobe Systems"
+    assert prediction["answers"][0].meta["classification"]["label"] == "joy"
+    assert "_debug" not in prediction.keys()
+
+
+@pytest.mark.elasticsearch
+@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True)
+def test_query_pipeline_with_document_classifier(document_store):
+    # test correct load of indexing pipeline from yaml
+    pipeline = Pipeline.load_from_yaml(
+        Path(__file__).parent/"samples"/"pipeline"/"test_pipeline.yaml", pipeline_name="indexing_pipeline"
+    )
+    pipeline.run(
+        file_paths=Path(__file__).parent/"samples"/"pdf"/"sample_pdf_1.pdf"
+    )
+    # test correct load of query pipeline from yaml
+    pipeline = Pipeline.load_from_yaml(
+        Path(__file__).parent/"samples"/"pipeline"/"test_pipeline.yaml", pipeline_name="query_pipeline_with_document_classifier"
+    )
+    prediction = pipeline.run(
+        query="Who made the PDF specification?", params={"ESRetriever": {"top_k": 10}, "Reader": {"top_k": 3}}
+    )
+    assert prediction["query"] == "Who made the PDF specification?"
+    assert prediction["answers"][0].answer == "Adobe Systems"
+    assert prediction["answers"][0].meta["classification"]["label"] == "joy"
+    assert "_debug" not in prediction.keys()
+
+
+def test_existing_faiss_document_store():
+    clean_faiss_document_store()
+
+    pipeline = Pipeline.load_from_yaml(
+        Path(__file__).parent/"samples"/"pipeline"/"test_pipeline_faiss_indexing.yaml", pipeline_name="indexing_pipeline"
+    )
+    pipeline.run(
+        file_paths=Path(__file__).parent/"samples"/"pdf"/"sample_pdf_1.pdf"
+    )
+
+    new_document_store = pipeline.get_document_store()
+    new_document_store.save('existing_faiss_document_store')
+
+    # test correct load of query pipeline from yaml
+    pipeline = Pipeline.load_from_yaml(
+        Path(__file__).parent/"samples"/"pipeline"/"test_pipeline_faiss_retrieval.yaml", pipeline_name="query_pipeline"
+    )
+
+    retriever = pipeline.get_node("DPRRetriever")
+    existing_document_store = retriever.document_store
+    faiss_index = existing_document_store.faiss_indexes['document']
+    assert faiss_index.ntotal == 2
+
+    prediction = pipeline.run(
+        query="Who made the PDF specification?", params={"DPRRetriever": {"top_k": 10}}
+    )
+
+    assert prediction["query"] == "Who made the PDF specification?"
+    assert len(prediction["documents"]) == 2
+    clean_faiss_document_store()
+
+
+def clean_faiss_document_store():
+    if Path('existing_faiss_document_store').exists():
+        os.remove('existing_faiss_document_store')
+    if Path('existing_faiss_document_store.json').exists():
+        os.remove('existing_faiss_document_store.json')
+    if Path('faiss_document_store.db').exists():
+        os.remove('faiss_document_store.db')