chore: refactor pipeline tests for e2e testing (#5576)

* enable pipeline filder in e2e * merge standard pipeline tests with stanrdard pipeline batch tests * merge summarization tests into standard pipelines tests * Update test_standard_pipelines.py * black
2026-01-06 03:57:19 +00:00 · 2023-08-29 11:22:39 +02:00 · 2023-08-29 11:22:39 +02:00 · 5985b6d358
commit 5985b6d358
parent f13b37db24
4 changed files with 183 additions and 192 deletions
--- a/.github/workflows/e2e.yml
+++ b/.github/workflows/e2e.yml
@ -17,6 +17,7 @@ jobs:
      matrix:
        folder:
          - "document_search"
+          - "pipelines"

    runs-on: ubuntu-latest

--- a/e2e/pipelines/test_standard_pipelines.py
+++ b/e2e/pipelines/test_standard_pipelines.py
@ -10,18 +10,16 @@ from haystack.pipelines import (
    DocumentSearchPipeline,
    MostSimilarDocumentsPipeline,
    WebQAPipeline,
+    SearchSummarizationPipeline,
 )
-from haystack.nodes import EmbeddingRetriever, PromptNode
+from haystack.nodes import EmbeddingRetriever, PromptNode, BM25Retriever, TransformersSummarizer
 from haystack.schema import Document


 def test_faq_pipeline():
    documents = [
-        {"content": "How to test module-1?", "meta": {"source": "wiki1", "answer": "Using tests for module-1"}},
-        {"content": "How to test module-2?", "meta": {"source": "wiki2", "answer": "Using tests for module-2"}},
-        {"content": "How to test module-3?", "meta": {"source": "wiki3", "answer": "Using tests for module-3"}},
-        {"content": "How to test module-4?", "meta": {"source": "wiki4", "answer": "Using tests for module-4"}},
-        {"content": "How to test module-5?", "meta": {"source": "wiki5", "answer": "Using tests for module-5"}},
+        {"content": f"How to test module-{i}?", "meta": {"source": f"wiki{i}", "answer": f"Using tests for module-{i}"}}
+        for i in range(1, 6)
    ]
    document_store = InMemoryDocumentStore()
    retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert")
@ -165,3 +163,181 @@ def test_webqa_pipeline():
    assert len(result["results"]) == 1
    answer = result["results"][0]
    assert "Stark" in answer or "NED" in answer
+
+
+def test_faq_pipeline_batch():
+    documents = [
+        {"content": "How to test module-1?", "meta": {"source": "wiki1", "answer": "Using tests for module-1"}},
+        {"content": "How to test module-2?", "meta": {"source": "wiki2", "answer": "Using tests for module-2"}},
+        {"content": "How to test module-3?", "meta": {"source": "wiki3", "answer": "Using tests for module-3"}},
+        {"content": "How to test module-4?", "meta": {"source": "wiki4", "answer": "Using tests for module-4"}},
+        {"content": "How to test module-5?", "meta": {"source": "wiki5", "answer": "Using tests for module-5"}},
+    ]
+    document_store = InMemoryDocumentStore()
+    retriever = EmbeddingRetriever(
+        document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2"
+    )
+    document_store.write_documents(documents)
+    document_store.update_embeddings(retriever)
+
+    pipeline = FAQPipeline(retriever=retriever)
+
+    output = pipeline.run_batch(queries=["How to test this?", "How to test this?"], params={"Retriever": {"top_k": 3}})
+    assert len(output["answers"]) == 2  # 2 queries
+    assert len(output["answers"][0]) == 3  # 3 answers per query
+    assert output["queries"][0].startswith("How to")
+    assert output["answers"][0][0].answer.startswith("Using tests")
+
+
+def test_document_search_pipeline_batch():
+    documents = [
+        {"content": "Sample text for document-1", "meta": {"source": "wiki1"}},
+        {"content": "Sample text for document-2", "meta": {"source": "wiki2"}},
+        {"content": "Sample text for document-3", "meta": {"source": "wiki3"}},
+        {"content": "Sample text for document-4", "meta": {"source": "wiki4"}},
+        {"content": "Sample text for document-5", "meta": {"source": "wiki5"}},
+    ]
+    document_store = InMemoryDocumentStore()
+    retriever = EmbeddingRetriever(
+        document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2"
+    )
+    document_store.write_documents(documents)
+    document_store.update_embeddings(retriever)
+
+    pipeline = DocumentSearchPipeline(retriever=retriever)
+    output = pipeline.run_batch(queries=["How to test this?", "How to test this?"], params={"top_k": 4})
+    assert len(output["documents"]) == 2  # 2 queries
+    assert len(output["documents"][0]) == 4  # 4 docs per query
+
+
+def test_most_similar_documents_pipeline_batch():
+    documents = [
+        {"id": "a", "content": "Sample text for document-1", "meta": {"source": "wiki1"}},
+        {"id": "b", "content": "Sample text for document-2", "meta": {"source": "wiki2"}},
+        {"content": "Sample text for document-3", "meta": {"source": "wiki3"}},
+        {"content": "Sample text for document-4", "meta": {"source": "wiki4"}},
+        {"content": "Sample text for document-5", "meta": {"source": "wiki5"}},
+    ]
+    document_store = InMemoryDocumentStore()
+    retriever = EmbeddingRetriever(
+        document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2"
+    )
+    document_store.write_documents(documents)
+    document_store.update_embeddings(retriever)
+
+    docs_id: list = ["a", "b"]
+    pipeline = MostSimilarDocumentsPipeline(document_store=document_store)
+    list_of_documents = pipeline.run_batch(document_ids=docs_id)
+
+    assert len(list_of_documents[0]) > 1
+    assert isinstance(list_of_documents, list)
+    assert len(list_of_documents) == len(docs_id)
+
+    for another_list in list_of_documents:
+        assert isinstance(another_list, list)
+        for document in another_list:
+            assert isinstance(document, Document)
+            assert isinstance(document.id, str)
+            assert isinstance(document.content, str)
+
+
+def test_most_similar_documents_pipeline_with_filters_batch():
+    documents = [
+        {"id": "a", "content": "Sample text for document-1", "meta": {"source": "wiki1"}},
+        {"id": "b", "content": "Sample text for document-2", "meta": {"source": "wiki2"}},
+        {"content": "Sample text for document-3", "meta": {"source": "wiki3"}},
+        {"content": "Sample text for document-4", "meta": {"source": "wiki4"}},
+        {"content": "Sample text for document-5", "meta": {"source": "wiki5"}},
+    ]
+    document_store = InMemoryDocumentStore()
+    retriever = EmbeddingRetriever(
+        document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2"
+    )
+    document_store = InMemoryDocumentStore()
+    document_store.write_documents(documents)
+    document_store.update_embeddings(retriever)
+
+    docs_id: list = ["a", "b"]
+    filters = {"source": ["wiki3", "wiki4", "wiki5"]}
+    pipeline = MostSimilarDocumentsPipeline(document_store=document_store)
+    list_of_documents = pipeline.run_batch(document_ids=docs_id, filters=filters)
+
+    assert len(list_of_documents[0]) > 1
+    assert isinstance(list_of_documents, list)
+    assert len(list_of_documents) == len(docs_id)
+
+    for another_list in list_of_documents:
+        assert isinstance(another_list, list)
+        for document in another_list:
+            assert isinstance(document, Document)
+            assert isinstance(document.id, str)
+            assert isinstance(document.content, str)
+            assert document.meta["source"] in ["wiki3", "wiki4", "wiki5"]
+
+
+def test_summarization_pipeline():
+    docs = [
+        Document(
+            content="""
+    PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions.
+    The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected
+    by the shutoffs which were expected to last through at least midday tomorrow.
+    """
+        ),
+        Document(
+            content="""
+    The Eiffel Tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest
+    structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction,
+    the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a
+    title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first
+    structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower
+    in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel
+    Tower is the second tallest free-standing structure in France after the Millau Viaduct.
+    """
+        ),
+    ]
+    summarizer = TransformersSummarizer(model_name_or_path="sshleifer/distilbart-xsum-12-6", use_gpu=False)
+
+    ds = InMemoryDocumentStore(use_bm25=True)
+    retriever = BM25Retriever(document_store=ds)
+    ds.write_documents(docs)
+
+    query = "Eiffel Tower"
+    pipeline = SearchSummarizationPipeline(retriever=retriever, summarizer=summarizer, return_in_answer_format=True)
+    output = pipeline.run(query=query, params={"Retriever": {"top_k": 1}})
+    answers = output["answers"]
+    assert len(answers) == 1
+    assert "The Eiffel Tower is one of the world's tallest structures" == answers[0]["answer"].strip()
+
+
+def test_summarization_pipeline_one_summary():
+    split_docs = [
+        Document(
+            content="""
+    The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris.
+    Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the
+    Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler
+    Building in New York City was finished in 1930.
+    """
+        ),
+        Document(
+            content="""
+    It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the
+    top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters,
+    the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct.
+    """
+        ),
+    ]
+    ds = InMemoryDocumentStore(use_bm25=True)
+    retriever = BM25Retriever(document_store=ds)
+    ds.write_documents(split_docs)
+    summarizer = TransformersSummarizer(model_name_or_path="sshleifer/distilbart-xsum-12-6", use_gpu=False)
+
+    query = "Eiffel Tower"
+    pipeline = SearchSummarizationPipeline(
+        retriever=retriever, summarizer=summarizer, generate_single_summary=True, return_in_answer_format=True
+    )
+    output = pipeline.run(query=query, params={"Retriever": {"top_k": 2}})
+    answers = output["answers"]
+    assert len(answers) == 1
+    assert answers[0]["answer"].strip() == "The Eiffel Tower was built in 1924 in Paris, France."
--- a/e2e/pipelines/test_standard_pipelines_batch.py
+++ b/e2e/pipelines/test_standard_pipelines_batch.py
@ -1,114 +0,0 @@
-from haystack.document_stores import InMemoryDocumentStore
-from haystack.pipelines import FAQPipeline, DocumentSearchPipeline, MostSimilarDocumentsPipeline
-from haystack.nodes import EmbeddingRetriever
-from haystack.schema import Document
-
-
-def test_faq_pipeline_batch():
-    documents = [
-        {"content": "How to test module-1?", "meta": {"source": "wiki1", "answer": "Using tests for module-1"}},
-        {"content": "How to test module-2?", "meta": {"source": "wiki2", "answer": "Using tests for module-2"}},
-        {"content": "How to test module-3?", "meta": {"source": "wiki3", "answer": "Using tests for module-3"}},
-        {"content": "How to test module-4?", "meta": {"source": "wiki4", "answer": "Using tests for module-4"}},
-        {"content": "How to test module-5?", "meta": {"source": "wiki5", "answer": "Using tests for module-5"}},
-    ]
-    document_store = InMemoryDocumentStore()
-    retriever = EmbeddingRetriever(
-        document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2"
-    )
-    document_store.write_documents(documents)
-    document_store.update_embeddings(retriever)
-
-    pipeline = FAQPipeline(retriever=retriever)
-
-    output = pipeline.run_batch(queries=["How to test this?", "How to test this?"], params={"Retriever": {"top_k": 3}})
-    assert len(output["answers"]) == 2  # 2 queries
-    assert len(output["answers"][0]) == 3  # 3 answers per query
-    assert output["queries"][0].startswith("How to")
-    assert output["answers"][0][0].answer.startswith("Using tests")
-
-
-def test_document_search_pipeline_batch():
-    documents = [
-        {"content": "Sample text for document-1", "meta": {"source": "wiki1"}},
-        {"content": "Sample text for document-2", "meta": {"source": "wiki2"}},
-        {"content": "Sample text for document-3", "meta": {"source": "wiki3"}},
-        {"content": "Sample text for document-4", "meta": {"source": "wiki4"}},
-        {"content": "Sample text for document-5", "meta": {"source": "wiki5"}},
-    ]
-    document_store = InMemoryDocumentStore()
-    retriever = EmbeddingRetriever(
-        document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2"
-    )
-    document_store.write_documents(documents)
-    document_store.update_embeddings(retriever)
-
-    pipeline = DocumentSearchPipeline(retriever=retriever)
-    output = pipeline.run_batch(queries=["How to test this?", "How to test this?"], params={"top_k": 4})
-    assert len(output["documents"]) == 2  # 2 queries
-    assert len(output["documents"][0]) == 4  # 4 docs per query
-
-
-def test_most_similar_documents_pipeline_batch():
-    documents = [
-        {"id": "a", "content": "Sample text for document-1", "meta": {"source": "wiki1"}},
-        {"id": "b", "content": "Sample text for document-2", "meta": {"source": "wiki2"}},
-        {"content": "Sample text for document-3", "meta": {"source": "wiki3"}},
-        {"content": "Sample text for document-4", "meta": {"source": "wiki4"}},
-        {"content": "Sample text for document-5", "meta": {"source": "wiki5"}},
-    ]
-    document_store = InMemoryDocumentStore()
-    retriever = EmbeddingRetriever(
-        document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2"
-    )
-    document_store.write_documents(documents)
-    document_store.update_embeddings(retriever)
-
-    docs_id: list = ["a", "b"]
-    pipeline = MostSimilarDocumentsPipeline(document_store=document_store)
-    list_of_documents = pipeline.run_batch(document_ids=docs_id)
-
-    assert len(list_of_documents[0]) > 1
-    assert isinstance(list_of_documents, list)
-    assert len(list_of_documents) == len(docs_id)
-
-    for another_list in list_of_documents:
-        assert isinstance(another_list, list)
-        for document in another_list:
-            assert isinstance(document, Document)
-            assert isinstance(document.id, str)
-            assert isinstance(document.content, str)
-
-
-def test_most_similar_documents_pipeline_with_filters_batch():
-    documents = [
-        {"id": "a", "content": "Sample text for document-1", "meta": {"source": "wiki1"}},
-        {"id": "b", "content": "Sample text for document-2", "meta": {"source": "wiki2"}},
-        {"content": "Sample text for document-3", "meta": {"source": "wiki3"}},
-        {"content": "Sample text for document-4", "meta": {"source": "wiki4"}},
-        {"content": "Sample text for document-5", "meta": {"source": "wiki5"}},
-    ]
-    document_store = InMemoryDocumentStore()
-    retriever = EmbeddingRetriever(
-        document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2"
-    )
-    document_store = InMemoryDocumentStore()
-    document_store.write_documents(documents)
-    document_store.update_embeddings(retriever)
-
-    docs_id: list = ["a", "b"]
-    filters = {"source": ["wiki3", "wiki4", "wiki5"]}
-    pipeline = MostSimilarDocumentsPipeline(document_store=document_store)
-    list_of_documents = pipeline.run_batch(document_ids=docs_id, filters=filters)
-
-    assert len(list_of_documents[0]) > 1
-    assert isinstance(list_of_documents, list)
-    assert len(list_of_documents) == len(docs_id)
-
-    for another_list in list_of_documents:
-        assert isinstance(another_list, list)
-        for document in another_list:
-            assert isinstance(document, Document)
-            assert isinstance(document.id, str)
-            assert isinstance(document.content, str)
-            assert document.meta["source"] in ["wiki3", "wiki4", "wiki5"]
--- a/e2e/pipelines/test_summarization_pipelines.py
+++ b/e2e/pipelines/test_summarization_pipelines.py
@ -1,72 +0,0 @@
-from haystack import Document
-from haystack.document_stores import InMemoryDocumentStore
-from haystack.pipelines import SearchSummarizationPipeline
-from haystack.nodes import BM25Retriever, TransformersSummarizer
-
-
-def test_summarization_pipeline():
-    docs = [
-        Document(
-            content="""
-    PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions.
-    The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected
-    by the shutoffs which were expected to last through at least midday tomorrow.
-    """
-        ),
-        Document(
-            content="""
-    The Eiffel Tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest
-    structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction,
-    the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a
-    title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first
-    structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower
-    in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel
-    Tower is the second tallest free-standing structure in France after the Millau Viaduct.
-    """
-        ),
-    ]
-    summarizer = TransformersSummarizer(model_name_or_path="sshleifer/distilbart-xsum-12-6", use_gpu=False)
-
-    ds = InMemoryDocumentStore(use_bm25=True)
-    retriever = BM25Retriever(document_store=ds)
-    ds.write_documents(docs)
-
-    query = "Eiffel Tower"
-    pipeline = SearchSummarizationPipeline(retriever=retriever, summarizer=summarizer, return_in_answer_format=True)
-    output = pipeline.run(query=query, params={"Retriever": {"top_k": 1}})
-    answers = output["answers"]
-    assert len(answers) == 1
-    assert "The Eiffel Tower is one of the world's tallest structures" == answers[0]["answer"].strip()
-
-
-def test_summarization_pipeline_one_summary():
-    split_docs = [
-        Document(
-            content="""
-    The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris.
-    Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the
-    Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler
-    Building in New York City was finished in 1930.
-    """
-        ),
-        Document(
-            content="""
-    It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the
-    top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters,
-    the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct.
-    """
-        ),
-    ]
-    ds = InMemoryDocumentStore(use_bm25=True)
-    retriever = BM25Retriever(document_store=ds)
-    ds.write_documents(split_docs)
-    summarizer = TransformersSummarizer(model_name_or_path="sshleifer/distilbart-xsum-12-6", use_gpu=False)
-
-    query = "Eiffel Tower"
-    pipeline = SearchSummarizationPipeline(
-        retriever=retriever, summarizer=summarizer, generate_single_summary=True, return_in_answer_format=True
-    )
-    output = pipeline.run(query=query, params={"Retriever": {"top_k": 2}})
-    answers = output["answers"]
-    assert len(answers) == 1
-    assert answers[0]["answer"].strip() == "The Eiffel Tower was built in 1924 in Paris, France."