test: Add end-to-end test for dense doc search 2.0 (#6102)

* draft e2e test for dense doc search * fix import path * add DocumentJoiner * update converter import; fix getting filled doc store * add text embedder * add sample txt and pdf for preview e2e tests * run the query pipeline before serializing * define samples path --------- Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Co-authored-by: ZanSara <sara.zanzottera@deepset.ai>
2025-11-03 19:29:32 +00:00 · 2023-11-23 16:59:02 +01:00 · 2023-11-23 16:59:02 +01:00 · 67780a62d5
commit 67780a62d5
parent c44e2cf49b
5 changed files with 95 additions and 3 deletions
--- a/e2e/preview/conftest.py
+++ b/e2e/preview/conftest.py
@ -1,4 +1,11 @@
 from pathlib import Path
 import pytest
 from haystack.preview.testing.test_utils import set_all_seeds
 set_all_seeds(0)
@pytest.fixture
 def samples_path():
    return Path(__file__).parent / "samples"
--- a/e2e/preview/pipelines/test_dense_doc_search.py
+++ b/e2e/preview/pipelines/test_dense_doc_search.py
@ -0,0 +1,84 @@
 import json
 from pathlib import Path
 from haystack.preview import Pipeline
 from haystack.preview.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
 from haystack.preview.components.converters import PyPDFToDocument, TextFileToDocument
 from haystack.preview.components.preprocessors import DocumentCleaner, DocumentSplitter
 from haystack.preview.components.routers import FileTypeRouter, DocumentJoiner
 from haystack.preview.components.writers import DocumentWriter
 from haystack.preview.document_stores import InMemoryDocumentStore
 from haystack.preview.components.retrievers import InMemoryEmbeddingRetriever
 def test_dense_doc_search_pipeline(tmp_path, samples_path):
    # Create the indexing pipeline
    indexing_pipeline = Pipeline()
    indexing_pipeline.add_component(
        instance=FileTypeRouter(mime_types=["text/plain", "application/pdf"]), name="file_type_router"
    )
    indexing_pipeline.add_component(instance=TextFileToDocument(), name="text_file_converter")
    indexing_pipeline.add_component(instance=PyPDFToDocument(), name="pdf_file_converter")
    indexing_pipeline.add_component(instance=DocumentJoiner(), name="joiner")
    indexing_pipeline.add_component(instance=DocumentCleaner(), name="cleaner")
    indexing_pipeline.add_component(
        instance=DocumentSplitter(split_by="sentence", split_length=250, split_overlap=30), name="splitter"
    )
    indexing_pipeline.add_component(
        instance=SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),
        name="embedder",
    )
    indexing_pipeline.add_component(instance=DocumentWriter(document_store=InMemoryDocumentStore()), name="writer")
    indexing_pipeline.connect("file_type_router.text/plain", "text_file_converter.sources")
    indexing_pipeline.connect("file_type_router.application/pdf", "pdf_file_converter.sources")
    indexing_pipeline.connect("text_file_converter.documents", "joiner.documents")
    indexing_pipeline.connect("pdf_file_converter.documents", "joiner.documents")
    indexing_pipeline.connect("joiner.documents", "cleaner.documents")
    indexing_pipeline.connect("cleaner.documents", "splitter.documents")
    indexing_pipeline.connect("splitter.documents", "embedder.documents")
    indexing_pipeline.connect("embedder.documents", "writer.documents")
    # Draw the indexing pipeline
    indexing_pipeline.draw(tmp_path / "test_dense_doc_search_indexing_pipeline.png")
    # Serialize the indexing pipeline to JSON
    with open(tmp_path / "test_dense_doc_search_indexing_pipeline.json", "w") as f:
        print(json.dumps(indexing_pipeline.to_dict(), indent=4))
        json.dump(indexing_pipeline.to_dict(), f)
    # Load the indexing pipeline back
    with open(tmp_path / "test_dense_doc_search_indexing_pipeline.json", "r") as f:
        indexing_pipeline = Pipeline.from_dict(json.load(f))
    indexing_result = indexing_pipeline.run({"file_type_router": {"sources": samples_path.iterdir()}})
    filled_document_store = indexing_pipeline.get_component("writer").document_store
    assert indexing_result["writer"]["documents_written"] == 2
    assert filled_document_store.count_documents() == 2
    # Create the querying pipeline
    query_pipeline = Pipeline()
    query_pipeline.add_component(
        instance=SentenceTransformersTextEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),
        name="text_embedder",
    )
    query_pipeline.add_component(
        instance=InMemoryEmbeddingRetriever(document_store=filled_document_store, top_k=20), name="embedding_retriever"
    )
    query_pipeline.connect("text_embedder", "embedding_retriever")
    querying_result = query_pipeline.run({"text_embedder": {"text": "Who lives in Rome?"}})
    assert querying_result["embedding_retriever"]["documents"][0].content == "My name is Giorgio and I live in Rome."
    # Draw the querying pipeline
    query_pipeline.draw(tmp_path / "test_dense_doc_search_query_pipeline.png")
    # Serialize the querying pipeline to JSON
    with open(tmp_path / "test_dense_doc_search_query_pipeline.json", "w") as f:
        print(json.dumps(query_pipeline.to_dict(), indent=4))
        json.dump(query_pipeline.to_dict(), f)
    # Load the querying pipeline back
    with open(tmp_path / "test_dense_doc_search_query_pipeline.json", "r") as f:
        query_pipeline = Pipeline.from_dict(json.load(f))
--- a/e2e/preview/pipelines/test_rag_pipelines.py
+++ b/e2e/preview/pipelines/test_rag_pipelines.py
@ -115,11 +115,11 @@ def test_embedding_retrieval_rag_pipeline(tmp_path):
    rag_pipeline.draw(tmp_path / "test_embedding_rag_pipeline.png")
    # Serialize the pipeline to JSON
-    with open(tmp_path / "test_bm25_rag_pipeline.json", "w") as f:
+    with open(tmp_path / "test_embedding_rag_pipeline.json", "w") as f:
        json.dump(rag_pipeline.to_dict(), f)
    # Load the pipeline back
-    with open(tmp_path / "test_bm25_rag_pipeline.json", "r") as f:
+    with open(tmp_path / "test_embedding_rag_pipeline.json", "r") as f:
        rag_pipeline = Pipeline.from_dict(json.load(f))
    # Populate the document store
--- a/e2e/preview/samples/doc_1.txt
+++ b/e2e/preview/samples/doc_1.txt
@ -0,0 +1 @@
 My name is Giorgio and I live in Rome.
--- a/e2e/preview/samples/sample_pdf_1.pdf
+++ b/e2e/preview/samples/sample_pdf_1.pdf