mirror of
				https://github.com/deepset-ai/haystack.git
				synced 2025-11-03 19:29:32 +00:00 
			
		
		
		
	test: Add end-to-end test for dense doc search 2.0 (#6102)
* draft e2e test for dense doc search * fix import path * add DocumentJoiner * update converter import; fix getting filled doc store * add text embedder * add sample txt and pdf for preview e2e tests * run the query pipeline before serializing * define samples path --------- Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Co-authored-by: ZanSara <sara.zanzottera@deepset.ai>
This commit is contained in:
		
							parent
							
								
									c44e2cf49b
								
							
						
					
					
						commit
						67780a62d5
					
				@ -1,4 +1,11 @@
 | 
				
			|||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
from haystack.preview.testing.test_utils import set_all_seeds
 | 
					from haystack.preview.testing.test_utils import set_all_seeds
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
set_all_seeds(0)
 | 
					set_all_seeds(0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.fixture
 | 
				
			||||||
 | 
					def samples_path():
 | 
				
			||||||
 | 
					    return Path(__file__).parent / "samples"
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										84
									
								
								e2e/preview/pipelines/test_dense_doc_search.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										84
									
								
								e2e/preview/pipelines/test_dense_doc_search.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,84 @@
 | 
				
			|||||||
 | 
					import json
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from haystack.preview import Pipeline
 | 
				
			||||||
 | 
					from haystack.preview.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
 | 
				
			||||||
 | 
					from haystack.preview.components.converters import PyPDFToDocument, TextFileToDocument
 | 
				
			||||||
 | 
					from haystack.preview.components.preprocessors import DocumentCleaner, DocumentSplitter
 | 
				
			||||||
 | 
					from haystack.preview.components.routers import FileTypeRouter, DocumentJoiner
 | 
				
			||||||
 | 
					from haystack.preview.components.writers import DocumentWriter
 | 
				
			||||||
 | 
					from haystack.preview.document_stores import InMemoryDocumentStore
 | 
				
			||||||
 | 
					from haystack.preview.components.retrievers import InMemoryEmbeddingRetriever
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_dense_doc_search_pipeline(tmp_path, samples_path):
 | 
				
			||||||
 | 
					    # Create the indexing pipeline
 | 
				
			||||||
 | 
					    indexing_pipeline = Pipeline()
 | 
				
			||||||
 | 
					    indexing_pipeline.add_component(
 | 
				
			||||||
 | 
					        instance=FileTypeRouter(mime_types=["text/plain", "application/pdf"]), name="file_type_router"
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    indexing_pipeline.add_component(instance=TextFileToDocument(), name="text_file_converter")
 | 
				
			||||||
 | 
					    indexing_pipeline.add_component(instance=PyPDFToDocument(), name="pdf_file_converter")
 | 
				
			||||||
 | 
					    indexing_pipeline.add_component(instance=DocumentJoiner(), name="joiner")
 | 
				
			||||||
 | 
					    indexing_pipeline.add_component(instance=DocumentCleaner(), name="cleaner")
 | 
				
			||||||
 | 
					    indexing_pipeline.add_component(
 | 
				
			||||||
 | 
					        instance=DocumentSplitter(split_by="sentence", split_length=250, split_overlap=30), name="splitter"
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    indexing_pipeline.add_component(
 | 
				
			||||||
 | 
					        instance=SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),
 | 
				
			||||||
 | 
					        name="embedder",
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    indexing_pipeline.add_component(instance=DocumentWriter(document_store=InMemoryDocumentStore()), name="writer")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    indexing_pipeline.connect("file_type_router.text/plain", "text_file_converter.sources")
 | 
				
			||||||
 | 
					    indexing_pipeline.connect("file_type_router.application/pdf", "pdf_file_converter.sources")
 | 
				
			||||||
 | 
					    indexing_pipeline.connect("text_file_converter.documents", "joiner.documents")
 | 
				
			||||||
 | 
					    indexing_pipeline.connect("pdf_file_converter.documents", "joiner.documents")
 | 
				
			||||||
 | 
					    indexing_pipeline.connect("joiner.documents", "cleaner.documents")
 | 
				
			||||||
 | 
					    indexing_pipeline.connect("cleaner.documents", "splitter.documents")
 | 
				
			||||||
 | 
					    indexing_pipeline.connect("splitter.documents", "embedder.documents")
 | 
				
			||||||
 | 
					    indexing_pipeline.connect("embedder.documents", "writer.documents")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Draw the indexing pipeline
 | 
				
			||||||
 | 
					    indexing_pipeline.draw(tmp_path / "test_dense_doc_search_indexing_pipeline.png")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Serialize the indexing pipeline to JSON
 | 
				
			||||||
 | 
					    with open(tmp_path / "test_dense_doc_search_indexing_pipeline.json", "w") as f:
 | 
				
			||||||
 | 
					        print(json.dumps(indexing_pipeline.to_dict(), indent=4))
 | 
				
			||||||
 | 
					        json.dump(indexing_pipeline.to_dict(), f)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Load the indexing pipeline back
 | 
				
			||||||
 | 
					    with open(tmp_path / "test_dense_doc_search_indexing_pipeline.json", "r") as f:
 | 
				
			||||||
 | 
					        indexing_pipeline = Pipeline.from_dict(json.load(f))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    indexing_result = indexing_pipeline.run({"file_type_router": {"sources": samples_path.iterdir()}})
 | 
				
			||||||
 | 
					    filled_document_store = indexing_pipeline.get_component("writer").document_store
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert indexing_result["writer"]["documents_written"] == 2
 | 
				
			||||||
 | 
					    assert filled_document_store.count_documents() == 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Create the querying pipeline
 | 
				
			||||||
 | 
					    query_pipeline = Pipeline()
 | 
				
			||||||
 | 
					    query_pipeline.add_component(
 | 
				
			||||||
 | 
					        instance=SentenceTransformersTextEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),
 | 
				
			||||||
 | 
					        name="text_embedder",
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    query_pipeline.add_component(
 | 
				
			||||||
 | 
					        instance=InMemoryEmbeddingRetriever(document_store=filled_document_store, top_k=20), name="embedding_retriever"
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    query_pipeline.connect("text_embedder", "embedding_retriever")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    querying_result = query_pipeline.run({"text_embedder": {"text": "Who lives in Rome?"}})
 | 
				
			||||||
 | 
					    assert querying_result["embedding_retriever"]["documents"][0].content == "My name is Giorgio and I live in Rome."
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Draw the querying pipeline
 | 
				
			||||||
 | 
					    query_pipeline.draw(tmp_path / "test_dense_doc_search_query_pipeline.png")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Serialize the querying pipeline to JSON
 | 
				
			||||||
 | 
					    with open(tmp_path / "test_dense_doc_search_query_pipeline.json", "w") as f:
 | 
				
			||||||
 | 
					        print(json.dumps(query_pipeline.to_dict(), indent=4))
 | 
				
			||||||
 | 
					        json.dump(query_pipeline.to_dict(), f)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Load the querying pipeline back
 | 
				
			||||||
 | 
					    with open(tmp_path / "test_dense_doc_search_query_pipeline.json", "r") as f:
 | 
				
			||||||
 | 
					        query_pipeline = Pipeline.from_dict(json.load(f))
 | 
				
			||||||
@ -115,11 +115,11 @@ def test_embedding_retrieval_rag_pipeline(tmp_path):
 | 
				
			|||||||
    rag_pipeline.draw(tmp_path / "test_embedding_rag_pipeline.png")
 | 
					    rag_pipeline.draw(tmp_path / "test_embedding_rag_pipeline.png")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Serialize the pipeline to JSON
 | 
					    # Serialize the pipeline to JSON
 | 
				
			||||||
    with open(tmp_path / "test_bm25_rag_pipeline.json", "w") as f:
 | 
					    with open(tmp_path / "test_embedding_rag_pipeline.json", "w") as f:
 | 
				
			||||||
        json.dump(rag_pipeline.to_dict(), f)
 | 
					        json.dump(rag_pipeline.to_dict(), f)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Load the pipeline back
 | 
					    # Load the pipeline back
 | 
				
			||||||
    with open(tmp_path / "test_bm25_rag_pipeline.json", "r") as f:
 | 
					    with open(tmp_path / "test_embedding_rag_pipeline.json", "r") as f:
 | 
				
			||||||
        rag_pipeline = Pipeline.from_dict(json.load(f))
 | 
					        rag_pipeline = Pipeline.from_dict(json.load(f))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Populate the document store
 | 
					    # Populate the document store
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										1
									
								
								e2e/preview/samples/doc_1.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								e2e/preview/samples/doc_1.txt
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					My name is Giorgio and I live in Rome.
 | 
				
			||||||
							
								
								
									
										
											BIN
										
									
								
								e2e/preview/samples/sample_pdf_1.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								e2e/preview/samples/sample_pdf_1.pdf
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user