mirror of
				https://github.com/deepset-ai/haystack.git
				synced 2025-10-20 20:39:04 +00:00 
			
		
		
		
	fix: Allowing InMemStore and FAISSDocStore for indexing using single worker (#3868)
* Allowing InMemStore and FAISSDocStore for indexing using single worker YAML config * unified pipeline & doc store loading * fix pylint warning * separated tests * removed unnecessay caplog
This commit is contained in:
		
							parent
							
								
									12e057837b
								
							
						
					
					
						commit
						dad7b12874
					
				| @ -13,9 +13,32 @@ from rest_api.controller.utils import RequestLimiter | ||||
| 
 | ||||
| logger = logging.getLogger(__name__) | ||||
| 
 | ||||
| # Since each instance of FAISSDocumentStore creates an in-memory FAISS index, the Indexing & Query Pipelines would | ||||
| # end up with different indices. The same applies for InMemoryDocumentStore. | ||||
| UNSUPPORTED_DOC_STORES = (FAISSDocumentStore, InMemoryDocumentStore) | ||||
| # Each instance of FAISSDocumentStore creates an in-memory FAISS index, | ||||
| # the Indexing & Query Pipelines will end up with different indices for each worker. | ||||
| # The same applies for InMemoryDocumentStore. | ||||
| SINGLE_PROCESS_DOC_STORES = (FAISSDocumentStore, InMemoryDocumentStore) | ||||
| 
 | ||||
| 
 | ||||
| def _load_pipeline(pipeline_yaml_path, indexing_pipeline_name): | ||||
|     # Load pipeline (if available) | ||||
|     try: | ||||
|         pipeline = Pipeline.load_from_yaml(Path(pipeline_yaml_path), pipeline_name=indexing_pipeline_name) | ||||
|         logging.info("Loaded pipeline nodes: %s", pipeline.graph.nodes.keys()) | ||||
|         document_store = _get_pipeline_doc_store(pipeline) | ||||
|     except PipelineConfigError as e: | ||||
|         pipeline, document_store = None, None | ||||
|         logger.error( | ||||
|             "Error loading %s pipeline from %s. \n %s\n", indexing_pipeline_name, pipeline_yaml_path, e.message | ||||
|         ) | ||||
|     return pipeline, document_store | ||||
| 
 | ||||
| 
 | ||||
| def _get_pipeline_doc_store(pipeline): | ||||
|     document_store = pipeline.get_document_store() | ||||
|     logging.info("Loaded docstore: %s", document_store) | ||||
|     if isinstance(document_store, SINGLE_PROCESS_DOC_STORES): | ||||
|         logger.warning("FAISSDocumentStore or InMemoryDocumentStore should only be used with 1 worker.") | ||||
|     return document_store | ||||
| 
 | ||||
| 
 | ||||
| def setup_pipelines() -> Dict[str, Any]: | ||||
| @ -24,14 +47,9 @@ def setup_pipelines() -> Dict[str, Any]: | ||||
| 
 | ||||
|     pipelines = {} | ||||
| 
 | ||||
|     # Load query pipeline | ||||
|     query_pipeline = Pipeline.load_from_yaml(Path(config.PIPELINE_YAML_PATH), pipeline_name=config.QUERY_PIPELINE_NAME) | ||||
|     logging.info("Loaded pipeline nodes: %s", query_pipeline.graph.nodes.keys()) | ||||
|     # Load query pipeline & document store | ||||
|     query_pipeline, document_store = _load_pipeline(config.PIPELINE_YAML_PATH, config.QUERY_PIPELINE_NAME) | ||||
|     pipelines["query_pipeline"] = query_pipeline | ||||
| 
 | ||||
|     # Find document store | ||||
|     document_store = query_pipeline.get_document_store() | ||||
|     logging.info("Loaded docstore: %s", document_store) | ||||
|     pipelines["document_store"] = document_store | ||||
| 
 | ||||
|     # Setup concurrency limiter | ||||
| @ -39,24 +57,11 @@ def setup_pipelines() -> Dict[str, Any]: | ||||
|     logging.info("Concurrent requests per worker: %s", config.CONCURRENT_REQUEST_PER_WORKER) | ||||
|     pipelines["concurrency_limiter"] = concurrency_limiter | ||||
| 
 | ||||
|     # Load indexing pipeline (if available) | ||||
|     try: | ||||
|         indexing_pipeline = Pipeline.load_from_yaml( | ||||
|             Path(config.PIPELINE_YAML_PATH), pipeline_name=config.INDEXING_PIPELINE_NAME | ||||
|         ) | ||||
|         docstore = indexing_pipeline.get_document_store() | ||||
|         if isinstance(docstore, UNSUPPORTED_DOC_STORES): | ||||
|             indexing_pipeline = None | ||||
|             raise PipelineConfigError( | ||||
|                 "Indexing pipelines with FAISSDocumentStore or InMemoryDocumentStore are not supported by the REST APIs." | ||||
|             ) | ||||
| 
 | ||||
|     except PipelineConfigError as e: | ||||
|         indexing_pipeline = None | ||||
|         logger.error("%s\nFile Upload API will not be available.", e.message) | ||||
| 
 | ||||
|     finally: | ||||
|         pipelines["indexing_pipeline"] = indexing_pipeline | ||||
|     # Load indexing pipeline | ||||
|     index_pipeline, _ = _load_pipeline(config.PIPELINE_YAML_PATH, config.INDEXING_PIPELINE_NAME) | ||||
|     if not index_pipeline: | ||||
|         logger.warning("Indexing Pipeline is not setup. File Upload API will not be available.") | ||||
|     pipelines["indexing_pipeline"] = index_pipeline | ||||
| 
 | ||||
|     # Create directory for uploaded files | ||||
|     os.makedirs(config.FILE_UPLOAD_PATH, exist_ok=True) | ||||
|  | ||||
							
								
								
									
										17
									
								
								rest_api/test/samples/test.bogus_pipeline.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										17
									
								
								rest_api/test/samples/test.bogus_pipeline.yml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,17 @@ | ||||
| version: 'ignore' | ||||
| 
 | ||||
| components: | ||||
|   - name: InMemoryDocumentStore | ||||
|     params: {} | ||||
|     # Wrong DocumentStore Type | ||||
|     type: MyOwnDocumentStore | ||||
|   - name: tfidf_ret | ||||
|     params: | ||||
|       document_store: InMemoryDocumentStore | ||||
|     type: TfidfRetriever | ||||
| pipelines: | ||||
|   - name: query | ||||
|     nodes: | ||||
|       - inputs: | ||||
|           - Query | ||||
|         name: tfidf_ret | ||||
							
								
								
									
										16
									
								
								rest_api/test/samples/test.in-memory-haystack-pipeline.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								rest_api/test/samples/test.in-memory-haystack-pipeline.yml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,16 @@ | ||||
| version: 'ignore' | ||||
| 
 | ||||
| components: | ||||
|   - name: InMemoryDocumentStore | ||||
|     params: {} | ||||
|     type: InMemoryDocumentStore | ||||
|   - name: tfidf_ret | ||||
|     params: | ||||
|       document_store: InMemoryDocumentStore | ||||
|     type: TfidfRetriever | ||||
| pipelines: | ||||
|   - name: query | ||||
|     nodes: | ||||
|       - inputs: | ||||
|           - Query | ||||
|         name: tfidf_ret | ||||
| @ -5,25 +5,49 @@ from pathlib import Path | ||||
| from textwrap import dedent | ||||
| from unittest import mock | ||||
| from unittest.mock import MagicMock, Mock | ||||
| import functools | ||||
| import numpy as np | ||||
| import pandas as pd | ||||
| 
 | ||||
| import pytest | ||||
| from fastapi.testclient import TestClient | ||||
| from haystack import Document, Answer | ||||
| from haystack import Document, Answer, Pipeline | ||||
| import haystack | ||||
| from haystack.nodes import BaseReader, BaseRetriever | ||||
| from haystack.document_stores import BaseDocumentStore | ||||
| from haystack.errors import PipelineSchemaError | ||||
| from haystack.schema import Label, FilterType | ||||
| from haystack.nodes.file_converter import BaseConverter | ||||
| 
 | ||||
| from rest_api.pipeline import _load_pipeline | ||||
| from rest_api.utils import get_app | ||||
| 
 | ||||
| 
 | ||||
| TEST_QUERY = "Who made the PDF specification?" | ||||
| 
 | ||||
| 
 | ||||
| def test_single_worker_warning_for_indexing_pipelines(caplog): | ||||
|     yaml_pipeline_path = Path(__file__).parent.resolve() / "samples" / "test.in-memory-haystack-pipeline.yml" | ||||
|     p, _ = _load_pipeline(yaml_pipeline_path, None) | ||||
| 
 | ||||
|     assert isinstance(p, Pipeline) | ||||
|     assert "used with 1 worker" in caplog.text | ||||
| 
 | ||||
| 
 | ||||
| def test_check_error_for_pipeline_not_found(): | ||||
| 
 | ||||
|     yaml_pipeline_path = Path(__file__).parent.resolve() / "samples" / "test.in-memory-haystack-pipeline.yml" | ||||
|     p, _ = _load_pipeline(yaml_pipeline_path, "ThisPipelineDoesntExist") | ||||
|     assert p is None | ||||
| 
 | ||||
| 
 | ||||
| def test_bad_yaml_pipeline_configuration_error(): | ||||
| 
 | ||||
|     yaml_pipeline_path = Path(__file__).parent.resolve() / "samples" / "test.bogus_pipeline.yml" | ||||
|     with pytest.raises(PipelineSchemaError) as excinfo: | ||||
|         _load_pipeline(yaml_pipeline_path, None) | ||||
|     assert "MyOwnDocumentStore" in str(excinfo.value) | ||||
| 
 | ||||
| 
 | ||||
| class MockReader(BaseReader): | ||||
|     outgoing_edges = 1 | ||||
| 
 | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Mayank Jobanputra
						Mayank Jobanputra