Add support for Dense Retrievers in REST API Indexing Pipeline (#1430)

2025-12-28 07:29:06 +00:00 · 2021-09-10 11:53:32 +02:00 · 2021-09-10 11:53:32 +02:00 · 1f859694f1
commit 1f859694f1
parent 9dd7c74f4f
3 changed files with 76 additions and 10 deletions
--- a/rest_api/controller/file_upload.py
+++ b/rest_api/controller/file_upload.py
@ -17,10 +17,24 @@ logger = logging.getLogger(__name__)
 router = APIRouter()

 try:
-    INDEXING_PIPELINE = Pipeline.load_from_yaml(Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME)
+    _, pipeline_config, definitions = Pipeline._read_yaml(
+        path=Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME, overwrite_with_env_variables=True
+    )
+    # Since each instance of FAISSDocumentStore creates an in-memory FAISS index, the Indexing & Query Pipelines would
+    # end up with different indices. The check below prevents creation of Indexing Pipelines with FAISSDocumentStore.   
+    is_faiss_present = False
+    for node in pipeline_config["nodes"]:
+        if definitions[node["name"]]["type"] == "FAISSDocumentStore":
+            is_faiss_present = True
+            break
+    if is_faiss_present:
+        logger.warning("Indexing Pipeline with FAISSDocumentStore is not supported with the REST APIs.")
+        INDEXING_PIPELINE = None
+    else:
+        INDEXING_PIPELINE = Pipeline.load_from_yaml(Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME)
 except KeyError:
    INDEXING_PIPELINE = None
-    logger.info("Indexing Pipeline not found in the YAML configuration. File Upload API will not be available.")
+    logger.warning("Indexing Pipeline not found in the YAML configuration. File Upload API will not be available.")


 os.makedirs(FILE_UPLOAD_PATH, exist_ok=True)  # create directory for uploading files
--- a/rest_api/pipeline/pipelines.yaml
+++ b/rest_api/pipeline/pipelines.yaml
@ -1,14 +1,14 @@
-version: '0.7'
+version: '0.9'

 components:    # define all the building-blocks for Pipeline
-  - name: ElasticsearchDocumentStore
+  - name: DocumentStore
    type: ElasticsearchDocumentStore
    params:
      host: localhost
-  - name: ESRetriever
+  - name: Retriever
    type: ElasticsearchRetriever
    params:
-      document_store: ElasticsearchDocumentStore    # params can reference other components defined in the YAML
+      document_store: DocumentStore    # params can reference other components defined in the YAML
      top_k: 5
  - name: Reader       # custom-name for the component; helpful for visualization & debugging
    type: FARMReader    # Haystack Class name for the component
@ -30,11 +30,10 @@ pipelines:
  - name: query    # a sample extractive-qa Pipeline
    type: Query
    nodes:
-      - name: ESRetriever
+      - name: Retriever
        inputs: [Query]
      - name: Reader
-        inputs: [ESRetriever]
-
+        inputs: [Retriever]
  - name: indexing
    type: Indexing
    nodes:
@ -46,5 +45,7 @@ pipelines:
        inputs: [FileTypeClassifier.output_2]
      - name: Preprocessor
        inputs: [PDFFileConverter, TextFileConverter]
-      - name: ElasticsearchDocumentStore
+      - name: Retriever
        inputs: [Preprocessor]
+      - name: DocumentStore
+        inputs: [Retriever]
--- a/rest_api/pipeline/pipelines_dpr.yaml
+++ b/rest_api/pipeline/pipelines_dpr.yaml
@ -0,0 +1,51 @@
+version: '0.9'
+
+components:    # define all the building-blocks for Pipeline
+  - name: DocumentStore
+    type: ElasticsearchDocumentStore  # consider using MilvusDocumentStore or WeaviateDocumentStore for scaling to large number of documents
+    params:
+      host: localhost
+  - name: Retriever
+    type: DensePassageRetriever
+    params:
+      document_store: DocumentStore    # params can reference other components defined in the YAML
+      top_k: 5
+  - name: Reader       # custom-name for the component; helpful for visualization & debugging
+    type: FARMReader    # Haystack Class name for the component
+    params:
+      model_name_or_path: deepset/roberta-base-squad2
+  - name: TextFileConverter
+    type: TextConverter
+  - name: PDFFileConverter
+    type: PDFToTextConverter
+  - name: Preprocessor
+    type: PreProcessor
+    params:
+      split_by: word
+      split_length: 1000
+  - name: FileTypeClassifier
+    type: FileTypeClassifier
+
+pipelines:
+  - name: query    # a sample extractive-qa Pipeline
+    type: Query
+    nodes:
+      - name: Retriever
+        inputs: [Query]
+      - name: Reader
+        inputs: [Retriever]
+  - name: indexing
+    type: Indexing
+    nodes:
+      - name: FileTypeClassifier
+        inputs: [File]
+      - name: TextFileConverter
+        inputs: [FileTypeClassifier.output_1]
+      - name: PDFFileConverter
+        inputs: [FileTypeClassifier.output_2]
+      - name: Preprocessor
+        inputs: [PDFFileConverter, TextFileConverter]
+      - name: Retriever
+        inputs: [Preprocessor]
+      - name: DocumentStore
+        inputs: [Retriever]