diff --git a/rest_api/controller/file_upload.py b/rest_api/controller/file_upload.py index 6944b4751..56735cba2 100644 --- a/rest_api/controller/file_upload.py +++ b/rest_api/controller/file_upload.py @@ -17,10 +17,24 @@ logger = logging.getLogger(__name__) router = APIRouter() try: - INDEXING_PIPELINE = Pipeline.load_from_yaml(Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME) + _, pipeline_config, definitions = Pipeline._read_yaml( + path=Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME, overwrite_with_env_variables=True + ) + # Since each instance of FAISSDocumentStore creates an in-memory FAISS index, the Indexing & Query Pipelines would + # end up with different indices. The check below prevents creation of Indexing Pipelines with FAISSDocumentStore. + is_faiss_present = False + for node in pipeline_config["nodes"]: + if definitions[node["name"]]["type"] == "FAISSDocumentStore": + is_faiss_present = True + break + if is_faiss_present: + logger.warning("Indexing Pipeline with FAISSDocumentStore is not supported with the REST APIs.") + INDEXING_PIPELINE = None + else: + INDEXING_PIPELINE = Pipeline.load_from_yaml(Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME) except KeyError: INDEXING_PIPELINE = None - logger.info("Indexing Pipeline not found in the YAML configuration. File Upload API will not be available.") + logger.warning("Indexing Pipeline not found in the YAML configuration. File Upload API will not be available.") os.makedirs(FILE_UPLOAD_PATH, exist_ok=True) # create directory for uploading files diff --git a/rest_api/pipeline/pipelines.yaml b/rest_api/pipeline/pipelines.yaml index 1a22e1bc8..2e820e3d8 100644 --- a/rest_api/pipeline/pipelines.yaml +++ b/rest_api/pipeline/pipelines.yaml @@ -1,14 +1,14 @@ -version: '0.7' +version: '0.9' components: # define all the building-blocks for Pipeline - - name: ElasticsearchDocumentStore + - name: DocumentStore type: ElasticsearchDocumentStore params: host: localhost - - name: ESRetriever + - name: Retriever type: ElasticsearchRetriever params: - document_store: ElasticsearchDocumentStore # params can reference other components defined in the YAML + document_store: DocumentStore # params can reference other components defined in the YAML top_k: 5 - name: Reader # custom-name for the component; helpful for visualization & debugging type: FARMReader # Haystack Class name for the component @@ -30,11 +30,10 @@ pipelines: - name: query # a sample extractive-qa Pipeline type: Query nodes: - - name: ESRetriever + - name: Retriever inputs: [Query] - name: Reader - inputs: [ESRetriever] - + inputs: [Retriever] - name: indexing type: Indexing nodes: @@ -46,5 +45,7 @@ pipelines: inputs: [FileTypeClassifier.output_2] - name: Preprocessor inputs: [PDFFileConverter, TextFileConverter] - - name: ElasticsearchDocumentStore + - name: Retriever inputs: [Preprocessor] + - name: DocumentStore + inputs: [Retriever] diff --git a/rest_api/pipeline/pipelines_dpr.yaml b/rest_api/pipeline/pipelines_dpr.yaml new file mode 100644 index 000000000..23ec38da3 --- /dev/null +++ b/rest_api/pipeline/pipelines_dpr.yaml @@ -0,0 +1,51 @@ +version: '0.9' + +components: # define all the building-blocks for Pipeline + - name: DocumentStore + type: ElasticsearchDocumentStore # consider using MilvusDocumentStore or WeaviateDocumentStore for scaling to large number of documents + params: + host: localhost + - name: Retriever + type: DensePassageRetriever + params: + document_store: DocumentStore # params can reference other components defined in the YAML + top_k: 5 + - name: Reader # custom-name for the component; helpful for visualization & debugging + type: FARMReader # Haystack Class name for the component + params: + model_name_or_path: deepset/roberta-base-squad2 + - name: TextFileConverter + type: TextConverter + - name: PDFFileConverter + type: PDFToTextConverter + - name: Preprocessor + type: PreProcessor + params: + split_by: word + split_length: 1000 + - name: FileTypeClassifier + type: FileTypeClassifier + +pipelines: + - name: query # a sample extractive-qa Pipeline + type: Query + nodes: + - name: Retriever + inputs: [Query] + - name: Reader + inputs: [Retriever] + - name: indexing + type: Indexing + nodes: + - name: FileTypeClassifier + inputs: [File] + - name: TextFileConverter + inputs: [FileTypeClassifier.output_1] + - name: PDFFileConverter + inputs: [FileTypeClassifier.output_2] + - name: Preprocessor + inputs: [PDFFileConverter, TextFileConverter] + - name: Retriever + inputs: [Preprocessor] + - name: DocumentStore + inputs: [Retriever]