| 
									
										
										
										
											2022-02-09 18:27:12 +01:00
										 |  |  | from typing import Optional, List, Union | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-04-07 17:53:32 +02:00
										 |  |  | import json | 
					
						
							| 
									
										
										
										
											2020-06-17 16:28:26 +02:00
										 |  |  | import logging | 
					
						
							| 
									
										
										
										
											2020-06-24 15:05:30 +02:00
										 |  |  | import os | 
					
						
							| 
									
										
										
										
											2020-06-17 16:28:26 +02:00
										 |  |  | import shutil | 
					
						
							|  |  |  | import uuid | 
					
						
							|  |  |  | from pathlib import Path | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-10 11:41:16 +02:00
										 |  |  | from fastapi import APIRouter, UploadFile, File, Form, HTTPException, Depends | 
					
						
							|  |  |  | from pydantic import BaseModel | 
					
						
							| 
									
										
										
										
											2020-06-17 16:28:26 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-10-25 15:50:23 +02:00
										 |  |  | from haystack.pipelines.base import Pipeline | 
					
						
							| 
									
										
										
										
											2021-04-07 17:53:32 +02:00
										 |  |  | from rest_api.config import PIPELINE_YAML_PATH, FILE_UPLOAD_PATH, INDEXING_PIPELINE_NAME | 
					
						
							| 
									
										
										
										
											2021-09-10 11:41:16 +02:00
										 |  |  | from rest_api.controller.utils import as_form | 
					
						
							| 
									
										
										
										
											2020-06-17 16:28:26 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-26 18:12:55 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-06-17 16:28:26 +02:00
										 |  |  | logger = logging.getLogger(__name__) | 
					
						
							|  |  |  | router = APIRouter() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-04-07 17:53:32 +02:00
										 |  |  | try: | 
					
						
							| 
									
										
										
										
											2022-01-28 17:32:56 +01:00
										 |  |  |     pipeline_config = Pipeline._read_pipeline_config_from_yaml(Path(PIPELINE_YAML_PATH)) | 
					
						
							| 
									
										
										
										
											2022-02-03 13:43:18 +01:00
										 |  |  |     pipeline_definition = Pipeline._get_pipeline_definition( | 
					
						
							|  |  |  |         pipeline_config=pipeline_config, pipeline_name=INDEXING_PIPELINE_NAME | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2022-01-28 17:32:56 +01:00
										 |  |  |     definitions = Pipeline._get_component_definitions( | 
					
						
							|  |  |  |         pipeline_config=pipeline_config, overwrite_with_env_variables=True | 
					
						
							| 
									
										
										
										
											2021-09-10 11:53:32 +02:00
										 |  |  |     ) | 
					
						
							|  |  |  |     # Since each instance of FAISSDocumentStore creates an in-memory FAISS index, the Indexing & Query Pipelines would | 
					
						
							| 
									
										
										
										
											2022-02-03 13:43:18 +01:00
										 |  |  |     # end up with different indices. The same applies for InMemoryDocumentStore. The check below prevents creation of | 
					
						
							|  |  |  |     # Indexing Pipelines with FAISSDocumentStore or InMemoryDocumentStore. | 
					
						
							| 
									
										
										
										
											2021-11-01 10:39:13 +01:00
										 |  |  |     is_faiss_or_inmemory_present = False | 
					
						
							| 
									
										
										
										
											2022-01-28 17:32:56 +01:00
										 |  |  |     for node in pipeline_definition["nodes"]: | 
					
						
							| 
									
										
										
										
											2022-02-03 13:43:18 +01:00
										 |  |  |         if ( | 
					
						
							|  |  |  |             definitions[node["name"]]["type"] == "FAISSDocumentStore" | 
					
						
							|  |  |  |             or definitions[node["name"]]["type"] == "InMemoryDocumentStore" | 
					
						
							|  |  |  |         ): | 
					
						
							| 
									
										
										
										
											2021-11-01 10:39:13 +01:00
										 |  |  |             is_faiss_or_inmemory_present = True | 
					
						
							| 
									
										
										
										
											2021-09-10 11:53:32 +02:00
										 |  |  |             break | 
					
						
							| 
									
										
										
										
											2021-11-01 10:39:13 +01:00
										 |  |  |     if is_faiss_or_inmemory_present: | 
					
						
							| 
									
										
										
										
											2022-02-03 13:43:18 +01:00
										 |  |  |         logger.warning( | 
					
						
							|  |  |  |             "Indexing Pipeline with FAISSDocumentStore or InMemoryDocumentStore is not supported with the REST APIs." | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2021-09-10 11:53:32 +02:00
										 |  |  |         INDEXING_PIPELINE = None | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         INDEXING_PIPELINE = Pipeline.load_from_yaml(Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME) | 
					
						
							| 
									
										
										
										
											2021-04-07 17:53:32 +02:00
										 |  |  | except KeyError: | 
					
						
							|  |  |  |     INDEXING_PIPELINE = None | 
					
						
							| 
									
										
										
										
											2021-09-10 11:53:32 +02:00
										 |  |  |     logger.warning("Indexing Pipeline not found in the YAML configuration. File Upload API will not be available.") | 
					
						
							| 
									
										
										
										
											2020-06-17 16:28:26 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-02-09 18:27:12 +01:00
										 |  |  | # create directory for uploading files | 
					
						
							|  |  |  | os.makedirs(FILE_UPLOAD_PATH, exist_ok=True) | 
					
						
							| 
									
										
										
										
											2020-06-24 15:05:30 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-06-17 16:28:26 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-10 11:41:16 +02:00
										 |  |  | @as_form | 
					
						
							| 
									
										
										
										
											2022-02-03 13:43:18 +01:00
										 |  |  | class FileConverterParams(BaseModel): | 
					
						
							| 
									
										
										
										
											2021-09-10 11:41:16 +02:00
										 |  |  |     remove_numeric_tables: Optional[bool] = None | 
					
						
							|  |  |  |     valid_languages: Optional[List[str]] = None | 
					
						
							| 
									
										
										
										
											2021-10-19 15:22:44 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @as_form | 
					
						
							|  |  |  | class PreprocessorParams(BaseModel): | 
					
						
							|  |  |  |     clean_whitespace: Optional[bool] = None | 
					
						
							|  |  |  |     clean_empty_lines: Optional[bool] = None | 
					
						
							|  |  |  |     clean_header_footer: Optional[bool] = None | 
					
						
							| 
									
										
										
										
											2021-09-10 11:41:16 +02:00
										 |  |  |     split_by: Optional[str] = None | 
					
						
							|  |  |  |     split_length: Optional[int] = None | 
					
						
							|  |  |  |     split_overlap: Optional[int] = None | 
					
						
							|  |  |  |     split_respect_sentence_boundary: Optional[bool] = None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class Response(BaseModel): | 
					
						
							|  |  |  |     file_id: str | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-06-17 16:28:26 +02:00
										 |  |  | @router.post("/file-upload") | 
					
						
							| 
									
										
										
										
											2021-11-11 09:40:58 +01:00
										 |  |  | def upload_file( | 
					
						
							| 
									
										
										
										
											2021-06-30 17:13:46 +05:00
										 |  |  |     files: List[UploadFile] = File(...), | 
					
						
							| 
									
										
										
										
											2022-02-09 18:27:12 +01:00
										 |  |  |     # JSON serialized string | 
					
						
							|  |  |  |     meta: Optional[str] = Form("null"),  # type: ignore | 
					
						
							|  |  |  |     fileconverter_params: FileConverterParams = Depends(FileConverterParams.as_form),  # type: ignore | 
					
						
							|  |  |  |     preprocessor_params: PreprocessorParams = Depends(PreprocessorParams.as_form),  # type: ignore | 
					
						
							| 
									
										
										
										
											2020-07-06 17:35:47 +02:00
										 |  |  | ): | 
					
						
							| 
									
										
										
										
											2022-01-27 13:06:01 +01:00
										 |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-02-03 13:43:18 +01:00
										 |  |  |     You can use this endpoint to upload a file for indexing | 
					
						
							| 
									
										
										
										
											2022-02-16 13:06:05 +01:00
										 |  |  |     (see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store). | 
					
						
							| 
									
										
										
										
											2022-01-27 13:06:01 +01:00
										 |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2021-04-07 17:53:32 +02:00
										 |  |  |     if not INDEXING_PIPELINE: | 
					
						
							|  |  |  |         raise HTTPException(status_code=501, detail="Indexing Pipeline is not configured.") | 
					
						
							| 
									
										
										
										
											2021-04-30 14:16:30 +05:30
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-30 17:13:46 +05:00
										 |  |  |     file_paths: list = [] | 
					
						
							|  |  |  |     file_metas: list = [] | 
					
						
							| 
									
										
										
										
											2022-02-16 10:32:22 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |     meta_form = json.loads(meta) or {}  # type: ignore | 
					
						
							|  |  |  |     if not isinstance(meta_form, dict): | 
					
						
							|  |  |  |         raise HTTPException(status_code=500, detail=f"The meta field must be a dict or None, not {type(meta_form)}") | 
					
						
							| 
									
										
										
										
											2021-04-30 14:16:30 +05:30
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-30 17:13:46 +05:00
										 |  |  |     for file in files: | 
					
						
							|  |  |  |         try: | 
					
						
							|  |  |  |             file_path = Path(FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{file.filename}" | 
					
						
							|  |  |  |             with file_path.open("wb") as buffer: | 
					
						
							|  |  |  |                 shutil.copyfileobj(file.file, buffer) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             file_paths.append(file_path) | 
					
						
							| 
									
										
										
										
											2022-02-09 18:27:12 +01:00
										 |  |  |             meta_form["name"] = file.filename | 
					
						
							|  |  |  |             file_metas.append(meta_form) | 
					
						
							| 
									
										
										
										
											2021-06-30 17:13:46 +05:00
										 |  |  |         finally: | 
					
						
							|  |  |  |             file.file.close() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     INDEXING_PIPELINE.run( | 
					
						
							| 
									
										
										
										
											2022-02-03 13:43:18 +01:00
										 |  |  |         file_paths=file_paths, | 
					
						
							|  |  |  |         meta=file_metas, | 
					
						
							|  |  |  |         params={ | 
					
						
							|  |  |  |             "TextFileConverter": fileconverter_params.dict(), | 
					
						
							|  |  |  |             "PDFFileConverter": fileconverter_params.dict(), | 
					
						
							|  |  |  |             "Preprocessor": preprocessor_params.dict(), | 
					
						
							|  |  |  |         }, | 
					
						
							| 
									
										
										
										
											2021-06-30 17:13:46 +05:00
										 |  |  |     ) |