mirror of
				https://github.com/deepset-ai/haystack.git
				synced 2025-10-31 09:49:48 +00:00 
			
		
		
		
	 96a538b182
			
		
	
	
		96a538b182
		
			
		
	
	
	
	
		
			
			* remove duplicate imports * fix ungrouped-imports * Fix wrong-import-position * Fix unused-import * pyproject.toml * Working on wrong-import-order * Solve wrong-import-order * fix Pool import * Move open_search_index_to_document_store and elasticsearch_index_to_document_store in elasticsearch.py * remove Converter from modeling * Fix mypy issues on adaptive_model.py * create es_converter.py * remove converter import * change import path in tests * Restructure REST API to not rely on global vars from search.apy and improve tests * Fix openapi generator * Move variable initialization * Change type of FilterRequest.filters Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
		
			
				
	
	
		
			89 lines
		
	
	
		
			2.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			89 lines
		
	
	
		
			2.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from typing import Optional, List
 | |
| 
 | |
| import json
 | |
| import shutil
 | |
| import uuid
 | |
| from pathlib import Path
 | |
| 
 | |
| from fastapi import FastAPI, APIRouter, UploadFile, File, Form, HTTPException, Depends
 | |
| from pydantic import BaseModel
 | |
| from haystack import Pipeline
 | |
| from haystack.nodes import BaseConverter, PreProcessor
 | |
| 
 | |
| from rest_api.utils import get_app, get_pipelines
 | |
| from rest_api.config import FILE_UPLOAD_PATH
 | |
| from rest_api.controller.utils import as_form
 | |
| 
 | |
| 
 | |
| router = APIRouter()
 | |
| app: FastAPI = get_app()
 | |
| indexing_pipeline: Pipeline = get_pipelines().get("indexing_pipeline", None)
 | |
| 
 | |
| 
 | |
| @as_form
 | |
| class FileConverterParams(BaseModel):
 | |
|     remove_numeric_tables: Optional[bool] = None
 | |
|     valid_languages: Optional[List[str]] = None
 | |
| 
 | |
| 
 | |
| @as_form
 | |
| class PreprocessorParams(BaseModel):
 | |
|     clean_whitespace: Optional[bool] = None
 | |
|     clean_empty_lines: Optional[bool] = None
 | |
|     clean_header_footer: Optional[bool] = None
 | |
|     split_by: Optional[str] = None
 | |
|     split_length: Optional[int] = None
 | |
|     split_overlap: Optional[int] = None
 | |
|     split_respect_sentence_boundary: Optional[bool] = None
 | |
| 
 | |
| 
 | |
| class Response(BaseModel):
 | |
|     file_id: str
 | |
| 
 | |
| 
 | |
| @router.post("/file-upload")
 | |
| def upload_file(
 | |
|     files: List[UploadFile] = File(...),
 | |
|     # JSON serialized string
 | |
|     meta: Optional[str] = Form("null"),  # type: ignore
 | |
|     fileconverter_params: FileConverterParams = Depends(FileConverterParams.as_form),  # type: ignore
 | |
|     preprocessor_params: PreprocessorParams = Depends(PreprocessorParams.as_form),  # type: ignore
 | |
| ):
 | |
|     """
 | |
|     You can use this endpoint to upload a file for indexing
 | |
|     (see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).
 | |
|     """
 | |
|     if not indexing_pipeline:
 | |
|         raise HTTPException(status_code=501, detail="Indexing Pipeline is not configured.")
 | |
| 
 | |
|     file_paths: list = []
 | |
|     file_metas: list = []
 | |
| 
 | |
|     meta_form = json.loads(meta) or {}  # type: ignore
 | |
|     if not isinstance(meta_form, dict):
 | |
|         raise HTTPException(status_code=500, detail=f"The meta field must be a dict or None, not {type(meta_form)}")
 | |
| 
 | |
|     for file in files:
 | |
|         try:
 | |
|             file_path = Path(FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{file.filename}"
 | |
|             with file_path.open("wb") as buffer:
 | |
|                 shutil.copyfileobj(file.file, buffer)
 | |
| 
 | |
|             file_paths.append(file_path)
 | |
|             meta_form["name"] = file.filename
 | |
|             file_metas.append(meta_form)
 | |
|         finally:
 | |
|             file.file.close()
 | |
| 
 | |
|     # Find nodes names
 | |
|     converters = indexing_pipeline.get_nodes_by_class(BaseConverter)
 | |
|     preprocessors = indexing_pipeline.get_nodes_by_class(PreProcessor)
 | |
| 
 | |
|     params = {}
 | |
|     for converter in converters:
 | |
|         params[converter.name] = fileconverter_params.dict()
 | |
|     for preprocessor in preprocessors:
 | |
|         params[preprocessor.name] = preprocessor_params.dict()
 | |
| 
 | |
|     indexing_pipeline.run(file_paths=file_paths, meta=file_metas, params=params)
 |