2022-04-12 16:41:05 +02:00
|
|
|
from typing import Optional, List
|
2022-02-09 18:27:12 +01:00
|
|
|
|
2021-04-07 17:53:32 +02:00
|
|
|
import json
|
2020-06-17 16:28:26 +02:00
|
|
|
import shutil
|
|
|
|
import uuid
|
|
|
|
from pathlib import Path
|
|
|
|
|
2022-04-12 16:41:05 +02:00
|
|
|
from fastapi import FastAPI, APIRouter, UploadFile, File, Form, HTTPException, Depends
|
2021-09-10 11:41:16 +02:00
|
|
|
from pydantic import BaseModel
|
2022-04-12 16:41:05 +02:00
|
|
|
from haystack import Pipeline
|
|
|
|
from haystack.nodes import BaseConverter, PreProcessor
|
2020-06-17 16:28:26 +02:00
|
|
|
|
2022-04-12 16:41:05 +02:00
|
|
|
from rest_api.utils import get_app, get_pipelines
|
|
|
|
from rest_api.config import FILE_UPLOAD_PATH
|
2021-09-10 11:41:16 +02:00
|
|
|
from rest_api.controller.utils import as_form
|
2020-06-17 16:28:26 +02:00
|
|
|
|
2022-01-26 18:12:55 +01:00
|
|
|
|
2020-06-17 16:28:26 +02:00
|
|
|
router = APIRouter()
|
2022-04-12 16:41:05 +02:00
|
|
|
app: FastAPI = get_app()
|
|
|
|
indexing_pipeline: Pipeline = get_pipelines().get("indexing_pipeline", None)
|
2020-06-24 15:05:30 +02:00
|
|
|
|
2020-06-17 16:28:26 +02:00
|
|
|
|
2021-09-10 11:41:16 +02:00
|
|
|
@as_form
|
2022-02-03 13:43:18 +01:00
|
|
|
class FileConverterParams(BaseModel):
|
2021-09-10 11:41:16 +02:00
|
|
|
remove_numeric_tables: Optional[bool] = None
|
|
|
|
valid_languages: Optional[List[str]] = None
|
2021-10-19 15:22:44 +02:00
|
|
|
|
|
|
|
|
|
|
|
@as_form
|
|
|
|
class PreprocessorParams(BaseModel):
|
|
|
|
clean_whitespace: Optional[bool] = None
|
|
|
|
clean_empty_lines: Optional[bool] = None
|
|
|
|
clean_header_footer: Optional[bool] = None
|
2021-09-10 11:41:16 +02:00
|
|
|
split_by: Optional[str] = None
|
|
|
|
split_length: Optional[int] = None
|
|
|
|
split_overlap: Optional[int] = None
|
|
|
|
split_respect_sentence_boundary: Optional[bool] = None
|
|
|
|
|
|
|
|
|
|
|
|
class Response(BaseModel):
|
|
|
|
file_id: str
|
|
|
|
|
|
|
|
|
2020-06-17 16:28:26 +02:00
|
|
|
@router.post("/file-upload")
|
2021-11-11 09:40:58 +01:00
|
|
|
def upload_file(
|
2021-06-30 17:13:46 +05:00
|
|
|
files: List[UploadFile] = File(...),
|
2022-02-09 18:27:12 +01:00
|
|
|
# JSON serialized string
|
|
|
|
meta: Optional[str] = Form("null"), # type: ignore
|
|
|
|
fileconverter_params: FileConverterParams = Depends(FileConverterParams.as_form), # type: ignore
|
|
|
|
preprocessor_params: PreprocessorParams = Depends(PreprocessorParams.as_form), # type: ignore
|
2020-07-06 17:35:47 +02:00
|
|
|
):
|
2022-01-27 13:06:01 +01:00
|
|
|
"""
|
2022-02-03 13:43:18 +01:00
|
|
|
You can use this endpoint to upload a file for indexing
|
2022-02-16 13:06:05 +01:00
|
|
|
(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).
|
2022-01-27 13:06:01 +01:00
|
|
|
"""
|
2022-04-12 16:41:05 +02:00
|
|
|
if not indexing_pipeline:
|
2021-04-07 17:53:32 +02:00
|
|
|
raise HTTPException(status_code=501, detail="Indexing Pipeline is not configured.")
|
2021-04-30 14:16:30 +05:30
|
|
|
|
2021-06-30 17:13:46 +05:00
|
|
|
file_paths: list = []
|
|
|
|
file_metas: list = []
|
2022-02-16 10:32:22 +01:00
|
|
|
|
|
|
|
meta_form = json.loads(meta) or {} # type: ignore
|
|
|
|
if not isinstance(meta_form, dict):
|
|
|
|
raise HTTPException(status_code=500, detail=f"The meta field must be a dict or None, not {type(meta_form)}")
|
2021-04-30 14:16:30 +05:30
|
|
|
|
2021-06-30 17:13:46 +05:00
|
|
|
for file in files:
|
|
|
|
try:
|
|
|
|
file_path = Path(FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{file.filename}"
|
|
|
|
with file_path.open("wb") as buffer:
|
|
|
|
shutil.copyfileobj(file.file, buffer)
|
|
|
|
|
|
|
|
file_paths.append(file_path)
|
2022-02-09 18:27:12 +01:00
|
|
|
meta_form["name"] = file.filename
|
|
|
|
file_metas.append(meta_form)
|
2021-06-30 17:13:46 +05:00
|
|
|
finally:
|
|
|
|
file.file.close()
|
|
|
|
|
2022-04-12 16:41:05 +02:00
|
|
|
# Find nodes names
|
|
|
|
converters = indexing_pipeline.get_nodes_by_class(BaseConverter)
|
|
|
|
preprocessors = indexing_pipeline.get_nodes_by_class(PreProcessor)
|
|
|
|
|
|
|
|
params = {}
|
|
|
|
for converter in converters:
|
|
|
|
params[converter.name] = fileconverter_params.dict()
|
|
|
|
for preprocessor in preprocessors:
|
|
|
|
params[preprocessor.name] = preprocessor_params.dict()
|
|
|
|
|
|
|
|
indexing_pipeline.run(file_paths=file_paths, meta=file_metas, params=params)
|