mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-20 23:41:36 +00:00

* remove duplicate imports * fix ungrouped-imports * Fix wrong-import-position * Fix unused-import * pyproject.toml * Working on wrong-import-order * Solve wrong-import-order * fix Pool import * Move open_search_index_to_document_store and elasticsearch_index_to_document_store in elasticsearch.py * remove Converter from modeling * Fix mypy issues on adaptive_model.py * create es_converter.py * remove converter import * change import path in tests * Restructure REST API to not rely on global vars from search.apy and improve tests * Fix openapi generator * Move variable initialization * Change type of FilterRequest.filters Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
89 lines
2.9 KiB
Python
89 lines
2.9 KiB
Python
from typing import Optional, List
|
|
|
|
import json
|
|
import shutil
|
|
import uuid
|
|
from pathlib import Path
|
|
|
|
from fastapi import FastAPI, APIRouter, UploadFile, File, Form, HTTPException, Depends
|
|
from pydantic import BaseModel
|
|
from haystack import Pipeline
|
|
from haystack.nodes import BaseConverter, PreProcessor
|
|
|
|
from rest_api.utils import get_app, get_pipelines
|
|
from rest_api.config import FILE_UPLOAD_PATH
|
|
from rest_api.controller.utils import as_form
|
|
|
|
|
|
router = APIRouter()
|
|
app: FastAPI = get_app()
|
|
indexing_pipeline: Pipeline = get_pipelines().get("indexing_pipeline", None)
|
|
|
|
|
|
@as_form
|
|
class FileConverterParams(BaseModel):
|
|
remove_numeric_tables: Optional[bool] = None
|
|
valid_languages: Optional[List[str]] = None
|
|
|
|
|
|
@as_form
|
|
class PreprocessorParams(BaseModel):
|
|
clean_whitespace: Optional[bool] = None
|
|
clean_empty_lines: Optional[bool] = None
|
|
clean_header_footer: Optional[bool] = None
|
|
split_by: Optional[str] = None
|
|
split_length: Optional[int] = None
|
|
split_overlap: Optional[int] = None
|
|
split_respect_sentence_boundary: Optional[bool] = None
|
|
|
|
|
|
class Response(BaseModel):
|
|
file_id: str
|
|
|
|
|
|
@router.post("/file-upload")
|
|
def upload_file(
|
|
files: List[UploadFile] = File(...),
|
|
# JSON serialized string
|
|
meta: Optional[str] = Form("null"), # type: ignore
|
|
fileconverter_params: FileConverterParams = Depends(FileConverterParams.as_form), # type: ignore
|
|
preprocessor_params: PreprocessorParams = Depends(PreprocessorParams.as_form), # type: ignore
|
|
):
|
|
"""
|
|
You can use this endpoint to upload a file for indexing
|
|
(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).
|
|
"""
|
|
if not indexing_pipeline:
|
|
raise HTTPException(status_code=501, detail="Indexing Pipeline is not configured.")
|
|
|
|
file_paths: list = []
|
|
file_metas: list = []
|
|
|
|
meta_form = json.loads(meta) or {} # type: ignore
|
|
if not isinstance(meta_form, dict):
|
|
raise HTTPException(status_code=500, detail=f"The meta field must be a dict or None, not {type(meta_form)}")
|
|
|
|
for file in files:
|
|
try:
|
|
file_path = Path(FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{file.filename}"
|
|
with file_path.open("wb") as buffer:
|
|
shutil.copyfileobj(file.file, buffer)
|
|
|
|
file_paths.append(file_path)
|
|
meta_form["name"] = file.filename
|
|
file_metas.append(meta_form)
|
|
finally:
|
|
file.file.close()
|
|
|
|
# Find nodes names
|
|
converters = indexing_pipeline.get_nodes_by_class(BaseConverter)
|
|
preprocessors = indexing_pipeline.get_nodes_by_class(PreProcessor)
|
|
|
|
params = {}
|
|
for converter in converters:
|
|
params[converter.name] = fileconverter_params.dict()
|
|
for preprocessor in preprocessors:
|
|
params[preprocessor.name] = preprocessor_params.dict()
|
|
|
|
indexing_pipeline.run(file_paths=file_paths, meta=file_metas, params=params)
|