haystack/rest_api/controller/file_upload.py

100 lines
3.5 KiB
Python
Raw Normal View History

import json
import logging
import os
import shutil
import uuid
from pathlib import Path
from typing import Optional, List
from fastapi import APIRouter, UploadFile, File, Form, HTTPException, Depends
from pydantic import BaseModel
Refactoring of the `haystack` package (#1624) * Files moved, imports all broken * Fix most imports and docstrings into * Fix the paths to the modules in the API docs * Add latest docstring and tutorial changes * Add a few pipelines that were lost in the inports * Fix a bunch of mypy warnings * Add latest docstring and tutorial changes * Create a file_classifier module * Add docs for file_classifier * Fixed most circular imports, now the REST API can start * Add latest docstring and tutorial changes * Tackling more mypy issues * Reintroduce from FARM and fix last mypy issues hopefully * Re-enable old-style imports * Fix some more import from the top-level package in an attempt to sort out circular imports * Fix some imports in tests to new-style to prevent failed class equalities from breaking tests * Change document_store into document_stores * Update imports in tutorials * Add latest docstring and tutorial changes * Probably fixes summarizer tests * Improve the old-style import allowing module imports (should work) * Try to fix the docs * Remove dedicated KnowledgeGraph page from autodocs * Remove dedicated GraphRetriever page from autodocs * Fix generate_docstrings.sh with an updated list of yaml files to look for * Fix some more modules in the docs * Fix the document stores docs too * Fix a small issue on Tutorial14 * Add latest docstring and tutorial changes * Add deprecation warning to old-style imports * Remove stray folder and import Dict into dense.py * Change import path for MLFlowLogger * Add old loggers path to the import path aliases * Fix debug output of convert_ipynb.py * Fix circular import on BaseRetriever * Missed one merge block * re-run tutorial 5 * Fix imports in tutorial 5 * Re-enable squad_to_dpr CLI from the root package and move get_batches_from_generator into document_stores.base * Add latest docstring and tutorial changes * Fix typo in utils __init__ * Fix a few more imports * Fix benchmarks too * New-style imports in test_knowledge_graph * Rollback setup.py * Rollback squad_to_dpr too Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2021-10-25 15:50:23 +02:00
from haystack.pipelines.base import Pipeline
from rest_api.config import PIPELINE_YAML_PATH, FILE_UPLOAD_PATH, INDEXING_PIPELINE_NAME
from rest_api.controller.utils import as_form
logger = logging.getLogger(__name__)
router = APIRouter()
try:
_, pipeline_config, definitions = Pipeline._read_yaml(
path=Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME, overwrite_with_env_variables=True
)
# Since each instance of FAISSDocumentStore creates an in-memory FAISS index, the Indexing & Query Pipelines would
# end up with different indices. The same applies for InMemoryDocumentStore. The check below prevents creation of
# Indexing Pipelines with FAISSDocumentStore or InMemoryDocumentStore.
is_faiss_or_inmemory_present = False
for node in pipeline_config["nodes"]:
if definitions[node["name"]]["type"] == "FAISSDocumentStore" or definitions[node["name"]]["type"] == "InMemoryDocumentStore":
is_faiss_or_inmemory_present = True
break
if is_faiss_or_inmemory_present:
logger.warning("Indexing Pipeline with FAISSDocumentStore or InMemoryDocumentStore is not supported with the REST APIs.")
INDEXING_PIPELINE = None
else:
INDEXING_PIPELINE = Pipeline.load_from_yaml(Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME)
except KeyError:
INDEXING_PIPELINE = None
logger.warning("Indexing Pipeline not found in the YAML configuration. File Upload API will not be available.")
os.makedirs(FILE_UPLOAD_PATH, exist_ok=True) # create directory for uploading files
@as_form
class FileConverterParams(BaseModel):
remove_numeric_tables: Optional[bool] = None
valid_languages: Optional[List[str]] = None
@as_form
class PreprocessorParams(BaseModel):
clean_whitespace: Optional[bool] = None
clean_empty_lines: Optional[bool] = None
clean_header_footer: Optional[bool] = None
split_by: Optional[str] = None
split_length: Optional[int] = None
split_overlap: Optional[int] = None
split_respect_sentence_boundary: Optional[bool] = None
class Response(BaseModel):
file_id: str
@router.post("/file-upload")
def file_upload(
files: List[UploadFile] = File(...),
meta: Optional[str] = Form("null"), # JSON serialized string
fileconverter_params: FileConverterParams = Depends(FileConverterParams.as_form),
preprocessor_params: PreprocessorParams = Depends(PreprocessorParams.as_form)
):
if not INDEXING_PIPELINE:
raise HTTPException(status_code=501, detail="Indexing Pipeline is not configured.")
file_paths: list = []
file_metas: list = []
meta = json.loads(meta) or {}
for file in files:
try:
file_path = Path(FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{file.filename}"
with file_path.open("wb") as buffer:
shutil.copyfileobj(file.file, buffer)
file_paths.append(file_path)
meta["name"] = file.filename
file_metas.append(meta)
finally:
file.file.close()
INDEXING_PIPELINE.run(
file_paths=file_paths,
meta=file_metas,
params={
"TextFileConverter": fileconverter_params.dict(),
"PDFFileConverter": fileconverter_params.dict(),
"Preprocessor": preprocessor_params.dict()
},
)