haystack/rest_api/controller/file_upload.py

126 lines
4.4 KiB
Python
Raw Normal View History

from typing import Optional, List, Union
import json
import logging
import os
import shutil
import uuid
from pathlib import Path
from fastapi import APIRouter, UploadFile, File, Form, HTTPException, Depends
from pydantic import BaseModel
Refactoring of the `haystack` package (#1624) * Files moved, imports all broken * Fix most imports and docstrings into * Fix the paths to the modules in the API docs * Add latest docstring and tutorial changes * Add a few pipelines that were lost in the inports * Fix a bunch of mypy warnings * Add latest docstring and tutorial changes * Create a file_classifier module * Add docs for file_classifier * Fixed most circular imports, now the REST API can start * Add latest docstring and tutorial changes * Tackling more mypy issues * Reintroduce from FARM and fix last mypy issues hopefully * Re-enable old-style imports * Fix some more import from the top-level package in an attempt to sort out circular imports * Fix some imports in tests to new-style to prevent failed class equalities from breaking tests * Change document_store into document_stores * Update imports in tutorials * Add latest docstring and tutorial changes * Probably fixes summarizer tests * Improve the old-style import allowing module imports (should work) * Try to fix the docs * Remove dedicated KnowledgeGraph page from autodocs * Remove dedicated GraphRetriever page from autodocs * Fix generate_docstrings.sh with an updated list of yaml files to look for * Fix some more modules in the docs * Fix the document stores docs too * Fix a small issue on Tutorial14 * Add latest docstring and tutorial changes * Add deprecation warning to old-style imports * Remove stray folder and import Dict into dense.py * Change import path for MLFlowLogger * Add old loggers path to the import path aliases * Fix debug output of convert_ipynb.py * Fix circular import on BaseRetriever * Missed one merge block * re-run tutorial 5 * Fix imports in tutorial 5 * Re-enable squad_to_dpr CLI from the root package and move get_batches_from_generator into document_stores.base * Add latest docstring and tutorial changes * Fix typo in utils __init__ * Fix a few more imports * Fix benchmarks too * New-style imports in test_knowledge_graph * Rollback setup.py * Rollback squad_to_dpr too Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2021-10-25 15:50:23 +02:00
from haystack.pipelines.base import Pipeline
Pipeline's YAML: syntax validation (#2226) * Add BasePipeline.validate_config, BasePipeline.validate_yaml, and some new custom exception classes * Make error composition work properly * Clarify typing * Help mypy a bit more * Update Documentation & Code Style * Enable autogenerated docs for Milvus1 and 2 separately * Revert "Enable autogenerated docs for Milvus1 and 2 separately" This reverts commit 282be4a78a6e95862a9b4c924fc3dea5ca71e28d. * Update Documentation & Code Style * Re-enable 'additionalProperties: False' * Add pipeline.type to JSON Schema, was somehow forgotten * Disable additionalProperties on the pipeline properties too * Fix json-schemas for 1.1.0 and 1.2.0 (should not do it again in the future) * Cal super in PipelineValidationError * Improve _read_pipeline_config_from_yaml's error handling * Fix generate_json_schema.py to include document stores * Fix json schemas (retro-fix 1.1.0 again) * Improve custom errors printing, add link to docs * Add function in BaseComponent to list its subclasses in a module * Make some document stores base classes abstract * Add marker 'integration' in pytest flags * Slighly improve validation of pipelines at load * Adding tests for YAML loading and validation * Make custom_query Optional for validation issues * Fix bug in _read_pipeline_config_from_yaml * Improve error handling in BasePipeline and Pipeline and add DAG check * Move json schema generation into haystack/nodes/_json_schema.py (useful for tests) * Simplify errors slightly * Add some YAML validation tests * Remove load_from_config from BasePipeline, it was never used anyway * Improve tests * Include json-schemas in package * Fix conftest imports * Make BasePipeline abstract * Improve mocking by making the test independent from the YAML version * Add exportable_to_yaml decorator to forget about set_config on mock nodes * Fix mypy errors * Comment out one monkeypatch * Fix typing again * Improve error message for validation * Add required properties to pipelines * Fix YAML version for REST API YAMLs to 1.2.0 * Fix load_from_yaml call in load_from_deepset_cloud * fix HaystackError.__getattr__ * Add super().__init__()in most nodes and docstore, comment set_config * Remove type from REST API pipelines * Remove useless init from doc2answers * Call super in Seq3SeqGenerator * Typo in deepsetcloud.py * Fix rest api indexing error mismatch and mock version of JSON schema in all tests * Working on pipeline tests * Improve errors printing slightly * Add back test_pipeline.yaml * _json_schema.py supports different versions with identical schemas * Add type to 0.7 schema for backwards compatibility * Fix small bug in _json_schema.py * Try alternative to generate json schemas on the CI * Update Documentation & Code Style * Make linux CI match autoformat CI * Fix super-init-not-called * Accidentally committed file * Update Documentation & Code Style * fix test_summarizer_translation.py's import * Mock YAML in a few suites, split and simplify test_pipeline_debug_and_validation.py::test_invalid_run_args * Fix json schema for ray tests too * Update Documentation & Code Style * Reintroduce validation * Usa unstable version in tests and rest api * Make unstable support the latest versions * Update Documentation & Code Style * Remove needless fixture * Make type in pipeline optional in the strings validation * Fix schemas * Fix string validation for pipeline type * Improve validate_config_strings * Remove type from test p[ipelines * Update Documentation & Code Style * Fix test_pipeline * Removing more type from pipelines * Temporary CI patc * Fix issue with exportable_to_yaml never invoking the wrapped init * rm stray file * pipeline tests are green again * Linux CI now needs .[all] to generate the schema * Bugfixes, pipeline tests seems to be green * Typo in version after merge * Implement missing methods in Weaviate * Trying to avoid FAISS tests from running in the Milvus1 test suite * Fix some stray test paths and faiss index dumping * Fix pytest markers list * Temporarily disable cache to be able to see tests failures * Fix pyproject.toml syntax * Use only tmp_path * Fix preprocessor signature after merge * Fix faiss bug * Fix Ray test * Fix documentation issue by removing quotes from faiss type * Update Documentation & Code Style * use document properly in preprocessor tests * Update Documentation & Code Style * make preprocessor capable of handling documents * import document * Revert support for documents in preprocessor, do later * Fix bug in _json_schema.py that was breaking validation * re-enable cache * Update Documentation & Code Style * Simplify calling _json_schema.py from the CI * Remove redundant ABC inheritance * Ensure exportable_to_yaml works only on implementations * Rename subclass to class_ in Meta * Make run() and get_config() abstract in BasePipeline * Revert unintended change in preprocessor * Move outgoing_edges_input_node check inside try block * Rename VALID_CODE_GEN_INPUT_REGEX into VALID_INPUT_REGEX * Add check for a RecursionError on validate_config_strings * Address usages of _pipeline_config in data silo and elasticsearch * Rename _pipeline_config into _init_parameters * Fix pytest marker and remove unused imports * Remove most redundant ABCs * Rename _init_parameters into _component_configuration * Remove set_config and type from _component_configuration's dict * Remove last instances of set_config and replace with super().__init__() * Implement __init_subclass__ approach * Simplify checks on the existence of _component_configuration * Fix faiss issue * Dynamic generation of node schemas & weed out old schemas * Add debatable test * Add docstring to debatable test * Positive diff between schemas implemented * Improve diff printing * Rename REST API YAML files to trigger IDE validation * Fix typing issues * Fix more typing * Typo in YAML filename * Remove needless type:ignore * Add tests * Fix tests & validation feedback for accessory classes in custom nodes * Refactor RAGeneratorType out * Fix broken import in conftest * Improve source error handling * Remove unused import in test_eval.py breaking tests * Fix changed error message in tests matches too * Normalize generate_openapi_specs.py and generate_json_schema.py in the actions * Fix path to generate_openapi_specs.py in autoformat.yml * Update Documentation & Code Style * Add test for FAISSDocumentStore-like situations (superclass with init params) * Update Documentation & Code Style * Fix indentation * Remove commented set_config * Store model_name_or_path in FARMReader to use in DistillationDataSilo * Rename _component_configuration into _component_config * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2022-03-15 11:17:26 +01:00
from haystack.errors import PipelineConfigError
from haystack.pipelines.config import (
get_component_definitions,
get_pipeline_definition,
read_pipeline_config_from_yaml,
validate_config,
)
from rest_api.config import PIPELINE_YAML_PATH, FILE_UPLOAD_PATH, INDEXING_PIPELINE_NAME
from rest_api.controller.utils import as_form
logger = logging.getLogger(__name__)
router = APIRouter()
try:
pipeline_config = read_pipeline_config_from_yaml(Path(PIPELINE_YAML_PATH))
validate_config(pipeline_config)
pipeline_definition = get_pipeline_definition(pipeline_config=pipeline_config, pipeline_name=INDEXING_PIPELINE_NAME)
component_definitions = get_component_definitions(
DC SDK - load pipeline from deepset cloud (#2013) * initial load_from_dc * typo * adjusted api endpoint * removed kwargs * added _load_from_dict * refactor pipeline loading mechanism * renaming load_from_dc api * renaming * fixed errors * fix comments and environment variable overrides * Add latest docstring and tutorial changes * fix outdated YAML examples * Add latest docstring and tutorial changes * Introduce readonly DCDocumentStore (without labels support) (#1991) * minimal DCDocumentStore * support filters * implement get_documents_by_id * handle not existing documents * add docstrings * auth added * add tests * generate docs * Add latest docstring and tutorial changes * add responses to dev dependencies * fix tests * support query() and quey_by_embedding() * Add latest docstring and tutorial changes * query tests added * read api_key and api_endpoint from env * Add latest docstring and tutorial changes * support query() and quey_by_embedding() * query tests added * Add latest docstring and tutorial changes * Add latest docstring and tutorial changes * support dynamic similarity and return_embedding values * Add latest docstring and tutorial changes * adjust KeywordDocumentStore description * refactoring * Add latest docstring and tutorial changes * implement get_document_count and raise on all not implemented methods * Add latest docstring and tutorial changes * don't use abbreviation DC in comments and errors * Add latest docstring and tutorial changes * docstring added to KeywordDocumentStore * Add latest docstring and tutorial changes * enhanced api key set * split tests into two parts * change setup.py in order to work around build cache * added link * Add latest docstring and tutorial changes * rename DCDocumentStore to DeepsetCloudDocumentStore * Add latest docstring and tutorial changes * remove dc.py * reinsert link to docs * fix imports * Add latest docstring and tutorial changes * better test structure Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: ArzelaAscoIi <kristof.herrmann@rwth-aachen.de> * introduce DeepsetCloudAdapter * Add latest docstring and tutorial changes * introduce DeepsetCloudClient * Add latest docstring and tutorial changes * use json api for pipeline_config * indexing pipeline test added * pseudo change to force cache eviction * revert pseudo change to force cache eviction * remove conftest duplicates * minor formatting and docstring fixes * fix tests when MOCK_DC=False Co-authored-by: Thomas Stadelmann <thomas.stadelmann@deepset.ai> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: tstadel <60758086+tstadel@users.noreply.github.com>
2022-01-28 17:32:56 +01:00
pipeline_config=pipeline_config, overwrite_with_env_variables=True
)
# Since each instance of FAISSDocumentStore creates an in-memory FAISS index, the Indexing & Query Pipelines would
# end up with different indices. The same applies for InMemoryDocumentStore. The check below prevents creation of
# Indexing Pipelines with FAISSDocumentStore or InMemoryDocumentStore.
is_faiss_or_inmemory_present = False
DC SDK - load pipeline from deepset cloud (#2013) * initial load_from_dc * typo * adjusted api endpoint * removed kwargs * added _load_from_dict * refactor pipeline loading mechanism * renaming load_from_dc api * renaming * fixed errors * fix comments and environment variable overrides * Add latest docstring and tutorial changes * fix outdated YAML examples * Add latest docstring and tutorial changes * Introduce readonly DCDocumentStore (without labels support) (#1991) * minimal DCDocumentStore * support filters * implement get_documents_by_id * handle not existing documents * add docstrings * auth added * add tests * generate docs * Add latest docstring and tutorial changes * add responses to dev dependencies * fix tests * support query() and quey_by_embedding() * Add latest docstring and tutorial changes * query tests added * read api_key and api_endpoint from env * Add latest docstring and tutorial changes * support query() and quey_by_embedding() * query tests added * Add latest docstring and tutorial changes * Add latest docstring and tutorial changes * support dynamic similarity and return_embedding values * Add latest docstring and tutorial changes * adjust KeywordDocumentStore description * refactoring * Add latest docstring and tutorial changes * implement get_document_count and raise on all not implemented methods * Add latest docstring and tutorial changes * don't use abbreviation DC in comments and errors * Add latest docstring and tutorial changes * docstring added to KeywordDocumentStore * Add latest docstring and tutorial changes * enhanced api key set * split tests into two parts * change setup.py in order to work around build cache * added link * Add latest docstring and tutorial changes * rename DCDocumentStore to DeepsetCloudDocumentStore * Add latest docstring and tutorial changes * remove dc.py * reinsert link to docs * fix imports * Add latest docstring and tutorial changes * better test structure Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: ArzelaAscoIi <kristof.herrmann@rwth-aachen.de> * introduce DeepsetCloudAdapter * Add latest docstring and tutorial changes * introduce DeepsetCloudClient * Add latest docstring and tutorial changes * use json api for pipeline_config * indexing pipeline test added * pseudo change to force cache eviction * revert pseudo change to force cache eviction * remove conftest duplicates * minor formatting and docstring fixes * fix tests when MOCK_DC=False Co-authored-by: Thomas Stadelmann <thomas.stadelmann@deepset.ai> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: tstadel <60758086+tstadel@users.noreply.github.com>
2022-01-28 17:32:56 +01:00
for node in pipeline_definition["nodes"]:
if (
component_definitions[node["name"]]["type"] == "FAISSDocumentStore"
or component_definitions[node["name"]]["type"] == "InMemoryDocumentStore"
):
is_faiss_or_inmemory_present = True
break
if is_faiss_or_inmemory_present:
logger.warning(
"Indexing Pipeline with FAISSDocumentStore or InMemoryDocumentStore is not supported with the REST APIs."
)
INDEXING_PIPELINE = None
else:
INDEXING_PIPELINE = Pipeline.load_from_yaml(Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME)
Pipeline's YAML: syntax validation (#2226) * Add BasePipeline.validate_config, BasePipeline.validate_yaml, and some new custom exception classes * Make error composition work properly * Clarify typing * Help mypy a bit more * Update Documentation & Code Style * Enable autogenerated docs for Milvus1 and 2 separately * Revert "Enable autogenerated docs for Milvus1 and 2 separately" This reverts commit 282be4a78a6e95862a9b4c924fc3dea5ca71e28d. * Update Documentation & Code Style * Re-enable 'additionalProperties: False' * Add pipeline.type to JSON Schema, was somehow forgotten * Disable additionalProperties on the pipeline properties too * Fix json-schemas for 1.1.0 and 1.2.0 (should not do it again in the future) * Cal super in PipelineValidationError * Improve _read_pipeline_config_from_yaml's error handling * Fix generate_json_schema.py to include document stores * Fix json schemas (retro-fix 1.1.0 again) * Improve custom errors printing, add link to docs * Add function in BaseComponent to list its subclasses in a module * Make some document stores base classes abstract * Add marker 'integration' in pytest flags * Slighly improve validation of pipelines at load * Adding tests for YAML loading and validation * Make custom_query Optional for validation issues * Fix bug in _read_pipeline_config_from_yaml * Improve error handling in BasePipeline and Pipeline and add DAG check * Move json schema generation into haystack/nodes/_json_schema.py (useful for tests) * Simplify errors slightly * Add some YAML validation tests * Remove load_from_config from BasePipeline, it was never used anyway * Improve tests * Include json-schemas in package * Fix conftest imports * Make BasePipeline abstract * Improve mocking by making the test independent from the YAML version * Add exportable_to_yaml decorator to forget about set_config on mock nodes * Fix mypy errors * Comment out one monkeypatch * Fix typing again * Improve error message for validation * Add required properties to pipelines * Fix YAML version for REST API YAMLs to 1.2.0 * Fix load_from_yaml call in load_from_deepset_cloud * fix HaystackError.__getattr__ * Add super().__init__()in most nodes and docstore, comment set_config * Remove type from REST API pipelines * Remove useless init from doc2answers * Call super in Seq3SeqGenerator * Typo in deepsetcloud.py * Fix rest api indexing error mismatch and mock version of JSON schema in all tests * Working on pipeline tests * Improve errors printing slightly * Add back test_pipeline.yaml * _json_schema.py supports different versions with identical schemas * Add type to 0.7 schema for backwards compatibility * Fix small bug in _json_schema.py * Try alternative to generate json schemas on the CI * Update Documentation & Code Style * Make linux CI match autoformat CI * Fix super-init-not-called * Accidentally committed file * Update Documentation & Code Style * fix test_summarizer_translation.py's import * Mock YAML in a few suites, split and simplify test_pipeline_debug_and_validation.py::test_invalid_run_args * Fix json schema for ray tests too * Update Documentation & Code Style * Reintroduce validation * Usa unstable version in tests and rest api * Make unstable support the latest versions * Update Documentation & Code Style * Remove needless fixture * Make type in pipeline optional in the strings validation * Fix schemas * Fix string validation for pipeline type * Improve validate_config_strings * Remove type from test p[ipelines * Update Documentation & Code Style * Fix test_pipeline * Removing more type from pipelines * Temporary CI patc * Fix issue with exportable_to_yaml never invoking the wrapped init * rm stray file * pipeline tests are green again * Linux CI now needs .[all] to generate the schema * Bugfixes, pipeline tests seems to be green * Typo in version after merge * Implement missing methods in Weaviate * Trying to avoid FAISS tests from running in the Milvus1 test suite * Fix some stray test paths and faiss index dumping * Fix pytest markers list * Temporarily disable cache to be able to see tests failures * Fix pyproject.toml syntax * Use only tmp_path * Fix preprocessor signature after merge * Fix faiss bug * Fix Ray test * Fix documentation issue by removing quotes from faiss type * Update Documentation & Code Style * use document properly in preprocessor tests * Update Documentation & Code Style * make preprocessor capable of handling documents * import document * Revert support for documents in preprocessor, do later * Fix bug in _json_schema.py that was breaking validation * re-enable cache * Update Documentation & Code Style * Simplify calling _json_schema.py from the CI * Remove redundant ABC inheritance * Ensure exportable_to_yaml works only on implementations * Rename subclass to class_ in Meta * Make run() and get_config() abstract in BasePipeline * Revert unintended change in preprocessor * Move outgoing_edges_input_node check inside try block * Rename VALID_CODE_GEN_INPUT_REGEX into VALID_INPUT_REGEX * Add check for a RecursionError on validate_config_strings * Address usages of _pipeline_config in data silo and elasticsearch * Rename _pipeline_config into _init_parameters * Fix pytest marker and remove unused imports * Remove most redundant ABCs * Rename _init_parameters into _component_configuration * Remove set_config and type from _component_configuration's dict * Remove last instances of set_config and replace with super().__init__() * Implement __init_subclass__ approach * Simplify checks on the existence of _component_configuration * Fix faiss issue * Dynamic generation of node schemas & weed out old schemas * Add debatable test * Add docstring to debatable test * Positive diff between schemas implemented * Improve diff printing * Rename REST API YAML files to trigger IDE validation * Fix typing issues * Fix more typing * Typo in YAML filename * Remove needless type:ignore * Add tests * Fix tests & validation feedback for accessory classes in custom nodes * Refactor RAGeneratorType out * Fix broken import in conftest * Improve source error handling * Remove unused import in test_eval.py breaking tests * Fix changed error message in tests matches too * Normalize generate_openapi_specs.py and generate_json_schema.py in the actions * Fix path to generate_openapi_specs.py in autoformat.yml * Update Documentation & Code Style * Add test for FAISSDocumentStore-like situations (superclass with init params) * Update Documentation & Code Style * Fix indentation * Remove commented set_config * Store model_name_or_path in FARMReader to use in DistillationDataSilo * Rename _component_configuration into _component_config * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2022-03-15 11:17:26 +01:00
except PipelineConfigError as e:
INDEXING_PIPELINE = None
logger.error(f"{e.message}. File Upload API will not be available.")
# create directory for uploading files
os.makedirs(FILE_UPLOAD_PATH, exist_ok=True)
@as_form
class FileConverterParams(BaseModel):
remove_numeric_tables: Optional[bool] = None
valid_languages: Optional[List[str]] = None
@as_form
class PreprocessorParams(BaseModel):
clean_whitespace: Optional[bool] = None
clean_empty_lines: Optional[bool] = None
clean_header_footer: Optional[bool] = None
split_by: Optional[str] = None
split_length: Optional[int] = None
split_overlap: Optional[int] = None
split_respect_sentence_boundary: Optional[bool] = None
class Response(BaseModel):
file_id: str
@router.post("/file-upload")
def upload_file(
files: List[UploadFile] = File(...),
# JSON serialized string
meta: Optional[str] = Form("null"), # type: ignore
fileconverter_params: FileConverterParams = Depends(FileConverterParams.as_form), # type: ignore
preprocessor_params: PreprocessorParams = Depends(PreprocessorParams.as_form), # type: ignore
):
"""
You can use this endpoint to upload a file for indexing
(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).
"""
if not INDEXING_PIPELINE:
raise HTTPException(status_code=501, detail="Indexing Pipeline is not configured.")
file_paths: list = []
file_metas: list = []
meta_form = json.loads(meta) or {} # type: ignore
if not isinstance(meta_form, dict):
raise HTTPException(status_code=500, detail=f"The meta field must be a dict or None, not {type(meta_form)}")
for file in files:
try:
file_path = Path(FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{file.filename}"
with file_path.open("wb") as buffer:
shutil.copyfileobj(file.file, buffer)
file_paths.append(file_path)
meta_form["name"] = file.filename
file_metas.append(meta_form)
finally:
file.file.close()
INDEXING_PIPELINE.run(
file_paths=file_paths,
meta=file_metas,
params={
"TextFileConverter": fileconverter_params.dict(),
"PDFFileConverter": fileconverter_params.dict(),
"Preprocessor": preprocessor_params.dict(),
},
)