diff --git a/rest_api/rest_api/controller/file_upload.py b/rest_api/rest_api/controller/file_upload.py index f9ca06a45..97b29994e 100644 --- a/rest_api/rest_api/controller/file_upload.py +++ b/rest_api/rest_api/controller/file_upload.py @@ -49,10 +49,14 @@ def upload_file( additional_params: Optional[str] = Form("null"), # type: ignore fileconverter_params: FileConverterParams = Depends(FileConverterParams.as_form), # type: ignore preprocessor_params: PreprocessorParams = Depends(PreprocessorParams.as_form), # type: ignore + keep_files: Optional[bool] = False, ): """ You can use this endpoint to upload a file for indexing - (see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store). + (see https://docs.haystack.deepset.ai/docs/rest_api#indexing-documents-in-the-haystack-rest-api-documentstore). + + Note: files are removed immediately after being indexed. If you want to keep them, pass the + `keep_files=true` parameter in the request payload. """ if not indexing_pipeline: raise HTTPException(status_code=501, detail="Indexing Pipeline is not configured.") @@ -88,3 +92,8 @@ def upload_file( params[preprocessor.name] = preprocessor_params.dict() indexing_pipeline.run(file_paths=file_paths, meta=file_metas, params=params) + + # Clean up indexed files + if not keep_files: + for p in file_paths: + p.unlink() diff --git a/rest_api/test/test_rest_api.py b/rest_api/test/test_rest_api.py index e0e7a3b83..ee212bf44 100644 --- a/rest_api/test/test_rest_api.py +++ b/rest_api/test/test_rest_api.py @@ -240,11 +240,12 @@ def feedback(): @pytest.fixture -def client(): +def client(tmp_path): yaml_pipeline_path = Path(__file__).parent.resolve() / "samples" / "test.haystack-pipeline.yml" os.environ["PIPELINE_YAML_PATH"] = str(yaml_pipeline_path) os.environ["INDEXING_PIPELINE_NAME"] = "test-indexing" os.environ["QUERY_PIPELINE_NAME"] = "test-query" + os.environ["FILE_UPLOAD_PATH"] = str(tmp_path) app = get_app() client = TestClient(app) @@ -324,6 +325,28 @@ def test_file_upload_with_wrong_meta(client): MockPDFToTextConverter.mocker.convert.assert_not_called() +def test_file_upload_cleanup_after_indexing(client): + # mock the upload path to use a dedicated temp folder + with mock.patch("rest_api.controller.file_upload.FILE_UPLOAD_PATH", os.environ.get("FILE_UPLOAD_PATH")): + file_to_upload = {"files": (Path(__file__).parent / "samples" / "pdf" / "sample_pdf_1.pdf").open("rb")} + response = client.post(url="/file-upload", files=file_to_upload, data={}) + assert 200 == response.status_code + # ensure upload folder is empty + uploaded_files = os.listdir(os.environ.get("FILE_UPLOAD_PATH")) + assert len(uploaded_files) == 0 + + +def test_file_upload_keep_files_after_indexing(client): + # mock the upload path to use a dedicated temp folder + with mock.patch("rest_api.controller.file_upload.FILE_UPLOAD_PATH", os.environ.get("FILE_UPLOAD_PATH")): + file_to_upload = {"files": (Path(__file__).parent / "samples" / "pdf" / "sample_pdf_1.pdf").open("rb")} + response = client.post(url="/file-upload", files=file_to_upload, params={"keep_files": "true"}) + assert 200 == response.status_code + # ensure original file was kept + uploaded_files = os.listdir(os.environ.get("FILE_UPLOAD_PATH")) + assert len(uploaded_files) == 1 + + def test_query_with_no_filter(client): with mock.patch("rest_api.controller.search.query_pipeline") as mocked_pipeline: # `run` must return a dictionary containing a `query` key