feat!: remove original files after indexing (#5459)

* remove original files after indexing

* fix tests
This commit is contained in:
Massimiliano Pippi 2023-07-31 13:07:16 +02:00 committed by GitHub
parent 5f01391827
commit d9fd1ab7bc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 34 additions and 2 deletions

View File

@ -49,10 +49,14 @@ def upload_file(
additional_params: Optional[str] = Form("null"), # type: ignore
fileconverter_params: FileConverterParams = Depends(FileConverterParams.as_form), # type: ignore
preprocessor_params: PreprocessorParams = Depends(PreprocessorParams.as_form), # type: ignore
keep_files: Optional[bool] = False,
):
"""
You can use this endpoint to upload a file for indexing
(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).
(see https://docs.haystack.deepset.ai/docs/rest_api#indexing-documents-in-the-haystack-rest-api-documentstore).
Note: files are removed immediately after being indexed. If you want to keep them, pass the
`keep_files=true` parameter in the request payload.
"""
if not indexing_pipeline:
raise HTTPException(status_code=501, detail="Indexing Pipeline is not configured.")
@ -88,3 +92,8 @@ def upload_file(
params[preprocessor.name] = preprocessor_params.dict()
indexing_pipeline.run(file_paths=file_paths, meta=file_metas, params=params)
# Clean up indexed files
if not keep_files:
for p in file_paths:
p.unlink()

View File

@ -240,11 +240,12 @@ def feedback():
@pytest.fixture
def client():
def client(tmp_path):
yaml_pipeline_path = Path(__file__).parent.resolve() / "samples" / "test.haystack-pipeline.yml"
os.environ["PIPELINE_YAML_PATH"] = str(yaml_pipeline_path)
os.environ["INDEXING_PIPELINE_NAME"] = "test-indexing"
os.environ["QUERY_PIPELINE_NAME"] = "test-query"
os.environ["FILE_UPLOAD_PATH"] = str(tmp_path)
app = get_app()
client = TestClient(app)
@ -324,6 +325,28 @@ def test_file_upload_with_wrong_meta(client):
MockPDFToTextConverter.mocker.convert.assert_not_called()
def test_file_upload_cleanup_after_indexing(client):
# mock the upload path to use a dedicated temp folder
with mock.patch("rest_api.controller.file_upload.FILE_UPLOAD_PATH", os.environ.get("FILE_UPLOAD_PATH")):
file_to_upload = {"files": (Path(__file__).parent / "samples" / "pdf" / "sample_pdf_1.pdf").open("rb")}
response = client.post(url="/file-upload", files=file_to_upload, data={})
assert 200 == response.status_code
# ensure upload folder is empty
uploaded_files = os.listdir(os.environ.get("FILE_UPLOAD_PATH"))
assert len(uploaded_files) == 0
def test_file_upload_keep_files_after_indexing(client):
# mock the upload path to use a dedicated temp folder
with mock.patch("rest_api.controller.file_upload.FILE_UPLOAD_PATH", os.environ.get("FILE_UPLOAD_PATH")):
file_to_upload = {"files": (Path(__file__).parent / "samples" / "pdf" / "sample_pdf_1.pdf").open("rb")}
response = client.post(url="/file-upload", files=file_to_upload, params={"keep_files": "true"})
assert 200 == response.status_code
# ensure original file was kept
uploaded_files = os.listdir(os.environ.get("FILE_UPLOAD_PATH"))
assert len(uploaded_files) == 1
def test_query_with_no_filter(client):
with mock.patch("rest_api.controller.search.query_pipeline") as mocked_pipeline:
# `run` must return a dictionary containing a `query` key