mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-08 04:56:45 +00:00
feat!: remove original files after indexing (#5459)
* remove original files after indexing * fix tests
This commit is contained in:
parent
5f01391827
commit
d9fd1ab7bc
@ -49,10 +49,14 @@ def upload_file(
|
||||
additional_params: Optional[str] = Form("null"), # type: ignore
|
||||
fileconverter_params: FileConverterParams = Depends(FileConverterParams.as_form), # type: ignore
|
||||
preprocessor_params: PreprocessorParams = Depends(PreprocessorParams.as_form), # type: ignore
|
||||
keep_files: Optional[bool] = False,
|
||||
):
|
||||
"""
|
||||
You can use this endpoint to upload a file for indexing
|
||||
(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).
|
||||
(see https://docs.haystack.deepset.ai/docs/rest_api#indexing-documents-in-the-haystack-rest-api-documentstore).
|
||||
|
||||
Note: files are removed immediately after being indexed. If you want to keep them, pass the
|
||||
`keep_files=true` parameter in the request payload.
|
||||
"""
|
||||
if not indexing_pipeline:
|
||||
raise HTTPException(status_code=501, detail="Indexing Pipeline is not configured.")
|
||||
@ -88,3 +92,8 @@ def upload_file(
|
||||
params[preprocessor.name] = preprocessor_params.dict()
|
||||
|
||||
indexing_pipeline.run(file_paths=file_paths, meta=file_metas, params=params)
|
||||
|
||||
# Clean up indexed files
|
||||
if not keep_files:
|
||||
for p in file_paths:
|
||||
p.unlink()
|
||||
|
||||
@ -240,11 +240,12 @@ def feedback():
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def client():
|
||||
def client(tmp_path):
|
||||
yaml_pipeline_path = Path(__file__).parent.resolve() / "samples" / "test.haystack-pipeline.yml"
|
||||
os.environ["PIPELINE_YAML_PATH"] = str(yaml_pipeline_path)
|
||||
os.environ["INDEXING_PIPELINE_NAME"] = "test-indexing"
|
||||
os.environ["QUERY_PIPELINE_NAME"] = "test-query"
|
||||
os.environ["FILE_UPLOAD_PATH"] = str(tmp_path)
|
||||
|
||||
app = get_app()
|
||||
client = TestClient(app)
|
||||
@ -324,6 +325,28 @@ def test_file_upload_with_wrong_meta(client):
|
||||
MockPDFToTextConverter.mocker.convert.assert_not_called()
|
||||
|
||||
|
||||
def test_file_upload_cleanup_after_indexing(client):
|
||||
# mock the upload path to use a dedicated temp folder
|
||||
with mock.patch("rest_api.controller.file_upload.FILE_UPLOAD_PATH", os.environ.get("FILE_UPLOAD_PATH")):
|
||||
file_to_upload = {"files": (Path(__file__).parent / "samples" / "pdf" / "sample_pdf_1.pdf").open("rb")}
|
||||
response = client.post(url="/file-upload", files=file_to_upload, data={})
|
||||
assert 200 == response.status_code
|
||||
# ensure upload folder is empty
|
||||
uploaded_files = os.listdir(os.environ.get("FILE_UPLOAD_PATH"))
|
||||
assert len(uploaded_files) == 0
|
||||
|
||||
|
||||
def test_file_upload_keep_files_after_indexing(client):
|
||||
# mock the upload path to use a dedicated temp folder
|
||||
with mock.patch("rest_api.controller.file_upload.FILE_UPLOAD_PATH", os.environ.get("FILE_UPLOAD_PATH")):
|
||||
file_to_upload = {"files": (Path(__file__).parent / "samples" / "pdf" / "sample_pdf_1.pdf").open("rb")}
|
||||
response = client.post(url="/file-upload", files=file_to_upload, params={"keep_files": "true"})
|
||||
assert 200 == response.status_code
|
||||
# ensure original file was kept
|
||||
uploaded_files = os.listdir(os.environ.get("FILE_UPLOAD_PATH"))
|
||||
assert len(uploaded_files) == 1
|
||||
|
||||
|
||||
def test_query_with_no_filter(client):
|
||||
with mock.patch("rest_api.controller.search.query_pipeline") as mocked_pipeline:
|
||||
# `run` must return a dictionary containing a `query` key
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user