mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-27 09:56:37 +00:00
Fix file upload API (#808)
This commit is contained in:
parent
7b18e324f2
commit
f95b70df38
@ -53,13 +53,20 @@ EMBEDDING_MODEL_FORMAT = os.getenv("EMBEDDING_MODEL_FORMAT", "farm")
|
||||
# File uploads
|
||||
FILE_UPLOAD_PATH = os.getenv("FILE_UPLOAD_PATH", "file-uploads")
|
||||
REMOVE_NUMERIC_TABLES = os.getenv("REMOVE_NUMERIC_TABLES", "True").lower() == "true"
|
||||
REMOVE_WHITESPACE = os.getenv("REMOVE_WHITESPACE", "True").lower() == "true"
|
||||
REMOVE_EMPTY_LINES = os.getenv("REMOVE_EMPTY_LINES", "True").lower() == "true"
|
||||
REMOVE_HEADER_FOOTER = os.getenv("REMOVE_HEADER_FOOTER", "True").lower() == "true"
|
||||
VALID_LANGUAGES = os.getenv("VALID_LANGUAGES", None)
|
||||
if VALID_LANGUAGES:
|
||||
VALID_LANGUAGES = ast.literal_eval(VALID_LANGUAGES)
|
||||
|
||||
# Preprocessing
|
||||
REMOVE_WHITESPACE = os.getenv("REMOVE_WHITESPACE", "True").lower() == "true"
|
||||
REMOVE_EMPTY_LINES = os.getenv("REMOVE_EMPTY_LINES", "True").lower() == "true"
|
||||
REMOVE_HEADER_FOOTER = os.getenv("REMOVE_HEADER_FOOTER", "True").lower() == "true"
|
||||
SPLIT_BY = os.getenv("SPLIT_BY", "word")
|
||||
SPLIT_LENGTH = os.getenv("SPLIT_LENGTH", 1_000)
|
||||
SPLIT_OVERLAP = os.getenv("SPLIT_OVERLAP", None)
|
||||
SPLIT_RESPECT_SENTENCE_BOUNDARY = os.getenv("SPLIT_RESPECT_SENTENCE_BOUNDARY", True)
|
||||
|
||||
|
||||
# Monitoring
|
||||
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
|
||||
APM_SERVER = os.getenv("APM_SERVER", None)
|
||||
|
@ -12,10 +12,12 @@ from fastapi import UploadFile, File, Form
|
||||
from rest_api.config import DB_HOST, DB_PORT, DB_USER, DB_PW, DB_INDEX, DB_INDEX_FEEDBACK, ES_CONN_SCHEME, TEXT_FIELD_NAME, \
|
||||
SEARCH_FIELD_NAME, FILE_UPLOAD_PATH, EMBEDDING_DIM, EMBEDDING_FIELD_NAME, EXCLUDE_META_DATA_FIELDS, VALID_LANGUAGES, \
|
||||
FAQ_QUESTION_FIELD_NAME, REMOVE_NUMERIC_TABLES, REMOVE_WHITESPACE, REMOVE_EMPTY_LINES, REMOVE_HEADER_FOOTER, \
|
||||
CREATE_INDEX, UPDATE_EXISTING_DOCUMENTS, VECTOR_SIMILARITY_METRIC
|
||||
CREATE_INDEX, UPDATE_EXISTING_DOCUMENTS, VECTOR_SIMILARITY_METRIC, SPLIT_BY, SPLIT_LENGTH, SPLIT_OVERLAP, \
|
||||
SPLIT_RESPECT_SENTENCE_BOUNDARY
|
||||
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
|
||||
from haystack.file_converter.pdf import PDFToTextConverter
|
||||
from haystack.file_converter.txt import TextConverter
|
||||
from haystack.preprocessor.preprocessor import PreProcessor
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -30,7 +32,7 @@ document_store = ElasticsearchDocumentStore(
|
||||
index=DB_INDEX,
|
||||
label_index=DB_INDEX_FEEDBACK,
|
||||
scheme=ES_CONN_SCHEME,
|
||||
ca_certs=False,
|
||||
ca_certs=None,
|
||||
verify_certs=False,
|
||||
text_field=TEXT_FIELD_NAME,
|
||||
search_fields=SEARCH_FIELD_NAME,
|
||||
@ -54,6 +56,10 @@ def upload_file_to_document_store(
|
||||
remove_empty_lines: Optional[bool] = Form(REMOVE_EMPTY_LINES),
|
||||
remove_header_footer: Optional[bool] = Form(REMOVE_HEADER_FOOTER),
|
||||
valid_languages: Optional[List[str]] = Form(VALID_LANGUAGES),
|
||||
split_by: Optional[str] = Form(SPLIT_BY),
|
||||
split_length: Optional[int] = Form(SPLIT_LENGTH),
|
||||
split_overlap: Optional[int] = Form(SPLIT_OVERLAP),
|
||||
split_respect_sentence_boundary: Optional[bool] = Form(SPLIT_RESPECT_SENTENCE_BOUNDARY),
|
||||
):
|
||||
try:
|
||||
file_path = Path(FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{file.filename}"
|
||||
@ -62,27 +68,31 @@ def upload_file_to_document_store(
|
||||
|
||||
if file.filename.split(".")[-1].lower() == "pdf":
|
||||
pdf_converter = PDFToTextConverter(
|
||||
remove_numeric_tables=remove_numeric_tables,
|
||||
remove_whitespace=remove_whitespace,
|
||||
remove_empty_lines=remove_empty_lines,
|
||||
remove_header_footer=remove_header_footer,
|
||||
valid_languages=valid_languages,
|
||||
remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages
|
||||
)
|
||||
document = pdf_converter.convert(file_path)
|
||||
elif file.filename.split(".")[-1].lower() == "txt":
|
||||
txt_converter = TextConverter(
|
||||
remove_numeric_tables=remove_numeric_tables,
|
||||
remove_whitespace=remove_whitespace,
|
||||
remove_empty_lines=remove_empty_lines,
|
||||
remove_header_footer=remove_header_footer,
|
||||
valid_languages=valid_languages,
|
||||
remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages,
|
||||
)
|
||||
document = txt_converter.convert(file_path)
|
||||
else:
|
||||
raise HTTPException(status_code=415, detail=f"Only .pdf and .txt file formats are supported.")
|
||||
|
||||
document_to_write = {TEXT_FIELD_NAME: document["text"], "name": file.filename}
|
||||
document_store.write_documents([document_to_write])
|
||||
document = {TEXT_FIELD_NAME: document["text"], "name": file.filename}
|
||||
|
||||
preprocessor = PreProcessor(
|
||||
clean_whitespace=remove_whitespace,
|
||||
clean_header_footer=remove_header_footer,
|
||||
clean_empty_lines=remove_empty_lines,
|
||||
split_by=split_by,
|
||||
split_length=split_length,
|
||||
split_overlap=split_overlap,
|
||||
split_respect_sentence_boundary=split_respect_sentence_boundary,
|
||||
)
|
||||
|
||||
documents = preprocessor.process(document)
|
||||
document_store.write_documents(documents)
|
||||
return "File upload was successful."
|
||||
finally:
|
||||
file.file.close()
|
||||
|
@ -1,19 +1,20 @@
|
||||
import pytest
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from pathlib import Path
|
||||
from haystack import Finder
|
||||
from haystack.retriever.sparse import ElasticsearchRetriever
|
||||
|
||||
# TODO: Add integration tests for other APIs
|
||||
|
||||
|
||||
def get_test_client_and_override_dependencies(reader, document_store_with_docs):
|
||||
def get_test_client_and_override_dependencies(reader, document_store):
|
||||
from rest_api.application import app
|
||||
from rest_api.controller import search
|
||||
from rest_api.controller import search, file_upload
|
||||
|
||||
search.document_store = document_store_with_docs
|
||||
search.retriever = ElasticsearchRetriever(document_store=document_store_with_docs)
|
||||
search.document_store = document_store
|
||||
search.retriever = ElasticsearchRetriever(document_store=document_store)
|
||||
search.FINDERS = {1: Finder(reader=reader, retriever=search.retriever)}
|
||||
file_upload.document_store = document_store
|
||||
|
||||
return TestClient(app)
|
||||
|
||||
@ -96,3 +97,14 @@ def test_query_api_filters(reader, document_store_with_docs):
|
||||
assert "New York" == response_json['hits']['hits'][0]["_source"]["answer"]
|
||||
assert "My name is Paul and I live in New York" == response_json['hits']['hits'][0]["_source"]["context"]
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.elasticsearch
|
||||
@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True)
|
||||
def test_file_upload(document_store):
|
||||
assert document_store.get_document_count() == 0
|
||||
client = get_test_client_and_override_dependencies(reader=None, document_store=document_store)
|
||||
file_to_upload = {'file': Path("samples/pdf/sample_pdf_1.pdf").open('rb')}
|
||||
response = client.post(url="/file-upload", files=file_to_upload)
|
||||
assert 200 == response.status_code
|
||||
assert document_store.get_document_count() > 0
|
||||
|
Loading…
x
Reference in New Issue
Block a user