diff --git a/rest_api/config.py b/rest_api/config.py index 197917904..df8ea3e00 100644 --- a/rest_api/config.py +++ b/rest_api/config.py @@ -53,13 +53,20 @@ EMBEDDING_MODEL_FORMAT = os.getenv("EMBEDDING_MODEL_FORMAT", "farm") # File uploads FILE_UPLOAD_PATH = os.getenv("FILE_UPLOAD_PATH", "file-uploads") REMOVE_NUMERIC_TABLES = os.getenv("REMOVE_NUMERIC_TABLES", "True").lower() == "true" -REMOVE_WHITESPACE = os.getenv("REMOVE_WHITESPACE", "True").lower() == "true" -REMOVE_EMPTY_LINES = os.getenv("REMOVE_EMPTY_LINES", "True").lower() == "true" -REMOVE_HEADER_FOOTER = os.getenv("REMOVE_HEADER_FOOTER", "True").lower() == "true" VALID_LANGUAGES = os.getenv("VALID_LANGUAGES", None) if VALID_LANGUAGES: VALID_LANGUAGES = ast.literal_eval(VALID_LANGUAGES) +# Preprocessing +REMOVE_WHITESPACE = os.getenv("REMOVE_WHITESPACE", "True").lower() == "true" +REMOVE_EMPTY_LINES = os.getenv("REMOVE_EMPTY_LINES", "True").lower() == "true" +REMOVE_HEADER_FOOTER = os.getenv("REMOVE_HEADER_FOOTER", "True").lower() == "true" +SPLIT_BY = os.getenv("SPLIT_BY", "word") +SPLIT_LENGTH = os.getenv("SPLIT_LENGTH", 1_000) +SPLIT_OVERLAP = os.getenv("SPLIT_OVERLAP", None) +SPLIT_RESPECT_SENTENCE_BOUNDARY = os.getenv("SPLIT_RESPECT_SENTENCE_BOUNDARY", True) + + # Monitoring LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO") APM_SERVER = os.getenv("APM_SERVER", None) diff --git a/rest_api/controller/file_upload.py b/rest_api/controller/file_upload.py index 9fe781177..8051aed4a 100644 --- a/rest_api/controller/file_upload.py +++ b/rest_api/controller/file_upload.py @@ -12,10 +12,12 @@ from fastapi import UploadFile, File, Form from rest_api.config import DB_HOST, DB_PORT, DB_USER, DB_PW, DB_INDEX, DB_INDEX_FEEDBACK, ES_CONN_SCHEME, TEXT_FIELD_NAME, \ SEARCH_FIELD_NAME, FILE_UPLOAD_PATH, EMBEDDING_DIM, EMBEDDING_FIELD_NAME, EXCLUDE_META_DATA_FIELDS, VALID_LANGUAGES, \ FAQ_QUESTION_FIELD_NAME, REMOVE_NUMERIC_TABLES, REMOVE_WHITESPACE, REMOVE_EMPTY_LINES, REMOVE_HEADER_FOOTER, \ - CREATE_INDEX, UPDATE_EXISTING_DOCUMENTS, VECTOR_SIMILARITY_METRIC + CREATE_INDEX, UPDATE_EXISTING_DOCUMENTS, VECTOR_SIMILARITY_METRIC, SPLIT_BY, SPLIT_LENGTH, SPLIT_OVERLAP, \ + SPLIT_RESPECT_SENTENCE_BOUNDARY from haystack.document_store.elasticsearch import ElasticsearchDocumentStore from haystack.file_converter.pdf import PDFToTextConverter from haystack.file_converter.txt import TextConverter +from haystack.preprocessor.preprocessor import PreProcessor logger = logging.getLogger(__name__) @@ -30,7 +32,7 @@ document_store = ElasticsearchDocumentStore( index=DB_INDEX, label_index=DB_INDEX_FEEDBACK, scheme=ES_CONN_SCHEME, - ca_certs=False, + ca_certs=None, verify_certs=False, text_field=TEXT_FIELD_NAME, search_fields=SEARCH_FIELD_NAME, @@ -54,6 +56,10 @@ def upload_file_to_document_store( remove_empty_lines: Optional[bool] = Form(REMOVE_EMPTY_LINES), remove_header_footer: Optional[bool] = Form(REMOVE_HEADER_FOOTER), valid_languages: Optional[List[str]] = Form(VALID_LANGUAGES), + split_by: Optional[str] = Form(SPLIT_BY), + split_length: Optional[int] = Form(SPLIT_LENGTH), + split_overlap: Optional[int] = Form(SPLIT_OVERLAP), + split_respect_sentence_boundary: Optional[bool] = Form(SPLIT_RESPECT_SENTENCE_BOUNDARY), ): try: file_path = Path(FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{file.filename}" @@ -62,27 +68,31 @@ def upload_file_to_document_store( if file.filename.split(".")[-1].lower() == "pdf": pdf_converter = PDFToTextConverter( - remove_numeric_tables=remove_numeric_tables, - remove_whitespace=remove_whitespace, - remove_empty_lines=remove_empty_lines, - remove_header_footer=remove_header_footer, - valid_languages=valid_languages, + remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages ) document = pdf_converter.convert(file_path) elif file.filename.split(".")[-1].lower() == "txt": txt_converter = TextConverter( - remove_numeric_tables=remove_numeric_tables, - remove_whitespace=remove_whitespace, - remove_empty_lines=remove_empty_lines, - remove_header_footer=remove_header_footer, - valid_languages=valid_languages, + remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, ) document = txt_converter.convert(file_path) else: raise HTTPException(status_code=415, detail=f"Only .pdf and .txt file formats are supported.") - document_to_write = {TEXT_FIELD_NAME: document["text"], "name": file.filename} - document_store.write_documents([document_to_write]) + document = {TEXT_FIELD_NAME: document["text"], "name": file.filename} + + preprocessor = PreProcessor( + clean_whitespace=remove_whitespace, + clean_header_footer=remove_header_footer, + clean_empty_lines=remove_empty_lines, + split_by=split_by, + split_length=split_length, + split_overlap=split_overlap, + split_respect_sentence_boundary=split_respect_sentence_boundary, + ) + + documents = preprocessor.process(document) + document_store.write_documents(documents) return "File upload was successful." finally: file.file.close() diff --git a/test/test_rest_api.py b/test/test_rest_api.py index 35f4433b0..0b63b3d5e 100644 --- a/test/test_rest_api.py +++ b/test/test_rest_api.py @@ -1,19 +1,20 @@ import pytest from fastapi.testclient import TestClient - +from pathlib import Path from haystack import Finder from haystack.retriever.sparse import ElasticsearchRetriever # TODO: Add integration tests for other APIs -def get_test_client_and_override_dependencies(reader, document_store_with_docs): +def get_test_client_and_override_dependencies(reader, document_store): from rest_api.application import app - from rest_api.controller import search + from rest_api.controller import search, file_upload - search.document_store = document_store_with_docs - search.retriever = ElasticsearchRetriever(document_store=document_store_with_docs) + search.document_store = document_store + search.retriever = ElasticsearchRetriever(document_store=document_store) search.FINDERS = {1: Finder(reader=reader, retriever=search.retriever)} + file_upload.document_store = document_store return TestClient(app) @@ -96,3 +97,14 @@ def test_query_api_filters(reader, document_store_with_docs): assert "New York" == response_json['hits']['hits'][0]["_source"]["answer"] assert "My name is Paul and I live in New York" == response_json['hits']['hits'][0]["_source"]["context"] + +@pytest.mark.slow +@pytest.mark.elasticsearch +@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True) +def test_file_upload(document_store): + assert document_store.get_document_count() == 0 + client = get_test_client_and_override_dependencies(reader=None, document_store=document_store) + file_to_upload = {'file': Path("samples/pdf/sample_pdf_1.pdf").open('rb')} + response = client.post(url="/file-upload", files=file_to_upload) + assert 200 == response.status_code + assert document_store.get_document_count() > 0