2020-04-15 14:04:30 +02:00
import ast
import os
2020-06-09 04:56:56 -03:00
# FastAPI
PROJECT_NAME = os . getenv ( " PROJECT_NAME " , " FastAPI " )
2020-04-15 14:04:30 +02:00
# Resources / Computation
USE_GPU = os . getenv ( " USE_GPU " , " True " ) . lower ( ) == " true "
2020-07-07 16:25:36 +02:00
GPU_NUMBER = int ( os . getenv ( " GPU_NUMBER " , 1 ) )
2020-09-29 21:12:44 +02:00
MAX_PROCESSES = int ( os . getenv ( " MAX_PROCESSES " , 0 ) )
2020-04-15 14:04:30 +02:00
BATCHSIZE = int ( os . getenv ( " BATCHSIZE " , 50 ) )
2020-04-17 15:15:53 +02:00
CONCURRENT_REQUEST_PER_WORKER = int ( os . getenv ( " CONCURRENT_REQUEST_PER_WORKER " , 4 ) )
2020-04-15 14:04:30 +02:00
# DB
DB_HOST = os . getenv ( " DB_HOST " , " localhost " )
2020-06-02 17:59:40 +02:00
DB_PORT = int ( os . getenv ( " DB_PORT " , 9200 ) )
2020-04-15 14:04:30 +02:00
DB_USER = os . getenv ( " DB_USER " , " " )
DB_PW = os . getenv ( " DB_PW " , " " )
DB_INDEX = os . getenv ( " DB_INDEX " , " document " )
2020-07-31 11:34:06 +02:00
DB_INDEX_FEEDBACK = os . getenv ( " DB_INDEX_FEEDBACK " , " label " )
2020-04-15 14:04:30 +02:00
ES_CONN_SCHEME = os . getenv ( " ES_CONN_SCHEME " , " http " )
TEXT_FIELD_NAME = os . getenv ( " TEXT_FIELD_NAME " , " text " )
2020-08-10 05:34:39 -04:00
NAME_FIELD_NAME = os . getenv ( " NAME_FIELD_NAME " , " name " )
2020-04-15 14:04:30 +02:00
SEARCH_FIELD_NAME = os . getenv ( " SEARCH_FIELD_NAME " , " text " )
2020-06-11 12:36:19 +02:00
FAQ_QUESTION_FIELD_NAME = os . getenv ( " FAQ_QUESTION_FIELD_NAME " , " question " )
2020-11-16 06:51:27 +01:00
EMBEDDING_FIELD_NAME = os . getenv ( " EMBEDDING_FIELD_NAME " , " embedding " )
EMBEDDING_DIM = int ( os . getenv ( " EMBEDDING_DIM " , 768 ) )
2020-10-15 18:41:36 +02:00
VECTOR_SIMILARITY_METRIC = os . getenv ( " VECTOR_SIMILARITY_METRIC " , " dot_product " )
CREATE_INDEX = os . getenv ( " CREATE_INDEX " , " True " ) . lower ( ) == " true "
2021-01-12 13:00:56 +01:00
UPDATE_EXISTING_DOCUMENTS = os . getenv ( " UPDATE_EXISTING_DOCUMENTS " , " False " ) . lower ( ) == " true "
2020-04-15 14:04:30 +02:00
# Reader
2020-07-16 10:02:06 +02:00
READER_MODEL_PATH = os . getenv ( " READER_MODEL_PATH " , " deepset/roberta-base-squad2 " )
2020-07-15 17:22:17 +02:00
READER_TYPE = os . getenv ( " READER_TYPE " , " FARMReader " ) # alternative: 'TransformersReader'
2020-07-07 16:25:36 +02:00
READER_TOKENIZER = os . getenv ( " READER_TOKENIZER " , None )
2020-04-15 14:04:30 +02:00
CONTEXT_WINDOW_SIZE = int ( os . getenv ( " CONTEXT_WINDOW_SIZE " , 500 ) )
2021-02-15 14:27:17 +01:00
DEFAULT_TOP_K_READER = int ( os . getenv ( " DEFAULT_TOP_K_READER " , 5 ) ) # How many answers to return in total
2021-02-15 14:30:04 +01:00
TOP_K_PER_CANDIDATE = int ( os . getenv ( " TOP_K_PER_CANDIDATE " , 3 ) ) # How many answers can come from one indexed doc
2021-02-15 14:27:17 +01:00
TOP_K_PER_SAMPLE = int ( os . getenv ( " TOP_K_PER_SAMPLE " , 1 ) ) # How many answers can come from one passage that the reader processes at once (i.e. text of max_seq_len from the doc)
2020-04-15 14:04:30 +02:00
NO_ANS_BOOST = int ( os . getenv ( " NO_ANS_BOOST " , - 10 ) )
2020-11-20 14:09:39 +01:00
READER_CAN_HAVE_NO_ANSWER = os . getenv ( " READER_CAN_HAVE_NO_ANSWER " , " True " ) . lower ( ) == " true "
2020-04-15 14:04:30 +02:00
DOC_STRIDE = int ( os . getenv ( " DOC_STRIDE " , 128 ) )
MAX_SEQ_LEN = int ( os . getenv ( " MAX_SEQ_LEN " , 256 ) )
# Retriever
2020-07-15 17:22:17 +02:00
RETRIEVER_TYPE = os . getenv ( " RETRIEVER_TYPE " , " ElasticsearchRetriever " ) # alternatives: 'EmbeddingRetriever', 'ElasticsearchRetriever', 'ElasticsearchFilterOnlyRetriever', None
2020-09-29 21:12:44 +02:00
DEFAULT_TOP_K_RETRIEVER = int ( os . getenv ( " DEFAULT_TOP_K_RETRIEVER " , 5 ) )
2020-09-17 14:37:01 +02:00
EXCLUDE_META_DATA_FIELDS = os . getenv ( " EXCLUDE_META_DATA_FIELDS " , f " [ ' question_emb ' , ' embedding ' ] " )
2020-04-15 14:04:30 +02:00
if EXCLUDE_META_DATA_FIELDS :
EXCLUDE_META_DATA_FIELDS = ast . literal_eval ( EXCLUDE_META_DATA_FIELDS )
2020-11-16 06:51:27 +01:00
EMBEDDING_MODEL_PATH = os . getenv ( " EMBEDDING_MODEL_PATH " , " deepset/sentence_bert " )
2020-06-16 13:58:30 +02:00
EMBEDDING_MODEL_FORMAT = os . getenv ( " EMBEDDING_MODEL_FORMAT " , " farm " )
2020-04-15 14:04:30 +02:00
2020-06-17 16:28:26 +02:00
# File uploads
FILE_UPLOAD_PATH = os . getenv ( " FILE_UPLOAD_PATH " , " file-uploads " )
REMOVE_NUMERIC_TABLES = os . getenv ( " REMOVE_NUMERIC_TABLES " , " True " ) . lower ( ) == " true "
VALID_LANGUAGES = os . getenv ( " VALID_LANGUAGES " , None )
if VALID_LANGUAGES :
VALID_LANGUAGES = ast . literal_eval ( VALID_LANGUAGES )
2021-02-05 12:17:38 +01:00
# Preprocessing
REMOVE_WHITESPACE = os . getenv ( " REMOVE_WHITESPACE " , " True " ) . lower ( ) == " true "
REMOVE_EMPTY_LINES = os . getenv ( " REMOVE_EMPTY_LINES " , " True " ) . lower ( ) == " true "
REMOVE_HEADER_FOOTER = os . getenv ( " REMOVE_HEADER_FOOTER " , " True " ) . lower ( ) == " true "
SPLIT_BY = os . getenv ( " SPLIT_BY " , " word " )
SPLIT_LENGTH = os . getenv ( " SPLIT_LENGTH " , 1_000 )
SPLIT_OVERLAP = os . getenv ( " SPLIT_OVERLAP " , None )
SPLIT_RESPECT_SENTENCE_BOUNDARY = os . getenv ( " SPLIT_RESPECT_SENTENCE_BOUNDARY " , True )
2020-04-15 14:04:30 +02:00
# Monitoring
2020-11-04 09:54:02 +01:00
LOG_LEVEL = os . getenv ( " LOG_LEVEL " , " INFO " )
2020-04-17 15:29:19 +02:00
APM_SERVER = os . getenv ( " APM_SERVER " , None )
APM_SERVICE_NAME = os . getenv ( " APM_SERVICE_NAME " , " haystack-backend " )