haystack/rest_api/config.py

import ast
import os

# FastAPI
PROJECT_NAME = os.getenv("PROJECT_NAME", "FastAPI")

# Resources / Computation
USE_GPU = os.getenv("USE_GPU", "True").lower() == "true"
GPU_NUMBER = int(os.getenv("GPU_NUMBER", 1))
MAX_PROCESSES = int(os.getenv("MAX_PROCESSES", 0))
BATCHSIZE = int(os.getenv("BATCHSIZE", 50))
CONCURRENT_REQUEST_PER_WORKER = int(os.getenv("CONCURRENT_REQUEST_PER_WORKER", 4))

# DB
DB_HOST = os.getenv("DB_HOST", "localhost")
DB_PORT = int(os.getenv("DB_PORT", 9200))
DB_USER = os.getenv("DB_USER", "")
DB_PW = os.getenv("DB_PW", "")
DB_INDEX = os.getenv("DB_INDEX", "document")
DB_INDEX_FEEDBACK = os.getenv("DB_INDEX_FEEDBACK", "label")
ES_CONN_SCHEME = os.getenv("ES_CONN_SCHEME", "http")
TEXT_FIELD_NAME = os.getenv("TEXT_FIELD_NAME", "text")
NAME_FIELD_NAME = os.getenv("NAME_FIELD_NAME", "name")
SEARCH_FIELD_NAME = os.getenv("SEARCH_FIELD_NAME", "text")
FAQ_QUESTION_FIELD_NAME = os.getenv("FAQ_QUESTION_FIELD_NAME", "question")
EMBEDDING_FIELD_NAME = os.getenv("EMBEDDING_FIELD_NAME", "embedding")
EMBEDDING_DIM = int(os.getenv("EMBEDDING_DIM", 768))
VECTOR_SIMILARITY_METRIC = os.getenv("VECTOR_SIMILARITY_METRIC", "dot_product")
CREATE_INDEX = os.getenv("CREATE_INDEX", "True").lower() == "true"
UPDATE_EXISTING_DOCUMENTS = os.getenv("UPDATE_EXISTING_DOCUMENTS", "False").lower() == "true"

# Reader
READER_MODEL_PATH = os.getenv("READER_MODEL_PATH", "deepset/roberta-base-squad2")
READER_TYPE = os.getenv("READER_TYPE", "FARMReader") # alternative: 'TransformersReader'
READER_TOKENIZER = os.getenv("READER_TOKENIZER", None)
CONTEXT_WINDOW_SIZE = int(os.getenv("CONTEXT_WINDOW_SIZE", 500))
DEFAULT_TOP_K_READER = int(os.getenv("DEFAULT_TOP_K_READER", 5)) # How many answers to return in total
TOP_K_PER_CANDIDATE = int(os.getenv("TOP_K_PER_CANDIDATE", 3)) # How many answers can come from one indexed doc
TOP_K_PER_SAMPLE = int(os.getenv("TOP_K_PER_SAMPLE", 1)) # How many answers can come from one passage that the reader processes at once (i.e. text of max_seq_len from the doc)
NO_ANS_BOOST = int(os.getenv("NO_ANS_BOOST", -10))
READER_CAN_HAVE_NO_ANSWER = os.getenv("READER_CAN_HAVE_NO_ANSWER", "True").lower() == "true"
DOC_STRIDE = int(os.getenv("DOC_STRIDE", 128))
MAX_SEQ_LEN = int(os.getenv("MAX_SEQ_LEN", 256))

# Retriever
RETRIEVER_TYPE = os.getenv("RETRIEVER_TYPE", "ElasticsearchRetriever") # alternatives: 'EmbeddingRetriever', 'ElasticsearchRetriever', 'ElasticsearchFilterOnlyRetriever', None
DEFAULT_TOP_K_RETRIEVER = int(os.getenv("DEFAULT_TOP_K_RETRIEVER", 5))
EXCLUDE_META_DATA_FIELDS = os.getenv("EXCLUDE_META_DATA_FIELDS", f"['question_emb','embedding']")
if EXCLUDE_META_DATA_FIELDS:
    EXCLUDE_META_DATA_FIELDS = ast.literal_eval(EXCLUDE_META_DATA_FIELDS)
EMBEDDING_MODEL_PATH = os.getenv("EMBEDDING_MODEL_PATH", "deepset/sentence_bert")
EMBEDDING_MODEL_FORMAT = os.getenv("EMBEDDING_MODEL_FORMAT", "farm")

# File uploads
FILE_UPLOAD_PATH = os.getenv("FILE_UPLOAD_PATH", "file-uploads")
REMOVE_NUMERIC_TABLES = os.getenv("REMOVE_NUMERIC_TABLES", "True").lower() == "true"
VALID_LANGUAGES = os.getenv("VALID_LANGUAGES", None)
if VALID_LANGUAGES:
    VALID_LANGUAGES = ast.literal_eval(VALID_LANGUAGES)

# Preprocessing
REMOVE_WHITESPACE = os.getenv("REMOVE_WHITESPACE", "True").lower() == "true"
REMOVE_EMPTY_LINES = os.getenv("REMOVE_EMPTY_LINES", "True").lower() == "true"
REMOVE_HEADER_FOOTER = os.getenv("REMOVE_HEADER_FOOTER", "True").lower() == "true"
SPLIT_BY = os.getenv("SPLIT_BY", "word")
SPLIT_LENGTH = os.getenv("SPLIT_LENGTH", 1_000)
SPLIT_OVERLAP = os.getenv("SPLIT_OVERLAP", None)
SPLIT_RESPECT_SENTENCE_BOUNDARY = os.getenv("SPLIT_RESPECT_SENTENCE_BOUNDARY", True)


# Monitoring
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
APM_SERVER = os.getenv("APM_SERVER", None)
APM_SERVICE_NAME = os.getenv("APM_SERVICE_NAME", "haystack-backend")
Modularize API components (#55) This PR makes the REST API module more cohesive and maintainable by splitting into separate modules for controllers, routes, config, and the FastAPI app. 2020-04-15 14:04:30 +02:00			`import ast`
			`import os`

Add Elasticsearch PORT and SCHEME in API config (#134) 2020-06-09 04:56:56 -03:00			`# FastAPI`
			`PROJECT_NAME = os.getenv("PROJECT_NAME", "FastAPI")`

Modularize API components (#55) This PR makes the REST API module more cohesive and maintainable by splitting into separate modules for controllers, routes, config, and the FastAPI app. 2020-04-15 14:04:30 +02:00			`# Resources / Computation`
			`USE_GPU = os.getenv("USE_GPU", "True").lower() == "true"`
Adjust Docker and REST API to allow TransformsReader Class (#180) 2020-07-07 16:25:36 +02:00			`GPU_NUMBER = int(os.getenv("GPU_NUMBER", 1))`
Update GPU docker & fix race condition with multiple workers (#436) * fix gpu CMD and set tag to latest * udpate dockerfiles. resolve race condition of index creation with multiple workers * update dockerfiles for preload. remove try catch for elastic index creation * add back try/catch. disable multiproc in default config to comply with --preload of gunicorn * change to pip3 for GPU dockerfile * remove --preload for gpu 2020-09-29 21:12:44 +02:00			`MAX_PROCESSES = int(os.getenv("MAX_PROCESSES", 0))`
Modularize API components (#55) This PR makes the REST API module more cohesive and maintainable by splitting into separate modules for controllers, routes, config, and the FastAPI app. 2020-04-15 14:04:30 +02:00			`BATCHSIZE = int(os.getenv("BATCHSIZE", 50))`
Add limit on concurrent requests for doc-qa (#64) 2020-04-17 15:15:53 +02:00			`CONCURRENT_REQUEST_PER_WORKER = int(os.getenv("CONCURRENT_REQUEST_PER_WORKER", 4))`
Modularize API components (#55) This PR makes the REST API module more cohesive and maintainable by splitting into separate modules for controllers, routes, config, and the FastAPI app. 2020-04-15 14:04:30 +02:00
			`# DB`
			`DB_HOST = os.getenv("DB_HOST", "localhost")`
Add custom port to ElasticsearchDocumentStore and remove outdated tag_fields arg (#129) 2020-06-02 17:59:40 +02:00			`DB_PORT = int(os.getenv("DB_PORT", 9200))`
Modularize API components (#55) This PR makes the REST API module more cohesive and maintainable by splitting into separate modules for controllers, routes, config, and the FastAPI app. 2020-04-15 14:04:30 +02:00			`DB_USER = os.getenv("DB_USER", "")`
			`DB_PW = os.getenv("DB_PW", "")`
			`DB_INDEX = os.getenv("DB_INDEX", "document")`
Add eval for Dense Passage Retriever & Refactor handling of labels/feedback (#243) 2020-07-31 11:34:06 +02:00			`DB_INDEX_FEEDBACK = os.getenv("DB_INDEX_FEEDBACK", "label")`
Modularize API components (#55) This PR makes the REST API module more cohesive and maintainable by splitting into separate modules for controllers, routes, config, and the FastAPI app. 2020-04-15 14:04:30 +02:00			`ES_CONN_SCHEME = os.getenv("ES_CONN_SCHEME", "http")`
			`TEXT_FIELD_NAME = os.getenv("TEXT_FIELD_NAME", "text")`
Custom fields for indexing in ElasticsearchDocumentStore (#297) 2020-08-10 05:34:39 -04:00			`NAME_FIELD_NAME = os.getenv("NAME_FIELD_NAME", "name")`
Modularize API components (#55) This PR makes the REST API module more cohesive and maintainable by splitting into separate modules for controllers, routes, config, and the FastAPI app. 2020-04-15 14:04:30 +02:00			`SEARCH_FIELD_NAME = os.getenv("SEARCH_FIELD_NAME", "text")`
Make FAQ question field customizable (#146) 2020-06-11 12:36:19 +02:00			`FAQ_QUESTION_FIELD_NAME = os.getenv("FAQ_QUESTION_FIELD_NAME", "question")`
Fixing defaults in config for rest_api (#583) * Fixing defaults configs for rest_apis * Reverting change to VALID_LANGUAGES * Casting EMBEDDING_DIM as int 2020-11-16 06:51:27 +01:00			`EMBEDDING_FIELD_NAME = os.getenv("EMBEDDING_FIELD_NAME", "embedding")`
			`EMBEDDING_DIM = int(os.getenv("EMBEDDING_DIM", 768))`
Add create_index and similarity metric to api config (#493) * make creation of label index optional * add params for rest api * reset tutorial flag 2020-10-15 18:41:36 +02:00			`VECTOR_SIMILARITY_METRIC = os.getenv("VECTOR_SIMILARITY_METRIC", "dot_product")`
			`CREATE_INDEX = os.getenv("CREATE_INDEX", "True").lower() == "true"`
Rename label id field for elastic & add UPDATE_EXISTING_DOCUMENTS to API config (#728) * rename label id field for elastic * add UPDATE_EXISTING_DOCUMENTS param to API config 2021-01-12 13:00:56 +01:00			`UPDATE_EXISTING_DOCUMENTS = os.getenv("UPDATE_EXISTING_DOCUMENTS", "False").lower() == "true"`
Modularize API components (#55) This PR makes the REST API module more cohesive and maintainable by splitting into separate modules for controllers, routes, config, and the FastAPI app. 2020-04-15 14:04:30 +02:00
			`# Reader`
Change default reader for REST API 2020-07-16 10:02:06 +02:00			`READER_MODEL_PATH = os.getenv("READER_MODEL_PATH", "deepset/roberta-base-squad2")`
Add dummy retriever for benchmarking / reader-only settings (#235) 2020-07-15 17:22:17 +02:00			`READER_TYPE = os.getenv("READER_TYPE", "FARMReader") # alternative: 'TransformersReader'`
Adjust Docker and REST API to allow TransformsReader Class (#180) 2020-07-07 16:25:36 +02:00			`READER_TOKENIZER = os.getenv("READER_TOKENIZER", None)`
Modularize API components (#55) This PR makes the REST API module more cohesive and maintainable by splitting into separate modules for controllers, routes, config, and the FastAPI app. 2020-04-15 14:04:30 +02:00			`CONTEXT_WINDOW_SIZE = int(os.getenv("CONTEXT_WINDOW_SIZE", 500))`
Fix UI when API returns fewer answers than expected (#828) * fix ui for few answers from api. add top_k_per_sample env * Add latest docstring and tutorial changes Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2021-02-15 14:27:17 +01:00			`DEFAULT_TOP_K_READER = int(os.getenv("DEFAULT_TOP_K_READER", 5)) # How many answers to return in total`
Revert TOP_K_PER_CANDIDATE value to 3 2021-02-15 14:30:04 +01:00			`TOP_K_PER_CANDIDATE = int(os.getenv("TOP_K_PER_CANDIDATE", 3)) # How many answers can come from one indexed doc`
Fix UI when API returns fewer answers than expected (#828) * fix ui for few answers from api. add top_k_per_sample env * Add latest docstring and tutorial changes Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2021-02-15 14:27:17 +01:00			`TOP_K_PER_SAMPLE = int(os.getenv("TOP_K_PER_SAMPLE", 1)) # How many answers can come from one passage that the reader processes at once (i.e. text of max_seq_len from the doc)`
Modularize API components (#55) This PR makes the REST API module more cohesive and maintainable by splitting into separate modules for controllers, routes, config, and the FastAPI app. 2020-04-15 14:04:30 +02:00			`NO_ANS_BOOST = int(os.getenv("NO_ANS_BOOST", -10))`
Allow setting return_no_answers for TransformersReader in REST API (SQuAD 1.0 format) (#609) * Update config.py * new option Allow a new option from the settings : tell is a reader model can return a "no answer" like SQuAD2.0 models, or if it's only a SQuAD1.0-like model, always giving an answer. 2020-11-20 14:09:39 +01:00			`READER_CAN_HAVE_NO_ANSWER = os.getenv("READER_CAN_HAVE_NO_ANSWER", "True").lower() == "true"`
Modularize API components (#55) This PR makes the REST API module more cohesive and maintainable by splitting into separate modules for controllers, routes, config, and the FastAPI app. 2020-04-15 14:04:30 +02:00			`DOC_STRIDE = int(os.getenv("DOC_STRIDE", 128))`
			`MAX_SEQ_LEN = int(os.getenv("MAX_SEQ_LEN", 256))`

			`# Retriever`
Add dummy retriever for benchmarking / reader-only settings (#235) 2020-07-15 17:22:17 +02:00			`RETRIEVER_TYPE = os.getenv("RETRIEVER_TYPE", "ElasticsearchRetriever") # alternatives: 'EmbeddingRetriever', 'ElasticsearchRetriever', 'ElasticsearchFilterOnlyRetriever', None`
Update GPU docker & fix race condition with multiple workers (#436) * fix gpu CMD and set tag to latest * udpate dockerfiles. resolve race condition of index creation with multiple workers * update dockerfiles for preload. remove try catch for elastic index creation * add back try/catch. disable multiproc in default config to comply with --preload of gunicorn * change to pip3 for GPU dockerfile * remove --preload for gpu 2020-09-29 21:12:44 +02:00			`DEFAULT_TOP_K_RETRIEVER = int(os.getenv("DEFAULT_TOP_K_RETRIEVER", 5))`
Exclude embedding fields from the REST API (#390) 2020-09-17 14:37:01 +02:00			`EXCLUDE_META_DATA_FIELDS = os.getenv("EXCLUDE_META_DATA_FIELDS", f"['question_emb','embedding']")`
Modularize API components (#55) This PR makes the REST API module more cohesive and maintainable by splitting into separate modules for controllers, routes, config, and the FastAPI app. 2020-04-15 14:04:30 +02:00			`if EXCLUDE_META_DATA_FIELDS:`
			`EXCLUDE_META_DATA_FIELDS = ast.literal_eval(EXCLUDE_META_DATA_FIELDS)`
Fixing defaults in config for rest_api (#583) * Fixing defaults configs for rest_apis * Reverting change to VALID_LANGUAGES * Casting EMBEDDING_DIM as int 2020-11-16 06:51:27 +01:00			`EMBEDDING_MODEL_PATH = os.getenv("EMBEDDING_MODEL_PATH", "deepset/sentence_bert")`
Add EMBEDDING_MODEL_FORMAT in API config (#152) 2020-06-16 13:58:30 +02:00			`EMBEDDING_MODEL_FORMAT = os.getenv("EMBEDDING_MODEL_FORMAT", "farm")`
Modularize API components (#55) This PR makes the REST API module more cohesive and maintainable by splitting into separate modules for controllers, routes, config, and the FastAPI app. 2020-04-15 14:04:30 +02:00
Add API endpoint to upload files (#154) 2020-06-17 16:28:26 +02:00			`# File uploads`
			`FILE_UPLOAD_PATH = os.getenv("FILE_UPLOAD_PATH", "file-uploads")`
			`REMOVE_NUMERIC_TABLES = os.getenv("REMOVE_NUMERIC_TABLES", "True").lower() == "true"`
			`VALID_LANGUAGES = os.getenv("VALID_LANGUAGES", None)`
			`if VALID_LANGUAGES:`
			`VALID_LANGUAGES = ast.literal_eval(VALID_LANGUAGES)`

Fix file upload API (#808) 2021-02-05 12:17:38 +01:00			`# Preprocessing`
			`REMOVE_WHITESPACE = os.getenv("REMOVE_WHITESPACE", "True").lower() == "true"`
			`REMOVE_EMPTY_LINES = os.getenv("REMOVE_EMPTY_LINES", "True").lower() == "true"`
			`REMOVE_HEADER_FOOTER = os.getenv("REMOVE_HEADER_FOOTER", "True").lower() == "true"`
			`SPLIT_BY = os.getenv("SPLIT_BY", "word")`
			`SPLIT_LENGTH = os.getenv("SPLIT_LENGTH", 1_000)`
			`SPLIT_OVERLAP = os.getenv("SPLIT_OVERLAP", None)`
			`SPLIT_RESPECT_SENTENCE_BOUNDARY = os.getenv("SPLIT_RESPECT_SENTENCE_BOUNDARY", True)`


Modularize API components (#55) This PR makes the REST API module more cohesive and maintainable by splitting into separate modules for controllers, routes, config, and the FastAPI app. 2020-04-15 14:04:30 +02:00			`# Monitoring`
Allow configuration of log level in REST API via ENV (#541) * configure log level via env. adjust debug messages * pin faiss version 2020-11-04 09:54:02 +01:00			`LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")`
Make APM optional (#65) 2020-04-17 15:29:19 +02:00			`APM_SERVER = os.getenv("APM_SERVER", None)`
			`APM_SERVICE_NAME = os.getenv("APM_SERVICE_NAME", "haystack-backend")`