datahub/docker/elasticsearch-setup/create-indices.sh

204 lines
8.7 KiB
Bash
Executable File

#!/bin/bash
set -e
: ${DATAHUB_ANALYTICS_ENABLED:=true}
: ${USE_AWS_ELASTICSEARCH:=false}
: ${ELASTICSEARCH_INSECURE:=false}
: ${DUE_SHARDS:=1}
: ${DUE_REPLICAS:=1}
# protocol: http or https?
if [[ $ELASTICSEARCH_USE_SSL == true ]]; then
ELASTICSEARCH_PROTOCOL=https
else
ELASTICSEARCH_PROTOCOL=http
fi
echo -e "going to use protocol: $ELASTICSEARCH_PROTOCOL"
# Elasticsearch URL to be suffixed with a resource address
ELASTICSEARCH_URL="$ELASTICSEARCH_PROTOCOL://$ELASTICSEARCH_HOST:$ELASTICSEARCH_PORT"
# set auth header if none is given
if [[ -z $ELASTICSEARCH_AUTH_HEADER ]]; then
if [[ ! -z $ELASTICSEARCH_USERNAME ]]; then
# no auth header given, but username is defined -> use it to create the auth header
AUTH_TOKEN=$(echo -ne "$ELASTICSEARCH_USERNAME:$ELASTICSEARCH_PASSWORD" | base64 --wrap 0)
ELASTICSEARCH_AUTH_HEADER="Authorization:Basic $AUTH_TOKEN"
echo -e "going to use elastic headers based on username and password"
else
# no auth header or username given -> use default auth header
ELASTICSEARCH_AUTH_HEADER="Accept: */*"
echo -e "going to use default elastic headers"
fi
fi
# will be using this for all curl communication with Elasticsearch:
CURL_ARGS=(
--silent
--header "$ELASTICSEARCH_AUTH_HEADER"
)
# ... also optionally use --insecure
if [[ $ELASTICSEARCH_INSECURE == true ]]; then
CURL_ARGS+=(--insecure)
fi
# index prefix used throughout the script
if [[ -z "$INDEX_PREFIX" ]]; then
PREFIX=''
echo -e "not using any prefix"
else
PREFIX="${INDEX_PREFIX}_"
echo -e "going to use prefix: '$PREFIX'"
fi
# path where index definitions are stored
INDEX_DEFINITIONS_ROOT=/index/usage-event
# check Elasticsearch for given index/resource (first argument)
# if it doesn't exist (http code 404), use the given file (second argument) to create it
function create_if_not_exists {
RESOURCE_ADDRESS="$1"
RESOURCE_DEFINITION_NAME="$2"
# query ES to see if the resource already exists
RESOURCE_STATUS=$(curl "${CURL_ARGS[@]}" -o /dev/null -w "%{http_code}\n" "$ELASTICSEARCH_URL/$RESOURCE_ADDRESS")
echo -e "\n>>> GET $RESOURCE_ADDRESS response code is $RESOURCE_STATUS"
if [ $RESOURCE_STATUS -eq 200 ]; then
# resource already exists -> nothing to do
echo -e ">>> $RESOURCE_ADDRESS already exists ✓"
elif [ $RESOURCE_STATUS -eq 404 ]; then
# resource doesn't exist -> need to create it
echo -e ">>> creating $RESOURCE_ADDRESS because it doesn't exist ..."
# use the file at given path as definition, but first replace all occurences of `PREFIX`
# placeholder within the file with the actual prefix value
TMP_SOURCE_PATH="/tmp/$RESOURCE_DEFINITION_NAME"
sed -e "s/PREFIX/$PREFIX/g" "$INDEX_DEFINITIONS_ROOT/$RESOURCE_DEFINITION_NAME" \
| sed -e "s/DUE_SHARDS/$DUE_SHARDS/g" \
| sed -e "s/DUE_REPLICAS/$DUE_REPLICAS/g" \
| tee -a "$TMP_SOURCE_PATH"
curl "${CURL_ARGS[@]}" -XPUT "$ELASTICSEARCH_URL/$RESOURCE_ADDRESS" -H 'Content-Type: application/json' --data "@$TMP_SOURCE_PATH"
elif [ $RESOURCE_STATUS -eq 403 ]; then
# probably authorization fail
echo -e ">>> forbidden access to $RESOURCE_ADDRESS ! -> exiting"
exit 1
else
# when `USE_AWS_ELASTICSEARCH` was forgotten to be set to `true` when running against AWS ES OSS,
# this script will use wrong paths (e.g. `_ilm/policy/` instead of AWS-compatible `_opendistro/_ism/policies/`)
# and the ES endpoint will return `401 Unauthorized` or `405 Method Not Allowed`
# let's use this as chance to point that wrong config might be used!
if [ $RESOURCE_STATUS -eq 401 ] || [ $RESOURCE_STATUS -eq 405 ]; then
if [[ $USE_AWS_ELASTICSEARCH == false ]] && [[ $ELASTICSEARCH_URL == *"amazonaws"* ]]; then
echo "... looks like AWS OpenSearch is used; please set USE_AWS_ELASTICSEARCH env value to true"
fi
fi
echo -e ">>> failed to GET $RESOURCE_ADDRESS ! -> exiting"
exit 1
fi
}
# Update ISM policy. Non-fatal if policy cannot be updated.
function update_ism_policy {
RESOURCE_ADDRESS="$1"
RESOURCE_DEFINITION_NAME="$2"
TMP_CURRENT_POLICY_PATH="/tmp/current-$RESOURCE_DEFINITION_NAME"
# Get existing policy
RESOURCE_STATUS=$(curl "${CURL_ARGS[@]}" -o $TMP_CURRENT_POLICY_PATH -w "%{http_code}\n" "$ELASTICSEARCH_URL/$RESOURCE_ADDRESS")
echo -e "\n>>> GET $RESOURCE_ADDRESS response code is $RESOURCE_STATUS"
if [ $RESOURCE_STATUS -ne 200 ]; then
echo -e ">>> Could not get ISM policy $RESOURCE_ADDRESS. Ignoring."
return
fi
SEQ_NO=$(cat $TMP_CURRENT_POLICY_PATH | jq -r '._seq_no')
PRIMARY_TERM=$(cat $TMP_CURRENT_POLICY_PATH | jq -r '._primary_term')
TMP_NEW_RESPONSE_PATH="/tmp/response-$RESOURCE_DEFINITION_NAME"
TMP_NEW_POLICY_PATH="/tmp/new-$RESOURCE_DEFINITION_NAME"
sed -e "s/PREFIX/$PREFIX/g" "$INDEX_DEFINITIONS_ROOT/$RESOURCE_DEFINITION_NAME" \
| sed -e "s/DUE_SHARDS/$DUE_SHARDS/g" \
| sed -e "s/DUE_REPLICAS/$DUE_REPLICAS/g" \
| tee -a "$TMP_NEW_POLICY_PATH"
RESOURCE_STATUS=$(curl "${CURL_ARGS[@]}" -XPUT "$ELASTICSEARCH_URL/$RESOURCE_ADDRESS?if_seq_no=$SEQ_NO&if_primary_term=$PRIMARY_TERM" \
-H 'Content-Type: application/json' -w "%{http_code}\n" -o $TMP_NEW_RESPONSE_PATH --data "@$TMP_NEW_POLICY_PATH")
echo -e "\n>>> PUT $RESOURCE_ADDRESS response code is $RESOURCE_STATUS"
}
# create indices for ES (non-AWS)
function create_datahub_usage_event_datastream() {
# non-AWS env requires creation of three resources for Datahub usage events:
# 1. ILM policy
create_if_not_exists "_ilm/policy/${PREFIX}datahub_usage_event_policy" policy.json
# 2. index template
create_if_not_exists "_index_template/${PREFIX}datahub_usage_event_index_template" index_template.json
# 3. although indexing request creates the data stream, it's not queryable before creation, causing GMS to throw exceptions
create_if_not_exists "_data_stream/${PREFIX}datahub_usage_event" "datahub_usage_event"
}
# create indices for ES OSS (AWS)
function create_datahub_usage_event_aws_elasticsearch() {
# AWS env requires creation of three resources for Datahub usage events:
# 1. ISM policy
create_if_not_exists "_opendistro/_ism/policies/${PREFIX}datahub_usage_event_policy" aws_es_ism_policy.json
# 1.1 ISM policy update if it already existed
if [ $RESOURCE_STATUS -eq 200 ]; then
update_ism_policy "_opendistro/_ism/policies/${PREFIX}datahub_usage_event_policy" aws_es_ism_policy.json
fi
# 2. index template
create_if_not_exists "_template/${PREFIX}datahub_usage_event_index_template" aws_es_index_template.json
# 3. event index datahub_usage_event-000001
# (note that AWS *rollover* indices need to use `^.*-\d+$` naming pattern)
# -> https://aws.amazon.com/premiumsupport/knowledge-center/opensearch-failed-rollover-index/
INDEX_SUFFIX="000001"
# ... but first check whether `datahub_usage_event` wasn't already autocreated by GMS before `datahub_usage_event-000001`
# (as is common case when this script was initially run without properly setting `USE_AWS_ELASTICSEARCH` to `true`)
# -> https://github.com/datahub-project/datahub/issues/5376
USAGE_EVENT_STATUS=$(curl "${CURL_ARGS[@]}" -o /dev/null -w "%{http_code}\n" "$ELASTICSEARCH_URL/${PREFIX}datahub_usage_event")
if [ $USAGE_EVENT_STATUS -eq 200 ]; then
USAGE_EVENT_DEFINITION=$(curl "${CURL_ARGS[@]}" "$ELASTICSEARCH_URL/${PREFIX}datahub_usage_event")
# the definition is expected to contain "datahub_usage_event-000001" string
if [[ $USAGE_EVENT_DEFINITION != *"datahub_usage_event-"* ]]; then
# ... if it doesn't, we need to drop it
echo -e "\n>>> deleting invalid datahub_usage_event ..."
curl "${CURL_ARGS[@]}" -XDELETE "$ELASTICSEARCH_URL/${PREFIX}datahub_usage_event"
# ... and then recreate it below
fi
fi
# ... now we are safe to create the index
create_if_not_exists "${PREFIX}datahub_usage_event-$INDEX_SUFFIX" aws_es_index.json
}
if [[ $DATAHUB_ANALYTICS_ENABLED == true ]]; then
echo -e "\n datahub_analytics_enabled: $DATAHUB_ANALYTICS_ENABLED"
if [[ $USE_AWS_ELASTICSEARCH == false ]]; then
create_datahub_usage_event_datastream || exit 1
else
create_datahub_usage_event_aws_elasticsearch || exit 1
fi
else
echo -e "\ndatahub_analytics_enabled: $DATAHUB_ANALYTICS_ENABLED"
DATAHUB_USAGE_EVENT_INDEX_RESPONSE_CODE=$(curl "${CURL_ARGS[@]}" -o /dev/null -w "%{http_code}" "$ELASTICSEARCH_URL/_cat/indices/${PREFIX}datahub_usage_event")
if [ $DATAHUB_USAGE_EVENT_INDEX_RESPONSE_CODE -eq 404 ]
then
echo -e "\ncreating ${PREFIX}datahub_usage_event"
curl "${CURL_ARGS[@]}" -XPUT "$ELASTICSEARCH_URL/${PREFIX}datahub_usage_event"
elif [ $DATAHUB_USAGE_EVENT_INDEX_RESPONSE_CODE -eq 200 ]; then
echo -e "\n${PREFIX}datahub_usage_event exists"
elif [ $DATAHUB_USAGE_EVENT_INDEX_RESPONSE_CODE -eq 403 ]; then
echo -e "Forbidden so exiting"
fi
fi