mirror of
https://github.com/datahub-project/datahub.git
synced 2025-06-27 05:03:31 +00:00
204 lines
8.7 KiB
Bash
Executable File
204 lines
8.7 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
set -e
|
|
|
|
: ${DATAHUB_ANALYTICS_ENABLED:=true}
|
|
: ${USE_AWS_ELASTICSEARCH:=false}
|
|
: ${ELASTICSEARCH_INSECURE:=false}
|
|
: ${DUE_SHARDS:=1}
|
|
: ${DUE_REPLICAS:=1}
|
|
|
|
# protocol: http or https?
|
|
if [[ $ELASTICSEARCH_USE_SSL == true ]]; then
|
|
ELASTICSEARCH_PROTOCOL=https
|
|
else
|
|
ELASTICSEARCH_PROTOCOL=http
|
|
fi
|
|
echo -e "going to use protocol: $ELASTICSEARCH_PROTOCOL"
|
|
|
|
# Elasticsearch URL to be suffixed with a resource address
|
|
ELASTICSEARCH_URL="$ELASTICSEARCH_PROTOCOL://$ELASTICSEARCH_HOST:$ELASTICSEARCH_PORT"
|
|
|
|
# set auth header if none is given
|
|
if [[ -z $ELASTICSEARCH_AUTH_HEADER ]]; then
|
|
if [[ ! -z $ELASTICSEARCH_USERNAME ]]; then
|
|
# no auth header given, but username is defined -> use it to create the auth header
|
|
AUTH_TOKEN=$(echo -ne "$ELASTICSEARCH_USERNAME:$ELASTICSEARCH_PASSWORD" | base64 --wrap 0)
|
|
ELASTICSEARCH_AUTH_HEADER="Authorization:Basic $AUTH_TOKEN"
|
|
echo -e "going to use elastic headers based on username and password"
|
|
else
|
|
# no auth header or username given -> use default auth header
|
|
ELASTICSEARCH_AUTH_HEADER="Accept: */*"
|
|
echo -e "going to use default elastic headers"
|
|
fi
|
|
fi
|
|
|
|
# will be using this for all curl communication with Elasticsearch:
|
|
CURL_ARGS=(
|
|
--silent
|
|
--header "$ELASTICSEARCH_AUTH_HEADER"
|
|
)
|
|
# ... also optionally use --insecure
|
|
if [[ $ELASTICSEARCH_INSECURE == true ]]; then
|
|
CURL_ARGS+=(--insecure)
|
|
fi
|
|
|
|
# index prefix used throughout the script
|
|
if [[ -z "$INDEX_PREFIX" ]]; then
|
|
PREFIX=''
|
|
echo -e "not using any prefix"
|
|
else
|
|
PREFIX="${INDEX_PREFIX}_"
|
|
echo -e "going to use prefix: '$PREFIX'"
|
|
fi
|
|
|
|
# path where index definitions are stored
|
|
INDEX_DEFINITIONS_ROOT=/index/usage-event
|
|
|
|
|
|
# check Elasticsearch for given index/resource (first argument)
|
|
# if it doesn't exist (http code 404), use the given file (second argument) to create it
|
|
function create_if_not_exists {
|
|
RESOURCE_ADDRESS="$1"
|
|
RESOURCE_DEFINITION_NAME="$2"
|
|
|
|
# query ES to see if the resource already exists
|
|
RESOURCE_STATUS=$(curl "${CURL_ARGS[@]}" -o /dev/null -w "%{http_code}\n" "$ELASTICSEARCH_URL/$RESOURCE_ADDRESS")
|
|
echo -e "\n>>> GET $RESOURCE_ADDRESS response code is $RESOURCE_STATUS"
|
|
|
|
if [ $RESOURCE_STATUS -eq 200 ]; then
|
|
# resource already exists -> nothing to do
|
|
echo -e ">>> $RESOURCE_ADDRESS already exists ✓"
|
|
|
|
elif [ $RESOURCE_STATUS -eq 404 ]; then
|
|
# resource doesn't exist -> need to create it
|
|
echo -e ">>> creating $RESOURCE_ADDRESS because it doesn't exist ..."
|
|
# use the file at given path as definition, but first replace all occurences of `PREFIX`
|
|
# placeholder within the file with the actual prefix value
|
|
TMP_SOURCE_PATH="/tmp/$RESOURCE_DEFINITION_NAME"
|
|
sed -e "s/PREFIX/$PREFIX/g" "$INDEX_DEFINITIONS_ROOT/$RESOURCE_DEFINITION_NAME" \
|
|
| sed -e "s/DUE_SHARDS/$DUE_SHARDS/g" \
|
|
| sed -e "s/DUE_REPLICAS/$DUE_REPLICAS/g" \
|
|
| tee -a "$TMP_SOURCE_PATH"
|
|
curl "${CURL_ARGS[@]}" -XPUT "$ELASTICSEARCH_URL/$RESOURCE_ADDRESS" -H 'Content-Type: application/json' --data "@$TMP_SOURCE_PATH"
|
|
|
|
elif [ $RESOURCE_STATUS -eq 403 ]; then
|
|
# probably authorization fail
|
|
echo -e ">>> forbidden access to $RESOURCE_ADDRESS ! -> exiting"
|
|
exit 1
|
|
|
|
else
|
|
# when `USE_AWS_ELASTICSEARCH` was forgotten to be set to `true` when running against AWS ES OSS,
|
|
# this script will use wrong paths (e.g. `_ilm/policy/` instead of AWS-compatible `_opendistro/_ism/policies/`)
|
|
# and the ES endpoint will return `401 Unauthorized` or `405 Method Not Allowed`
|
|
# let's use this as chance to point that wrong config might be used!
|
|
if [ $RESOURCE_STATUS -eq 401 ] || [ $RESOURCE_STATUS -eq 405 ]; then
|
|
if [[ $USE_AWS_ELASTICSEARCH == false ]] && [[ $ELASTICSEARCH_URL == *"amazonaws"* ]]; then
|
|
echo "... looks like AWS OpenSearch is used; please set USE_AWS_ELASTICSEARCH env value to true"
|
|
fi
|
|
fi
|
|
|
|
echo -e ">>> failed to GET $RESOURCE_ADDRESS ! -> exiting"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
# Update ISM policy. Non-fatal if policy cannot be updated.
|
|
function update_ism_policy {
|
|
RESOURCE_ADDRESS="$1"
|
|
RESOURCE_DEFINITION_NAME="$2"
|
|
|
|
TMP_CURRENT_POLICY_PATH="/tmp/current-$RESOURCE_DEFINITION_NAME"
|
|
|
|
# Get existing policy
|
|
RESOURCE_STATUS=$(curl "${CURL_ARGS[@]}" -o $TMP_CURRENT_POLICY_PATH -w "%{http_code}\n" "$ELASTICSEARCH_URL/$RESOURCE_ADDRESS")
|
|
echo -e "\n>>> GET $RESOURCE_ADDRESS response code is $RESOURCE_STATUS"
|
|
|
|
if [ $RESOURCE_STATUS -ne 200 ]; then
|
|
echo -e ">>> Could not get ISM policy $RESOURCE_ADDRESS. Ignoring."
|
|
return
|
|
fi
|
|
|
|
SEQ_NO=$(cat $TMP_CURRENT_POLICY_PATH | jq -r '._seq_no')
|
|
PRIMARY_TERM=$(cat $TMP_CURRENT_POLICY_PATH | jq -r '._primary_term')
|
|
|
|
TMP_NEW_RESPONSE_PATH="/tmp/response-$RESOURCE_DEFINITION_NAME"
|
|
TMP_NEW_POLICY_PATH="/tmp/new-$RESOURCE_DEFINITION_NAME"
|
|
sed -e "s/PREFIX/$PREFIX/g" "$INDEX_DEFINITIONS_ROOT/$RESOURCE_DEFINITION_NAME" \
|
|
| sed -e "s/DUE_SHARDS/$DUE_SHARDS/g" \
|
|
| sed -e "s/DUE_REPLICAS/$DUE_REPLICAS/g" \
|
|
| tee -a "$TMP_NEW_POLICY_PATH"
|
|
RESOURCE_STATUS=$(curl "${CURL_ARGS[@]}" -XPUT "$ELASTICSEARCH_URL/$RESOURCE_ADDRESS?if_seq_no=$SEQ_NO&if_primary_term=$PRIMARY_TERM" \
|
|
-H 'Content-Type: application/json' -w "%{http_code}\n" -o $TMP_NEW_RESPONSE_PATH --data "@$TMP_NEW_POLICY_PATH")
|
|
echo -e "\n>>> PUT $RESOURCE_ADDRESS response code is $RESOURCE_STATUS"
|
|
}
|
|
|
|
# create indices for ES (non-AWS)
|
|
function create_datahub_usage_event_datastream() {
|
|
# non-AWS env requires creation of three resources for Datahub usage events:
|
|
# 1. ILM policy
|
|
create_if_not_exists "_ilm/policy/${PREFIX}datahub_usage_event_policy" policy.json
|
|
# 2. index template
|
|
create_if_not_exists "_index_template/${PREFIX}datahub_usage_event_index_template" index_template.json
|
|
# 3. although indexing request creates the data stream, it's not queryable before creation, causing GMS to throw exceptions
|
|
create_if_not_exists "_data_stream/${PREFIX}datahub_usage_event" "datahub_usage_event"
|
|
}
|
|
|
|
# create indices for ES OSS (AWS)
|
|
function create_datahub_usage_event_aws_elasticsearch() {
|
|
# AWS env requires creation of three resources for Datahub usage events:
|
|
# 1. ISM policy
|
|
create_if_not_exists "_opendistro/_ism/policies/${PREFIX}datahub_usage_event_policy" aws_es_ism_policy.json
|
|
|
|
# 1.1 ISM policy update if it already existed
|
|
if [ $RESOURCE_STATUS -eq 200 ]; then
|
|
update_ism_policy "_opendistro/_ism/policies/${PREFIX}datahub_usage_event_policy" aws_es_ism_policy.json
|
|
fi
|
|
|
|
# 2. index template
|
|
create_if_not_exists "_template/${PREFIX}datahub_usage_event_index_template" aws_es_index_template.json
|
|
|
|
# 3. event index datahub_usage_event-000001
|
|
# (note that AWS *rollover* indices need to use `^.*-\d+$` naming pattern)
|
|
# -> https://aws.amazon.com/premiumsupport/knowledge-center/opensearch-failed-rollover-index/
|
|
INDEX_SUFFIX="000001"
|
|
# ... but first check whether `datahub_usage_event` wasn't already autocreated by GMS before `datahub_usage_event-000001`
|
|
# (as is common case when this script was initially run without properly setting `USE_AWS_ELASTICSEARCH` to `true`)
|
|
# -> https://github.com/datahub-project/datahub/issues/5376
|
|
USAGE_EVENT_STATUS=$(curl "${CURL_ARGS[@]}" -o /dev/null -w "%{http_code}\n" "$ELASTICSEARCH_URL/${PREFIX}datahub_usage_event")
|
|
if [ $USAGE_EVENT_STATUS -eq 200 ]; then
|
|
USAGE_EVENT_DEFINITION=$(curl "${CURL_ARGS[@]}" "$ELASTICSEARCH_URL/${PREFIX}datahub_usage_event")
|
|
# the definition is expected to contain "datahub_usage_event-000001" string
|
|
if [[ $USAGE_EVENT_DEFINITION != *"datahub_usage_event-"* ]]; then
|
|
# ... if it doesn't, we need to drop it
|
|
echo -e "\n>>> deleting invalid datahub_usage_event ..."
|
|
curl "${CURL_ARGS[@]}" -XDELETE "$ELASTICSEARCH_URL/${PREFIX}datahub_usage_event"
|
|
# ... and then recreate it below
|
|
fi
|
|
fi
|
|
|
|
# ... now we are safe to create the index
|
|
create_if_not_exists "${PREFIX}datahub_usage_event-$INDEX_SUFFIX" aws_es_index.json
|
|
}
|
|
|
|
if [[ $DATAHUB_ANALYTICS_ENABLED == true ]]; then
|
|
echo -e "\n datahub_analytics_enabled: $DATAHUB_ANALYTICS_ENABLED"
|
|
if [[ $USE_AWS_ELASTICSEARCH == false ]]; then
|
|
create_datahub_usage_event_datastream || exit 1
|
|
else
|
|
create_datahub_usage_event_aws_elasticsearch || exit 1
|
|
fi
|
|
else
|
|
echo -e "\ndatahub_analytics_enabled: $DATAHUB_ANALYTICS_ENABLED"
|
|
DATAHUB_USAGE_EVENT_INDEX_RESPONSE_CODE=$(curl "${CURL_ARGS[@]}" -o /dev/null -w "%{http_code}" "$ELASTICSEARCH_URL/_cat/indices/${PREFIX}datahub_usage_event")
|
|
if [ $DATAHUB_USAGE_EVENT_INDEX_RESPONSE_CODE -eq 404 ]
|
|
then
|
|
echo -e "\ncreating ${PREFIX}datahub_usage_event"
|
|
curl "${CURL_ARGS[@]}" -XPUT "$ELASTICSEARCH_URL/${PREFIX}datahub_usage_event"
|
|
elif [ $DATAHUB_USAGE_EVENT_INDEX_RESPONSE_CODE -eq 200 ]; then
|
|
echo -e "\n${PREFIX}datahub_usage_event exists"
|
|
elif [ $DATAHUB_USAGE_EVENT_INDEX_RESPONSE_CODE -eq 403 ]; then
|
|
echo -e "Forbidden so exiting"
|
|
fi
|
|
fi
|