nomad/enos/modules/test_cluster_health/scripts/allocs.sh

#!/usr/bin/env bash
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1

set -euo pipefail

error_exit() {
    printf 'Error: %s' "${1}"
    echo "All allocs:"
    nomad alloc status -json
    exit 1
}

MAX_WAIT_TIME=120
POLL_INTERVAL=2

elapsed_time=0

# Quality: nomad_allocs_status: A GET call to /v1/allocs returns the correct number of allocations and they are all running

running_allocs=
allocs_length=
last_error=

checkAllocsCount() {
    local allocs
    allocs=$(nomad alloc status -json) || {
        last_error="Failed to check alloc status"
        return 1
    }

    running_allocs=$(echo "$allocs" | jq '[.[] | select(.ClientStatus == "running")]')
    allocs_length=$(echo "$running_allocs" | jq 'length') \
        || error_exit "Invalid alloc status -json output"

    if [ "$allocs_length" -eq "$ALLOC_COUNT" ]; then
        return 0
    fi

    last_error="Some allocs are not running"
    return 1
}

while true; do
    checkAllocsCount && break

    if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
        error_exit "$last_error within $elapsed_time seconds."
    fi

    echo "Running allocs: $allocs_length, expected ${ALLOC_COUNT}. Have been waiting for ${elapsed_time}. Retrying in $POLL_INTERVAL seconds..."
    sleep $POLL_INTERVAL
    elapsed_time=$((elapsed_time + POLL_INTERVAL))
done

echo "All $ALLOC_COUNT ALLOCS are running."

if [ "$allocs_length" -eq 0 ]; then
    exit 0
fi

# Quality: nomad_reschedule_alloc: A POST / PUT call to /v1/allocation/:alloc_id/stop results in the stopped allocation being rescheduled

service_batch_allocs=$(echo "$running_allocs" | jq  '[.[] |select(.JobType != "system")]')
service_batch_allocs_length=$(echo "$service_batch_allocs" | jq 'length' )
random_index=$((RANDOM % service_batch_allocs_length))
random_alloc_id=$(echo "$service_batch_allocs" | jq -r ".[${random_index}].ID")

nomad alloc stop "$random_alloc_id" \
    || error_exit "Failed to stop allocation $random_alloc_id"

echo "Waiting for allocation $random_alloc_id to reach 'complete' status..."
elapsed_time=0

while true; do
    alloc_status=$(nomad alloc status -json "$random_alloc_id" | jq -r '.ClientStatus')
    if [ "$alloc_status" == "complete" ]; then
        break
    fi

    if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
        error_exit "Allocation $random_alloc_id did not reach 'complete' status within $MAX_WAIT_TIME seconds."
    fi

    echo "Current status: $alloc_status, not 'complete'. Waiting for $elapsed_time  Retrying in $POLL_INTERVAL seconds..."
    sleep $POLL_INTERVAL
    elapsed_time=$((elapsed_time + POLL_INTERVAL))
done

echo "Waiting for all the allocations to be running again"
elapsed_time=0

while true; do
    # reset
    running_allocs=
    allocs_length=

    checkAllocsCount && break

    if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
        nomad alloc status -json > allocs.json
        error_exit "Expected $ALLOC_COUNT running allocations, found $allocs_length after $elapsed_time seconds"
    fi

    echo "Expected $ALLOC_COUNT running allocations, found $allocs_length Retrying in $POLL_INTERVAL seconds..."
    sleep $POLL_INTERVAL
    elapsed_time=$((elapsed_time + POLL_INTERVAL))
done

echo "Alloc successfully rescheduled"