terraform: updates AWS example packer and terraform code (#19512)

The "Provision a Nomad cluster in the cloud" works in AWS with these updates:

- use an available ubuntu version
- uses hashicorp packages where possible
- updates Nvidia installation
- installs CNI plugins
This commit is contained in:
Nick Wales
2024-04-16 09:47:31 -05:00
committed by GitHub
parent 9d4f7bcb68
commit e014e8411c
14 changed files with 147 additions and 303 deletions

View File

@@ -145,6 +145,29 @@ resource "aws_security_group" "primary" {
cidr_blocks = [var.whitelist_ip]
}
# Consul Ingress
ingress {
from_port = 8080
to_port = 8080
protocol = "tcp"
cidr_blocks = [var.whitelist_ip]
}
# Prometheus
ingress {
from_port = 8081
to_port = 8081
protocol = "tcp"
cidr_blocks = [var.whitelist_ip]
}
# Grafana
ingress {
from_port = 3000
to_port = 3000
protocol = "tcp"
cidr_blocks = [var.whitelist_ip]
}
ingress {
from_port = 0
to_port = 0

View File

@@ -6,7 +6,7 @@
"filters": {
"virtualization-type": "hvm",
"architecture": "x86_64",
"name": "ubuntu/images/hvm-ssd/ubuntu-xenial-16.04-amd64-server-*",
"name": "ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-amd64-server-*",
"block-device-mapping.volume-type": "gp2",
"root-device-type": "ebs"
},

View File

@@ -0,0 +1,17 @@
advertise_addr = "IP_ADDRESS"
bind_addr = "0.0.0.0"
client_addr = "0.0.0.0"
bootstrap_expect = SERVER_COUNT
data_dir = "/opt/consul/data"
log_level = "INFO"
retry_join = ["RETRY_JOIN"]
server = true
ports = {
grpc = 8502
}
ui_config {
enabled = true
}
connect {
enabled = true
}

View File

@@ -1,20 +0,0 @@
{
"log_level": "INFO",
"server": true,
"ui": true,
"data_dir": "/opt/consul/data",
"bind_addr": "0.0.0.0",
"client_addr": "0.0.0.0",
"advertise_addr": "IP_ADDRESS",
"bootstrap_expect": SERVER_COUNT,
"service": {
"name": "consul"
},
"retry_join": ["RETRY_JOIN"],
"ports": {
"grpc": 8502
},
"connect": {
"enabled": true
}
}

View File

@@ -1,16 +0,0 @@
[Unit]
Description=Consul Agent
Requires=network-online.target
After=network-online.target
[Service]
Restart=on-failure
Environment=CONSUL_ALLOW_PRIVILEGED_PORTS=true
ExecStart=/usr/local/bin/consul agent -config-dir="/etc/consul.d" -dns-port="53" -recursor="172.31.0.2"
ExecReload=/bin/kill -HUP $MAINPID
KillSignal=SIGTERM
User=root
Group=root
[Install]
WantedBy=multi-user.target

View File

@@ -1,16 +0,0 @@
[Unit]
Description=Consul Agent
Requires=network-online.target
After=network-online.target
[Service]
Restart=on-failure
Environment=CONSUL_ALLOW_PRIVILEGED_PORTS=true
ExecStart=/usr/local/bin/consul agent -config-dir="/etc/consul.d" -dns-port="53" -recursor="168.63.129.16"
ExecReload=/bin/kill -HUP $MAINPID
KillSignal=SIGTERM
User=root
Group=root
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,12 @@
ui_config {
enabled = true
}
log_level = "INFO"
data_dir = "/opt/consul/data"
bind_addr = "0.0.0.0"
client_addr = "0.0.0.0"
advertise_addr = "IP_ADDRESS"
retry_join = ["RETRY_JOIN"]
ports = {
grpc = 8502
}

View File

@@ -1,12 +0,0 @@
{
"ui": true,
"log_level": "INFO",
"data_dir": "/opt/consul/data",
"bind_addr": "0.0.0.0",
"client_addr": "0.0.0.0",
"advertise_addr": "IP_ADDRESS",
"retry_join": ["RETRY_JOIN"],
"ports": {
"grpc": 8502
}
}

View File

@@ -1,16 +0,0 @@
[Unit]
Description=Consul Agent
Requires=network-online.target
After=network-online.target
[Service]
Restart=on-failure
Environment=CONSUL_ALLOW_PRIVILEGED_PORTS=true
ExecStart=/usr/local/bin/consul agent -config-dir="/etc/consul.d" -dns-port="53" -recursor="169.254.169.254"
ExecReload=/bin/kill -HUP $MAINPID
KillSignal=SIGTERM
User=root
Group=root
[Install]
WantedBy=multi-user.target

View File

@@ -1,26 +0,0 @@
[Unit]
Description=Nomad
Documentation=https://developer.hashicorp.com/nomad/docs/
Wants=network-online.target
After=network-online.target
StartLimitIntervalSec=10
StartLimitBurst=3
# If you are running Consul, please uncomment following Wants/After configs.
# Assuming your Consul service unit name is "consul"
#Wants=consul.service
#After=consul.service
[Service]
ExecReload=/bin/kill -HUP $MAINPID
ExecStart=/usr/local/bin/nomad agent -config /etc/nomad.d
KillMode=process
KillSignal=SIGINT
LimitNOFILE=infinity
LimitNPROC=infinity
Restart=on-failure
RestartSec=2
TasksMax=infinity
[Install]
WantedBy=multi-user.target

View File

@@ -1,16 +0,0 @@
[Unit]
Description=Vault Agent
Requires=network-online.target
After=network-online.target
[Service]
Restart=on-failure
Environment=GOMAXPROCS=nproc
ExecStart=/usr/local/bin/vault server -config="/etc/vault.d/vault.hcl"
ExecReload=/bin/kill -HUP $MAINPID
KillSignal=SIGTERM
User=root
Group=root
[Install]
WantedBy=multi-user.target

View File

@@ -2,11 +2,9 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
set -e
CONFIGDIR=/ops/shared/config
CONSULCONFIGDIR=/etc/consul.d
NOMADCONFIGDIR=/etc/nomad.d
CONSULTEMPLATECONFIGDIR=/etc/consul-template.d
@@ -26,38 +24,49 @@ if [ "$CLOUD" = "gce" ]; then
else
IP_ADDRESS=$(curl http://instance-data/latest/meta-data/local-ipv4)
fi
# IP_ADDRESS="$(/sbin/ifconfig eth0 | grep 'inet addr:' | cut -d: -f2 | awk '{ print $1}')"
# Systemd-resolved config to enable .consul domain lookups using the local Consul agent
# https://developer.hashicorp.com/consul/tutorials/networking/dns-forwarding#systemd-resolved-setup
mkdir -p /etc/systemd/resolved.conf.d/
cat <<EOT > /etc/systemd/resolved.conf.d/consul.conf
[Resolve]
DNS=127.0.0.1:8600
DNSSEC=false
Domains=~consul
EOT
systemctl restart systemd-resolved.service
# Consul
sed -i "s/IP_ADDRESS/$IP_ADDRESS/g" $CONFIGDIR/consul_client.json
sed -i "s/RETRY_JOIN/$RETRY_JOIN/g" $CONFIGDIR/consul_client.json
sudo cp $CONFIGDIR/consul_client.json $CONSULCONFIGDIR/consul.json
sudo cp $CONFIGDIR/consul_$CLOUD.service /etc/systemd/system/consul.service
sed -i "s/IP_ADDRESS/$IP_ADDRESS/g" $CONFIGDIR/consul_client.hcl
sed -i "s/RETRY_JOIN/$RETRY_JOIN/g" $CONFIGDIR/consul_client.hcl
sudo cp $CONFIGDIR/consul_client.hcl $CONSULCONFIGDIR/consul.hcl
sudo systemctl enable consul.service
sudo systemctl start consul.service
sudo systemctl enable consul.service --now
sleep 10
# Nomad
## Install CNI binaries
curl -L -o cni-plugins.tgz "https://github.com/containernetworking/plugins/releases/download/v1.0.0/cni-plugins-linux-$( [ $(uname -m) = aarch64 ] && echo arm64 || echo amd64)"-v1.0.0.tgz && \
sudo mkdir -p /opt/cni/bin && \
sudo tar -C /opt/cni/bin -xzf cni-plugins.tgz
## Replace existing Nomad binary if remote file exists
if [[ `wget -S --spider $NOMAD_BINARY 2>&1 | grep 'HTTP/1.1 200 OK'` ]]; then
curl -L $NOMAD_BINARY > nomad.zip
sudo unzip -o nomad.zip -d /usr/local/bin
sudo chmod 0755 /usr/local/bin/nomad
sudo chown root:root /usr/local/bin/nomad
sudo unzip -o nomad.zip -d /usr/bin
sudo chmod 0755 /usr/bin/nomad
sudo chown root:root /usr/bin/nomad
fi
sudo cp $CONFIGDIR/nomad_client.hcl $NOMADCONFIGDIR/nomad.hcl
sudo cp $CONFIGDIR/nomad.service /etc/systemd/system/nomad.service
sudo systemctl enable nomad.service
sudo systemctl start nomad.service
sudo systemctl enable nomad.service --now
sleep 10
export NOMAD_ADDR=http://$IP_ADDRESS:4646
# Consul Template
sudo cp $CONFIGDIR/consul-template.hcl $CONSULTEMPLATECONFIGDIR/consul-template.hcl
sudo cp $CONFIGDIR/consul-template.service /etc/systemd/system/consul-template.service

View File

@@ -31,14 +31,12 @@ fi
# IP_ADDRESS="$(/sbin/ifconfig eth0 | grep 'inet addr:' | cut -d: -f2 | awk '{ print $1}')"
# Consul
sed -i "s/IP_ADDRESS/$IP_ADDRESS/g" $CONFIGDIR/consul.json
sed -i "s/SERVER_COUNT/$SERVER_COUNT/g" $CONFIGDIR/consul.json
sed -i "s/RETRY_JOIN/$RETRY_JOIN/g" $CONFIGDIR/consul.json
sudo cp $CONFIGDIR/consul.json $CONSULCONFIGDIR
sudo cp $CONFIGDIR/consul_$CLOUD.service /etc/systemd/system/consul.service
sed -i "s/IP_ADDRESS/$IP_ADDRESS/g" $CONFIGDIR/consul.hcl
sed -i "s/SERVER_COUNT/$SERVER_COUNT/g" $CONFIGDIR/consul.hcl
sed -i "s/RETRY_JOIN/$RETRY_JOIN/g" $CONFIGDIR/consul.hcl
sudo cp $CONFIGDIR/consul.hcl $CONSULCONFIGDIR
sudo systemctl enable consul.service
sudo systemctl start consul.service
sudo systemctl enable consul.service --now
sleep 10
export CONSUL_HTTP_ADDR=$IP_ADDRESS:8500
export CONSUL_RPC_ADDR=$IP_ADDRESS:8400
@@ -46,27 +44,22 @@ export CONSUL_RPC_ADDR=$IP_ADDRESS:8400
# Vault
sed -i "s/IP_ADDRESS/$IP_ADDRESS/g" $CONFIGDIR/vault.hcl
sudo cp $CONFIGDIR/vault.hcl $VAULTCONFIGDIR
sudo cp $CONFIGDIR/vault.service /etc/systemd/system/vault.service
sudo systemctl enable vault.service
sudo systemctl start vault.service
sudo systemctl enable vault.service --now
# Nomad
## Replace existing Nomad binary if remote file exists
# ## Replace existing Nomad binary if remote file exists
if [[ `wget -S --spider $NOMAD_BINARY 2>&1 | grep 'HTTP/1.1 200 OK'` ]]; then
curl -L $NOMAD_BINARY > nomad.zip
sudo unzip -o nomad.zip -d /usr/local/bin
sudo chmod 0755 /usr/local/bin/nomad
sudo chown root:root /usr/local/bin/nomad
sudo unzip -o nomad.zip -d /usr/bin/
sudo chmod 0755 /usr/bin/nomad
sudo chown root:root /usr/bin/nomad
fi
sed -i "s/SERVER_COUNT/$SERVER_COUNT/g" $CONFIGDIR/nomad.hcl
sudo cp $CONFIGDIR/nomad.hcl $NOMADCONFIGDIR
sudo cp $CONFIGDIR/nomad.service /etc/systemd/system/nomad.service
sudo systemctl enable nomad.service
sudo systemctl start nomad.service
sudo systemctl enable nomad.service --now
sleep 10
export NOMAD_ADDR=http://$IP_ADDRESS:4646

View File

@@ -3,173 +3,85 @@
# SPDX-License-Identifier: BUSL-1.1
set -e
#set -e
# Disable interactive apt prompts
# Disable interactive apt-get prompts
export DEBIAN_FRONTEND=noninteractive
cd /ops
CONFIGDIR=/ops/shared/config
sudo apt-get install -yq apt-utils
CONSULVERSION=1.12.2
CONSULDOWNLOAD=https://releases.hashicorp.com/consul/${CONSULVERSION}/consul_${CONSULVERSION}_linux_amd64.zip
CONSULCONFIGDIR=/etc/consul.d
CONSULDIR=/opt/consul
# Install HashiCorp products
CONSULVERSION=1.18.1
VAULTVERSION=1.15.7
NOMADVERSION=1.7.6
CONSULTEMPLATEVERSION=0.35.0
VAULTVERSION=1.11.0
VAULTDOWNLOAD=https://releases.hashicorp.com/vault/${VAULTVERSION}/vault_${VAULTVERSION}_linux_amd64.zip
VAULTCONFIGDIR=/etc/vault.d
VAULTDIR=/opt/vault
NOMADVERSION=1.3.1
NOMADDOWNLOAD=https://releases.hashicorp.com/nomad/${NOMADVERSION}/nomad_${NOMADVERSION}_linux_amd64.zip
NOMADCONFIGDIR=/etc/nomad.d
NOMADDIR=/opt/nomad
CONSULTEMPLATEVERSION=0.29.1
CONSULTEMPLATEDOWNLOAD=https://releases.hashicorp.com/consul-template/${CONSULTEMPLATEVERSION}/consul-template_${CONSULTEMPLATEVERSION}_linux_amd64.zip
CONSULTEMPLATECONFIGDIR=/etc/consul-template.d
CONSULTEMPLATEDIR=/opt/consul-template
sudo apt-get update && sudo apt-get install gpg
wget -O- https://apt.releases.hashicorp.com/gpg | sudo gpg --dearmor -o /usr/share/keyrings/hashicorp-archive-keyring.gpg
echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] https://apt.releases.hashicorp.com $(lsb_release -cs) main" | sudo tee /etc/apt/sources.list.d/hashicorp.list
sudo apt-get update
sudo apt-get install -yq consul="${CONSULVERSION}*" \
vault="${VAULTVERSION}*" \
nomad="${NOMADVERSION}*" \
consul-template="${CONSULTEMPLATEVERSION}*"
# Dependencies
sudo apt-get install -y software-properties-common
sudo apt-get install -yq software-properties-common
sudo apt-get update
sudo apt-get install -y unzip tree redis-tools jq curl tmux gnupg-curl
sudo apt-get install -yq unzip tree redis jq curl tmux openjdk-8-jdk
# Disable the firewall
sudo ufw disable || echo "ufw not installed"
# Consul
curl -L $CONSULDOWNLOAD > consul.zip
## Install
sudo unzip consul.zip -d /usr/local/bin
sudo chmod 0755 /usr/local/bin/consul
sudo chown root:root /usr/local/bin/consul
## Configure
sudo mkdir -p $CONSULCONFIGDIR
sudo chmod 755 $CONSULCONFIGDIR
sudo mkdir -p $CONSULDIR
sudo chmod 755 $CONSULDIR
# Vault
curl -L $VAULTDOWNLOAD > vault.zip
## Install
sudo unzip vault.zip -d /usr/local/bin
sudo chmod 0755 /usr/local/bin/vault
sudo chown root:root /usr/local/bin/vault
## Configure
sudo mkdir -p $VAULTCONFIGDIR
sudo chmod 755 $VAULTCONFIGDIR
sudo mkdir -p $VAULTDIR
sudo chmod 755 $VAULTDIR
# Nomad
curl -L $NOMADDOWNLOAD > nomad.zip
## Install
sudo unzip nomad.zip -d /usr/local/bin
sudo chmod 0755 /usr/local/bin/nomad
sudo chown root:root /usr/local/bin/nomad
## Configure
sudo mkdir -p $NOMADCONFIGDIR
sudo chmod 755 $NOMADCONFIGDIR
sudo mkdir -p $NOMADDIR
sudo chmod 755 $NOMADDIR
# Consul Template
curl -L $CONSULTEMPLATEDOWNLOAD > consul-template.zip
## Install
sudo unzip consul-template.zip -d /usr/local/bin
sudo chmod 0755 /usr/local/bin/consul-template
sudo chown root:root /usr/local/bin/consul-template
## Configure
sudo mkdir -p $CONSULTEMPLATECONFIGDIR
sudo chmod 755 $CONSULTEMPLATECONFIGDIR
sudo mkdir -p $CONSULTEMPLATEDIR
sudo chmod 755 $CONSULTEMPLATEDIR
# Docker
distro=$(lsb_release -si | tr '[:upper:]' '[:lower:]')
sudo apt-get install -y apt-transport-https ca-certificates gnupg2
curl -fsSL https://download.docker.com/linux/debian/gpg | sudo apt-key add -
sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/${distro} $(lsb_release -cs) stable"
sudo apt-get install -yq apt-transport-https ca-certificates gnupg2
# Add Docker's official GPG key:
sudo apt-get update
sudo apt-get install -y docker-ce
sudo apt-get install ca-certificates curl gnupg
sudo install -m 0755 -d /etc/apt/keyrings
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg
sudo chmod a+r /etc/apt/keyrings/docker.gpg
# Needs testing, updating and fixing
# Add the repository to apt-get sources:
echo \
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt-get update
sudo apt-get install -yq docker-ce docker-ce-cli containerd.io docker-buildx-plugin
# # Needs testing, updating and fixing
if [[ ! -z ${INSTALL_NVIDIA_DOCKER+x} ]]; then
# Install official NVIDIA driver package
# This is why we added gnupg-curl, otherwise, the following fails with "gpgkeys: protocol `https' not supported"
sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/3bf863cc.pub
sudo sh -c 'echo "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list'
sudo apt-get update && sudo apt-get install -y --no-install-recommends --allow-unauthenticated linux-headers-generic dkms cuda-drivers
## https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#network-repo-installation-for-ubuntu
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
# Install nvidia-docker and nvidia-docker-plugin
# from: https://github.com/NVIDIA/nvidia-docker#ubuntu-140416041804-debian-jessiestretch
wget -P /tmp https://github.com/NVIDIA/nvidia-docker/releases/download/v1.0.1/nvidia-docker_1.0.1-1_amd64.deb
sudo dpkg -i /tmp/nvidia-docker*.deb && rm /tmp/nvidia-docker*.deb
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | \
sudo tee /etc/apt/sources.list.d/nvidia-docker.list
echo "deb [signed-by=/usr/share/keyrings/cuda-archive-keyring.gpg] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" | sudo tee /etc/apt/sources.list.d/cuda-ubuntu2204-x86_64.list
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin
sudo mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600
sudo apt-get update
sudo apt-get install -y --allow-unauthenticated nvidia-docker2
sudo apt-get install cuda-toolkit
sudo apt-get install nvidia-gds
# Install nvidia container support
## https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#configuring-docker
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
sudo apt-get update
sudo apt-get install -y nvidia-container-toolkit
sudo nvidia-ctk runtime configure --runtime=docker
sudo systemctl restart docker
fi
# rkt
# Note: rkt has been ended and archived. This should likely be removed.
# See https://github.com/rkt/rkt/issues/4024
VERSION=1.30.0
DOWNLOAD=https://github.com/rkt/rkt/releases/download/v${VERSION}/rkt-v${VERSION}.tar.gz
function install_rkt() {
wget -q -O /tmp/rkt.tar.gz "${DOWNLOAD}"
tar -C /tmp -xvf /tmp/rkt.tar.gz
sudo mv /tmp/rkt-v${VERSION}/rkt /usr/local/bin
sudo mv /tmp/rkt-v${VERSION}/*.aci /usr/local/bin
}
function configure_rkt_networking() {
sudo mkdir -p /etc/rkt/net.d
sudo bash -c 'cat << EOT > /etc/rkt/net.d/99-network.conf
{
"name": "default",
"type": "ptp",
"ipMasq": false,
"ipam": {
"type": "host-local",
"subnet": "172.16.28.0/24",
"routes": [
{
"dst": "0.0.0.0/0"
}
]
}
}
EOT'
}
install_rkt
configure_rkt_networking
# Java
sudo add-apt-repository -y ppa:openjdk-r/ppa
sudo apt-get update
sudo apt-get install -y openjdk-8-jdk
JAVA_HOME=$(readlink -f /usr/bin/java | sed "s:bin/java::")