upgrade tests: add transparent proxy workload (#25176)

Add an upgrade test workload for Consul service mesh with transparent
proxy. Note this breaks from the "countdash" demo. The dashboard application
only can verify the backend is up by making a websocket connection, which we
can't do as a health check, and the health check it exposes for that purpose
only passes once the websocket connection has been made. So replace the
dashboard with a minimal nginx reverse proxy to the count-api instead.

Ref: https://hashicorp.atlassian.net/browse/NET-12217
This commit is contained in:
Tim Gross
2025-03-07 15:25:26 -05:00
committed by GitHub
parent c3e2d4a652
commit 5cc1b4e606
9 changed files with 218 additions and 2 deletions

View File

@@ -38,6 +38,15 @@ output "nomad_token" {
sensitive = true
}
output "consul_token" {
value = module.provision-infra.consul_token
sensitive = true
}
output "consul_addr" {
value = module.provision-infra.consul_addr
}
output "cluster_unique_identifier" {
value = module.provision-infra.cluster_unique_identifier
}

View File

@@ -23,7 +23,7 @@ locals {
module "keys" {
depends_on = [random_pet.e2e]
name = local.random_name
path = "${local.keys_dir}"
path = local.keys_dir
source = "mitchellh/dynamic-keys/aws"
version = "v2.0.0"
}

View File

@@ -66,7 +66,7 @@ EOM
}
output "cluster_unique_identifier" {
value = "${local.random_name}"
value = local.random_name
}
output "nomad_addr" {
@@ -93,3 +93,12 @@ output "nomad_token" {
value = chomp(data.local_sensitive_file.nomad_token.content)
sensitive = true
}
output "consul_addr" {
value = "https://${aws_instance.consul_server.public_ip}:8501"
}
output "consul_token" {
value = chomp(local_sensitive_file.consul_initial_management_token.content)
sensitive = true
}

View File

@@ -133,6 +133,8 @@ scenario "upgrade" {
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
availability_zone = var.availability_zone
consul_addr = step.provision_cluster.consul_addr
consul_token = step.provision_cluster.consul_token
workloads = {
service_raw_exec = { job_spec = "jobs/raw-exec-service.nomad.hcl", alloc_count = 3, type = "service" }
@@ -167,6 +169,13 @@ scenario "upgrade" {
pre_script = "scripts/wait_for_nfs_volume.sh"
}
tproxy = {
job_spec = "jobs/tproxy.nomad.hcl"
alloc_count = 2
type = "service"
pre_script = "scripts/create-consul-intention.sh"
}
}
}

View File

@@ -0,0 +1,155 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
# this variable is not used but required by runner
variable "alloc_count" {
type = number
default = 2
}
job "countdash" {
constraint {
attribute = "${attr.kernel.name}"
value = "linux"
}
group "api" {
network {
mode = "bridge"
}
service {
name = "count-api"
port = "9001"
check {
type = "http"
path = "/health"
expose = true
interval = "3s"
timeout = "1s"
check_restart {
limit = 0 # don't restart on failure
}
}
connect {
sidecar_service {
proxy {
transparent_proxy {}
}
}
}
}
task "web" {
driver = "docker"
config {
image = "hashicorpdev/counter-api:v3"
auth_soft_fail = true
}
}
}
group "dashboard" {
network {
mode = "bridge"
port "http" {
# TODO: for some reason without a static port the health check never
# succeeds, even though we have expose=true on the check
static = 9002
to = 9002
}
}
service {
name = "count-dashboard"
port = "9002"
# this check will fail if connectivity between the dashboard and the API
# fails, and restart the task. we poll frequently but also allow it to
# fail temporarily so we can account for allocations being rescheduled
# during tests
check {
type = "http"
path = "/health"
expose = true
task = "dashboard"
interval = "3s"
timeout = "1s"
# note it seems to take an extremely long time for this API to return ok
check_restart {
limit = 30
}
}
connect {
sidecar_service {
proxy {
transparent_proxy {}
}
}
}
}
# note: this is not the usual countdash frontend because that only sets the
# health check that tests the backend as healthy once a browser connection
# has been made. So serve a reverse proxy to the count API instead.
task "dashboard" {
driver = "docker"
env {
COUNTING_SERVICE_URL = "http://count-api.virtual.consul"
}
config {
image = "nginx:latest"
command = "nginx"
args = ["-c", "/local/default.conf"]
auth_soft_fail = true
}
template {
destination = "local/default.conf"
data = <<EOT
daemon off;
worker_processes 1;
user www-data;
error_log /var/log/error.log info;
events {
use epoll;
worker_connections 128;
}
http {
include /etc/nginx/mime.types;
charset utf-8;
access_log /var/log/access.log combined;
server {
listen 9002;
location / {
proxy_pass http://count-api.virtual.consul;
}
}
}
EOT
}
# restart only once because we're using the service for this task to
# detect tproxy connectivity failures in this test
restart {
delay = "5s"
attempts = 1
mode = "fail"
}
}
}
}

View File

@@ -16,6 +16,9 @@ locals {
NOMAD_CLIENT_CERT = var.cert_file
NOMAD_CLIENT_KEY = var.key_file
NOMAD_TOKEN = var.nomad_token
CONSUL_HTTP_TOKEN = var.consul_token
CONSUL_CACERT = var.ca_file
CONSUL_HTTP_ADDR = var.consul_addr
}
system_job_count = length({ for k, v in var.workloads : k => v if v.type == "system" })

View File

@@ -0,0 +1,8 @@
#!/usr/bin/env bash
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
set -euo pipefail
dir=$(dirname "${BASH_SOURCE[0]}")
consul config write "${dir}/intention.hcl"

View File

@@ -0,0 +1,11 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
Kind = "service-intentions"
Name = "count-api"
Sources = [
{
Name = "count-dashboard"
Action = "allow"
}
]

View File

@@ -28,6 +28,18 @@ variable "nomad_token" {
sensitive = true
}
variable "consul_addr" {
description = "The Consul API HTTP address."
type = string
default = "http://localhost:8500"
}
variable "consul_token" {
description = "The Secret ID of an ACL token to make requests to Consul with"
type = string
sensitive = true
}
variable "availability_zone" {
description = "The AZ where the cluster is being run"
type = string