upgrade tests: add transparent proxy workload (#25176)

Add an upgrade test workload for Consul service mesh with transparent proxy. Note this breaks from the "countdash" demo. The dashboard application only can verify the backend is up by making a websocket connection, which we can't do as a health check, and the health check it exposes for that purpose only passes once the websocket connection has been made. So replace the dashboard with a minimal nginx reverse proxy to the count-api instead. Ref: https://hashicorp.atlassian.net/browse/NET-12217
2026-01-01 16:05:42 +03:00 · 2025-03-07 15:25:26 -05:00
parent c3e2d4a652
commit 5cc1b4e606
9 changed files with 218 additions and 2 deletions
--- a/e2e/terraform/outputs.tf
+++ b/e2e/terraform/outputs.tf
@@ -38,6 +38,15 @@ output "nomad_token" {
  sensitive = true
 }

+output "consul_token" {
+  value     = module.provision-infra.consul_token
+  sensitive = true
+}
+
+output "consul_addr" {
+  value = module.provision-infra.consul_addr
+}
+
 output "cluster_unique_identifier" {
  value = module.provision-infra.cluster_unique_identifier
 }
--- a/e2e/terraform/provision-infra/main.tf
+++ b/e2e/terraform/provision-infra/main.tf
@@ -23,7 +23,7 @@ locals {
 module "keys" {
  depends_on = [random_pet.e2e]
  name       = local.random_name
-  path       = "${local.keys_dir}"
+  path       = local.keys_dir
  source     = "mitchellh/dynamic-keys/aws"
  version    = "v2.0.0"
 }
--- a/e2e/terraform/provision-infra/outputs.tf
+++ b/e2e/terraform/provision-infra/outputs.tf
@@ -66,7 +66,7 @@ EOM
 }

 output "cluster_unique_identifier" {
-  value = "${local.random_name}"
+  value = local.random_name
 }

 output "nomad_addr" {
@@ -93,3 +93,12 @@ output "nomad_token" {
  value     = chomp(data.local_sensitive_file.nomad_token.content)
  sensitive = true
 }
+
+output "consul_addr" {
+  value = "https://${aws_instance.consul_server.public_ip}:8501"
+}
+
+output "consul_token" {
+  value     = chomp(local_sensitive_file.consul_initial_management_token.content)
+  sensitive = true
+}
--- a/enos/enos-scenario-upgrade.hcl
+++ b/enos/enos-scenario-upgrade.hcl
@@ -133,6 +133,8 @@ scenario "upgrade" {
      key_file          = step.provision_cluster.key_file
      nomad_token       = step.provision_cluster.nomad_token
      availability_zone = var.availability_zone
+      consul_addr       = step.provision_cluster.consul_addr
+      consul_token      = step.provision_cluster.consul_token

      workloads = {
        service_raw_exec = { job_spec = "jobs/raw-exec-service.nomad.hcl", alloc_count = 3, type = "service" }
@@ -167,6 +169,13 @@ scenario "upgrade" {
          pre_script  = "scripts/wait_for_nfs_volume.sh"
        }

+        tproxy = {
+          job_spec    = "jobs/tproxy.nomad.hcl"
+          alloc_count = 2
+          type        = "service"
+          pre_script  = "scripts/create-consul-intention.sh"
+        }
+
      }
    }

--- a/enos/modules/run_workloads/jobs/tproxy.nomad.hcl
+++ b/enos/modules/run_workloads/jobs/tproxy.nomad.hcl
@@ -0,0 +1,155 @@
+# Copyright (c) HashiCorp, Inc.
+# SPDX-License-Identifier: BUSL-1.1
+
+# this variable is not used but required by runner
+variable "alloc_count" {
+  type    = number
+  default = 2
+}
+
+job "countdash" {
+
+  constraint {
+    attribute = "${attr.kernel.name}"
+    value     = "linux"
+  }
+
+  group "api" {
+    network {
+      mode = "bridge"
+    }
+
+    service {
+      name = "count-api"
+      port = "9001"
+
+      check {
+        type     = "http"
+        path     = "/health"
+        expose   = true
+        interval = "3s"
+        timeout  = "1s"
+
+        check_restart {
+          limit = 0 # don't restart on failure
+        }
+      }
+
+      connect {
+        sidecar_service {
+          proxy {
+            transparent_proxy {}
+          }
+        }
+      }
+    }
+
+    task "web" {
+      driver = "docker"
+
+      config {
+        image          = "hashicorpdev/counter-api:v3"
+        auth_soft_fail = true
+      }
+    }
+  }
+
+  group "dashboard" {
+    network {
+      mode = "bridge"
+
+      port "http" {
+        # TODO: for some reason without a static port the health check never
+        # succeeds, even though we have expose=true on the check
+        static = 9002
+        to     = 9002
+      }
+    }
+
+    service {
+      name = "count-dashboard"
+      port = "9002"
+
+      # this check will fail if connectivity between the dashboard and the API
+      # fails, and restart the task. we poll frequently but also allow it to
+      # fail temporarily so we can account for allocations being rescheduled
+      # during tests
+      check {
+        type     = "http"
+        path     = "/health"
+        expose   = true
+        task     = "dashboard"
+        interval = "3s"
+        timeout  = "1s"
+
+        # note it seems to take an extremely long time for this API to return ok
+        check_restart {
+          limit = 30
+        }
+      }
+
+      connect {
+        sidecar_service {
+          proxy {
+            transparent_proxy {}
+          }
+        }
+      }
+    }
+
+    # note: this is not the usual countdash frontend because that only sets the
+    # health check that tests the backend as healthy once a browser connection
+    # has been made. So serve a reverse proxy to the count API instead.
+    task "dashboard" {
+      driver = "docker"
+
+      env {
+        COUNTING_SERVICE_URL = "http://count-api.virtual.consul"
+      }
+
+      config {
+        image          = "nginx:latest"
+        command        = "nginx"
+        args           = ["-c", "/local/default.conf"]
+        auth_soft_fail = true
+      }
+
+      template {
+        destination = "local/default.conf"
+        data        = <<EOT
+daemon off;
+worker_processes  1;
+user www-data;
+error_log /var/log/error.log info;
+
+events {
+  use epoll;
+  worker_connections 128;
+}
+
+http {
+  include /etc/nginx/mime.types;
+  charset utf-8;
+  access_log /var/log/access.log  combined;
+  server {
+    listen 9002;
+    location / {
+      proxy_pass http://count-api.virtual.consul;
+    }
+  }
+}
+EOT
+
+      }
+
+      # restart only once because we're using the service for this task to
+      # detect tproxy connectivity failures in this test
+      restart {
+        delay    = "5s"
+        attempts = 1
+        mode     = "fail"
+      }
+    }
+
+  }
+}
--- a/enos/modules/run_workloads/main.tf
+++ b/enos/modules/run_workloads/main.tf
@@ -16,6 +16,9 @@ locals {
    NOMAD_CLIENT_CERT = var.cert_file
    NOMAD_CLIENT_KEY  = var.key_file
    NOMAD_TOKEN       = var.nomad_token
+    CONSUL_HTTP_TOKEN = var.consul_token
+    CONSUL_CACERT     = var.ca_file
+    CONSUL_HTTP_ADDR  = var.consul_addr
  }

  system_job_count     = length({ for k, v in var.workloads : k => v if v.type == "system" })
--- a/enos/modules/run_workloads/scripts/create-consul-intention.sh
+++ b/enos/modules/run_workloads/scripts/create-consul-intention.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+# Copyright (c) HashiCorp, Inc.
+# SPDX-License-Identifier: BUSL-1.1
+
+set -euo pipefail
+
+dir=$(dirname "${BASH_SOURCE[0]}")
+consul config write "${dir}/intention.hcl"
--- a/enos/modules/run_workloads/scripts/intention.hcl
+++ b/enos/modules/run_workloads/scripts/intention.hcl
@@ -0,0 +1,11 @@
+# Copyright (c) HashiCorp, Inc.
+# SPDX-License-Identifier: BUSL-1.1
+
+Kind = "service-intentions"
+Name = "count-api"
+Sources = [
+  {
+    Name   = "count-dashboard"
+    Action = "allow"
+  }
+]
--- a/enos/modules/run_workloads/variables.tf
+++ b/enos/modules/run_workloads/variables.tf
@@ -28,6 +28,18 @@ variable "nomad_token" {
  sensitive   = true
 }

+variable "consul_addr" {
+  description = "The Consul API HTTP address."
+  type        = string
+  default     = "http://localhost:8500"
+}
+
+variable "consul_token" {
+  description = "The Secret ID of an ACL token to make requests to Consul with"
+  type        = string
+  sensitive   = true
+}
+
 variable "availability_zone" {
  description = "The AZ where the cluster is being run"
  type        = string