From 1788bfb42eff36f2ee88ae20404ea30f8494cee0 Mon Sep 17 00:00:00 2001
From: Tim Gross <tgross@hashicorp.com>
Date: Mon, 3 Mar 2025 09:28:32 -0500
Subject: [PATCH] remove addresses from node class hash (#24942)

When a node is fingerprinted, we calculate a "computed class" from a hash over a
subset of its fields and attributes. In the scheduler, when a given node fails
feasibility checking (before fit checking) we know that no other node of that
same class will be feasible, and we add the hash to a map so we can reject them
early. This hash cannot include any values that are unique to a given node,
otherwise no other node will have the same hash and we'll never save ourselves
the work of feasibility checking those nodes.

In #4390 we introduce the `nomad.advertise.address` attribute and in #19969 we
introduced `consul.dns.addr` attribute. Both of these are unique per node and
break the hash.

Additionally, we were not correctly filtering attributes out when checking if a
node escaped the class by not filtering for attributes that start with
`unique.`. The test for this introduced in #708 had an inverted assertion, which
allowed this to pass unnoticed since the early days of Nomad.

Ref: https://github.com/hashicorp/nomad/pull/708
Ref: https://github.com/hashicorp/nomad/pull/4390
Ref: https://github.com/hashicorp/nomad/pull/19969
---
 .changelog/24942.txt                      |  7 +++
 client/allocrunner/networking_cni.go      |  4 +-
 client/allocrunner/networking_cni_test.go | 20 ++++----
 client/fingerprint/consul.go              | 50 ++++++++++----------
 client/fingerprint/consul_test.go         | 52 ++++++++++-----------
 client/fingerprint/nomad.go               |  2 +-
 client/fingerprint/nomad_test.go          |  2 +-
 nomad/structs/node_class.go               |  2 +
 nomad/structs/node_class_test.go          |  8 ++--
 scheduler/benchmarks/benchmarks_test.go   | 57 +++++++++++++++--------
 scheduler/testing.go                      | 11 +++++
 11 files changed, 126 insertions(+), 89 deletions(-)
 create mode 100644 .changelog/24942.txt

diff --git a/.changelog/24942.txt b/.changelog/24942.txt
new file mode 100644
index 000000000..c02ad6b12
--- /dev/null
+++ b/.changelog/24942.txt
@@ -0,0 +1,7 @@
+```release-note:bug
+scheduler: Fixed a bug where node class hashes included unique attributes, making scheduling more costly
+```
+
+```release-note:breaking-change
+node: The node attribute `consul.addr.dns` has been changed to `unique.consul.addr.dns`. The node attribute `nomad.advertise.address` has been changed to `unique.advertise.address`.
+```
diff --git a/client/allocrunner/networking_cni.go b/client/allocrunner/networking_cni.go
index 4f39eb218..7a8663501 100644
--- a/client/allocrunner/networking_cni.go
+++ b/client/allocrunner/networking_cni.go
@@ -413,10 +413,10 @@ func (c *cniNetworkConfigurator) setupTransparentProxyArgs(alloc *structs.Alloca
 func (c *cniNetworkConfigurator) dnsFromAttrs(cluster string) (string, int) {
 	var dnsAddrAttr, dnsPortAttr string
 	if cluster == structs.ConsulDefaultCluster || cluster == "" {
-		dnsAddrAttr = "consul.dns.addr"
+		dnsAddrAttr = "unique.consul.dns.addr"
 		dnsPortAttr = "consul.dns.port"
 	} else {
-		dnsAddrAttr = "consul." + cluster + ".dns.addr"
+		dnsAddrAttr = "unique.consul." + cluster + ".dns.addr"
 		dnsPortAttr = "consul." + cluster + ".dns.port"
 	}
 
diff --git a/client/allocrunner/networking_cni_test.go b/client/allocrunner/networking_cni_test.go
index f76f2b246..43745d3d7 100644
--- a/client/allocrunner/networking_cni_test.go
+++ b/client/allocrunner/networking_cni_test.go
@@ -257,8 +257,8 @@ func TestSetup(t *testing.T) {
 	}
 
 	nodeAddrs := map[string]string{
-		"consul.dns.addr": "192.168.1.117",
-		"consul.dns.port": "8600",
+		"unique.consul.dns.addr": "192.168.1.117",
+		"consul.dns.port":        "8600",
 	}
 	nodeMeta := map[string]string{
 		"connect.transparent_proxy.default_outbound_port": "15001",
@@ -554,8 +554,8 @@ func TestCNI_setupTproxyArgs(t *testing.T) {
 	}
 
 	nodeAttrs := map[string]string{
-		"consul.dns.addr": "192.168.1.117",
-		"consul.dns.port": "8600",
+		"unique.consul.dns.addr": "192.168.1.117",
+		"consul.dns.port":        "8600",
 	}
 
 	alloc := mock.ConnectAlloc()
@@ -716,8 +716,8 @@ func TestCNI_setupTproxyArgs(t *testing.T) {
 		{
 			name: "tproxy with consul dns disabled",
 			nodeAttrs: map[string]string{
-				"consul.dns.port": "-1",
-				"consul.dns.addr": "192.168.1.117",
+				"consul.dns.port":        "-1",
+				"unique.consul.dns.addr": "192.168.1.117",
 			},
 			tproxySpec: &structs.ConsulTransparentProxy{},
 			expectIPConfig: &iptables.Config{
@@ -732,10 +732,10 @@ func TestCNI_setupTproxyArgs(t *testing.T) {
 			name:    "tproxy for other cluster with default consul dns disabled",
 			cluster: "infra",
 			nodeAttrs: map[string]string{
-				"consul.dns.port":       "-1",
-				"consul.dns.addr":       "192.168.1.110",
-				"consul.infra.dns.port": "8600",
-				"consul.infra.dns.addr": "192.168.1.117",
+				"consul.dns.port":              "-1",
+				"unique.consul.dns.addr":       "192.168.1.110",
+				"consul.infra.dns.port":        "8600",
+				"unique.consul.infra.dns.addr": "192.168.1.117",
 			},
 			tproxySpec: &structs.ConsulTransparentProxy{},
 			expectIPConfig: &iptables.Config{
diff --git a/client/fingerprint/consul.go b/client/fingerprint/consul.go
index d76136604..2ff906857 100644
--- a/client/fingerprint/consul.go
+++ b/client/fingerprint/consul.go
@@ -190,34 +190,34 @@ func (cfs *consulState) initialize(cfg *config.ConsulConfig, logger hclog.Logger
 
 	if cfg.Name == structs.ConsulDefaultCluster {
 		cfs.readers = map[string]valueReader{
-			"consul.server":        cfs.server,
-			"consul.version":       cfs.version,
-			"consul.sku":           cfs.sku,
-			"consul.revision":      cfs.revision,
-			"unique.consul.name":   cfs.name, // note: won't have this for non-default clusters
-			"consul.datacenter":    cfs.dc,
-			"consul.segment":       cfs.segment,
-			"consul.connect":       cfs.connect,
-			"consul.grpc":          cfs.grpc(consulConfig.Scheme, logger),
-			"consul.ft.namespaces": cfs.namespaces,
-			"consul.partition":     cfs.partition,
-			"consul.dns.port":      cfs.dnsPort,
-			"consul.dns.addr":      cfs.dnsAddr(logger),
+			"consul.server":          cfs.server,
+			"consul.version":         cfs.version,
+			"consul.sku":             cfs.sku,
+			"consul.revision":        cfs.revision,
+			"unique.consul.name":     cfs.name, // note: won't have this for non-default clusters
+			"consul.datacenter":      cfs.dc,
+			"consul.segment":         cfs.segment,
+			"consul.connect":         cfs.connect,
+			"consul.grpc":            cfs.grpc(consulConfig.Scheme, logger),
+			"consul.ft.namespaces":   cfs.namespaces,
+			"consul.partition":       cfs.partition,
+			"consul.dns.port":        cfs.dnsPort,
+			"unique.consul.dns.addr": cfs.dnsAddr(logger),
 		}
 	} else {
 		cfs.readers = map[string]valueReader{
-			fmt.Sprintf("consul.%s.server", cfg.Name):        cfs.server,
-			fmt.Sprintf("consul.%s.version", cfg.Name):       cfs.version,
-			fmt.Sprintf("consul.%s.sku", cfg.Name):           cfs.sku,
-			fmt.Sprintf("consul.%s.revision", cfg.Name):      cfs.revision,
-			fmt.Sprintf("consul.%s.datacenter", cfg.Name):    cfs.dc,
-			fmt.Sprintf("consul.%s.segment", cfg.Name):       cfs.segment,
-			fmt.Sprintf("consul.%s.connect", cfg.Name):       cfs.connect,
-			fmt.Sprintf("consul.%s.grpc", cfg.Name):          cfs.grpc(consulConfig.Scheme, logger),
-			fmt.Sprintf("consul.%s.ft.namespaces", cfg.Name): cfs.namespaces,
-			fmt.Sprintf("consul.%s.partition", cfg.Name):     cfs.partition,
-			fmt.Sprintf("consul.%s.dns.port", cfg.Name):      cfs.dnsPort,
-			fmt.Sprintf("consul.%s.dns.addr", cfg.Name):      cfs.dnsAddr(logger),
+			fmt.Sprintf("consul.%s.server", cfg.Name):          cfs.server,
+			fmt.Sprintf("consul.%s.version", cfg.Name):         cfs.version,
+			fmt.Sprintf("consul.%s.sku", cfg.Name):             cfs.sku,
+			fmt.Sprintf("consul.%s.revision", cfg.Name):        cfs.revision,
+			fmt.Sprintf("consul.%s.datacenter", cfg.Name):      cfs.dc,
+			fmt.Sprintf("consul.%s.segment", cfg.Name):         cfs.segment,
+			fmt.Sprintf("consul.%s.connect", cfg.Name):         cfs.connect,
+			fmt.Sprintf("consul.%s.grpc", cfg.Name):            cfs.grpc(consulConfig.Scheme, logger),
+			fmt.Sprintf("consul.%s.ft.namespaces", cfg.Name):   cfs.namespaces,
+			fmt.Sprintf("consul.%s.partition", cfg.Name):       cfs.partition,
+			fmt.Sprintf("consul.%s.dns.port", cfg.Name):        cfs.dnsPort,
+			fmt.Sprintf("unique.consul.%s.dns.addr", cfg.Name): cfs.dnsAddr(logger),
 		}
 	}
 
diff --git a/client/fingerprint/consul_test.go b/client/fingerprint/consul_test.go
index 848a78553..078c93bf3 100644
--- a/client/fingerprint/consul_test.go
+++ b/client/fingerprint/consul_test.go
@@ -696,19 +696,19 @@ func TestConsulFingerprint_Fingerprint_ent(t *testing.T) {
 	err := cf.Fingerprint(&FingerprintRequest{Config: cfg, Node: node}, &resp)
 	must.NoError(t, err)
 	must.Eq(t, map[string]string{
-		"consul.datacenter":    "dc1",
-		"consul.revision":      "22ce6c6ad",
-		"consul.segment":       "seg1",
-		"consul.server":        "true",
-		"consul.sku":           "ent",
-		"consul.version":       "1.9.5+ent",
-		"consul.ft.namespaces": "true",
-		"consul.connect":       "true",
-		"consul.grpc":          "8502",
-		"consul.dns.addr":      "192.168.1.117",
-		"consul.dns.port":      "8600",
-		"consul.partition":     "default",
-		"unique.consul.name":   "HAL9000",
+		"consul.datacenter":      "dc1",
+		"consul.revision":        "22ce6c6ad",
+		"consul.segment":         "seg1",
+		"consul.server":          "true",
+		"consul.sku":             "ent",
+		"consul.version":         "1.9.5+ent",
+		"consul.ft.namespaces":   "true",
+		"consul.connect":         "true",
+		"consul.grpc":            "8502",
+		"unique.consul.dns.addr": "192.168.1.117",
+		"consul.dns.port":        "8600",
+		"consul.partition":       "default",
+		"unique.consul.name":     "HAL9000",
 	}, resp.Attributes)
 	must.True(t, resp.Detected)
 
@@ -752,19 +752,19 @@ func TestConsulFingerprint_Fingerprint_ent(t *testing.T) {
 	err4 := cf.Fingerprint(&FingerprintRequest{Config: cfg, Node: node}, &resp4)
 	must.NoError(t, err4)
 	must.Eq(t, map[string]string{
-		"consul.datacenter":    "dc1",
-		"consul.revision":      "22ce6c6ad",
-		"consul.segment":       "seg1",
-		"consul.server":        "true",
-		"consul.sku":           "ent",
-		"consul.version":       "1.9.5+ent",
-		"consul.ft.namespaces": "true",
-		"consul.connect":       "true",
-		"consul.grpc":          "8502",
-		"consul.dns.addr":      "192.168.1.117",
-		"consul.dns.port":      "8600",
-		"consul.partition":     "default",
-		"unique.consul.name":   "HAL9000",
+		"consul.datacenter":      "dc1",
+		"consul.revision":        "22ce6c6ad",
+		"consul.segment":         "seg1",
+		"consul.server":          "true",
+		"consul.sku":             "ent",
+		"consul.version":         "1.9.5+ent",
+		"consul.ft.namespaces":   "true",
+		"consul.connect":         "true",
+		"consul.grpc":            "8502",
+		"unique.consul.dns.addr": "192.168.1.117",
+		"consul.dns.port":        "8600",
+		"consul.partition":       "default",
+		"unique.consul.name":     "HAL9000",
 	}, resp4.Attributes)
 
 	// consul now available again
diff --git a/client/fingerprint/nomad.go b/client/fingerprint/nomad.go
index cbdc1c3b4..2caadf29a 100644
--- a/client/fingerprint/nomad.go
+++ b/client/fingerprint/nomad.go
@@ -22,7 +22,7 @@ func NewNomadFingerprint(logger log.Logger) Fingerprint {
 }
 
 func (f *NomadFingerprint) Fingerprint(req *FingerprintRequest, resp *FingerprintResponse) error {
-	resp.AddAttribute("nomad.advertise.address", req.Node.HTTPAddr)
+	resp.AddAttribute("unique.advertise.address", req.Node.HTTPAddr)
 	resp.AddAttribute("nomad.version", req.Config.Version.VersionNumber())
 	resp.AddAttribute("nomad.revision", req.Config.Version.Revision)
 	resp.AddAttribute("nomad.service_discovery", strconv.FormatBool(req.Config.NomadServiceDiscovery))
diff --git a/client/fingerprint/nomad_test.go b/client/fingerprint/nomad_test.go
index dbb645aa1..d72130c89 100644
--- a/client/fingerprint/nomad_test.go
+++ b/client/fingerprint/nomad_test.go
@@ -57,7 +57,7 @@ func TestNomadFingerprint(t *testing.T) {
 		t.Fatalf("incorrect revision")
 	}
 
-	if response.Attributes["nomad.advertise.address"] != h {
+	if response.Attributes["unique.advertise.address"] != h {
 		t.Fatalf("incorrect advertise address")
 	}
 
diff --git a/nomad/structs/node_class.go b/nomad/structs/node_class.go
index c6b101620..08d4ab0c2 100644
--- a/nomad/structs/node_class.go
+++ b/nomad/structs/node_class.go
@@ -123,6 +123,8 @@ func EscapedConstraints(constraints []*Constraint) []*Constraint {
 // computed node class optimization.
 func constraintTargetEscapes(target string) bool {
 	switch {
+	case strings.HasPrefix(target, "${unique."):
+		return true
 	case strings.HasPrefix(target, "${node.unique."):
 		return true
 	case strings.HasPrefix(target, "${attr.unique."):
diff --git a/nomad/structs/node_class_test.go b/nomad/structs/node_class_test.go
index fe8265dc8..e6c160747 100644
--- a/nomad/structs/node_class_test.go
+++ b/nomad/structs/node_class_test.go
@@ -4,7 +4,6 @@
 package structs
 
 import (
-	"reflect"
 	"testing"
 
 	"github.com/hashicorp/nomad/ci"
@@ -276,8 +275,7 @@ func TestNode_EscapedConstraints(t *testing.T) {
 		Operand: "!=",
 	}
 	constraints := []*Constraint{ne1, ne2, ne3, e1, e2, e3}
-	expected := []*Constraint{ne1, ne2, ne3}
-	if act := EscapedConstraints(constraints); reflect.DeepEqual(act, expected) {
-		t.Fatalf("EscapedConstraints(%v) returned %v; want %v", constraints, act, expected)
-	}
+	expected := []*Constraint{e1, e2, e3}
+	must.Eq(t, expected, EscapedConstraints(constraints),
+		must.Sprintf("expected unique fields to escape constraints"))
 }
diff --git a/scheduler/benchmarks/benchmarks_test.go b/scheduler/benchmarks/benchmarks_test.go
index 7ab141037..cc2c7668d 100644
--- a/scheduler/benchmarks/benchmarks_test.go
+++ b/scheduler/benchmarks/benchmarks_test.go
@@ -5,9 +5,10 @@ package benchmarks
 
 import (
 	"fmt"
+	"strings"
 	"testing"
 
-	"github.com/stretchr/testify/require"
+	"github.com/shoenig/test/must"
 
 	"github.com/hashicorp/nomad/helper/uuid"
 	"github.com/hashicorp/nomad/nomad/mock"
@@ -38,7 +39,7 @@ func BenchmarkSchedulerExample(b *testing.B) {
 		upsertNodes(h, 5000, 100)
 
 		iter, err := h.State.Nodes(nil)
-		require.NoError(b, err)
+		must.NoError(b, err)
 		nodes := 0
 		for {
 			raw := iter.Next()
@@ -47,8 +48,8 @@ func BenchmarkSchedulerExample(b *testing.B) {
 			}
 			nodes++
 		}
-		require.Equal(b, 5000, nodes)
-		job := generateJob(true, 600)
+		must.Eq(b, 5000, nodes)
+		job := generateJob(true, 600, 100)
 		eval = upsertJob(h, job)
 	}
 
@@ -58,14 +59,14 @@ func BenchmarkSchedulerExample(b *testing.B) {
 	// benchmarking a successful run and not a failed plan.
 	{
 		err := h.Process(scheduler.NewServiceScheduler, eval)
-		require.NoError(b, err)
-		require.Len(b, h.Plans, 1)
-		require.False(b, h.Plans[0].IsNoOp())
+		must.NoError(b, err)
+		must.Len(b, 1, h.Plans)
+		must.False(b, h.Plans[0].IsNoOp())
 	}
 
 	for i := 0; i < b.N; i++ {
 		err := h.Process(scheduler.NewServiceScheduler, eval)
-		require.NoError(b, err)
+		must.NoError(b, err)
 	}
 }
 
@@ -73,9 +74,9 @@ func BenchmarkSchedulerExample(b *testing.B) {
 // variety of cluster sizes, with both spread and non-spread jobs
 func BenchmarkServiceScheduler(b *testing.B) {
 
-	clusterSizes := []int{1000, 5000, 10000}
-	rackSets := []int{10, 25, 50, 75}
-	jobSizes := []int{300, 600, 900, 1200}
+	clusterSizes := []int{500, 1000, 5000, 10000}
+	rackSets := []int{25, 50, 75}
+	jobSizes := []int{50, 300, 600, 900, 1200}
 
 	type benchmark struct {
 		name        string
@@ -112,18 +113,21 @@ func BenchmarkServiceScheduler(b *testing.B) {
 	}
 
 	for _, bm := range benchmarks {
+		job := generateJob(bm.withSpread, bm.jobSize, bm.racks)
+		h := scheduler.NewHarness(b)
+		h.SetNoSubmit()
+		upsertNodes(h, bm.clusterSize, bm.racks)
+		eval := upsertJob(h, job)
+		b.ResetTimer()
+
 		b.Run(bm.name, func(b *testing.B) {
-			h := scheduler.NewHarness(b)
-			upsertNodes(h, bm.clusterSize, bm.racks)
-			job := generateJob(bm.withSpread, bm.jobSize)
-			eval := upsertJob(h, job)
-			b.ResetTimer()
 			for i := 0; i < b.N; i++ {
 				err := h.Process(scheduler.NewServiceScheduler, eval)
-				require.NoError(b, err)
+				must.NoError(b, err)
 			}
 		})
 	}
+
 }
 
 func upsertJob(h *scheduler.Harness, job *structs.Job) *structs.Evaluation {
@@ -147,13 +151,26 @@ func upsertJob(h *scheduler.Harness, job *structs.Job) *structs.Evaluation {
 	return eval
 }
 
-func generateJob(withSpread bool, jobSize int) *structs.Job {
+func generateJob(withSpread bool, jobSize int, racks int) *structs.Job {
 	job := mock.Job()
 	job.Datacenters = []string{"dc-1", "dc-2"}
 	if withSpread {
 		job.Spreads = []*structs.Spread{{Attribute: "${meta.rack}"}}
 	}
-	job.Constraints = []*structs.Constraint{}
+
+	// only half the racks will be considered eligibble
+	rackTargets := []string{}
+	for i := range racks / 2 {
+		rackTargets = append(rackTargets, fmt.Sprintf("r%d", i))
+	}
+	rackTarget := strings.Join(rackTargets, ",")
+	job.Constraints = []*structs.Constraint{
+		{
+			LTarget: "${meta.rack}",
+			RTarget: rackTarget,
+			Operand: "set_contains_any",
+		},
+	}
 	job.TaskGroups[0].Count = jobSize
 	job.TaskGroups[0].Networks = nil
 	job.TaskGroups[0].Services = []*structs.Service{}
@@ -173,6 +190,7 @@ func upsertNodes(h *scheduler.Harness, count, racks int) {
 		node.Datacenter = datacenters[i%2]
 		node.Meta = map[string]string{}
 		node.Meta["rack"] = fmt.Sprintf("r%d", i%racks)
+		node.Attributes["unique.advertise.address"] = fmt.Sprintf("192.168.%d.%d", i%10, i%120)
 		memoryMB := 32000
 		diskMB := 100 * 1024
 
@@ -196,6 +214,7 @@ func upsertNodes(h *scheduler.Harness, count, racks int) {
 			},
 		}
 		node.NodeResources.Compatibility()
+		node.ComputeClass()
 
 		err := h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)
 		if err != nil {
diff --git a/scheduler/testing.go b/scheduler/testing.go
index 347a27906..472c3d12f 100644
--- a/scheduler/testing.go
+++ b/scheduler/testing.go
@@ -65,6 +65,9 @@ type Harness struct {
 
 	optimizePlan              bool
 	serversMeetMinimumVersion bool
+
+	// don't actually write plans back to state
+	noSubmit bool
 }
 
 // NewHarness is used to make a new testing harness
@@ -179,6 +182,10 @@ func (h *Harness) SubmitPlan(plan *structs.Plan) (*structs.PlanResult, State, er
 		req.NodePreemptions = preemptedAllocs
 	}
 
+	if h.noSubmit {
+		return result, nil, nil
+	}
+
 	// Apply the full plan
 	err := h.State.UpsertPlanResults(structs.MsgTypeTestSetup, index, &req)
 	return result, nil, err
@@ -303,3 +310,7 @@ func (h *Harness) AssertEvalStatus(t testing.TB, state string) {
 	update := h.Evals[0]
 	require.Equal(t, state, update.Status)
 }
+
+func (h *Harness) SetNoSubmit() {
+	h.noSubmit = true
+}