From 1788bfb42eff36f2ee88ae20404ea30f8494cee0 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Mon, 3 Mar 2025 09:28:32 -0500 Subject: [PATCH] remove addresses from node class hash (#24942) When a node is fingerprinted, we calculate a "computed class" from a hash over a subset of its fields and attributes. In the scheduler, when a given node fails feasibility checking (before fit checking) we know that no other node of that same class will be feasible, and we add the hash to a map so we can reject them early. This hash cannot include any values that are unique to a given node, otherwise no other node will have the same hash and we'll never save ourselves the work of feasibility checking those nodes. In #4390 we introduce the `nomad.advertise.address` attribute and in #19969 we introduced `consul.dns.addr` attribute. Both of these are unique per node and break the hash. Additionally, we were not correctly filtering attributes out when checking if a node escaped the class by not filtering for attributes that start with `unique.`. The test for this introduced in #708 had an inverted assertion, which allowed this to pass unnoticed since the early days of Nomad. Ref: https://github.com/hashicorp/nomad/pull/708 Ref: https://github.com/hashicorp/nomad/pull/4390 Ref: https://github.com/hashicorp/nomad/pull/19969 --- .changelog/24942.txt | 7 +++ client/allocrunner/networking_cni.go | 4 +- client/allocrunner/networking_cni_test.go | 20 ++++---- client/fingerprint/consul.go | 50 ++++++++++---------- client/fingerprint/consul_test.go | 52 ++++++++++----------- client/fingerprint/nomad.go | 2 +- client/fingerprint/nomad_test.go | 2 +- nomad/structs/node_class.go | 2 + nomad/structs/node_class_test.go | 8 ++-- scheduler/benchmarks/benchmarks_test.go | 57 +++++++++++++++-------- scheduler/testing.go | 11 +++++ 11 files changed, 126 insertions(+), 89 deletions(-) create mode 100644 .changelog/24942.txt diff --git a/.changelog/24942.txt b/.changelog/24942.txt new file mode 100644 index 000000000..c02ad6b12 --- /dev/null +++ b/.changelog/24942.txt @@ -0,0 +1,7 @@ +```release-note:bug +scheduler: Fixed a bug where node class hashes included unique attributes, making scheduling more costly +``` + +```release-note:breaking-change +node: The node attribute `consul.addr.dns` has been changed to `unique.consul.addr.dns`. The node attribute `nomad.advertise.address` has been changed to `unique.advertise.address`. +``` diff --git a/client/allocrunner/networking_cni.go b/client/allocrunner/networking_cni.go index 4f39eb218..7a8663501 100644 --- a/client/allocrunner/networking_cni.go +++ b/client/allocrunner/networking_cni.go @@ -413,10 +413,10 @@ func (c *cniNetworkConfigurator) setupTransparentProxyArgs(alloc *structs.Alloca func (c *cniNetworkConfigurator) dnsFromAttrs(cluster string) (string, int) { var dnsAddrAttr, dnsPortAttr string if cluster == structs.ConsulDefaultCluster || cluster == "" { - dnsAddrAttr = "consul.dns.addr" + dnsAddrAttr = "unique.consul.dns.addr" dnsPortAttr = "consul.dns.port" } else { - dnsAddrAttr = "consul." + cluster + ".dns.addr" + dnsAddrAttr = "unique.consul." + cluster + ".dns.addr" dnsPortAttr = "consul." + cluster + ".dns.port" } diff --git a/client/allocrunner/networking_cni_test.go b/client/allocrunner/networking_cni_test.go index f76f2b246..43745d3d7 100644 --- a/client/allocrunner/networking_cni_test.go +++ b/client/allocrunner/networking_cni_test.go @@ -257,8 +257,8 @@ func TestSetup(t *testing.T) { } nodeAddrs := map[string]string{ - "consul.dns.addr": "192.168.1.117", - "consul.dns.port": "8600", + "unique.consul.dns.addr": "192.168.1.117", + "consul.dns.port": "8600", } nodeMeta := map[string]string{ "connect.transparent_proxy.default_outbound_port": "15001", @@ -554,8 +554,8 @@ func TestCNI_setupTproxyArgs(t *testing.T) { } nodeAttrs := map[string]string{ - "consul.dns.addr": "192.168.1.117", - "consul.dns.port": "8600", + "unique.consul.dns.addr": "192.168.1.117", + "consul.dns.port": "8600", } alloc := mock.ConnectAlloc() @@ -716,8 +716,8 @@ func TestCNI_setupTproxyArgs(t *testing.T) { { name: "tproxy with consul dns disabled", nodeAttrs: map[string]string{ - "consul.dns.port": "-1", - "consul.dns.addr": "192.168.1.117", + "consul.dns.port": "-1", + "unique.consul.dns.addr": "192.168.1.117", }, tproxySpec: &structs.ConsulTransparentProxy{}, expectIPConfig: &iptables.Config{ @@ -732,10 +732,10 @@ func TestCNI_setupTproxyArgs(t *testing.T) { name: "tproxy for other cluster with default consul dns disabled", cluster: "infra", nodeAttrs: map[string]string{ - "consul.dns.port": "-1", - "consul.dns.addr": "192.168.1.110", - "consul.infra.dns.port": "8600", - "consul.infra.dns.addr": "192.168.1.117", + "consul.dns.port": "-1", + "unique.consul.dns.addr": "192.168.1.110", + "consul.infra.dns.port": "8600", + "unique.consul.infra.dns.addr": "192.168.1.117", }, tproxySpec: &structs.ConsulTransparentProxy{}, expectIPConfig: &iptables.Config{ diff --git a/client/fingerprint/consul.go b/client/fingerprint/consul.go index d76136604..2ff906857 100644 --- a/client/fingerprint/consul.go +++ b/client/fingerprint/consul.go @@ -190,34 +190,34 @@ func (cfs *consulState) initialize(cfg *config.ConsulConfig, logger hclog.Logger if cfg.Name == structs.ConsulDefaultCluster { cfs.readers = map[string]valueReader{ - "consul.server": cfs.server, - "consul.version": cfs.version, - "consul.sku": cfs.sku, - "consul.revision": cfs.revision, - "unique.consul.name": cfs.name, // note: won't have this for non-default clusters - "consul.datacenter": cfs.dc, - "consul.segment": cfs.segment, - "consul.connect": cfs.connect, - "consul.grpc": cfs.grpc(consulConfig.Scheme, logger), - "consul.ft.namespaces": cfs.namespaces, - "consul.partition": cfs.partition, - "consul.dns.port": cfs.dnsPort, - "consul.dns.addr": cfs.dnsAddr(logger), + "consul.server": cfs.server, + "consul.version": cfs.version, + "consul.sku": cfs.sku, + "consul.revision": cfs.revision, + "unique.consul.name": cfs.name, // note: won't have this for non-default clusters + "consul.datacenter": cfs.dc, + "consul.segment": cfs.segment, + "consul.connect": cfs.connect, + "consul.grpc": cfs.grpc(consulConfig.Scheme, logger), + "consul.ft.namespaces": cfs.namespaces, + "consul.partition": cfs.partition, + "consul.dns.port": cfs.dnsPort, + "unique.consul.dns.addr": cfs.dnsAddr(logger), } } else { cfs.readers = map[string]valueReader{ - fmt.Sprintf("consul.%s.server", cfg.Name): cfs.server, - fmt.Sprintf("consul.%s.version", cfg.Name): cfs.version, - fmt.Sprintf("consul.%s.sku", cfg.Name): cfs.sku, - fmt.Sprintf("consul.%s.revision", cfg.Name): cfs.revision, - fmt.Sprintf("consul.%s.datacenter", cfg.Name): cfs.dc, - fmt.Sprintf("consul.%s.segment", cfg.Name): cfs.segment, - fmt.Sprintf("consul.%s.connect", cfg.Name): cfs.connect, - fmt.Sprintf("consul.%s.grpc", cfg.Name): cfs.grpc(consulConfig.Scheme, logger), - fmt.Sprintf("consul.%s.ft.namespaces", cfg.Name): cfs.namespaces, - fmt.Sprintf("consul.%s.partition", cfg.Name): cfs.partition, - fmt.Sprintf("consul.%s.dns.port", cfg.Name): cfs.dnsPort, - fmt.Sprintf("consul.%s.dns.addr", cfg.Name): cfs.dnsAddr(logger), + fmt.Sprintf("consul.%s.server", cfg.Name): cfs.server, + fmt.Sprintf("consul.%s.version", cfg.Name): cfs.version, + fmt.Sprintf("consul.%s.sku", cfg.Name): cfs.sku, + fmt.Sprintf("consul.%s.revision", cfg.Name): cfs.revision, + fmt.Sprintf("consul.%s.datacenter", cfg.Name): cfs.dc, + fmt.Sprintf("consul.%s.segment", cfg.Name): cfs.segment, + fmt.Sprintf("consul.%s.connect", cfg.Name): cfs.connect, + fmt.Sprintf("consul.%s.grpc", cfg.Name): cfs.grpc(consulConfig.Scheme, logger), + fmt.Sprintf("consul.%s.ft.namespaces", cfg.Name): cfs.namespaces, + fmt.Sprintf("consul.%s.partition", cfg.Name): cfs.partition, + fmt.Sprintf("consul.%s.dns.port", cfg.Name): cfs.dnsPort, + fmt.Sprintf("unique.consul.%s.dns.addr", cfg.Name): cfs.dnsAddr(logger), } } diff --git a/client/fingerprint/consul_test.go b/client/fingerprint/consul_test.go index 848a78553..078c93bf3 100644 --- a/client/fingerprint/consul_test.go +++ b/client/fingerprint/consul_test.go @@ -696,19 +696,19 @@ func TestConsulFingerprint_Fingerprint_ent(t *testing.T) { err := cf.Fingerprint(&FingerprintRequest{Config: cfg, Node: node}, &resp) must.NoError(t, err) must.Eq(t, map[string]string{ - "consul.datacenter": "dc1", - "consul.revision": "22ce6c6ad", - "consul.segment": "seg1", - "consul.server": "true", - "consul.sku": "ent", - "consul.version": "1.9.5+ent", - "consul.ft.namespaces": "true", - "consul.connect": "true", - "consul.grpc": "8502", - "consul.dns.addr": "192.168.1.117", - "consul.dns.port": "8600", - "consul.partition": "default", - "unique.consul.name": "HAL9000", + "consul.datacenter": "dc1", + "consul.revision": "22ce6c6ad", + "consul.segment": "seg1", + "consul.server": "true", + "consul.sku": "ent", + "consul.version": "1.9.5+ent", + "consul.ft.namespaces": "true", + "consul.connect": "true", + "consul.grpc": "8502", + "unique.consul.dns.addr": "192.168.1.117", + "consul.dns.port": "8600", + "consul.partition": "default", + "unique.consul.name": "HAL9000", }, resp.Attributes) must.True(t, resp.Detected) @@ -752,19 +752,19 @@ func TestConsulFingerprint_Fingerprint_ent(t *testing.T) { err4 := cf.Fingerprint(&FingerprintRequest{Config: cfg, Node: node}, &resp4) must.NoError(t, err4) must.Eq(t, map[string]string{ - "consul.datacenter": "dc1", - "consul.revision": "22ce6c6ad", - "consul.segment": "seg1", - "consul.server": "true", - "consul.sku": "ent", - "consul.version": "1.9.5+ent", - "consul.ft.namespaces": "true", - "consul.connect": "true", - "consul.grpc": "8502", - "consul.dns.addr": "192.168.1.117", - "consul.dns.port": "8600", - "consul.partition": "default", - "unique.consul.name": "HAL9000", + "consul.datacenter": "dc1", + "consul.revision": "22ce6c6ad", + "consul.segment": "seg1", + "consul.server": "true", + "consul.sku": "ent", + "consul.version": "1.9.5+ent", + "consul.ft.namespaces": "true", + "consul.connect": "true", + "consul.grpc": "8502", + "unique.consul.dns.addr": "192.168.1.117", + "consul.dns.port": "8600", + "consul.partition": "default", + "unique.consul.name": "HAL9000", }, resp4.Attributes) // consul now available again diff --git a/client/fingerprint/nomad.go b/client/fingerprint/nomad.go index cbdc1c3b4..2caadf29a 100644 --- a/client/fingerprint/nomad.go +++ b/client/fingerprint/nomad.go @@ -22,7 +22,7 @@ func NewNomadFingerprint(logger log.Logger) Fingerprint { } func (f *NomadFingerprint) Fingerprint(req *FingerprintRequest, resp *FingerprintResponse) error { - resp.AddAttribute("nomad.advertise.address", req.Node.HTTPAddr) + resp.AddAttribute("unique.advertise.address", req.Node.HTTPAddr) resp.AddAttribute("nomad.version", req.Config.Version.VersionNumber()) resp.AddAttribute("nomad.revision", req.Config.Version.Revision) resp.AddAttribute("nomad.service_discovery", strconv.FormatBool(req.Config.NomadServiceDiscovery)) diff --git a/client/fingerprint/nomad_test.go b/client/fingerprint/nomad_test.go index dbb645aa1..d72130c89 100644 --- a/client/fingerprint/nomad_test.go +++ b/client/fingerprint/nomad_test.go @@ -57,7 +57,7 @@ func TestNomadFingerprint(t *testing.T) { t.Fatalf("incorrect revision") } - if response.Attributes["nomad.advertise.address"] != h { + if response.Attributes["unique.advertise.address"] != h { t.Fatalf("incorrect advertise address") } diff --git a/nomad/structs/node_class.go b/nomad/structs/node_class.go index c6b101620..08d4ab0c2 100644 --- a/nomad/structs/node_class.go +++ b/nomad/structs/node_class.go @@ -123,6 +123,8 @@ func EscapedConstraints(constraints []*Constraint) []*Constraint { // computed node class optimization. func constraintTargetEscapes(target string) bool { switch { + case strings.HasPrefix(target, "${unique."): + return true case strings.HasPrefix(target, "${node.unique."): return true case strings.HasPrefix(target, "${attr.unique."): diff --git a/nomad/structs/node_class_test.go b/nomad/structs/node_class_test.go index fe8265dc8..e6c160747 100644 --- a/nomad/structs/node_class_test.go +++ b/nomad/structs/node_class_test.go @@ -4,7 +4,6 @@ package structs import ( - "reflect" "testing" "github.com/hashicorp/nomad/ci" @@ -276,8 +275,7 @@ func TestNode_EscapedConstraints(t *testing.T) { Operand: "!=", } constraints := []*Constraint{ne1, ne2, ne3, e1, e2, e3} - expected := []*Constraint{ne1, ne2, ne3} - if act := EscapedConstraints(constraints); reflect.DeepEqual(act, expected) { - t.Fatalf("EscapedConstraints(%v) returned %v; want %v", constraints, act, expected) - } + expected := []*Constraint{e1, e2, e3} + must.Eq(t, expected, EscapedConstraints(constraints), + must.Sprintf("expected unique fields to escape constraints")) } diff --git a/scheduler/benchmarks/benchmarks_test.go b/scheduler/benchmarks/benchmarks_test.go index 7ab141037..cc2c7668d 100644 --- a/scheduler/benchmarks/benchmarks_test.go +++ b/scheduler/benchmarks/benchmarks_test.go @@ -5,9 +5,10 @@ package benchmarks import ( "fmt" + "strings" "testing" - "github.com/stretchr/testify/require" + "github.com/shoenig/test/must" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/mock" @@ -38,7 +39,7 @@ func BenchmarkSchedulerExample(b *testing.B) { upsertNodes(h, 5000, 100) iter, err := h.State.Nodes(nil) - require.NoError(b, err) + must.NoError(b, err) nodes := 0 for { raw := iter.Next() @@ -47,8 +48,8 @@ func BenchmarkSchedulerExample(b *testing.B) { } nodes++ } - require.Equal(b, 5000, nodes) - job := generateJob(true, 600) + must.Eq(b, 5000, nodes) + job := generateJob(true, 600, 100) eval = upsertJob(h, job) } @@ -58,14 +59,14 @@ func BenchmarkSchedulerExample(b *testing.B) { // benchmarking a successful run and not a failed plan. { err := h.Process(scheduler.NewServiceScheduler, eval) - require.NoError(b, err) - require.Len(b, h.Plans, 1) - require.False(b, h.Plans[0].IsNoOp()) + must.NoError(b, err) + must.Len(b, 1, h.Plans) + must.False(b, h.Plans[0].IsNoOp()) } for i := 0; i < b.N; i++ { err := h.Process(scheduler.NewServiceScheduler, eval) - require.NoError(b, err) + must.NoError(b, err) } } @@ -73,9 +74,9 @@ func BenchmarkSchedulerExample(b *testing.B) { // variety of cluster sizes, with both spread and non-spread jobs func BenchmarkServiceScheduler(b *testing.B) { - clusterSizes := []int{1000, 5000, 10000} - rackSets := []int{10, 25, 50, 75} - jobSizes := []int{300, 600, 900, 1200} + clusterSizes := []int{500, 1000, 5000, 10000} + rackSets := []int{25, 50, 75} + jobSizes := []int{50, 300, 600, 900, 1200} type benchmark struct { name string @@ -112,18 +113,21 @@ func BenchmarkServiceScheduler(b *testing.B) { } for _, bm := range benchmarks { + job := generateJob(bm.withSpread, bm.jobSize, bm.racks) + h := scheduler.NewHarness(b) + h.SetNoSubmit() + upsertNodes(h, bm.clusterSize, bm.racks) + eval := upsertJob(h, job) + b.ResetTimer() + b.Run(bm.name, func(b *testing.B) { - h := scheduler.NewHarness(b) - upsertNodes(h, bm.clusterSize, bm.racks) - job := generateJob(bm.withSpread, bm.jobSize) - eval := upsertJob(h, job) - b.ResetTimer() for i := 0; i < b.N; i++ { err := h.Process(scheduler.NewServiceScheduler, eval) - require.NoError(b, err) + must.NoError(b, err) } }) } + } func upsertJob(h *scheduler.Harness, job *structs.Job) *structs.Evaluation { @@ -147,13 +151,26 @@ func upsertJob(h *scheduler.Harness, job *structs.Job) *structs.Evaluation { return eval } -func generateJob(withSpread bool, jobSize int) *structs.Job { +func generateJob(withSpread bool, jobSize int, racks int) *structs.Job { job := mock.Job() job.Datacenters = []string{"dc-1", "dc-2"} if withSpread { job.Spreads = []*structs.Spread{{Attribute: "${meta.rack}"}} } - job.Constraints = []*structs.Constraint{} + + // only half the racks will be considered eligibble + rackTargets := []string{} + for i := range racks / 2 { + rackTargets = append(rackTargets, fmt.Sprintf("r%d", i)) + } + rackTarget := strings.Join(rackTargets, ",") + job.Constraints = []*structs.Constraint{ + { + LTarget: "${meta.rack}", + RTarget: rackTarget, + Operand: "set_contains_any", + }, + } job.TaskGroups[0].Count = jobSize job.TaskGroups[0].Networks = nil job.TaskGroups[0].Services = []*structs.Service{} @@ -173,6 +190,7 @@ func upsertNodes(h *scheduler.Harness, count, racks int) { node.Datacenter = datacenters[i%2] node.Meta = map[string]string{} node.Meta["rack"] = fmt.Sprintf("r%d", i%racks) + node.Attributes["unique.advertise.address"] = fmt.Sprintf("192.168.%d.%d", i%10, i%120) memoryMB := 32000 diskMB := 100 * 1024 @@ -196,6 +214,7 @@ func upsertNodes(h *scheduler.Harness, count, racks int) { }, } node.NodeResources.Compatibility() + node.ComputeClass() err := h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node) if err != nil { diff --git a/scheduler/testing.go b/scheduler/testing.go index 347a27906..472c3d12f 100644 --- a/scheduler/testing.go +++ b/scheduler/testing.go @@ -65,6 +65,9 @@ type Harness struct { optimizePlan bool serversMeetMinimumVersion bool + + // don't actually write plans back to state + noSubmit bool } // NewHarness is used to make a new testing harness @@ -179,6 +182,10 @@ func (h *Harness) SubmitPlan(plan *structs.Plan) (*structs.PlanResult, State, er req.NodePreemptions = preemptedAllocs } + if h.noSubmit { + return result, nil, nil + } + // Apply the full plan err := h.State.UpsertPlanResults(structs.MsgTypeTestSetup, index, &req) return result, nil, err @@ -303,3 +310,7 @@ func (h *Harness) AssertEvalStatus(t testing.TB, state string) { update := h.Evals[0] require.Equal(t, state, update.Status) } + +func (h *Harness) SetNoSubmit() { + h.noSubmit = true +}