From 55fe05d353ca2e37e135d0670c20773ff5c4e52d Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Thu, 17 Oct 2024 13:48:20 -0400 Subject: [PATCH] heartbeat: use leader's ACL token when failing heartbeat (#24241) In #23838 we updated the `Node.Update` RPC handler we use for heartbeats to be more strict about requiring node secrets. But when a node goes down, it's the leader that sends the request to mark the node down via `Node.Update` (to itself), and this request was missing the leader ACL needed to authenticate to itself. Add the leader ACL to the request and update the RPC handler test for disconnected-clients to use ACLs, which would have detected this bug. Also added a note to the `Authenticate` comment about how that authentication path requires the leader ACL. Fixes: https://github.com/hashicorp/nomad/issues/24231 Ref: https://hashicorp.atlassian.net/browse/NET-11384 --- .changelog/24241.txt | 3 +++ nomad/auth/auth.go | 4 ++++ nomad/heartbeat.go | 3 ++- nomad/node_endpoint_test.go | 38 ++++++++++++++++++++++--------------- testutil/wait.go | 18 ++++++++++++++---- 5 files changed, 46 insertions(+), 20 deletions(-) create mode 100644 .changelog/24241.txt diff --git a/.changelog/24241.txt b/.changelog/24241.txt new file mode 100644 index 000000000..6ed70d5e4 --- /dev/null +++ b/.changelog/24241.txt @@ -0,0 +1,3 @@ +```release-note:bug +heartbeat: Fixed a bug where failed nodes would not be marked down +``` diff --git a/nomad/auth/auth.go b/nomad/auth/auth.go index b777f7c5d..38fd64455 100644 --- a/nomad/auth/auth.go +++ b/nomad/auth/auth.go @@ -93,6 +93,10 @@ func NewAuthenticator(cfg *AuthenticatorConfig) *Authenticator { // an ephemeral ACLToken makes the original of the credential clear to RPC // handlers, who may have different behavior for internal vs external origins. // +// Note: when making a server-to-server RPC that authenticates with this method, +// the RPC *must* include the leader's ACL token. Use AuthenticateServerOnly for +// requests that don't have access to the leader's ACL token. +// // Note: when called on the follower we'll be making stale queries, so it's // possible if the follower is behind that the leader will get a different value // if an ACL token or allocation's WI has just been created. diff --git a/nomad/heartbeat.go b/nomad/heartbeat.go index 6b3948118..be282b70d 100644 --- a/nomad/heartbeat.go +++ b/nomad/heartbeat.go @@ -163,7 +163,8 @@ func (h *nodeHeartbeater) invalidateHeartbeat(id string) { Status: structs.NodeStatusDown, NodeEvent: structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemCluster).SetMessage(NodeHeartbeatEventMissed), WriteRequest: structs.WriteRequest{ - Region: h.srv.config.Region, + Region: h.srv.config.Region, + AuthToken: h.srv.getLeaderAcl(), }, } diff --git a/nomad/node_endpoint_test.go b/nomad/node_endpoint_test.go index 2f8c001a6..3a3d3acc2 100644 --- a/nomad/node_endpoint_test.go +++ b/nomad/node_endpoint_test.go @@ -952,7 +952,7 @@ func TestClientEndpoint_UpdateStatus_Reconnect(t *testing.T) { // Setup server with tighter heartbeat so we don't have to wait so long // for nodes to go down. heartbeatTTL := time.Duration(500*testutil.TestMultiplier()) * time.Millisecond - s, cleanupS := TestServer(t, func(c *Config) { + s, rootToken, cleanupS := TestACLServer(t, func(c *Config) { c.MinHeartbeatTTL = heartbeatTTL c.HeartbeatGrace = 2 * heartbeatTTL }) @@ -1001,7 +1001,8 @@ func TestClientEndpoint_UpdateStatus_Reconnect(t *testing.T) { go heartbeat(heartbeatCtx) // Wait for node to be ready. - testutil.WaitForClientStatus(t, s.RPC, node.ID, "global", structs.NodeStatusReady) + testutil.WaitForClientStatusWithToken(t, s.RPC, node.ID, "global", + structs.NodeStatusReady, rootToken.SecretID) // Register job with Disconnect.LostAfter job := version.jobSpec(time.Hour) @@ -1018,6 +1019,7 @@ func TestClientEndpoint_UpdateStatus_Reconnect(t *testing.T) { WriteRequest: structs.WriteRequest{ Region: "global", Namespace: job.Namespace, + AuthToken: rootToken.SecretID, }, } var jobResp structs.JobRegisterResponse @@ -1025,15 +1027,16 @@ func TestClientEndpoint_UpdateStatus_Reconnect(t *testing.T) { must.NoError(t, err) // Wait for alloc to be pending in the server. - testutil.WaitForJobAllocStatus(t, s.RPC, job, map[string]int{ + testutil.WaitForJobAllocStatusWithToken(t, s.RPC, job, map[string]int{ structs.AllocClientStatusPending: 1, - }) + }, rootToken.SecretID) // Get allocs that node should run. allocsReq := &structs.NodeSpecificRequest{ NodeID: node.ID, QueryOptions: structs.QueryOptions{ - Region: "global", + Region: "global", + AuthToken: rootToken.SecretID, }, } var allocsResp structs.NodeAllocsResponse @@ -1058,17 +1061,18 @@ func TestClientEndpoint_UpdateStatus_Reconnect(t *testing.T) { must.NoError(t, err) // Wait for alloc to be running in the server. - testutil.WaitForJobAllocStatus(t, s.RPC, job, map[string]int{ + testutil.WaitForJobAllocStatusWithToken(t, s.RPC, job, map[string]int{ structs.AllocClientStatusRunning: 1, - }) + }, rootToken.SecretID) // Stop heartbeat and wait for the client to be disconnected and the alloc // to be unknown. cancelHeartbeat() - testutil.WaitForClientStatus(t, s.RPC, node.ID, "global", structs.NodeStatusDisconnected) - testutil.WaitForJobAllocStatus(t, s.RPC, job, map[string]int{ + testutil.WaitForClientStatusWithToken(t, s.RPC, node.ID, "global", + structs.NodeStatusDisconnected, rootToken.SecretID) + testutil.WaitForJobAllocStatusWithToken(t, s.RPC, job, map[string]int{ structs.AllocClientStatusUnknown: 1, - }) + }, rootToken.SecretID) // Restart heartbeat to reconnect node. heartbeatCtx, cancelHeartbeat = context.WithCancel(context.Background()) @@ -1081,7 +1085,8 @@ func TestClientEndpoint_UpdateStatus_Reconnect(t *testing.T) { // allocs status with the server so the scheduler have the necessary // information to avoid unnecessary placements. time.Sleep(3 * heartbeatTTL) - testutil.WaitForClientStatus(t, s.RPC, node.ID, "global", structs.NodeStatusInit) + testutil.WaitForClientStatusWithToken(t, s.RPC, node.ID, "global", + structs.NodeStatusInit, rootToken.SecretID) // Get allocs that node should run. // The node should only have one alloc assigned until it updates its allocs @@ -1089,7 +1094,8 @@ func TestClientEndpoint_UpdateStatus_Reconnect(t *testing.T) { allocsReq = &structs.NodeSpecificRequest{ NodeID: node.ID, QueryOptions: structs.QueryOptions{ - Region: "global", + Region: "global", + AuthToken: rootToken.SecretID, }, } err = msgpackrpc.CallWithCodec(codec, "Node.GetAllocs", allocsReq, &allocsResp) @@ -1104,10 +1110,12 @@ func TestClientEndpoint_UpdateStatus_Reconnect(t *testing.T) { // - client status is ready. // - only 1 alloc and the alloc is running. // - all evals are terminal, so cluster is in a stable state. - testutil.WaitForClientStatus(t, s.RPC, node.ID, "global", structs.NodeStatusReady) - testutil.WaitForJobAllocStatus(t, s.RPC, job, map[string]int{ + testutil.WaitForClientStatusWithToken(t, s.RPC, node.ID, "global", + structs.NodeStatusReady, rootToken.SecretID) + testutil.WaitForJobAllocStatusWithToken(t, s.RPC, job, map[string]int{ structs.AllocClientStatusRunning: 1, - }) + }, rootToken.SecretID) + testutil.WaitForResult(func() (bool, error) { state := s.fsm.State() ws := memdb.NewWatchSet() diff --git a/testutil/wait.go b/testutil/wait.go index 00d0c0232..2a51c6b90 100644 --- a/testutil/wait.go +++ b/testutil/wait.go @@ -193,8 +193,15 @@ func WaitForClient(t testing.TB, rpc rpcFn, nodeID string, region string) { WaitForClientStatus(t, rpc, nodeID, region, structs.NodeStatusReady) } -// WaitForClientStatus blocks until the client is in the expected status. -func WaitForClientStatus(t testing.TB, rpc rpcFn, nodeID string, region string, status string) { +// WaitForClientStatus blocks until the client is in the expected status +func WaitForClientStatus(t testing.TB, rpc rpcFn, nodeID, region, status string) { + t.Helper() + WaitForClientStatusWithToken(t, rpc, nodeID, region, status, "") +} + +// WaitForClientStatusWithToken blocks until the client is in the expected +// status, for use with ACLs enabled +func WaitForClientStatusWithToken(t testing.TB, rpc rpcFn, nodeID, region, status, token string) { t.Helper() if region == "" { @@ -202,8 +209,11 @@ func WaitForClientStatus(t testing.TB, rpc rpcFn, nodeID string, region string, } WaitForResult(func() (bool, error) { req := structs.NodeSpecificRequest{ - NodeID: nodeID, - QueryOptions: structs.QueryOptions{Region: region}, + NodeID: nodeID, + QueryOptions: structs.QueryOptions{ + Region: region, + AuthToken: token, + }, } var out structs.SingleNodeResponse