heartbeat: use leader's ACL token when failing heartbeat (#24241)

In #23838 we updated the `Node.Update` RPC handler we use for heartbeats to be
more strict about requiring node secrets. But when a node goes down, it's the
leader that sends the request to mark the node down via `Node.Update` (to
itself), and this request was missing the leader ACL needed to authenticate to
itself.

Add the leader ACL to the request and update the RPC handler test for
disconnected-clients to use ACLs, which would have detected this bug. Also added
a note to the `Authenticate` comment about how that authentication path requires
the leader ACL.

Fixes: https://github.com/hashicorp/nomad/issues/24231
Ref: https://hashicorp.atlassian.net/browse/NET-11384
This commit is contained in:
Tim Gross
2024-10-17 13:48:20 -04:00
committed by GitHub
parent e440e1d1db
commit 55fe05d353
5 changed files with 46 additions and 20 deletions

3
.changelog/24241.txt Normal file
View File

@@ -0,0 +1,3 @@
```release-note:bug
heartbeat: Fixed a bug where failed nodes would not be marked down
```

View File

@@ -93,6 +93,10 @@ func NewAuthenticator(cfg *AuthenticatorConfig) *Authenticator {
// an ephemeral ACLToken makes the original of the credential clear to RPC
// handlers, who may have different behavior for internal vs external origins.
//
// Note: when making a server-to-server RPC that authenticates with this method,
// the RPC *must* include the leader's ACL token. Use AuthenticateServerOnly for
// requests that don't have access to the leader's ACL token.
//
// Note: when called on the follower we'll be making stale queries, so it's
// possible if the follower is behind that the leader will get a different value
// if an ACL token or allocation's WI has just been created.

View File

@@ -163,7 +163,8 @@ func (h *nodeHeartbeater) invalidateHeartbeat(id string) {
Status: structs.NodeStatusDown,
NodeEvent: structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemCluster).SetMessage(NodeHeartbeatEventMissed),
WriteRequest: structs.WriteRequest{
Region: h.srv.config.Region,
Region: h.srv.config.Region,
AuthToken: h.srv.getLeaderAcl(),
},
}

View File

@@ -952,7 +952,7 @@ func TestClientEndpoint_UpdateStatus_Reconnect(t *testing.T) {
// Setup server with tighter heartbeat so we don't have to wait so long
// for nodes to go down.
heartbeatTTL := time.Duration(500*testutil.TestMultiplier()) * time.Millisecond
s, cleanupS := TestServer(t, func(c *Config) {
s, rootToken, cleanupS := TestACLServer(t, func(c *Config) {
c.MinHeartbeatTTL = heartbeatTTL
c.HeartbeatGrace = 2 * heartbeatTTL
})
@@ -1001,7 +1001,8 @@ func TestClientEndpoint_UpdateStatus_Reconnect(t *testing.T) {
go heartbeat(heartbeatCtx)
// Wait for node to be ready.
testutil.WaitForClientStatus(t, s.RPC, node.ID, "global", structs.NodeStatusReady)
testutil.WaitForClientStatusWithToken(t, s.RPC, node.ID, "global",
structs.NodeStatusReady, rootToken.SecretID)
// Register job with Disconnect.LostAfter
job := version.jobSpec(time.Hour)
@@ -1018,6 +1019,7 @@ func TestClientEndpoint_UpdateStatus_Reconnect(t *testing.T) {
WriteRequest: structs.WriteRequest{
Region: "global",
Namespace: job.Namespace,
AuthToken: rootToken.SecretID,
},
}
var jobResp structs.JobRegisterResponse
@@ -1025,15 +1027,16 @@ func TestClientEndpoint_UpdateStatus_Reconnect(t *testing.T) {
must.NoError(t, err)
// Wait for alloc to be pending in the server.
testutil.WaitForJobAllocStatus(t, s.RPC, job, map[string]int{
testutil.WaitForJobAllocStatusWithToken(t, s.RPC, job, map[string]int{
structs.AllocClientStatusPending: 1,
})
}, rootToken.SecretID)
// Get allocs that node should run.
allocsReq := &structs.NodeSpecificRequest{
NodeID: node.ID,
QueryOptions: structs.QueryOptions{
Region: "global",
Region: "global",
AuthToken: rootToken.SecretID,
},
}
var allocsResp structs.NodeAllocsResponse
@@ -1058,17 +1061,18 @@ func TestClientEndpoint_UpdateStatus_Reconnect(t *testing.T) {
must.NoError(t, err)
// Wait for alloc to be running in the server.
testutil.WaitForJobAllocStatus(t, s.RPC, job, map[string]int{
testutil.WaitForJobAllocStatusWithToken(t, s.RPC, job, map[string]int{
structs.AllocClientStatusRunning: 1,
})
}, rootToken.SecretID)
// Stop heartbeat and wait for the client to be disconnected and the alloc
// to be unknown.
cancelHeartbeat()
testutil.WaitForClientStatus(t, s.RPC, node.ID, "global", structs.NodeStatusDisconnected)
testutil.WaitForJobAllocStatus(t, s.RPC, job, map[string]int{
testutil.WaitForClientStatusWithToken(t, s.RPC, node.ID, "global",
structs.NodeStatusDisconnected, rootToken.SecretID)
testutil.WaitForJobAllocStatusWithToken(t, s.RPC, job, map[string]int{
structs.AllocClientStatusUnknown: 1,
})
}, rootToken.SecretID)
// Restart heartbeat to reconnect node.
heartbeatCtx, cancelHeartbeat = context.WithCancel(context.Background())
@@ -1081,7 +1085,8 @@ func TestClientEndpoint_UpdateStatus_Reconnect(t *testing.T) {
// allocs status with the server so the scheduler have the necessary
// information to avoid unnecessary placements.
time.Sleep(3 * heartbeatTTL)
testutil.WaitForClientStatus(t, s.RPC, node.ID, "global", structs.NodeStatusInit)
testutil.WaitForClientStatusWithToken(t, s.RPC, node.ID, "global",
structs.NodeStatusInit, rootToken.SecretID)
// Get allocs that node should run.
// The node should only have one alloc assigned until it updates its allocs
@@ -1089,7 +1094,8 @@ func TestClientEndpoint_UpdateStatus_Reconnect(t *testing.T) {
allocsReq = &structs.NodeSpecificRequest{
NodeID: node.ID,
QueryOptions: structs.QueryOptions{
Region: "global",
Region: "global",
AuthToken: rootToken.SecretID,
},
}
err = msgpackrpc.CallWithCodec(codec, "Node.GetAllocs", allocsReq, &allocsResp)
@@ -1104,10 +1110,12 @@ func TestClientEndpoint_UpdateStatus_Reconnect(t *testing.T) {
// - client status is ready.
// - only 1 alloc and the alloc is running.
// - all evals are terminal, so cluster is in a stable state.
testutil.WaitForClientStatus(t, s.RPC, node.ID, "global", structs.NodeStatusReady)
testutil.WaitForJobAllocStatus(t, s.RPC, job, map[string]int{
testutil.WaitForClientStatusWithToken(t, s.RPC, node.ID, "global",
structs.NodeStatusReady, rootToken.SecretID)
testutil.WaitForJobAllocStatusWithToken(t, s.RPC, job, map[string]int{
structs.AllocClientStatusRunning: 1,
})
}, rootToken.SecretID)
testutil.WaitForResult(func() (bool, error) {
state := s.fsm.State()
ws := memdb.NewWatchSet()

View File

@@ -193,8 +193,15 @@ func WaitForClient(t testing.TB, rpc rpcFn, nodeID string, region string) {
WaitForClientStatus(t, rpc, nodeID, region, structs.NodeStatusReady)
}
// WaitForClientStatus blocks until the client is in the expected status.
func WaitForClientStatus(t testing.TB, rpc rpcFn, nodeID string, region string, status string) {
// WaitForClientStatus blocks until the client is in the expected status
func WaitForClientStatus(t testing.TB, rpc rpcFn, nodeID, region, status string) {
t.Helper()
WaitForClientStatusWithToken(t, rpc, nodeID, region, status, "")
}
// WaitForClientStatusWithToken blocks until the client is in the expected
// status, for use with ACLs enabled
func WaitForClientStatusWithToken(t testing.TB, rpc rpcFn, nodeID, region, status, token string) {
t.Helper()
if region == "" {
@@ -202,8 +209,11 @@ func WaitForClientStatus(t testing.TB, rpc rpcFn, nodeID string, region string,
}
WaitForResult(func() (bool, error) {
req := structs.NodeSpecificRequest{
NodeID: nodeID,
QueryOptions: structs.QueryOptions{Region: region},
NodeID: nodeID,
QueryOptions: structs.QueryOptions{
Region: region,
AuthToken: token,
},
}
var out structs.SingleNodeResponse