From 5d09d7ad0723c133f71a742b1e143d86f9332f92 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Fri, 7 Feb 2025 08:45:56 -0500 Subject: [PATCH] reset max query time of blocking queries in client after retries (#25039) When a blocking query on the client hits a retryable error, we change the max query time so that it falls within the `RPCHoldTimeout` timeout. But when the retry succeeds we don't reset it to the original value. Because the calls to `Node.GetClientAllocs` reuse the same request struct instead of reallocating it, any retry will cause the agent to poll at a faster frequency until the agent restarts. No other current RPC on the client has this behavior, but we'll fix this in the `rpc` method rather than in the caller so that any future users of the `rpc` method don't have to remember this detail. Fixes: https://github.com/hashicorp/nomad/issues/25033 --- .changelog/25039.txt | 3 +++ client/rpc.go | 20 +++++++++++----- client/rpc_test.go | 55 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 6 deletions(-) create mode 100644 .changelog/25039.txt diff --git a/.changelog/25039.txt b/.changelog/25039.txt new file mode 100644 index 000000000..c625a67dd --- /dev/null +++ b/.changelog/25039.txt @@ -0,0 +1,3 @@ +```release-note:bug +client: Fixed a bug where temporary RPC errors cause the client to poll for changes more frequently thereafter +``` diff --git a/client/rpc.go b/client/rpc.go index 20708fa8e..9d3441119 100644 --- a/client/rpc.go +++ b/client/rpc.go @@ -91,7 +91,9 @@ func (c *Client) rpc(method string, args any, reply any) error { // If its a blocking query, allow the time specified by the request if info, ok := args.(structs.RPCInfo); ok { - deadline = deadline.Add(info.TimeToBlock()) + oldBlockTime := info.TimeToBlock() + deadline = deadline.Add(oldBlockTime) + defer info.SetTimeToBlock(oldBlockTime) } TRY: @@ -127,9 +129,12 @@ TRY: } if time.Now().After(deadline) { - // Blocking queries are tricky. jitters and rpcholdtimes in multiple places can result in our server call taking longer than we wanted it to. For example: - // a block time of 5s may easily turn into the server blocking for 10s since it applies its own RPCHoldTime. If the server dies at t=7s we still want to retry - // so before we give up on blocking queries make one last attempt for an immediate answer + // Blocking queries are tricky. jitters and rpcholdtimes in multiple + // places can result in our server call taking longer than we wanted it + // to. For example: a block time of 5s may easily turn into the server + // blocking for 10s since it applies its own RPCHoldTime. If the server + // dies at t=7s we still want to retry so before we give up on blocking + // queries make one last attempt for an immediate answer if info, ok := args.(structs.RPCInfo); ok && info.TimeToBlock() > 0 { info.SetTimeToBlock(0) return c.RPC(method, args, reply) @@ -144,10 +149,13 @@ TRY: select { case <-timer.C: - // If we are going to retry a blocking query we need to update the time to block so it finishes by our deadline. + // If we are going to retry a blocking query we need to update the time + // to block so it finishes by our deadline. + if info, ok := args.(structs.RPCInfo); ok && info.TimeToBlock() > 0 { newBlockTime := time.Until(deadline) - // We can get below 0 here on slow computers because we slept for jitter so at least try to get an immediate response + // We can get below 0 here on slow computers because we slept for + // jitter so at least try to get an immediate response if newBlockTime < 0 { newBlockTime = 0 } diff --git a/client/rpc_test.go b/client/rpc_test.go index 5b4e26b11..8700f5d5c 100644 --- a/client/rpc_test.go +++ b/client/rpc_test.go @@ -6,6 +6,7 @@ package client import ( "errors" "testing" + "time" "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/client/config" @@ -191,3 +192,57 @@ func Test_resolveServer(t *testing.T) { } } + +func TestRpc_RetryBlockTime(t *testing.T) { + ci.Parallel(t) + + // Timeouts have to allow for multiple passes thru the recursive c.rpc + // call. Unconfigurable internal timeouts prevent us from using a shorter + // MaxQueryTime base for this test + expectMaxQueryTime := time.Second + rpcHoldTimeout := 5 * time.Second + unblockTimeout := 7 * time.Second + + srv, cleanupSrv := nomad.TestServer(t, func(c *nomad.Config) { + c.NumSchedulers = 0 + c.BootstrapExpect = 3 // we intentionally don't want a leader + }) + t.Cleanup(func() { cleanupSrv() }) + + c, cleanupC := TestClient(t, func(c *config.Config) { + c.Servers = []string{srv.GetConfig().RPCAddr.String()} + c.RPCHoldTimeout = rpcHoldTimeout + }) + t.Cleanup(func() { cleanupC() }) + + req := structs.NodeSpecificRequest{ + NodeID: c.NodeID(), + SecretID: c.secretNodeID(), + QueryOptions: structs.QueryOptions{ + Region: c.Region(), + AuthToken: c.secretNodeID(), + MinQueryIndex: 10000, // some far-flung index we know won't exist yet + MaxQueryTime: expectMaxQueryTime, + }, + } + + resp := structs.NodeClientAllocsResponse{} + errCh := make(chan error) + + go func() { + err := c.rpc("Node.GetClientAllocs", &req, &resp) + errCh <- err + }() + + // wait for the blocking query to run long enough for 2 passes thru, + // including jitter + select { + case err := <-errCh: + must.NoError(t, err) + case <-time.After(unblockTimeout): + cleanupC() // force unblock + } + + must.Eq(t, expectMaxQueryTime, req.MaxQueryTime, + must.Sprintf("MaxQueryTime was changed during retries but not reset")) +}