From 13b95b76858ec9f98f7f442182e3b22d7e07454b Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Wed, 19 Mar 2025 09:14:42 -0400 Subject: [PATCH] CSI: prevent extraneous GC attempts for plugins (#25432) We can't delete a CSI plugin when it has volumes in use. When periodic GC runs, we send the RPC unconditionally and then let the state store return an error. We accidentally fixed the excess logging this causes (#17025) in #20555, but we can also check if the plugin is empty first before sending the RPC to save a request and subsequent Raft write. Fixes: https://github.com/hashicorp/nomad/issues/17025 Ref: https://github.com/hashicorp/nomad/pull/20555 --- .changelog/25432.txt | 3 +++ nomad/core_sched.go | 3 +++ nomad/core_sched_test.go | 4 ++++ nomad/fsm.go | 3 ++- 4 files changed, 12 insertions(+), 1 deletion(-) create mode 100644 .changelog/25432.txt diff --git a/.changelog/25432.txt b/.changelog/25432.txt new file mode 100644 index 000000000..c989003e0 --- /dev/null +++ b/.changelog/25432.txt @@ -0,0 +1,3 @@ +```release-note:bug +csi: Fixed a bug where GC would attempt and fail to delete plugins that had volumes +``` diff --git a/nomad/core_sched.go b/nomad/core_sched.go index ec3d4f020..ca25abcb4 100644 --- a/nomad/core_sched.go +++ b/nomad/core_sched.go @@ -833,6 +833,9 @@ func (c *CoreScheduler) csiPluginGC(eval *structs.Evaluation, customThreshold *t for i := iter.Next(); i != nil; i = iter.Next() { plugin := i.(*structs.CSIPlugin) + if !plugin.IsEmpty() { + continue + } // Ignore new plugins mt := time.Unix(0, plugin.ModifyTime) diff --git a/nomad/core_sched_test.go b/nomad/core_sched_test.go index bc2730642..e7bf1ff31 100644 --- a/nomad/core_sched_test.go +++ b/nomad/core_sched_test.go @@ -2346,6 +2346,10 @@ func TestCoreScheduler_CSIPluginGC(t *testing.T) { index++ must.NoError(t, store.UpsertJob(structs.MsgTypeTestSetup, index, nil, job)) + snap, err = store.Snapshot() + must.NoError(t, err) + core = NewCoreScheduler(srv, snap) + // Retry index++ gc = srv.coreJobEval(structs.CoreJobCSIPluginGC, index) diff --git a/nomad/fsm.go b/nomad/fsm.go index 71f4b2fbf..6f735f94d 100644 --- a/nomad/fsm.go +++ b/nomad/fsm.go @@ -4,6 +4,7 @@ package nomad import ( + "errors" "fmt" "io" "reflect" @@ -1385,7 +1386,7 @@ func (n *nomadFSM) applyCSIPluginDelete(buf []byte, index uint64) interface{} { if err := n.state.DeleteCSIPlugin(index, req.ID); err != nil { // "plugin in use" is an error for the state store but not for typical // callers, so reduce log noise by not logging that case here - if err.Error() != "plugin in use" { + if !errors.Is(err, structs.ErrCSIPluginInUse) { n.logger.Error("DeleteCSIPlugin failed", "error", err) } return err