From b1657dd1fa22e472f1c7699dcadd2f9aa4ad6910 Mon Sep 17 00:00:00 2001
From: Tim Gross <tgross@hashicorp.com>
Date: Thu, 16 May 2024 09:45:07 -0400
Subject: [PATCH] CSI: track node claim before staging to prevent interleaved
 unstage (#20550)

The CSI hook for each allocation that claims a volume runs concurrently. If a
call to `MountVolume` happens at the same time as a call to `UnmountVolume` for
the same volume, it's possible for the second alloc to detect the volume has
already been staged, then for the original alloc to unpublish and unstage it,
only for the second alloc to then attempt to publish a volume that's been
unstaged.

The usage tracker on the volume manager was intended to prevent this behavior
but the call to claim the volume was made only after staging and publishing was
complete. Move the call to claim the volume for the usage tracker to the top of
the `MountVolume` workflow to prevent it from being unstaged until all consuming
allocations have called `UnmountVolume`.

Fixes: https://github.com/hashicorp/nomad/issues/20424
---
 .changelog/20550.txt                          |  3 +
 client/pluginmanager/csimanager/volume.go     |  9 +--
 .../pluginmanager/csimanager/volume_test.go   | 71 +++++++++++++++++++
 3 files changed, 79 insertions(+), 4 deletions(-)
 create mode 100644 .changelog/20550.txt

diff --git a/.changelog/20550.txt b/.changelog/20550.txt
new file mode 100644
index 000000000..4e282fc8e
--- /dev/null
+++ b/.changelog/20550.txt
@@ -0,0 +1,3 @@
+```release-note:bug
+csi: Fixed a bug where concurrent mount and unmount operations could unstage volumes needed by another allocation
+```
diff --git a/client/pluginmanager/csimanager/volume.go b/client/pluginmanager/csimanager/volume.go
index 6396dfbad..18251afcb 100644
--- a/client/pluginmanager/csimanager/volume.go
+++ b/client/pluginmanager/csimanager/volume.go
@@ -253,6 +253,10 @@ func (v *volumeManager) MountVolume(ctx context.Context, vol *structs.CSIVolume,
 	logger := v.logger.With("volume_id", vol.ID, "alloc_id", alloc.ID)
 	ctx = hclog.WithContext(ctx, logger)
 
+	// Claim before we stage/publish to prevent interleaved Unmount for another
+	// alloc from unstaging between stage/publish steps below
+	v.usageTracker.Claim(alloc.ID, vol.ID, vol.Namespace, usage)
+
 	if v.requiresStaging {
 		err = v.stageVolume(ctx, vol, usage, publishContext)
 	}
@@ -261,10 +265,6 @@ func (v *volumeManager) MountVolume(ctx context.Context, vol *structs.CSIVolume,
 		mountInfo, err = v.publishVolume(ctx, vol, alloc, usage, publishContext)
 	}
 
-	if err == nil {
-		v.usageTracker.Claim(alloc.ID, vol.ID, vol.Namespace, usage)
-	}
-
 	event := structs.NewNodeEvent().
 		SetSubsystem(structs.NodeEventSubsystemStorage).
 		SetMessage("Mount volume").
@@ -274,6 +274,7 @@ func (v *volumeManager) MountVolume(ctx context.Context, vol *structs.CSIVolume,
 	} else {
 		event.AddDetail("success", "false")
 		event.AddDetail("error", err.Error())
+		v.usageTracker.Free(alloc.ID, vol.ID, vol.Namespace, usage)
 	}
 
 	v.eventer(event)
diff --git a/client/pluginmanager/csimanager/volume_test.go b/client/pluginmanager/csimanager/volume_test.go
index 1b8fd4a69..1138d0e1f 100644
--- a/client/pluginmanager/csimanager/volume_test.go
+++ b/client/pluginmanager/csimanager/volume_test.go
@@ -9,7 +9,9 @@ import (
 	"os"
 	"runtime"
 	"testing"
+	"time"
 
+	"github.com/hashicorp/go-hclog"
 	"github.com/hashicorp/nomad/ci"
 	"github.com/hashicorp/nomad/helper/mount"
 	"github.com/hashicorp/nomad/helper/testlog"
@@ -17,6 +19,7 @@ import (
 	"github.com/hashicorp/nomad/nomad/structs"
 	"github.com/hashicorp/nomad/plugins/csi"
 	csifake "github.com/hashicorp/nomad/plugins/csi/fake"
+	"github.com/shoenig/test/must"
 	"github.com/stretchr/testify/require"
 )
 
@@ -526,3 +529,71 @@ func TestVolumeManager_MountVolumeEvents(t *testing.T) {
 	require.Equal(t, "vol", e.Details["volume_id"])
 	require.Equal(t, "true", e.Details["success"])
 }
+
+// TestVolumeManager_InterleavedStaging tests that a volume cannot be unstaged
+// if another alloc has staged but not yet published
+func TestVolumeManager_InterleavedStaging(t *testing.T) {
+	ci.Parallel(t)
+
+	tmpPath := t.TempDir()
+	csiFake := &csifake.Client{}
+
+	logger := testlog.HCLogger(t)
+	ctx := hclog.WithContext(context.Background(), logger)
+
+	manager := newVolumeManager(logger,
+		func(e *structs.NodeEvent) {}, csiFake,
+		tmpPath, tmpPath, true, "i-example")
+
+	alloc0, alloc1 := mock.Alloc(), mock.Alloc()
+	vol := &structs.CSIVolume{ID: "vol", Namespace: "ns"}
+	usage := &UsageOptions{
+		AccessMode:     structs.CSIVolumeAccessModeMultiNodeMultiWriter,
+		AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem,
+	}
+	pubCtx := map[string]string{}
+
+	// first alloc has previously claimed the volume
+	manager.usageTracker.Claim(alloc0.ID, vol.ID, vol.Namespace, usage)
+
+	alloc0WaitCh := make(chan struct{})
+	alloc1WaitCh := make(chan struct{})
+
+	// this goroutine simulates MountVolume, but with control over interleaving
+	// by waiting for the other alloc to check if should unstage before trying
+	// to publish
+	manager.usageTracker.Claim(alloc1.ID, vol.ID, vol.Namespace, usage)
+	must.NoError(t, manager.stageVolume(ctx, vol, usage, pubCtx))
+
+	go func() {
+		defer close(alloc1WaitCh)
+		<-alloc0WaitCh
+		_, err := manager.publishVolume(ctx, vol, alloc1, usage, pubCtx)
+		must.NoError(t, err)
+	}()
+
+	must.NoError(t, manager.UnmountVolume(ctx, vol.Namespace, vol.ID, "foo", alloc0.ID, usage))
+	close(alloc0WaitCh)
+
+	testTimeoutCtx, cancel := context.WithTimeout(context.TODO(), time.Second)
+	t.Cleanup(cancel)
+
+	select {
+	case <-alloc1WaitCh:
+	case <-testTimeoutCtx.Done():
+		t.Fatal("test timed out")
+	}
+
+	key := volumeUsageKey{
+		id:        vol.ID,
+		ns:        vol.Namespace,
+		usageOpts: *usage,
+	}
+
+	manager.usageTracker.stateMu.Lock()
+	t.Cleanup(manager.usageTracker.stateMu.Unlock)
+	must.Eq(t, []string{alloc1.ID}, manager.usageTracker.state[key])
+
+	must.Eq(t, 1, csiFake.NodeUnpublishVolumeCallCount, must.Sprint("expected 1 unpublish call"))
+	must.Eq(t, 0, csiFake.NodeUnstageVolumeCallCount, must.Sprint("expected no unstage call"))
+}