From 24a575ab800d4bc68b9e5a0747d09c2f166b627e Mon Sep 17 00:00:00 2001
From: Yorick Gersie <6005868+ygersie@users.noreply.github.com>
Date: Thu, 26 Jan 2023 15:14:47 +0100
Subject: [PATCH] Allow per_alloc to be used with host volumes (#15780)

Disallowing per_alloc for host volumes in some cases makes life of a nomad user much harder.
When we rely on the NOMAD_ALLOC_INDEX for any configuration that needs to be re-used across
restarts we need to make sure allocation placement is consistent. With CSI volumes we can
use the `per_alloc` feature but for some reason this is explicitly disabled for host volumes.

Ensure host volumes understand the concept of per_alloc
---
 .changelog/15780.txt                          |  3 +++
 .../allocrunner/interfaces/task_lifecycle.go  |  3 +++
 .../taskrunner/task_runner_hooks.go           |  3 +++
 client/allocrunner/taskrunner/volume_hook.go  | 25 +++++++++++++------
 nomad/structs/volume_test.go                  |  4 +--
 nomad/structs/volumes.go                      |  4 +--
 scheduler/feasible.go                         | 11 ++++++--
 scheduler/feasible_test.go                    | 21 +++++++++++++---
 scheduler/stack.go                            |  4 +--
 .../content/docs/job-specification/update.mdx |  2 +-
 .../content/docs/job-specification/volume.mdx | 14 +++++------
 11 files changed, 66 insertions(+), 28 deletions(-)
 create mode 100644 .changelog/15780.txt

diff --git a/.changelog/15780.txt b/.changelog/15780.txt
new file mode 100644
index 000000000..1efc18966
--- /dev/null
+++ b/.changelog/15780.txt
@@ -0,0 +1,3 @@
+```release-note:improvement
+volumes: Allow `per_alloc` to be used with host_volumes
+```
diff --git a/client/allocrunner/interfaces/task_lifecycle.go b/client/allocrunner/interfaces/task_lifecycle.go
index 9ab7eb5fc..1bf61bd5a 100644
--- a/client/allocrunner/interfaces/task_lifecycle.go
+++ b/client/allocrunner/interfaces/task_lifecycle.go
@@ -64,6 +64,9 @@ type TaskPrestartRequest struct {
 
 	// TaskEnv is the task's environment
 	TaskEnv *taskenv.TaskEnv
+
+	// Alloc is the current version of the allocation
+	Alloc *structs.Allocation
 }
 
 type TaskPrestartResponse struct {
diff --git a/client/allocrunner/taskrunner/task_runner_hooks.go b/client/allocrunner/taskrunner/task_runner_hooks.go
index 089e834d7..98ff3f011 100644
--- a/client/allocrunner/taskrunner/task_runner_hooks.go
+++ b/client/allocrunner/taskrunner/task_runner_hooks.go
@@ -208,6 +208,8 @@ func (tr *TaskRunner) prestart() error {
 	joinedCtx, joinedCancel := joincontext.Join(tr.killCtx, tr.shutdownCtx)
 	defer joinedCancel()
 
+	alloc := tr.Alloc()
+
 	for _, hook := range tr.runnerHooks {
 		pre, ok := hook.(interfaces.TaskPrestartHook)
 		if !ok {
@@ -218,6 +220,7 @@ func (tr *TaskRunner) prestart() error {
 
 		// Build the request
 		req := interfaces.TaskPrestartRequest{
+			Alloc:         alloc,
 			Task:          tr.Task(),
 			TaskDir:       tr.taskDir,
 			TaskEnv:       tr.envBuilder.Build(),
diff --git a/client/allocrunner/taskrunner/volume_hook.go b/client/allocrunner/taskrunner/volume_hook.go
index 7d33d74e3..7aeab512d 100644
--- a/client/allocrunner/taskrunner/volume_hook.go
+++ b/client/allocrunner/taskrunner/volume_hook.go
@@ -32,7 +32,7 @@ func (*volumeHook) Name() string {
 	return "volumes"
 }
 
-func validateHostVolumes(requestedByAlias map[string]*structs.VolumeRequest, clientVolumesByName map[string]*structs.ClientHostVolumeConfig) error {
+func validateHostVolumes(requestedByAlias map[string]*structs.VolumeRequest, clientVolumesByName map[string]*structs.ClientHostVolumeConfig, allocName string) error {
 	var result error
 
 	for _, req := range requestedByAlias {
@@ -42,9 +42,14 @@ func validateHostVolumes(requestedByAlias map[string]*structs.VolumeRequest, cli
 			continue
 		}
 
-		_, ok := clientVolumesByName[req.Source]
+		source := req.Source
+		if req.PerAlloc {
+			source = source + structs.AllocSuffix(allocName)
+		}
+
+		_, ok := clientVolumesByName[source]
 		if !ok {
-			result = multierror.Append(result, fmt.Errorf("missing %s", req.Source))
+			result = multierror.Append(result, fmt.Errorf("missing %s", source))
 		}
 	}
 
@@ -54,7 +59,7 @@ func validateHostVolumes(requestedByAlias map[string]*structs.VolumeRequest, cli
 // hostVolumeMountConfigurations takes the users requested volume mounts,
 // volumes, and the client host volume configuration and converts them into a
 // format that can be used by drivers.
-func (h *volumeHook) hostVolumeMountConfigurations(taskMounts []*structs.VolumeMount, taskVolumesByAlias map[string]*structs.VolumeRequest, clientVolumesByName map[string]*structs.ClientHostVolumeConfig) ([]*drivers.MountConfig, error) {
+func (h *volumeHook) hostVolumeMountConfigurations(taskMounts []*structs.VolumeMount, taskVolumesByAlias map[string]*structs.VolumeRequest, clientVolumesByName map[string]*structs.ClientHostVolumeConfig, allocName string) ([]*drivers.MountConfig, error) {
 	var mounts []*drivers.MountConfig
 	for _, m := range taskMounts {
 		req, ok := taskVolumesByAlias[m.Volume]
@@ -71,11 +76,15 @@ func (h *volumeHook) hostVolumeMountConfigurations(taskMounts []*structs.VolumeM
 			continue
 		}
 
-		hostVolume, ok := clientVolumesByName[req.Source]
+		source := req.Source
+		if req.PerAlloc {
+			source = source + structs.AllocSuffix(allocName)
+		}
+		hostVolume, ok := clientVolumesByName[source]
 		if !ok {
 			// Should never happen, but unless the client volumes were mutated during
 			// the execution of this hook.
-			return nil, fmt.Errorf("No host volume named: %s", req.Source)
+			return nil, fmt.Errorf("no host volume named: %s", source)
 		}
 
 		mcfg := &drivers.MountConfig{
@@ -110,12 +119,12 @@ func (h *volumeHook) prepareHostVolumes(req *interfaces.TaskPrestartRequest, vol
 
 	// Always validate volumes to ensure that we do not allow volumes to be used
 	// if a host is restarted and loses the host volume configuration.
-	if err := validateHostVolumes(volumes, hostVolumes); err != nil {
+	if err := validateHostVolumes(volumes, hostVolumes, req.Alloc.Name); err != nil {
 		h.logger.Error("Requested Host Volume does not exist", "existing", hostVolumes, "requested", volumes)
 		return nil, fmt.Errorf("host volume validation error: %v", err)
 	}
 
-	hostVolumeMounts, err := h.hostVolumeMountConfigurations(req.Task.VolumeMounts, volumes, hostVolumes)
+	hostVolumeMounts, err := h.hostVolumeMountConfigurations(req.Task.VolumeMounts, volumes, hostVolumes, req.Alloc.Name)
 	if err != nil {
 		h.logger.Error("Failed to generate host volume mounts", "error", err)
 		return nil, err
diff --git a/nomad/structs/volume_test.go b/nomad/structs/volume_test.go
index cbe9ed6fe..fa86ec2fb 100644
--- a/nomad/structs/volume_test.go
+++ b/nomad/structs/volume_test.go
@@ -30,8 +30,9 @@ func TestVolumeRequest_Validate(t *testing.T) {
 				"host volumes cannot have an access mode",
 				"host volumes cannot have an attachment mode",
 				"host volumes cannot have mount options",
-				"host volumes do not support per_alloc",
+				"volume cannot be per_alloc when canaries are in use",
 			},
+			canariesCount: 1,
 			req: &VolumeRequest{
 				Type:           VolumeTypeHost,
 				ReadOnly:       false,
@@ -79,7 +80,6 @@ func TestVolumeRequest_Validate(t *testing.T) {
 	}
 
 	for _, tc := range testCases {
-		tc = tc
 		t.Run(tc.name, func(t *testing.T) {
 			err := tc.req.Validate(tc.taskGroupCount, tc.canariesCount)
 			for _, expected := range tc.expected {
diff --git a/nomad/structs/volumes.go b/nomad/structs/volumes.go
index 0f8b040de..563d7b1cf 100644
--- a/nomad/structs/volumes.go
+++ b/nomad/structs/volumes.go
@@ -129,8 +129,8 @@ func (v *VolumeRequest) Validate(taskGroupCount, canaries int) error {
 		if v.MountOptions != nil {
 			addErr("host volumes cannot have mount options")
 		}
-		if v.PerAlloc {
-			addErr("host volumes do not support per_alloc")
+		if v.PerAlloc && canaries > 0 {
+			addErr("volume cannot be per_alloc when canaries are in use")
 		}
 
 	case VolumeTypeCSI:
diff --git a/scheduler/feasible.go b/scheduler/feasible.go
index eb8a1045d..c3a405885 100644
--- a/scheduler/feasible.go
+++ b/scheduler/feasible.go
@@ -149,7 +149,7 @@ func NewHostVolumeChecker(ctx Context) *HostVolumeChecker {
 }
 
 // SetVolumes takes the volumes required by a task group and updates the checker.
-func (h *HostVolumeChecker) SetVolumes(volumes map[string]*structs.VolumeRequest) {
+func (h *HostVolumeChecker) SetVolumes(allocName string, volumes map[string]*structs.VolumeRequest) {
 	lookupMap := make(map[string][]*structs.VolumeRequest)
 	// Convert the map from map[DesiredName]Request to map[Source][]Request to improve
 	// lookup performance. Also filter non-host volumes.
@@ -158,7 +158,14 @@ func (h *HostVolumeChecker) SetVolumes(volumes map[string]*structs.VolumeRequest
 			continue
 		}
 
-		lookupMap[req.Source] = append(lookupMap[req.Source], req)
+		if req.PerAlloc {
+			// provide a unique volume source per allocation
+			copied := req.Copy()
+			copied.Source = copied.Source + structs.AllocSuffix(allocName)
+			lookupMap[copied.Source] = append(lookupMap[copied.Source], copied)
+		} else {
+			lookupMap[req.Source] = append(lookupMap[req.Source], req)
+		}
 	}
 	h.volumes = lookupMap
 }
diff --git a/scheduler/feasible_test.go b/scheduler/feasible_test.go
index 30fc0df6d..8775f85fb 100644
--- a/scheduler/feasible_test.go
+++ b/scheduler/feasible_test.go
@@ -103,8 +103,9 @@ func TestHostVolumeChecker(t *testing.T) {
 	}
 	nodes[1].HostVolumes = map[string]*structs.ClientHostVolumeConfig{"foo": {Name: "foo"}}
 	nodes[2].HostVolumes = map[string]*structs.ClientHostVolumeConfig{
-		"foo": {},
-		"bar": {},
+		"foo":              {},
+		"bar":              {},
+		"unique-volume[0]": {},
 	}
 	nodes[3].HostVolumes = map[string]*structs.ClientHostVolumeConfig{
 		"foo": {},
@@ -130,6 +131,11 @@ func TestHostVolumeChecker(t *testing.T) {
 			Type:   "nothost",
 			Source: "baz",
 		},
+		"unique": {
+			Type:     "host",
+			Source:   "unique-volume[0]",
+			PerAlloc: true,
+		},
 	}
 
 	checker := NewHostVolumeChecker(ctx)
@@ -165,8 +171,11 @@ func TestHostVolumeChecker(t *testing.T) {
 		},
 	}
 
+	alloc := mock.Alloc()
+	alloc.NodeID = nodes[2].ID
+
 	for i, c := range cases {
-		checker.SetVolumes(c.RequestedVolumes)
+		checker.SetVolumes(alloc.Name, c.RequestedVolumes)
 		if act := checker.Feasible(c.Node); act != c.Result {
 			t.Fatalf("case(%d) failed: got %v; want %v", i, act, c.Result)
 		}
@@ -235,8 +244,12 @@ func TestHostVolumeChecker_ReadOnly(t *testing.T) {
 			Result:           true,
 		},
 	}
+
+	alloc := mock.Alloc()
+	alloc.NodeID = nodes[1].ID
+
 	for i, c := range cases {
-		checker.SetVolumes(c.RequestedVolumes)
+		checker.SetVolumes(alloc.Name, c.RequestedVolumes)
 		if act := checker.Feasible(c.Node); act != c.Result {
 			t.Fatalf("case(%d) failed: got %v; want %v", i, act, c.Result)
 		}
diff --git a/scheduler/stack.go b/scheduler/stack.go
index 6e362f637..c48fc78f3 100644
--- a/scheduler/stack.go
+++ b/scheduler/stack.go
@@ -144,7 +144,7 @@ func (s *GenericStack) Select(tg *structs.TaskGroup, options *SelectOptions) *Ra
 	s.taskGroupDrivers.SetDrivers(tgConstr.drivers)
 	s.taskGroupConstraint.SetConstraints(tgConstr.constraints)
 	s.taskGroupDevices.SetTaskGroup(tg)
-	s.taskGroupHostVolumes.SetVolumes(tg.Volumes)
+	s.taskGroupHostVolumes.SetVolumes(options.AllocName, tg.Volumes)
 	s.taskGroupCSIVolumes.SetVolumes(options.AllocName, tg.Volumes)
 	if len(tg.Networks) > 0 {
 		s.taskGroupNetwork.SetNetwork(tg.Networks[0])
@@ -321,7 +321,7 @@ func (s *SystemStack) Select(tg *structs.TaskGroup, options *SelectOptions) *Ran
 	s.taskGroupDrivers.SetDrivers(tgConstr.drivers)
 	s.taskGroupConstraint.SetConstraints(tgConstr.constraints)
 	s.taskGroupDevices.SetTaskGroup(tg)
-	s.taskGroupHostVolumes.SetVolumes(tg.Volumes)
+	s.taskGroupHostVolumes.SetVolumes(options.AllocName, tg.Volumes)
 	s.taskGroupCSIVolumes.SetVolumes(options.AllocName, tg.Volumes)
 	if len(tg.Networks) > 0 {
 		s.taskGroupNetwork.SetNetwork(tg.Networks[0])
diff --git a/website/content/docs/job-specification/update.mdx b/website/content/docs/job-specification/update.mdx
index fb4b461fe..1a3fd8725 100644
--- a/website/content/docs/job-specification/update.mdx
+++ b/website/content/docs/job-specification/update.mdx
@@ -105,7 +105,7 @@ a future release.
   stopping any previous allocations. Once the operator determines the canaries
   are healthy, they can be promoted which unblocks a rolling update of the
   remaining allocations at a rate of `max_parallel`. Canary deployments cannot
-  be used with CSI volumes when `per_alloc = true`.
+  be used with volumes when `per_alloc = true`.
 
 - `stagger` `(string: "30s")` - Specifies the delay between each set of
   [`max_parallel`](#max_parallel) updates when updating system jobs. This
diff --git a/website/content/docs/job-specification/volume.mdx b/website/content/docs/job-specification/volume.mdx
index cc263df11..14311d44e 100644
--- a/website/content/docs/job-specification/volume.mdx
+++ b/website/content/docs/job-specification/volume.mdx
@@ -75,6 +75,13 @@ the [volume_mount][volume_mount] stanza in the `task` configuration.
   used for validating `host_volume` ACLs and for scheduling when a
   matching `host_volume` requires `read_only` usage.
 
+- `per_alloc` `(bool: false)` - Specifies that the `source` of the volume
+  should have the suffix `[n]`, where `n` is the allocation index. This allows
+  mounting a unique volume per allocation, so long as the volume's source is
+  named appropriately. For example, with the source `myvolume` and `per_alloc
+  = true`, the allocation named `myjob.mygroup.mytask[0]` will require a
+  volume ID `myvolume[0]`.
+
 The following fields are only valid for volumes with `type = "csi"`:
 
 - `access_mode` `(string: <required>)` - Defines whether a volume should be
@@ -92,13 +99,6 @@ The following fields are only valid for volumes with `type = "csi"`:
   storage providers will support `"block-device"`, which will mount the volume
   with the CSI block device API within the container.
 
-- `per_alloc` `(bool: false)` - Specifies that the `source` of the volume
-  should have the suffix `[n]`, where `n` is the allocation index. This allows
-  mounting a unique volume per allocation, so long as the volume's source is
-  named appropriately. For example, with the source `myvolume` and `per_alloc
-  = true`, the allocation named `myjob.mygroup.mytask[0]` will require a
-  volume ID `myvolume[0]`.
-
 - `mount_options` - Options for mounting CSI volumes that have the
   `file-system` [attachment mode]. These options override the `mount_options`
   field from [volume registration]. Consult the documentation for your storage