Fix preemption panic (#11346)

Fix a bug where the scheduler may panic when preemption is enabled. The conditions are a bit complicated: A job with higher priority that schedule multiple allocations that preempt other multiple allocations on the same node, due to port/network/device assignments. The cause of the bug is incidental mutation of internal cached data. `RankedNode` computes and cache proposed allocations in https://github.com/hashicorp/nomad/blob/v1.1.6/scheduler/rank.go#L42-L53 . But scheduler then mutates the list to remove pre-emptable allocs in https://github.com/hashicorp/nomad/blob/v1.1.6/scheduler/rank.go#L293-L294, and `RemoveAllocs` mutates and sets the tail of cached slice with `nil`s triggering a nil-pointer derefencing case. I fixed the issue by avoiding the mutation in `RemoveAllocs` - the micro-optimization there doesn't seem necessary. Fixes https://github.com/hashicorp/nomad/issues/11342
2026-01-04 17:35:43 +03:00 · 2021-10-19 20:22:03 -04:00
parent 1dce6ecabf
commit 6d35e2fb58
2 changed files with 139 additions and 10 deletions
--- a/nomad/structs/funcs.go
+++ b/nomad/structs/funcs.go
@@ -44,24 +44,23 @@ func warningsFormatter(es []error) string {

 // RemoveAllocs is used to remove any allocs with the given IDs
 // from the list of allocations
-func RemoveAllocs(alloc []*Allocation, remove []*Allocation) []*Allocation {
+func RemoveAllocs(allocs []*Allocation, remove []*Allocation) []*Allocation {
+	if len(remove) == 0 {
+		return allocs
+	}
 	// Convert remove into a set
 	removeSet := make(map[string]struct{})
 	for _, remove := range remove {
 		removeSet[remove.ID] = struct{}{}
 	}

-	n := len(alloc)
-	for i := 0; i < n; i++ {
-		if _, ok := removeSet[alloc[i].ID]; ok {
-			alloc[i], alloc[n-1] = alloc[n-1], nil
-			i--
-			n--
+	r := make([]*Allocation, 0, len(allocs))
+	for _, alloc := range allocs {
+		if _, ok := removeSet[alloc.ID]; !ok {
+			r = append(r, alloc)
 		}
 	}
-
-	alloc = alloc[:n]
-	return alloc
+	return r
 }

 // FilterTerminalAllocs filters out all allocations in a terminal state and
--- a/scheduler/preemption_test.go
+++ b/scheduler/preemption_test.go
@@ -1381,6 +1381,136 @@ func TestPreemption(t *testing.T) {
 	}
 }

+// TestPreemptionMultiple tests evicting multiple allocations in the same time
+func TestPreemptionMultiple(t *testing.T) {
+	// The test setup:
+	//  * a node with 4 GPUs
+	//  * a low priority job with 4 allocs, each is using 1 GPU
+	//
+	// Then schedule a high priority job needing 2 allocs, using 2 GPUs each.
+	// Expectation:
+	// All low priority allocs should preempted to accomodate the high priority job
+	h := NewHarness(t)
+
+	// node with 4 GPUs
+	node := mock.Node()
+	node.NodeResources = &structs.NodeResources{
+		Cpu: structs.NodeCpuResources{
+			CpuShares: 4000,
+		},
+		Memory: structs.NodeMemoryResources{
+			MemoryMB: 8192,
+		},
+		Disk: structs.NodeDiskResources{
+			DiskMB: 100 * 1024,
+		},
+		Networks: []*structs.NetworkResource{
+			{
+				Device: "eth0",
+				CIDR:   "192.168.0.100/32",
+				MBits:  1000,
+			},
+		},
+		Devices: []*structs.NodeDeviceResource{
+			{
+				Type:   "gpu",
+				Vendor: "nvidia",
+				Name:   "1080ti",
+				Attributes: map[string]*psstructs.Attribute{
+					"memory":           psstructs.NewIntAttribute(11, psstructs.UnitGiB),
+					"cuda_cores":       psstructs.NewIntAttribute(3584, ""),
+					"graphics_clock":   psstructs.NewIntAttribute(1480, psstructs.UnitMHz),
+					"memory_bandwidth": psstructs.NewIntAttribute(11, psstructs.UnitGBPerS),
+				},
+				Instances: []*structs.NodeDevice{
+					{
+						ID:      "dev0",
+						Healthy: true,
+					},
+					{
+						ID:      "dev1",
+						Healthy: true,
+					},
+					{
+						ID:      "dev2",
+						Healthy: true,
+					},
+					{
+						ID:      "dev3",
+						Healthy: true,
+					},
+				},
+			},
+		},
+	}
+
+	require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node))
+
+	// low priority job with 4 allocs using all 4 GPUs
+	lowPrioJob := mock.Job()
+	lowPrioJob.Priority = 5
+	lowPrioJob.TaskGroups[0].Count = 4
+	lowPrioJob.TaskGroups[0].Networks = nil
+	lowPrioJob.TaskGroups[0].Tasks[0].Services = nil
+	lowPrioJob.TaskGroups[0].Tasks[0].Resources.Networks = nil
+	lowPrioJob.TaskGroups[0].Tasks[0].Resources.Devices = structs.ResourceDevices{{
+		Name:  "gpu",
+		Count: 1,
+	}}
+	require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), lowPrioJob))
+
+	allocs := []*structs.Allocation{}
+	allocIDs := map[string]struct{}{}
+	for i := 0; i < 4; i++ {
+		alloc := createAllocWithDevice(uuid.Generate(), lowPrioJob, lowPrioJob.TaskGroups[0].Tasks[0].Resources, &structs.AllocatedDeviceResource{
+			Type:      "gpu",
+			Vendor:    "nvidia",
+			Name:      "1080ti",
+			DeviceIDs: []string{fmt.Sprintf("dev%d", i)},
+		})
+		alloc.NodeID = node.ID
+
+		allocs = append(allocs, alloc)
+		allocIDs[alloc.ID] = struct{}{}
+	}
+	require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), allocs))
+
+	// new high priority job with 2 allocs, each using 2 GPUs
+	highPrioJob := mock.Job()
+	highPrioJob.Priority = 100
+	highPrioJob.TaskGroups[0].Count = 2
+	highPrioJob.TaskGroups[0].Networks = nil
+	highPrioJob.TaskGroups[0].Tasks[0].Services = nil
+	highPrioJob.TaskGroups[0].Tasks[0].Resources.Networks = nil
+	highPrioJob.TaskGroups[0].Tasks[0].Resources.Devices = structs.ResourceDevices{{
+		Name:  "gpu",
+		Count: 2,
+	}}
+	require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), highPrioJob))
+
+	// schedule
+	eval := &structs.Evaluation{
+		Namespace:   structs.DefaultNamespace,
+		ID:          uuid.Generate(),
+		Priority:    highPrioJob.Priority,
+		TriggeredBy: structs.EvalTriggerJobRegister,
+		JobID:       highPrioJob.ID,
+		Status:      structs.EvalStatusPending,
+	}
+	require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval}))
+
+	// Process the evaluation
+	require.NoError(t, h.Process(NewServiceScheduler, eval))
+	require.Len(t, h.Plans, 1)
+	require.Contains(t, h.Plans[0].NodePreemptions, node.ID)
+
+	preempted := map[string]struct{}{}
+	for _, alloc := range h.Plans[0].NodePreemptions[node.ID] {
+		preempted[alloc.ID] = struct{}{}
+	}
+	require.Equal(t, allocIDs, preempted)
+}
+
 // helper method to create allocations with given jobs and resources
 func createAlloc(id string, job *structs.Job, resource *structs.Resources) *structs.Allocation {
 	return createAllocInner(id, job, resource, nil, nil)