mirror of
https://github.com/kemko/nomad.git
synced 2026-01-04 17:35:43 +03:00
Fix preemption panic (#11346)
Fix a bug where the scheduler may panic when preemption is enabled. The conditions are a bit complicated: A job with higher priority that schedule multiple allocations that preempt other multiple allocations on the same node, due to port/network/device assignments. The cause of the bug is incidental mutation of internal cached data. `RankedNode` computes and cache proposed allocations in https://github.com/hashicorp/nomad/blob/v1.1.6/scheduler/rank.go#L42-L53 . But scheduler then mutates the list to remove pre-emptable allocs in https://github.com/hashicorp/nomad/blob/v1.1.6/scheduler/rank.go#L293-L294, and `RemoveAllocs` mutates and sets the tail of cached slice with `nil`s triggering a nil-pointer derefencing case. I fixed the issue by avoiding the mutation in `RemoveAllocs` - the micro-optimization there doesn't seem necessary. Fixes https://github.com/hashicorp/nomad/issues/11342
This commit is contained in:
@@ -44,24 +44,23 @@ func warningsFormatter(es []error) string {
|
||||
|
||||
// RemoveAllocs is used to remove any allocs with the given IDs
|
||||
// from the list of allocations
|
||||
func RemoveAllocs(alloc []*Allocation, remove []*Allocation) []*Allocation {
|
||||
func RemoveAllocs(allocs []*Allocation, remove []*Allocation) []*Allocation {
|
||||
if len(remove) == 0 {
|
||||
return allocs
|
||||
}
|
||||
// Convert remove into a set
|
||||
removeSet := make(map[string]struct{})
|
||||
for _, remove := range remove {
|
||||
removeSet[remove.ID] = struct{}{}
|
||||
}
|
||||
|
||||
n := len(alloc)
|
||||
for i := 0; i < n; i++ {
|
||||
if _, ok := removeSet[alloc[i].ID]; ok {
|
||||
alloc[i], alloc[n-1] = alloc[n-1], nil
|
||||
i--
|
||||
n--
|
||||
r := make([]*Allocation, 0, len(allocs))
|
||||
for _, alloc := range allocs {
|
||||
if _, ok := removeSet[alloc.ID]; !ok {
|
||||
r = append(r, alloc)
|
||||
}
|
||||
}
|
||||
|
||||
alloc = alloc[:n]
|
||||
return alloc
|
||||
return r
|
||||
}
|
||||
|
||||
// FilterTerminalAllocs filters out all allocations in a terminal state and
|
||||
|
||||
@@ -1381,6 +1381,136 @@ func TestPreemption(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestPreemptionMultiple tests evicting multiple allocations in the same time
|
||||
func TestPreemptionMultiple(t *testing.T) {
|
||||
// The test setup:
|
||||
// * a node with 4 GPUs
|
||||
// * a low priority job with 4 allocs, each is using 1 GPU
|
||||
//
|
||||
// Then schedule a high priority job needing 2 allocs, using 2 GPUs each.
|
||||
// Expectation:
|
||||
// All low priority allocs should preempted to accomodate the high priority job
|
||||
h := NewHarness(t)
|
||||
|
||||
// node with 4 GPUs
|
||||
node := mock.Node()
|
||||
node.NodeResources = &structs.NodeResources{
|
||||
Cpu: structs.NodeCpuResources{
|
||||
CpuShares: 4000,
|
||||
},
|
||||
Memory: structs.NodeMemoryResources{
|
||||
MemoryMB: 8192,
|
||||
},
|
||||
Disk: structs.NodeDiskResources{
|
||||
DiskMB: 100 * 1024,
|
||||
},
|
||||
Networks: []*structs.NetworkResource{
|
||||
{
|
||||
Device: "eth0",
|
||||
CIDR: "192.168.0.100/32",
|
||||
MBits: 1000,
|
||||
},
|
||||
},
|
||||
Devices: []*structs.NodeDeviceResource{
|
||||
{
|
||||
Type: "gpu",
|
||||
Vendor: "nvidia",
|
||||
Name: "1080ti",
|
||||
Attributes: map[string]*psstructs.Attribute{
|
||||
"memory": psstructs.NewIntAttribute(11, psstructs.UnitGiB),
|
||||
"cuda_cores": psstructs.NewIntAttribute(3584, ""),
|
||||
"graphics_clock": psstructs.NewIntAttribute(1480, psstructs.UnitMHz),
|
||||
"memory_bandwidth": psstructs.NewIntAttribute(11, psstructs.UnitGBPerS),
|
||||
},
|
||||
Instances: []*structs.NodeDevice{
|
||||
{
|
||||
ID: "dev0",
|
||||
Healthy: true,
|
||||
},
|
||||
{
|
||||
ID: "dev1",
|
||||
Healthy: true,
|
||||
},
|
||||
{
|
||||
ID: "dev2",
|
||||
Healthy: true,
|
||||
},
|
||||
{
|
||||
ID: "dev3",
|
||||
Healthy: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node))
|
||||
|
||||
// low priority job with 4 allocs using all 4 GPUs
|
||||
lowPrioJob := mock.Job()
|
||||
lowPrioJob.Priority = 5
|
||||
lowPrioJob.TaskGroups[0].Count = 4
|
||||
lowPrioJob.TaskGroups[0].Networks = nil
|
||||
lowPrioJob.TaskGroups[0].Tasks[0].Services = nil
|
||||
lowPrioJob.TaskGroups[0].Tasks[0].Resources.Networks = nil
|
||||
lowPrioJob.TaskGroups[0].Tasks[0].Resources.Devices = structs.ResourceDevices{{
|
||||
Name: "gpu",
|
||||
Count: 1,
|
||||
}}
|
||||
require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), lowPrioJob))
|
||||
|
||||
allocs := []*structs.Allocation{}
|
||||
allocIDs := map[string]struct{}{}
|
||||
for i := 0; i < 4; i++ {
|
||||
alloc := createAllocWithDevice(uuid.Generate(), lowPrioJob, lowPrioJob.TaskGroups[0].Tasks[0].Resources, &structs.AllocatedDeviceResource{
|
||||
Type: "gpu",
|
||||
Vendor: "nvidia",
|
||||
Name: "1080ti",
|
||||
DeviceIDs: []string{fmt.Sprintf("dev%d", i)},
|
||||
})
|
||||
alloc.NodeID = node.ID
|
||||
|
||||
allocs = append(allocs, alloc)
|
||||
allocIDs[alloc.ID] = struct{}{}
|
||||
}
|
||||
require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), allocs))
|
||||
|
||||
// new high priority job with 2 allocs, each using 2 GPUs
|
||||
highPrioJob := mock.Job()
|
||||
highPrioJob.Priority = 100
|
||||
highPrioJob.TaskGroups[0].Count = 2
|
||||
highPrioJob.TaskGroups[0].Networks = nil
|
||||
highPrioJob.TaskGroups[0].Tasks[0].Services = nil
|
||||
highPrioJob.TaskGroups[0].Tasks[0].Resources.Networks = nil
|
||||
highPrioJob.TaskGroups[0].Tasks[0].Resources.Devices = structs.ResourceDevices{{
|
||||
Name: "gpu",
|
||||
Count: 2,
|
||||
}}
|
||||
require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), highPrioJob))
|
||||
|
||||
// schedule
|
||||
eval := &structs.Evaluation{
|
||||
Namespace: structs.DefaultNamespace,
|
||||
ID: uuid.Generate(),
|
||||
Priority: highPrioJob.Priority,
|
||||
TriggeredBy: structs.EvalTriggerJobRegister,
|
||||
JobID: highPrioJob.ID,
|
||||
Status: structs.EvalStatusPending,
|
||||
}
|
||||
require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval}))
|
||||
|
||||
// Process the evaluation
|
||||
require.NoError(t, h.Process(NewServiceScheduler, eval))
|
||||
require.Len(t, h.Plans, 1)
|
||||
require.Contains(t, h.Plans[0].NodePreemptions, node.ID)
|
||||
|
||||
preempted := map[string]struct{}{}
|
||||
for _, alloc := range h.Plans[0].NodePreemptions[node.ID] {
|
||||
preempted[alloc.ID] = struct{}{}
|
||||
}
|
||||
require.Equal(t, allocIDs, preempted)
|
||||
}
|
||||
|
||||
// helper method to create allocations with given jobs and resources
|
||||
func createAlloc(id string, job *structs.Job, resource *structs.Resources) *structs.Allocation {
|
||||
return createAllocInner(id, job, resource, nil, nil)
|
||||
|
||||
Reference in New Issue
Block a user