Files
nomad/client/lib/numalib/detect_generic.go
Tim Gross 7d73065066 numa: fix scheduler panic due to topology serialization bug (#23284)
The NUMA topology struct field `NodeIDs` is a `idset.Set`, which has no public
members. As a result, this field is never serialized via msgpack and persisted
in state. When `numa.affinity = "prefer"`, the scheduler dereferences this nil
field and panics the scheduler worker.

Ideally we would fix this by adding a msgpack serialization extension, but
because the field already exists and is just always empty, this breaks RPC wire
compatibility across upgrades. Instead, create a new field that's populated at
the same time we populate the more useful `idset.Set`, and repopulate the set on
demand.

Fixes: https://hashicorp.atlassian.net/browse/NET-9924
2024-06-11 08:55:00 -04:00

49 lines
1.1 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
package numalib
import (
"context"
"time"
"github.com/hashicorp/nomad/client/lib/idset"
"github.com/hashicorp/nomad/client/lib/numalib/hw"
"github.com/shirou/gopsutil/v3/cpu"
)
const (
genericNodeID = hw.NodeID(0)
genericSocketID = hw.SocketID(0)
genericMaxSpeed = hw.KHz(0)
)
func scanGeneric(top *Topology) {
// hardware may or may not be NUMA, but for now we only
// detect such topology on linux systems
top.nodeIDs = idset.Empty[hw.NodeID]()
top.nodeIDs.Insert(genericNodeID)
top.Nodes = top.nodeIDs.Slice()
// cores
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
count, err := cpu.CountsWithContext(ctx, true)
if err != nil {
return
}
top.Cores = make([]Core, count)
infos, err := cpu.InfoWithContext(ctx)
if err != nil || len(infos) == 0 {
return
}
for i := 0; i < count; i++ {
info := infos[0]
speed := hw.KHz(hw.MHz(info.Mhz) * 1000)
top.insert(genericNodeID, genericSocketID, hw.CoreID(i), Performance, genericMaxSpeed, speed)
}
}