mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
numa: fix scheduler panic due to topology serialization bug (#23284)
The NUMA topology struct field `NodeIDs` is a `idset.Set`, which has no public members. As a result, this field is never serialized via msgpack and persisted in state. When `numa.affinity = "prefer"`, the scheduler dereferences this nil field and panics the scheduler worker. Ideally we would fix this by adding a msgpack serialization extension, but because the field already exists and is just always empty, this breaks RPC wire compatibility across upgrades. Instead, create a new field that's populated at the same time we populate the more useful `idset.Set`, and repopulate the set on demand. Fixes: https://hashicorp.atlassian.net/browse/NET-9924
This commit is contained in:
@@ -118,13 +118,15 @@ func nomadTopologyFromProto(pb *proto.ClientTopology) *numalib.Topology {
|
||||
if pb == nil {
|
||||
return nil
|
||||
}
|
||||
return &numalib.Topology{
|
||||
NodeIDs: idset.FromFunc(pb.NodeIds, func(i uint32) hw.NodeID { return hw.NodeID(i) }),
|
||||
t := &numalib.Topology{
|
||||
Distances: nomadTopologyDistancesFromProto(pb.Distances),
|
||||
Cores: nomadTopologyCoresFromProto(pb.Cores),
|
||||
OverrideTotalCompute: hw.MHz(pb.OverrideTotalCompute),
|
||||
OverrideWitholdCompute: hw.MHz(pb.OverrideWitholdCompute),
|
||||
}
|
||||
t.SetNodes(idset.FromFunc(pb.NodeIds, func(i uint32) hw.NodeID { return hw.NodeID(i) }))
|
||||
|
||||
return t
|
||||
}
|
||||
|
||||
func nomadTopologyDistancesFromProto(pb *proto.ClientTopologySLIT) numalib.SLIT {
|
||||
@@ -166,7 +168,7 @@ func nomadTopologyToProto(top *numalib.Topology) *proto.ClientTopology {
|
||||
return nil
|
||||
}
|
||||
return &proto.ClientTopology{
|
||||
NodeIds: helper.ConvertSlice(top.NodeIDs.Slice(), func(id hw.NodeID) uint32 { return uint32(id) }),
|
||||
NodeIds: helper.ConvertSlice(top.GetNodes().Slice(), func(id hw.NodeID) uint32 { return uint32(id) }),
|
||||
Distances: nomadTopologyDistancesToProto(top.Distances),
|
||||
Cores: nomadTopologyCoresToProto(top.Cores),
|
||||
OverrideTotalCompute: uint64(top.OverrideTotalCompute),
|
||||
|
||||
@@ -15,7 +15,6 @@ import (
|
||||
|
||||
func Test_nomadTopologyToProto(t *testing.T) {
|
||||
top := &numalib.Topology{
|
||||
NodeIDs: idset.From[hw.NodeID]([]hw.NodeID{0, 1}),
|
||||
Distances: numalib.SLIT{{10, 20}, {20, 10}},
|
||||
Cores: []numalib.Core{
|
||||
{
|
||||
@@ -32,6 +31,7 @@ func Test_nomadTopologyToProto(t *testing.T) {
|
||||
OverrideTotalCompute: 90_000,
|
||||
OverrideWitholdCompute: 2000,
|
||||
}
|
||||
top.SetNodes(idset.From[hw.NodeID]([]hw.NodeID{0, 1}))
|
||||
|
||||
pb := nomadTopologyToProto(top)
|
||||
must.Eq(t, &proto.ClientTopology{
|
||||
@@ -80,8 +80,7 @@ func Test_nomadTopologyFromProto(t *testing.T) {
|
||||
OverrideWitholdCompute: 2000,
|
||||
}
|
||||
top := nomadTopologyFromProto(pb)
|
||||
must.Eq(t, &numalib.Topology{
|
||||
NodeIDs: idset.From[hw.NodeID]([]hw.NodeID{0, 1}),
|
||||
expect := &numalib.Topology{
|
||||
Distances: numalib.SLIT{{10, 20}, {20, 10}},
|
||||
Cores: []numalib.Core{
|
||||
{
|
||||
@@ -97,7 +96,9 @@ func Test_nomadTopologyFromProto(t *testing.T) {
|
||||
},
|
||||
OverrideTotalCompute: 90_000,
|
||||
OverrideWitholdCompute: 2000,
|
||||
}, top)
|
||||
}
|
||||
expect.SetNodes(idset.From[hw.NodeID]([]hw.NodeID{0, 1}))
|
||||
must.Eq(t, expect, top)
|
||||
}
|
||||
|
||||
func Test_nomadTopologyDistancesToProto(t *testing.T) {
|
||||
|
||||
Reference in New Issue
Block a user