numa: fix scheduler panic due to topology serialization bug (#23284)

The NUMA topology struct field `NodeIDs` is a `idset.Set`, which has no public
members. As a result, this field is never serialized via msgpack and persisted
in state. When `numa.affinity = "prefer"`, the scheduler dereferences this nil
field and panics the scheduler worker.

Ideally we would fix this by adding a msgpack serialization extension, but
because the field already exists and is just always empty, this breaks RPC wire
compatibility across upgrades. Instead, create a new field that's populated at
the same time we populate the more useful `idset.Set`, and repopulate the set on
demand.

Fixes: https://hashicorp.atlassian.net/browse/NET-9924
This commit is contained in:
Tim Gross
2024-06-11 08:55:00 -04:00
committed by GitHub
parent 288a048a2e
commit 7d73065066
19 changed files with 134 additions and 61 deletions

View File

@@ -118,13 +118,15 @@ func nomadTopologyFromProto(pb *proto.ClientTopology) *numalib.Topology {
if pb == nil {
return nil
}
return &numalib.Topology{
NodeIDs: idset.FromFunc(pb.NodeIds, func(i uint32) hw.NodeID { return hw.NodeID(i) }),
t := &numalib.Topology{
Distances: nomadTopologyDistancesFromProto(pb.Distances),
Cores: nomadTopologyCoresFromProto(pb.Cores),
OverrideTotalCompute: hw.MHz(pb.OverrideTotalCompute),
OverrideWitholdCompute: hw.MHz(pb.OverrideWitholdCompute),
}
t.SetNodes(idset.FromFunc(pb.NodeIds, func(i uint32) hw.NodeID { return hw.NodeID(i) }))
return t
}
func nomadTopologyDistancesFromProto(pb *proto.ClientTopologySLIT) numalib.SLIT {
@@ -166,7 +168,7 @@ func nomadTopologyToProto(top *numalib.Topology) *proto.ClientTopology {
return nil
}
return &proto.ClientTopology{
NodeIds: helper.ConvertSlice(top.NodeIDs.Slice(), func(id hw.NodeID) uint32 { return uint32(id) }),
NodeIds: helper.ConvertSlice(top.GetNodes().Slice(), func(id hw.NodeID) uint32 { return uint32(id) }),
Distances: nomadTopologyDistancesToProto(top.Distances),
Cores: nomadTopologyCoresToProto(top.Cores),
OverrideTotalCompute: uint64(top.OverrideTotalCompute),

View File

@@ -15,7 +15,6 @@ import (
func Test_nomadTopologyToProto(t *testing.T) {
top := &numalib.Topology{
NodeIDs: idset.From[hw.NodeID]([]hw.NodeID{0, 1}),
Distances: numalib.SLIT{{10, 20}, {20, 10}},
Cores: []numalib.Core{
{
@@ -32,6 +31,7 @@ func Test_nomadTopologyToProto(t *testing.T) {
OverrideTotalCompute: 90_000,
OverrideWitholdCompute: 2000,
}
top.SetNodes(idset.From[hw.NodeID]([]hw.NodeID{0, 1}))
pb := nomadTopologyToProto(top)
must.Eq(t, &proto.ClientTopology{
@@ -80,8 +80,7 @@ func Test_nomadTopologyFromProto(t *testing.T) {
OverrideWitholdCompute: 2000,
}
top := nomadTopologyFromProto(pb)
must.Eq(t, &numalib.Topology{
NodeIDs: idset.From[hw.NodeID]([]hw.NodeID{0, 1}),
expect := &numalib.Topology{
Distances: numalib.SLIT{{10, 20}, {20, 10}},
Cores: []numalib.Core{
{
@@ -97,7 +96,9 @@ func Test_nomadTopologyFromProto(t *testing.T) {
},
OverrideTotalCompute: 90_000,
OverrideWitholdCompute: 2000,
}, top)
}
expect.SetNodes(idset.From[hw.NodeID]([]hw.NodeID{0, 1}))
must.Eq(t, expect, top)
}
func Test_nomadTopologyDistancesToProto(t *testing.T) {