From c20f854d162efe9cbe820e8a92de639778c32278 Mon Sep 17 00:00:00 2001 From: Michael Smithhisler Date: Thu, 11 Sep 2025 13:10:21 -0400 Subject: [PATCH] client: set network status on tasks when restoring allocations (#26699) The allocation network hook was not properly restoring network status from state when the network had previously been setup. This led to missing environment variables, misconfigured hosts file, and resolv.conf when a task was restarted after the nomad agent has restarted. --------- Co-authored-by: Daniel Bennett --- .changelog/26699.txt | 3 +++ client/allocrunner/network_hook.go | 24 ++++++++++++------- client/allocrunner/network_hook_linux_test.go | 16 ++++++++++++- client/allocrunner/network_hook_test.go | 20 ++++++++++------ 4 files changed, 47 insertions(+), 16 deletions(-) create mode 100644 .changelog/26699.txt diff --git a/.changelog/26699.txt b/.changelog/26699.txt new file mode 100644 index 000000000..12ebcf9ec --- /dev/null +++ b/.changelog/26699.txt @@ -0,0 +1,3 @@ +```release-note:bug +client: restore task network status on client restart so restarted tasks receive proper networking environment variables, hosts file, and resolv.conf. +``` diff --git a/client/allocrunner/network_hook.go b/client/allocrunner/network_hook.go index 1ac34d678..e3e34af96 100644 --- a/client/allocrunner/network_hook.go +++ b/client/allocrunner/network_hook.go @@ -49,8 +49,9 @@ func (a *allocNetworkIsolationSetter) SetNetworkIsolation(n *drivers.NetworkIsol } } -type networkStatusSetter interface { +type networkStatus interface { SetNetworkStatus(*structs.AllocNetworkStatus) + NetworkStatus() *structs.AllocNetworkStatus } // networkHook is an alloc lifecycle hook that manages the network namespace @@ -60,9 +61,9 @@ type networkHook struct { // network is created isolationSetter networkIsolationSetter - // statusSetter is a callback to the alloc runner to set the network status once - // network setup is complete - networkStatusSetter networkStatusSetter + // statusSetter is a callback to the alloc runner to get or set the network + // status once network setup is complete + networkStatus networkStatus // manager is used when creating the network namespace. This defaults to // bind mounting a network namespace descritor under /var/run/netns but @@ -87,11 +88,11 @@ func newNetworkHook(logger hclog.Logger, alloc *structs.Allocation, netManager drivers.DriverNetworkManager, netConfigurator NetworkConfigurator, - networkStatusSetter networkStatusSetter, + networkStatus networkStatus, ) *networkHook { return &networkHook{ isolationSetter: ns, - networkStatusSetter: networkStatusSetter, + networkStatus: networkStatus, alloc: alloc, manager: netManager, networkConfigurator: netConfigurator, @@ -168,8 +169,14 @@ CREATE: return fmt.Errorf("failed to configure networking for alloc: %v", err) } + // A nil status indicates a netns already exists and is configured correctly. + // It should have been saved to the local state store. if status == nil { - return nil // netns already existed and was correctly configured + stateStatus := h.networkStatus.NetworkStatus() + if stateStatus == nil { + return errors.New("network already configured but not found in state") + } + status = stateStatus } // If the driver set the sandbox hostname label, then we will use that @@ -196,7 +203,8 @@ CREATE: } } - h.networkStatusSetter.SetNetworkStatus(status) + // Saves the network status state, and also propagates status to task runners. + h.networkStatus.SetNetworkStatus(status) } return nil } diff --git a/client/allocrunner/network_hook_linux_test.go b/client/allocrunner/network_hook_linux_test.go index 0f9dc98e6..284796d2b 100644 --- a/client/allocrunner/network_hook_linux_test.go +++ b/client/allocrunner/network_hook_linux_test.go @@ -39,7 +39,7 @@ func TestNetworkHook_Prerun_Postrun_ExistingNetNS(t *testing.T) { Labels: map[string]string{"abc": "123"}, } isolationSetter := &mockNetworkIsolationSetter{t: t, expectedSpec: spec} - statusSetter := &mockNetworkStatusSetter{t: t, expectedStatus: nil} + statusSetter := &mockNetworkStatus{t: t, expectedStatus: mock.AllocNetworkStatus()} callCounts := testutil.NewCallCounter() @@ -84,6 +84,8 @@ func TestNetworkHook_Prerun_Postrun_ExistingNetNS(t *testing.T) { expectSetupCalls int expectPostrunDestroyNetworkCalls int expectPrerunError string + expectGetStatusCalls int + expectSetStatusCalls int }{ { name: "good check", @@ -92,6 +94,8 @@ func TestNetworkHook_Prerun_Postrun_ExistingNetNS(t *testing.T) { expectPrerunDestroyNetworkCalls: 0, expectCheckCalls: 1, expectSetupCalls: 0, + expectGetStatusCalls: 1, + expectSetStatusCalls: 1, expectPostrunDestroyNetworkCalls: 1, }, { @@ -102,6 +106,8 @@ func TestNetworkHook_Prerun_Postrun_ExistingNetNS(t *testing.T) { expectPrerunDestroyNetworkCalls: 1, expectCheckCalls: 2, expectSetupCalls: 0, + expectGetStatusCalls: 1, + expectSetStatusCalls: 1, expectPostrunDestroyNetworkCalls: 2, }, { @@ -116,6 +122,8 @@ func TestNetworkHook_Prerun_Postrun_ExistingNetNS(t *testing.T) { expectCheckCalls: 2, expectSetupCalls: 0, expectPostrunDestroyNetworkCalls: 2, + expectGetStatusCalls: 0, + expectSetStatusCalls: 0, expectPrerunError: "failed to configure networking for alloc: network namespace already exists but was misconfigured: whatever", }, { @@ -125,6 +133,8 @@ func TestNetworkHook_Prerun_Postrun_ExistingNetNS(t *testing.T) { expectPrerunDestroyNetworkCalls: 0, expectCheckCalls: 0, expectSetupCalls: 0, + expectGetStatusCalls: 1, + expectSetStatusCalls: 1, expectPostrunDestroyNetworkCalls: 1, }, } @@ -135,6 +145,8 @@ func TestNetworkHook_Prerun_Postrun_ExistingNetNS(t *testing.T) { callCounts.Reset() fakePlugin.counter.Reset() fakePlugin.checkErrors = tc.checkErrs + statusSetter.getCalls = 0 + statusSetter.setCalls = 0 configurator.nodeAttrs["plugins.cni.version.bridge"] = tc.cniVersion hook := newNetworkHook(testlog.HCLogger(t), isolationSetter, alloc, nm, configurator, statusSetter) @@ -158,6 +170,8 @@ func TestNetworkHook_Prerun_Postrun_ExistingNetNS(t *testing.T) { test.Eq(t, tc.expectPostrunDestroyNetworkCalls, callCounts.Get()["DestroyNetwork"], test.Sprint("DestroyNetwork calls after postrun")) + test.Eq(t, tc.expectGetStatusCalls, statusSetter.getCalls, test.Sprint("NetworkStatus Calls")) + test.Eq(t, tc.expectSetStatusCalls, statusSetter.setCalls, test.Sprint("SetNetworkStatus Calls")) }) } } diff --git a/client/allocrunner/network_hook_test.go b/client/allocrunner/network_hook_test.go index ae13977fe..182dcbe3e 100644 --- a/client/allocrunner/network_hook_test.go +++ b/client/allocrunner/network_hook_test.go @@ -28,17 +28,23 @@ func (m *mockNetworkIsolationSetter) SetNetworkIsolation(spec *drivers.NetworkIs test.Eq(m.t, m.expectedSpec, spec) } -type mockNetworkStatusSetter struct { +type mockNetworkStatus struct { t *testing.T expectedStatus *structs.AllocNetworkStatus - called bool + getCalls int + setCalls int } -func (m *mockNetworkStatusSetter) SetNetworkStatus(status *structs.AllocNetworkStatus) { - m.called = true +func (m *mockNetworkStatus) SetNetworkStatus(status *structs.AllocNetworkStatus) { + m.setCalls++ test.Eq(m.t, m.expectedStatus, status) } +func (m *mockNetworkStatus) NetworkStatus() *structs.AllocNetworkStatus { + m.getCalls++ + return m.expectedStatus +} + // Test that the prerun and postrun hooks call the setter with the expected // NetworkIsolationSpec for group bridge network. func TestNetworkHook_Prerun_Postrun_group(t *testing.T) { @@ -75,9 +81,9 @@ func TestNetworkHook_Prerun_Postrun_group(t *testing.T) { t: t, expectedSpec: spec, } - statusSetter := &mockNetworkStatusSetter{ + statusSetter := &mockNetworkStatus{ t: t, - expectedStatus: nil, + expectedStatus: mock.AllocNetworkStatus(), } logger := testlog.HCLogger(t) @@ -119,7 +125,7 @@ func TestNetworkHook_Prerun_Postrun_host(t *testing.T) { t: t, expectedSpec: nil, } - statusSetter := &mockNetworkStatusSetter{ + statusSetter := &mockNetworkStatus{ t: t, expectedStatus: nil, }