mirror of
https://github.com/kemko/nomad.git
synced 2026-01-03 17:05:43 +03:00
When the Nomad client restarts and restores allocations, the network namespace for an allocation may exist but no longer be correctly configured. For example, if the host is rebooted and the task was a Docker task using a pause container, the network namespace may be recreated by the docker daemon. When we restore an allocation, use the CNI "check" command to verify that any existing network namespace matches the expected configuration. This requires CNI plugins of at least version 1.2.0 to avoid a bug in older plugin versions that would cause the check to fail. If the check fails, destroy the network namespace and try to recreate it from scratch once. If that fails in the second pass, fail the restore so that the allocation can be recreated (rather than silently having networking fail). This should fix the gap left #24650 for Docker task drivers and any other drivers with the `MustInitiateNetwork` capability. Fixes: https://github.com/hashicorp/nomad/issues/24292 Ref: https://github.com/hashicorp/nomad/pull/24650
284 lines
8.7 KiB
Go
284 lines
8.7 KiB
Go
// Copyright (c) HashiCorp, Inc.
|
|
// SPDX-License-Identifier: BUSL-1.1
|
|
|
|
package allocrunner
|
|
|
|
import (
|
|
"fmt"
|
|
"testing"
|
|
|
|
"github.com/hashicorp/nomad/ci"
|
|
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
|
|
"github.com/hashicorp/nomad/client/taskenv"
|
|
"github.com/hashicorp/nomad/helper/testlog"
|
|
"github.com/hashicorp/nomad/nomad/mock"
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
"github.com/hashicorp/nomad/plugins/drivers"
|
|
"github.com/hashicorp/nomad/plugins/drivers/testutils"
|
|
"github.com/hashicorp/nomad/testutil"
|
|
"github.com/shoenig/test"
|
|
"github.com/shoenig/test/must"
|
|
)
|
|
|
|
// statically assert network hook implements the expected interfaces
|
|
var _ interfaces.RunnerPrerunHook = (*networkHook)(nil)
|
|
var _ interfaces.RunnerPostrunHook = (*networkHook)(nil)
|
|
|
|
type mockNetworkIsolationSetter struct {
|
|
t *testing.T
|
|
expectedSpec *drivers.NetworkIsolationSpec
|
|
called bool
|
|
}
|
|
|
|
func (m *mockNetworkIsolationSetter) SetNetworkIsolation(spec *drivers.NetworkIsolationSpec) {
|
|
m.called = true
|
|
test.Eq(m.t, m.expectedSpec, spec)
|
|
}
|
|
|
|
type mockNetworkStatusSetter struct {
|
|
t *testing.T
|
|
expectedStatus *structs.AllocNetworkStatus
|
|
called bool
|
|
}
|
|
|
|
func (m *mockNetworkStatusSetter) SetNetworkStatus(status *structs.AllocNetworkStatus) {
|
|
m.called = true
|
|
test.Eq(m.t, m.expectedStatus, status)
|
|
}
|
|
|
|
// Test that the prerun and postrun hooks call the setter with the expected
|
|
// NetworkIsolationSpec for group bridge network.
|
|
func TestNetworkHook_Prerun_Postrun_group(t *testing.T) {
|
|
ci.Parallel(t)
|
|
|
|
alloc := mock.Alloc()
|
|
alloc.Job.TaskGroups[0].Networks = []*structs.NetworkResource{
|
|
{Mode: "bridge"},
|
|
}
|
|
|
|
spec := &drivers.NetworkIsolationSpec{
|
|
Mode: drivers.NetIsolationModeGroup,
|
|
Path: "test",
|
|
Labels: map[string]string{"abc": "123"},
|
|
}
|
|
|
|
destroyCalled := false
|
|
nm := &testutils.MockDriver{
|
|
MockNetworkManager: testutils.MockNetworkManager{
|
|
CreateNetworkF: func(allocID string, req *drivers.NetworkCreateRequest) (*drivers.NetworkIsolationSpec, bool, error) {
|
|
test.Eq(t, alloc.ID, allocID)
|
|
return spec, true, nil
|
|
},
|
|
|
|
DestroyNetworkF: func(allocID string, netSpec *drivers.NetworkIsolationSpec) error {
|
|
destroyCalled = true
|
|
test.Eq(t, alloc.ID, allocID)
|
|
test.Eq(t, spec, netSpec)
|
|
return nil
|
|
},
|
|
},
|
|
}
|
|
setter := &mockNetworkIsolationSetter{
|
|
t: t,
|
|
expectedSpec: spec,
|
|
}
|
|
statusSetter := &mockNetworkStatusSetter{
|
|
t: t,
|
|
expectedStatus: nil,
|
|
}
|
|
|
|
envBuilder := taskenv.NewBuilder(mock.Node(), alloc, nil, alloc.Job.Region)
|
|
logger := testlog.HCLogger(t)
|
|
hook := newNetworkHook(logger, setter, alloc, nm, &hostNetworkConfigurator{}, statusSetter, envBuilder.Build())
|
|
must.NoError(t, hook.Prerun())
|
|
must.True(t, setter.called)
|
|
must.False(t, destroyCalled)
|
|
must.NoError(t, hook.Postrun())
|
|
must.True(t, destroyCalled)
|
|
}
|
|
|
|
// Test that prerun and postrun hooks do not expect a NetworkIsolationSpec
|
|
func TestNetworkHook_Prerun_Postrun_host(t *testing.T) {
|
|
ci.Parallel(t)
|
|
|
|
alloc := mock.Alloc()
|
|
alloc.Job.TaskGroups[0].Networks = []*structs.NetworkResource{
|
|
{Mode: "host"},
|
|
}
|
|
|
|
destroyCalled := false
|
|
nm := &testutils.MockDriver{
|
|
MockNetworkManager: testutils.MockNetworkManager{
|
|
CreateNetworkF: func(allocID string, req *drivers.NetworkCreateRequest) (*drivers.NetworkIsolationSpec, bool, error) {
|
|
test.Unreachable(t, test.Sprintf("should not call CreateNetwork for host network"))
|
|
return nil, false, nil
|
|
},
|
|
|
|
DestroyNetworkF: func(allocID string, netSpec *drivers.NetworkIsolationSpec) error {
|
|
destroyCalled = true
|
|
test.Nil(t, netSpec)
|
|
return nil
|
|
},
|
|
},
|
|
}
|
|
setter := &mockNetworkIsolationSetter{
|
|
t: t,
|
|
expectedSpec: nil,
|
|
}
|
|
statusSetter := &mockNetworkStatusSetter{
|
|
t: t,
|
|
expectedStatus: nil,
|
|
}
|
|
|
|
envBuilder := taskenv.NewBuilder(mock.Node(), alloc, nil, alloc.Job.Region)
|
|
logger := testlog.HCLogger(t)
|
|
hook := newNetworkHook(logger, setter, alloc, nm, &hostNetworkConfigurator{}, statusSetter, envBuilder.Build())
|
|
must.NoError(t, hook.Prerun())
|
|
must.False(t, setter.called)
|
|
must.False(t, destroyCalled)
|
|
must.NoError(t, hook.Postrun())
|
|
must.True(t, destroyCalled)
|
|
}
|
|
|
|
// TestNetworkHook_Prerun_Postrun_ExistingNetNS tests that the prerun and
|
|
// postrun hooks call the Setup and Destroy with the expected behaviors when the
|
|
// network namespace already exists (typical of agent restarts and host reboots)
|
|
func TestNetworkHook_Prerun_Postrun_ExistingNetNS(t *testing.T) {
|
|
ci.Parallel(t)
|
|
|
|
alloc := mock.Alloc()
|
|
alloc.Job.TaskGroups[0].Networks = []*structs.NetworkResource{
|
|
{Mode: "bridge"},
|
|
}
|
|
|
|
spec := &drivers.NetworkIsolationSpec{
|
|
Mode: drivers.NetIsolationModeGroup,
|
|
Path: "test",
|
|
Labels: map[string]string{"abc": "123"},
|
|
}
|
|
isolationSetter := &mockNetworkIsolationSetter{t: t, expectedSpec: spec}
|
|
statusSetter := &mockNetworkStatusSetter{t: t, expectedStatus: nil}
|
|
|
|
callCounts := testutil.NewCallCounter()
|
|
|
|
nm := &testutils.MockDriver{
|
|
MockNetworkManager: testutils.MockNetworkManager{
|
|
CreateNetworkF: func(allocID string, req *drivers.NetworkCreateRequest) (*drivers.NetworkIsolationSpec, bool, error) {
|
|
test.Eq(t, alloc.ID, allocID)
|
|
callCounts.Inc("CreateNetwork")
|
|
return spec, false, nil
|
|
},
|
|
|
|
DestroyNetworkF: func(allocID string, netSpec *drivers.NetworkIsolationSpec) error {
|
|
test.Eq(t, alloc.ID, allocID)
|
|
test.Eq(t, spec, netSpec)
|
|
callCounts.Inc("DestroyNetwork")
|
|
return nil
|
|
},
|
|
},
|
|
}
|
|
|
|
fakePlugin := newMockCNIPlugin()
|
|
|
|
configurator := &cniNetworkConfigurator{
|
|
nodeAttrs: map[string]string{
|
|
"plugins.cni.version.bridge": "1.6.1",
|
|
},
|
|
nodeMeta: map[string]string{},
|
|
logger: testlog.HCLogger(t),
|
|
cni: fakePlugin,
|
|
nsOpts: &nsOpts{},
|
|
}
|
|
|
|
envBuilder := taskenv.NewBuilder(mock.Node(), alloc, nil, alloc.Job.Region)
|
|
|
|
testCases := []struct {
|
|
name string
|
|
cniVersion string
|
|
checkErrs []error
|
|
setupErrs []string
|
|
expectPrerunCreateNetworkCalls int
|
|
expectPrerunDestroyNetworkCalls int
|
|
expectCheckCalls int
|
|
expectSetupCalls int
|
|
expectPostrunDestroyNetworkCalls int
|
|
expectPrerunError string
|
|
}{
|
|
{
|
|
name: "good check",
|
|
cniVersion: "1.6.1",
|
|
expectPrerunCreateNetworkCalls: 1,
|
|
expectPrerunDestroyNetworkCalls: 0,
|
|
expectCheckCalls: 1,
|
|
expectSetupCalls: 0,
|
|
expectPostrunDestroyNetworkCalls: 1,
|
|
},
|
|
{
|
|
name: "initial check fails",
|
|
cniVersion: "1.6.1",
|
|
checkErrs: []error{fmt.Errorf("whatever")},
|
|
expectPrerunCreateNetworkCalls: 2,
|
|
expectPrerunDestroyNetworkCalls: 1,
|
|
expectCheckCalls: 2,
|
|
expectSetupCalls: 0,
|
|
expectPostrunDestroyNetworkCalls: 2,
|
|
},
|
|
{
|
|
name: "check fails twice",
|
|
cniVersion: "1.6.1",
|
|
checkErrs: []error{
|
|
fmt.Errorf("whatever"),
|
|
fmt.Errorf("whatever"),
|
|
},
|
|
expectPrerunCreateNetworkCalls: 2,
|
|
expectPrerunDestroyNetworkCalls: 1,
|
|
expectCheckCalls: 2,
|
|
expectSetupCalls: 0,
|
|
expectPostrunDestroyNetworkCalls: 2,
|
|
expectPrerunError: "failed to configure networking for alloc: network namespace already exists but was misconfigured: whatever",
|
|
},
|
|
{
|
|
name: "old CNI version skips check",
|
|
cniVersion: "1.2.0",
|
|
expectPrerunCreateNetworkCalls: 1,
|
|
expectPrerunDestroyNetworkCalls: 0,
|
|
expectCheckCalls: 0,
|
|
expectSetupCalls: 0,
|
|
expectPostrunDestroyNetworkCalls: 1,
|
|
},
|
|
}
|
|
|
|
for _, tc := range testCases {
|
|
|
|
t.Run(tc.name, func(t *testing.T) {
|
|
callCounts.Reset()
|
|
fakePlugin.counter.Reset()
|
|
fakePlugin.checkErrors = tc.checkErrs
|
|
configurator.nodeAttrs["plugins.cni.version.bridge"] = tc.cniVersion
|
|
hook := newNetworkHook(testlog.HCLogger(t), isolationSetter,
|
|
alloc, nm, configurator, statusSetter, envBuilder.Build())
|
|
|
|
err := hook.Prerun()
|
|
if tc.expectPrerunError == "" {
|
|
must.NoError(t, err)
|
|
} else {
|
|
must.EqError(t, err, tc.expectPrerunError)
|
|
}
|
|
|
|
test.Eq(t, tc.expectPrerunDestroyNetworkCalls,
|
|
callCounts.Get()["DestroyNetwork"], test.Sprint("DestroyNetwork calls after prerun"))
|
|
test.Eq(t, tc.expectPrerunCreateNetworkCalls,
|
|
callCounts.Get()["CreateNetwork"], test.Sprint("CreateNetwork calls after prerun"))
|
|
|
|
test.Eq(t, tc.expectCheckCalls, fakePlugin.counter.Get()["Check"], test.Sprint("Check calls"))
|
|
test.Eq(t, tc.expectSetupCalls, fakePlugin.counter.Get()["Setup"], test.Sprint("Setup calls"))
|
|
|
|
must.NoError(t, hook.Postrun())
|
|
test.Eq(t, tc.expectPostrunDestroyNetworkCalls,
|
|
callCounts.Get()["DestroyNetwork"], test.Sprint("DestroyNetwork calls after postrun"))
|
|
|
|
})
|
|
|
|
}
|
|
}
|