diff --git a/.changelog/17455.txt b/.changelog/17455.txt new file mode 100644 index 000000000..6f5ffc3be --- /dev/null +++ b/.changelog/17455.txt @@ -0,0 +1,3 @@ +```release-note:bug +docker: Fixed a bug where network pause container would not be removed after node restart +``` diff --git a/client/allocrunner/network_hook.go b/client/allocrunner/network_hook.go index d822f6508..bc85e9ce4 100644 --- a/client/allocrunner/network_hook.go +++ b/client/allocrunner/network_hook.go @@ -178,12 +178,14 @@ func (h *networkHook) Prerun() error { } func (h *networkHook) Postrun() error { - if h.spec == nil { - return nil + + // we need the spec for network teardown + if h.spec != nil { + if err := h.networkConfigurator.Teardown(context.TODO(), h.alloc, h.spec); err != nil { + h.logger.Error("failed to cleanup network for allocation, resources may have leaked", "alloc", h.alloc.ID, "error", err) + } } - if err := h.networkConfigurator.Teardown(context.TODO(), h.alloc, h.spec); err != nil { - h.logger.Error("failed to cleanup network for allocation, resources may have leaked", "alloc", h.alloc.ID, "error", err) - } + // issue driver destroy regardless if we have a spec (e.g. cleanup pause container) return h.manager.DestroyNetwork(h.alloc.ID, h.spec) } diff --git a/client/allocrunner/network_hook_test.go b/client/allocrunner/network_hook_test.go index 07d9cabd7..bda7f0ed0 100644 --- a/client/allocrunner/network_hook_test.go +++ b/client/allocrunner/network_hook_test.go @@ -14,7 +14,8 @@ import ( "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/plugins/drivers" "github.com/hashicorp/nomad/plugins/drivers/testutils" - "github.com/stretchr/testify/require" + "github.com/shoenig/test" + "github.com/shoenig/test/must" ) // statically assert network hook implements the expected interfaces @@ -29,7 +30,7 @@ type mockNetworkIsolationSetter struct { func (m *mockNetworkIsolationSetter) SetNetworkIsolation(spec *drivers.NetworkIsolationSpec) { m.called = true - require.Exactly(m.t, m.expectedSpec, spec) + test.Eq(m.t, m.expectedSpec, spec) } type mockNetworkStatusSetter struct { @@ -40,20 +41,19 @@ type mockNetworkStatusSetter struct { func (m *mockNetworkStatusSetter) SetNetworkStatus(status *structs.AllocNetworkStatus) { m.called = true - require.Exactly(m.t, m.expectedStatus, status) + test.Eq(m.t, m.expectedStatus, status) } -// Test that the prerun and postrun hooks call the setter with the expected spec when -// the network mode is not host -func TestNetworkHook_Prerun_Postrun(t *testing.T) { +// Test that the prerun and postrun hooks call the setter with the expected +// NetworkIsolationSpec for group bridge network. +func TestNetworkHook_Prerun_Postrun_group(t *testing.T) { ci.Parallel(t) alloc := mock.Alloc() alloc.Job.TaskGroups[0].Networks = []*structs.NetworkResource{ - { - Mode: "bridge", - }, + {Mode: "bridge"}, } + spec := &drivers.NetworkIsolationSpec{ Mode: drivers.NetIsolationModeGroup, Path: "test", @@ -64,14 +64,14 @@ func TestNetworkHook_Prerun_Postrun(t *testing.T) { nm := &testutils.MockDriver{ MockNetworkManager: testutils.MockNetworkManager{ CreateNetworkF: func(allocID string, req *drivers.NetworkCreateRequest) (*drivers.NetworkIsolationSpec, bool, error) { - require.Equal(t, alloc.ID, allocID) + test.Eq(t, alloc.ID, allocID) return spec, false, nil }, DestroyNetworkF: func(allocID string, netSpec *drivers.NetworkIsolationSpec) error { destroyCalled = true - require.Equal(t, alloc.ID, allocID) - require.Exactly(t, spec, netSpec) + test.Eq(t, alloc.ID, allocID) + test.Eq(t, spec, netSpec) return nil }, }, @@ -84,26 +84,56 @@ func TestNetworkHook_Prerun_Postrun(t *testing.T) { t: t, expectedStatus: nil, } - require := require.New(t) envBuilder := taskenv.NewBuilder(mock.Node(), alloc, nil, alloc.Job.Region) - logger := testlog.HCLogger(t) hook := newNetworkHook(logger, setter, alloc, nm, &hostNetworkConfigurator{}, statusSetter, envBuilder.Build()) - require.NoError(hook.Prerun()) - require.True(setter.called) - require.False(destroyCalled) - require.NoError(hook.Postrun()) - require.True(destroyCalled) - - // reset and use host network mode - setter.called = false - destroyCalled = false - alloc.Job.TaskGroups[0].Networks[0].Mode = "host" - hook = newNetworkHook(logger, setter, alloc, nm, &hostNetworkConfigurator{}, statusSetter, envBuilder.Build()) - require.NoError(hook.Prerun()) - require.False(setter.called) - require.False(destroyCalled) - require.NoError(hook.Postrun()) - require.False(destroyCalled) + must.NoError(t, hook.Prerun()) + must.True(t, setter.called) + must.False(t, destroyCalled) + must.NoError(t, hook.Postrun()) + must.True(t, destroyCalled) +} + +// Test that prerun and postrun hooks do not expect a NetworkIsolationSpec +func TestNetworkHook_Prerun_Postrun_host(t *testing.T) { + ci.Parallel(t) + + alloc := mock.Alloc() + alloc.Job.TaskGroups[0].Networks = []*structs.NetworkResource{ + {Mode: "host"}, + } + + destroyCalled := false + nm := &testutils.MockDriver{ + MockNetworkManager: testutils.MockNetworkManager{ + CreateNetworkF: func(allocID string, req *drivers.NetworkCreateRequest) (*drivers.NetworkIsolationSpec, bool, error) { + test.Unreachable(t, test.Sprintf("should not call CreateNetwork for host network")) + return nil, false, nil + }, + + DestroyNetworkF: func(allocID string, netSpec *drivers.NetworkIsolationSpec) error { + destroyCalled = true + test.Nil(t, netSpec) + return nil + }, + }, + } + setter := &mockNetworkIsolationSetter{ + t: t, + expectedSpec: nil, + } + statusSetter := &mockNetworkStatusSetter{ + t: t, + expectedStatus: nil, + } + + envBuilder := taskenv.NewBuilder(mock.Node(), alloc, nil, alloc.Job.Region) + logger := testlog.HCLogger(t) + hook := newNetworkHook(logger, setter, alloc, nm, &hostNetworkConfigurator{}, statusSetter, envBuilder.Build()) + must.NoError(t, hook.Prerun()) + must.False(t, setter.called) + must.False(t, destroyCalled) + must.NoError(t, hook.Postrun()) + must.True(t, destroyCalled) } diff --git a/client/allocrunner/network_manager_linux.go b/client/allocrunner/network_manager_linux.go index f86f39d62..0b7d4cc0a 100644 --- a/client/allocrunner/network_manager_linux.go +++ b/client/allocrunner/network_manager_linux.go @@ -152,6 +152,9 @@ func (*defaultNetworkManager) CreateNetwork(allocID string, _ *drivers.NetworkCr } func (*defaultNetworkManager) DestroyNetwork(allocID string, spec *drivers.NetworkIsolationSpec) error { + if spec == nil { + return nil + } return nsutil.UnmountNS(spec.Path) } diff --git a/drivers/docker/driver.go b/drivers/docker/driver.go index ccc3aa0cd..0c3227de1 100644 --- a/drivers/docker/driver.go +++ b/drivers/docker/driver.go @@ -35,6 +35,7 @@ import ( "github.com/hashicorp/nomad/plugins/drivers" pstructs "github.com/hashicorp/nomad/plugins/shared/structs" "github.com/ryanuber/go-glob" + "golang.org/x/exp/slices" ) var ( @@ -760,6 +761,39 @@ func (d *Driver) containerBinds(task *drivers.TaskConfig, driverConfig *TaskConf return binds, nil } +func (d *Driver) findPauseContainer(allocID string) (string, error) { + + _, dockerClient, err := d.dockerClients() + if err != nil { + return "", err + } + + containers, listErr := dockerClient.ListContainers(docker.ListContainersOptions{ + Context: d.ctx, + All: false, // running only + Filters: map[string][]string{ + "label": {dockerLabelAllocID}, + }, + }) + if listErr != nil { + d.logger.Error("failed to list pause containers for recovery", "error", listErr) + return "", listErr + } + + for _, c := range containers { + if !slices.ContainsFunc(c.Names, func(s string) bool { + return strings.HasPrefix(s, "/nomad_init_") + }) { + continue + } + if c.Labels[dockerLabelAllocID] == allocID { + return c.ID, nil + } + } + + return "", nil +} + // recoverPauseContainers gets called when we start up the plugin. On client // restarts we need to rebuild the set of pause containers we are // tracking. Basically just scan all containers and pull the ID from anything @@ -779,7 +813,7 @@ func (d *Driver) recoverPauseContainers(ctx context.Context) { }, }) if listErr != nil && listErr != ctx.Err() { - d.logger.Error("failed to list pause containers", "error", listErr) + d.logger.Error("failed to list pause containers for recovery", "error", listErr) return } diff --git a/drivers/docker/network.go b/drivers/docker/network.go index 41b458493..e92acd3e5 100644 --- a/drivers/docker/network.go +++ b/drivers/docker/network.go @@ -93,7 +93,30 @@ func (d *Driver) CreateNetwork(allocID string, createSpec *drivers.NetworkCreate } func (d *Driver) DestroyNetwork(allocID string, spec *drivers.NetworkIsolationSpec) error { - id := spec.Labels[dockerNetSpecLabelKey] + + var ( + id string + err error + ) + + if spec != nil { + // if we have the spec we can just read the container id + id = spec.Labels[dockerNetSpecLabelKey] + } else { + // otherwise we need to scan all the containers and find the pause container + // associated with this allocation - this happens when the client is + // restarted since we do not persist the network spec + id, err = d.findPauseContainer(allocID) + } + + if err != nil { + return err + } + + if id == "" { + d.logger.Debug("failed to find pause container to cleanup", "alloc_id", allocID) + return nil + } // no longer tracking this pause container; even if we fail here we should // let the background reconciliation keep trying @@ -104,11 +127,16 @@ func (d *Driver) DestroyNetwork(allocID string, spec *drivers.NetworkIsolationSp return fmt.Errorf("failed to connect to docker daemon: %s", err) } + timeout := uint(1) // this is the pause container, just kill it fast + if err := client.StopContainerWithContext(id, timeout, d.ctx); err != nil { + d.logger.Warn("failed to stop pause container", "id", id, "error", err) + } + if err := client.RemoveContainer(docker.RemoveContainerOptions{ Force: true, ID: id, }); err != nil { - return err + return fmt.Errorf("failed to remove pause container: %w", err) } if d.config.GC.Image { diff --git a/plugins/drivers/client.go b/plugins/drivers/client.go index bfa1bc427..c5447dc7c 100644 --- a/plugins/drivers/client.go +++ b/plugins/drivers/client.go @@ -490,6 +490,10 @@ func (d *driverPluginClient) CreateNetwork(allocID string, _ *NetworkCreateReque } func (d *driverPluginClient) DestroyNetwork(allocID string, spec *NetworkIsolationSpec) error { + if spec == nil { + return nil + } + req := &proto.DestroyNetworkRequest{ AllocId: allocID, IsolationSpec: NetworkIsolationSpecToProto(spec),