mirror of
https://github.com/kemko/nomad.git
synced 2026-01-06 02:15:43 +03:00
When the Nomad client restarts and restores allocations, the network namespace for an allocation may exist but no longer be correctly configured. For example, if the host is rebooted and the task was a Docker task using a pause container, the network namespace may be recreated by the docker daemon. When we restore an allocation, use the CNI "check" command to verify that any existing network namespace matches the expected configuration. This requires CNI plugins of at least version 1.2.0 to avoid a bug in older plugin versions that would cause the check to fail. If the check fails, destroy the network namespace and try to recreate it from scratch once. If that fails in the second pass, fail the restore so that the allocation can be recreated (rather than silently having networking fail). This should fix the gap left #24650 for Docker task drivers and any other drivers with the `MustInitiateNetwork` capability. Fixes: https://github.com/hashicorp/nomad/issues/24292 Ref: https://github.com/hashicorp/nomad/pull/24650
213 lines
6.6 KiB
Go
213 lines
6.6 KiB
Go
// Copyright (c) HashiCorp, Inc.
|
|
// SPDX-License-Identifier: BUSL-1.1
|
|
|
|
package allocrunner
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
|
|
hclog "github.com/hashicorp/go-hclog"
|
|
"github.com/hashicorp/nomad/client/taskenv"
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
"github.com/hashicorp/nomad/plugins/drivers"
|
|
"github.com/miekg/dns"
|
|
)
|
|
|
|
const (
|
|
// dockerNetSpecLabelKey is the label added when we create a pause
|
|
// container to own the network namespace, and the NetworkIsolationSpec we
|
|
// get back from CreateNetwork has this label set as the container ID.
|
|
// We'll use this to generate a hostname for the task in the event the user
|
|
// did not specify a custom one. Please see dockerNetSpecHostnameKey.
|
|
dockerNetSpecLabelKey = "docker_sandbox_container_id"
|
|
|
|
// dockerNetSpecHostnameKey is the label added when we create a pause
|
|
// container and the task group network include a user supplied hostname
|
|
// parameter.
|
|
dockerNetSpecHostnameKey = "docker_sandbox_hostname"
|
|
)
|
|
|
|
var ErrCNICheckFailed = errors.New("network namespace already exists but was misconfigured")
|
|
|
|
type networkIsolationSetter interface {
|
|
SetNetworkIsolation(*drivers.NetworkIsolationSpec)
|
|
}
|
|
|
|
// allocNetworkIsolationSetter is a shim to allow the alloc network hook to
|
|
// set the alloc network isolation configuration without full access
|
|
// to the alloc runner
|
|
type allocNetworkIsolationSetter struct {
|
|
ar *allocRunner
|
|
}
|
|
|
|
func (a *allocNetworkIsolationSetter) SetNetworkIsolation(n *drivers.NetworkIsolationSpec) {
|
|
for _, tr := range a.ar.tasks {
|
|
tr.SetNetworkIsolation(n)
|
|
}
|
|
}
|
|
|
|
type networkStatusSetter interface {
|
|
SetNetworkStatus(*structs.AllocNetworkStatus)
|
|
}
|
|
|
|
// networkHook is an alloc lifecycle hook that manages the network namespace
|
|
// for an alloc
|
|
type networkHook struct {
|
|
// isolationSetter is a callback to set the network isolation spec when after the
|
|
// network is created
|
|
isolationSetter networkIsolationSetter
|
|
|
|
// statusSetter is a callback to the alloc runner to set the network status once
|
|
// network setup is complete
|
|
networkStatusSetter networkStatusSetter
|
|
|
|
// manager is used when creating the network namespace. This defaults to
|
|
// bind mounting a network namespace descritor under /var/run/netns but
|
|
// can be created by a driver if nessicary
|
|
manager drivers.DriverNetworkManager
|
|
|
|
// alloc should only be read from
|
|
alloc *structs.Allocation
|
|
|
|
// spec described the network namespace and is syncronized by specLock
|
|
spec *drivers.NetworkIsolationSpec
|
|
|
|
// networkConfigurator configures the network interfaces, routes, etc once
|
|
// the alloc network has been created
|
|
networkConfigurator NetworkConfigurator
|
|
|
|
// taskEnv is used to perform interpolation within the network blocks.
|
|
taskEnv *taskenv.TaskEnv
|
|
|
|
logger hclog.Logger
|
|
}
|
|
|
|
func newNetworkHook(logger hclog.Logger,
|
|
ns networkIsolationSetter,
|
|
alloc *structs.Allocation,
|
|
netManager drivers.DriverNetworkManager,
|
|
netConfigurator NetworkConfigurator,
|
|
networkStatusSetter networkStatusSetter,
|
|
taskEnv *taskenv.TaskEnv,
|
|
) *networkHook {
|
|
return &networkHook{
|
|
isolationSetter: ns,
|
|
networkStatusSetter: networkStatusSetter,
|
|
alloc: alloc,
|
|
manager: netManager,
|
|
networkConfigurator: netConfigurator,
|
|
taskEnv: taskEnv,
|
|
logger: logger,
|
|
}
|
|
}
|
|
|
|
func (h *networkHook) Name() string {
|
|
return "network"
|
|
}
|
|
|
|
func (h *networkHook) Prerun() error {
|
|
tg := h.alloc.Job.LookupTaskGroup(h.alloc.TaskGroup)
|
|
if len(tg.Networks) == 0 || tg.Networks[0].Mode == "host" || tg.Networks[0].Mode == "" {
|
|
return nil
|
|
}
|
|
|
|
if h.manager == nil || h.networkConfigurator == nil {
|
|
h.logger.Trace("shared network namespaces are not supported on this platform, skipping network hook")
|
|
return nil
|
|
}
|
|
|
|
// Perform our networks block interpolation.
|
|
interpolatedNetworks := taskenv.InterpolateNetworks(h.taskEnv, tg.Networks)
|
|
|
|
// Interpolated values need to be validated. It is also possible a user
|
|
// supplied hostname avoids the validation on job registrations because it
|
|
// looks like it includes interpolation, when it doesn't.
|
|
if interpolatedNetworks[0].Hostname != "" {
|
|
if _, ok := dns.IsDomainName(interpolatedNetworks[0].Hostname); !ok {
|
|
return fmt.Errorf("network hostname %q is not a valid DNS name", interpolatedNetworks[0].Hostname)
|
|
}
|
|
}
|
|
|
|
// Our network create request.
|
|
networkCreateReq := drivers.NetworkCreateRequest{
|
|
Hostname: interpolatedNetworks[0].Hostname,
|
|
}
|
|
|
|
var checkedOnce bool
|
|
|
|
CREATE:
|
|
spec, created, err := h.manager.CreateNetwork(h.alloc.ID, &networkCreateReq)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create network for alloc: %v", err)
|
|
}
|
|
|
|
if spec != nil {
|
|
h.spec = spec
|
|
h.isolationSetter.SetNetworkIsolation(spec)
|
|
}
|
|
|
|
if spec != nil {
|
|
status, err := h.networkConfigurator.Setup(context.TODO(), h.alloc, spec, created)
|
|
if err != nil {
|
|
// if the netns already existed but is invalid, we get
|
|
// ErrCNICheckFailed. We'll try to recover from this one time by
|
|
// recreating the netns from scratch before giving up
|
|
if errors.Is(err, ErrCNICheckFailed) && !checkedOnce {
|
|
checkedOnce = true
|
|
destroyErr := h.manager.DestroyNetwork(h.alloc.ID, spec)
|
|
if destroyErr != nil {
|
|
return fmt.Errorf("%w: destroying network to retry failed: %v", err, destroyErr)
|
|
}
|
|
goto CREATE
|
|
}
|
|
|
|
return fmt.Errorf("failed to configure networking for alloc: %v", err)
|
|
}
|
|
if status == nil {
|
|
return nil // netns already existed and was correctly configured
|
|
}
|
|
|
|
// If the driver set the sandbox hostname label, then we will use that
|
|
// to set the HostsConfig.Hostname. Otherwise, identify the sandbox
|
|
// container ID which will have been used to set the network namespace
|
|
// hostname.
|
|
if hostname, ok := spec.Labels[dockerNetSpecHostnameKey]; ok {
|
|
h.spec.HostsConfig = &drivers.HostsConfig{
|
|
Address: status.Address,
|
|
Hostname: hostname,
|
|
}
|
|
} else if hostname, ok := spec.Labels[dockerNetSpecLabelKey]; ok {
|
|
|
|
// the docker_sandbox_container_id is the full ID of the pause
|
|
// container, whereas we want the shortened name that dockerd sets
|
|
// as the pause container's hostname.
|
|
if len(hostname) > 12 {
|
|
hostname = hostname[:12]
|
|
}
|
|
|
|
h.spec.HostsConfig = &drivers.HostsConfig{
|
|
Address: status.Address,
|
|
Hostname: hostname,
|
|
}
|
|
}
|
|
|
|
h.networkStatusSetter.SetNetworkStatus(status)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (h *networkHook) Postrun() error {
|
|
|
|
// we need the spec for network teardown
|
|
if h.spec != nil {
|
|
if err := h.networkConfigurator.Teardown(context.TODO(), h.alloc, h.spec); err != nil {
|
|
h.logger.Error("failed to cleanup network for allocation, resources may have leaked", "alloc", h.alloc.ID, "error", err)
|
|
}
|
|
}
|
|
|
|
// issue driver destroy regardless if we have a spec (e.g. cleanup pause container)
|
|
return h.manager.DestroyNetwork(h.alloc.ID, h.spec)
|
|
}
|