Add network lifecycle management

Adds a new Prerun and Postrun hooks to manage set up of network namespaces
on linux. Work still needs to be done to make the code platform agnostic and
support Docker style network initalization.
This commit is contained in:
Nick Ethier
2019-04-29 13:35:15 -04:00
parent bfe7841913
commit e20fa7ccc1
10 changed files with 1134 additions and 311 deletions

View File

@@ -7,8 +7,22 @@ import (
multierror "github.com/hashicorp/go-multierror"
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/plugins/drivers"
)
// allocNetworkIsolationSetter is a shim to allow the alloc network hook to
// set the alloc network isolation configuration without full access
// to the alloc runner
type allocNetworkIsolationSetter struct {
ar *allocRunner
}
func (a *allocNetworkIsolationSetter) SetNetworkIsolation(n *drivers.NetworkIsolationSpec) {
for _, tr := range a.ar.tasks {
tr.SetNetworkIsolation(n)
}
}
// allocHealthSetter is a shim to allow the alloc health watcher hook to set
// and clear the alloc health without full access to the alloc runner state
type allocHealthSetter struct {
@@ -82,6 +96,9 @@ func (ar *allocRunner) initRunnerHooks() {
// create health setting shim
hs := &allocHealthSetter{ar}
// determine how the network must be created
ns := &allocNetworkIsolationSetter{ar: ar}
// Create the alloc directory hook. This is run first to ensure the
// directory path exists for other hooks.
ar.runnerHooks = []interfaces.RunnerHook{
@@ -89,6 +106,7 @@ func (ar *allocRunner) initRunnerHooks() {
newUpstreamAllocsHook(hookLogger, ar.prevAllocWatcher),
newDiskMigrationHook(hookLogger, ar.prevAllocMigrator, ar.allocDir),
newAllocHealthWatcherHook(hookLogger, ar.Alloc(), hs, ar.Listener(), ar.consulClient),
newNetworkHook(ns, hookLogger, ar.Alloc(), &defaultNetworkManager{}),
}
}

View File

@@ -0,0 +1,234 @@
package allocrunner
import (
"crypto/rand"
"fmt"
"os"
"path"
"runtime"
"strings"
"sync"
"github.com/containernetworking/plugins/pkg/ns"
hclog "github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/plugins/drivers"
"golang.org/x/sys/unix"
)
const (
NsRunDir = "/var/run/netns"
)
type networkManager interface {
CreateNetwork(allocID string) (*drivers.NetworkIsolationSpec, error)
DestroyNetwork(allocID string, spec *drivers.NetworkIsolationSpec) error
}
func (ar *allocRunner) netNSPath() string {
return path.Join(NsRunDir, netNSName(ar.Alloc().ID))
}
func netNSName(id string) string {
return fmt.Sprintf("nomad-%s", id)
}
type networkHook struct {
setter *allocNetworkIsolationSetter
manager networkManager
alloc *structs.Allocation
spec *drivers.NetworkIsolationSpec
specLock sync.Mutex
logger hclog.Logger
}
func newNetworkHook(ns *allocNetworkIsolationSetter, logger hclog.Logger, alloc *structs.Allocation, netManager networkManager) *networkHook {
return &networkHook{
setter: ns,
alloc: alloc,
manager: netManager,
logger: logger,
}
}
func (h *networkHook) Name() string {
return "network"
}
func (h *networkHook) Prerun() error {
h.specLock.Lock()
defer h.specLock.Unlock()
tg := h.alloc.Job.LookupTaskGroup(h.alloc.TaskGroup)
if len(tg.Networks) == 0 || tg.Networks[0].Mode == "host" || tg.Networks[0].Mode == "" {
return nil
}
spec, err := h.manager.CreateNetwork(h.alloc.ID)
if err != nil {
return fmt.Errorf("failed to create network for alloc: %v", err)
}
h.spec = spec
h.setter.SetNetworkIsolation(spec)
return nil
}
func (h *networkHook) Postrun() error {
h.specLock.Lock()
defer h.specLock.Unlock()
if h.spec == nil {
h.logger.Debug("spec was nil")
return nil
}
return h.manager.DestroyNetwork(h.alloc.ID, h.spec)
}
type defaultNetworkManager struct{}
func (_ *defaultNetworkManager) CreateNetwork(allocID string) (*drivers.NetworkIsolationSpec, error) {
netns, err := newNS(allocID)
if err != nil {
return nil, err
}
spec := &drivers.NetworkIsolationSpec{
Mode: drivers.NetIsolationModeGroup,
Path: netns.Path(),
Labels: make(map[string]string),
}
return spec, nil
}
func (_ *defaultNetworkManager) DestroyNetwork(allocID string, spec *drivers.NetworkIsolationSpec) error {
return unmountNS(spec.Path)
}
// Creates a new persistent (bind-mounted) network namespace and returns an object
// representing that namespace, without switching to it.
func newNS(id string) (ns.NetNS, error) {
b := make([]byte, 16)
_, err := rand.Reader.Read(b)
if err != nil {
return nil, fmt.Errorf("failed to generate random netns name: %v", err)
}
// Create the directory for mounting network namespaces
// This needs to be a shared mountpoint in case it is mounted in to
// other namespaces (containers)
err = os.MkdirAll(NsRunDir, 0755)
if err != nil {
return nil, err
}
// Remount the namespace directory shared. This will fail if it is not
// already a mountpoint, so bind-mount it on to itself to "upgrade" it
// to a mountpoint.
err = unix.Mount("", NsRunDir, "none", unix.MS_SHARED|unix.MS_REC, "")
if err != nil {
if err != unix.EINVAL {
return nil, fmt.Errorf("mount --make-rshared %s failed: %q", NsRunDir, err)
}
// Recursively remount /var/run/netns on itself. The recursive flag is
// so that any existing netns bindmounts are carried over.
err = unix.Mount(NsRunDir, NsRunDir, "none", unix.MS_BIND|unix.MS_REC, "")
if err != nil {
return nil, fmt.Errorf("mount --rbind %s %s failed: %q", NsRunDir, NsRunDir, err)
}
// Now we can make it shared
err = unix.Mount("", NsRunDir, "none", unix.MS_SHARED|unix.MS_REC, "")
if err != nil {
return nil, fmt.Errorf("mount --make-rshared %s failed: %q", NsRunDir, err)
}
}
nsName := netNSName(id)
// create an empty file at the mount point
nsPath := path.Join(NsRunDir, nsName)
mountPointFd, err := os.Create(nsPath)
if err != nil {
return nil, err
}
mountPointFd.Close()
// Ensure the mount point is cleaned up on errors; if the namespace
// was successfully mounted this will have no effect because the file
// is in-use
defer os.RemoveAll(nsPath)
var wg sync.WaitGroup
wg.Add(1)
// do namespace work in a dedicated goroutine, so that we can safely
// Lock/Unlock OSThread without upsetting the lock/unlock state of
// the caller of this function
go (func() {
defer wg.Done()
runtime.LockOSThread()
// Don't unlock. By not unlocking, golang will kill the OS thread when the
// goroutine is done (for go1.10+)
var origNS ns.NetNS
origNS, err = ns.GetNS(getCurrentThreadNetNSPath())
if err != nil {
return
}
defer origNS.Close()
// create a new netns on the current thread
err = unix.Unshare(unix.CLONE_NEWNET)
if err != nil {
return
}
// Put this thread back to the orig ns, since it might get reused (pre go1.10)
defer origNS.Set()
// bind mount the netns from the current thread (from /proc) onto the
// mount point. This causes the namespace to persist, even when there
// are no threads in the ns.
err = unix.Mount(getCurrentThreadNetNSPath(), nsPath, "none", unix.MS_BIND, "")
if err != nil {
err = fmt.Errorf("failed to bind mount ns at %s: %v", nsPath, err)
}
})()
wg.Wait()
if err != nil {
return nil, fmt.Errorf("failed to create namespace: %v", err)
}
return ns.GetNS(nsPath)
}
// UnmountNS unmounts the NS held by the netns object
func unmountNS(nsPath string) error {
// Only unmount if it's been bind-mounted (don't touch namespaces in /proc...)
if strings.HasPrefix(nsPath, NsRunDir) {
if err := unix.Unmount(nsPath, 0); err != nil {
return fmt.Errorf("failed to unmount NS: at %s: %v", nsPath, err)
}
if err := os.Remove(nsPath); err != nil {
return fmt.Errorf("failed to remove ns path %s: %v", nsPath, err)
}
}
return nil
}
// getCurrentThreadNetNSPath copied from pkg/ns
func getCurrentThreadNetNSPath() string {
// /proc/self/ns/net returns the namespace of the main thread, not
// of whatever thread this goroutine is running on. Make sure we
// use the thread's net namespace since the thread is switching around
return fmt.Sprintf("/proc/%d/task/%d/ns/net", os.Getpid(), unix.Gettid())
}

View File

@@ -0,0 +1,28 @@
package allocrunner
import "github.com/hashicorp/nomad/nomad/structs"
type networkHook struct {
// alloc set by new func
alloc *structs.Allocation
networks []*structs.NetworkResource
}
func newNetworkHook() *networkHook {
return nil
}
func (n *networkHook) Name() string {
return "network"
}
func (n *networkHook) Prerun() error {
if len(n.networks) == 0 || n.networks[0].Mode == "host" {
return nil
}
return nil
}

View File

@@ -202,6 +202,9 @@ type TaskRunner struct {
// fails and the Run method should wait until serversContactedCh is
// closed.
waitOnServers bool
networkIsolationLock sync.Mutex
networkIsolationSpec *drivers.NetworkIsolationSpec
}
type Config struct {
@@ -895,6 +898,8 @@ func (tr *TaskRunner) buildTaskConfig() *drivers.TaskConfig {
invocationid := uuid.Generate()[:8]
taskResources := tr.taskResources
env := tr.envBuilder.Build()
tr.networkIsolationLock.Lock()
defer tr.networkIsolationLock.Unlock()
return &drivers.TaskConfig{
ID: fmt.Sprintf("%s/%s/%s", alloc.ID, task.Name, invocationid),
@@ -909,15 +914,16 @@ func (tr *TaskRunner) buildTaskConfig() *drivers.TaskConfig {
PercentTicks: float64(taskResources.Cpu.CpuShares) / float64(tr.clientConfig.Node.NodeResources.Cpu.CpuShares),
},
},
Devices: tr.hookResources.getDevices(),
Mounts: tr.hookResources.getMounts(),
Env: env.Map(),
DeviceEnv: env.DeviceEnv(),
User: task.User,
AllocDir: tr.taskDir.AllocDir,
StdoutPath: tr.logmonHookConfig.stdoutFifo,
StderrPath: tr.logmonHookConfig.stderrFifo,
AllocID: tr.allocID,
Devices: tr.hookResources.getDevices(),
Mounts: tr.hookResources.getMounts(),
Env: env.Map(),
DeviceEnv: env.DeviceEnv(),
User: task.User,
AllocDir: tr.taskDir.AllocDir,
StdoutPath: tr.logmonHookConfig.stdoutFifo,
StderrPath: tr.logmonHookConfig.stderrFifo,
AllocID: tr.allocID,
NetworkIsolation: tr.networkIsolationSpec,
}
}
@@ -1181,6 +1187,14 @@ func (tr *TaskRunner) Update(update *structs.Allocation) {
}
}
// SetNetworkIsolation is called by the PreRun allocation hook after configuring
// the network isolation for the allocation
func (tr *TaskRunner) SetNetworkIsolation(n *drivers.NetworkIsolationSpec) {
tr.networkIsolationLock.Lock()
tr.networkIsolationSpec = n
tr.networkIsolationLock.Unlock()
}
// triggerUpdate if there isn't already an update pending. Should be called
// instead of calling updateHooks directly to serialize runs of update hooks.
// TaskRunner state should be updated prior to triggering update hooks.

View File

@@ -79,6 +79,14 @@ type ExecOptions struct {
ResizeCh <-chan TerminalSize
}
// DriverNetworkManager is the interface with exposes function for creating a
// network namespace for which tasks can join. This only needs to be implemented
// if the driver MUST create the network namespace
type DriverNetworkManager interface {
CreateNetwork(allocID string) (*NetworkIsolationSpec, error)
DestroyNetwork(allocID string, spec *NetworkIsolationSpec) error
}
// InternalDriverPlugin is an interface that exposes functions that are only
// implemented by internal driver plugins.
type InternalDriverPlugin interface {
@@ -148,6 +156,36 @@ type Capabilities struct {
//FSIsolation indicates what kind of filesystem isolation the driver supports.
FSIsolation FSIsolation
//NetIsolationModes lists the set of isolation modes supported by the driver
NetIsolationModes []NetIsolationMode
// MustInitiateNetwork tells Nomad that the driver must create the network
// namespace and that the CreateNetwork and DestroyNetwork RPCs are implemented.
MustInitiateNetwork bool
}
type NetIsolationMode string
var (
// NetIsolationModeHost disables network isolation and uses the host network
NetIsolationModeHost = NetIsolationMode("host")
// NetIsolationModeGroup uses the group network namespace for isolation
NetIsolationModeGroup = NetIsolationMode("group")
// NetIsolationModeTask isolates the network to just the task
NetIsolationModeTask = NetIsolationMode("task")
// NetIsolationModeNone indicates that there is no network to isolate and is
// inteded to be used for tasks that the client manages remotely
NetIsolationModeNone = NetIsolationMode("none")
)
type NetworkIsolationSpec struct {
Mode NetIsolationMode
Path string
Labels map[string]string
}
type TerminalSize struct {
@@ -156,21 +194,22 @@ type TerminalSize struct {
}
type TaskConfig struct {
ID string
JobName string
TaskGroupName string
Name string
Env map[string]string
DeviceEnv map[string]string
Resources *Resources
Devices []*DeviceConfig
Mounts []*MountConfig
User string
AllocDir string
rawDriverConfig []byte
StdoutPath string
StderrPath string
AllocID string
ID string
JobName string
TaskGroupName string
Name string
Env map[string]string
DeviceEnv map[string]string
Resources *Resources
Devices []*DeviceConfig
Mounts []*MountConfig
User string
AllocDir string
rawDriverConfig []byte
StdoutPath string
StderrPath string
AllocID string
NetworkIsolation *NetworkIsolationSpec
}
func (tc *TaskConfig) Copy() *TaskConfig {

File diff suppressed because it is too large Load Diff

View File

@@ -74,6 +74,14 @@ service Driver {
// ExecTaskStreaming executes a command inside the tasks execution context
// and streams back results
rpc ExecTaskStreaming(stream ExecTaskStreamingRequest) returns (stream ExecTaskStreamingResponse) {}
// CreateNetwork is implemented when the driver needs to create the network
// namespace instead of allowing the Nomad client to do.
rpc CreateNetwork(CreateNetworkRequest) returns (CreateNetworkResponse) {}
// DestroyNetwork destroys a previously created network. This rpc is only
// implemented if the driver needs to manage network namespace creation.
rpc DestroyNetwork(DestroyNetworkRequest) returns (DestroyNetworkResponse) {}
}
message TaskConfigSchemaRequest {}
@@ -314,6 +322,27 @@ message ExecTaskStreamingResponse {
ExitResult result = 4;
}
message CreateNetworkRequest {
// AllodID of the allocation the network is associated with
string alloc_id = 1;
}
message CreateNetworkResponse {
NetworkIsolationSpec isolation_spec = 1;
}
message DestroyNetworkRequest {
// AllodID of the allocation the network is associated with
string alloc_id = 1;
NetworkIsolationSpec isolation_spec = 2;
}
message DestroyNetworkResponse {}
message DriverCapabilities {
// SendSignals indicates that the driver can send process signals (ex. SIGUSR1)
@@ -331,6 +360,24 @@ message DriverCapabilities {
}
// FsIsolation indicates what kind of filesystem isolation a driver supports.
FSIsolation fs_isolation = 3;
repeated NetworkIsolationSpec.NetworkIsolationMode network_isolation_modes = 4;
bool must_create_network = 5;
}
message NetworkIsolationSpec {
enum NetworkIsolationMode {
HOST = 0;
GROUP = 1;
TASK = 2;
NONE = 3;
}
NetworkIsolationMode mode = 1;
string path = 2;
map<string,string> labels = 3;
}
message TaskConfig {
@@ -384,6 +431,10 @@ message TaskConfig {
// AllocId is the ID of the associated allocation
string alloc_id = 15;
// NetworkIsolationSpec specifies the configuration for the network namespace
// to use for the task. *Only supported on Linux
NetworkIsolationSpec network_isolation_spec = 16;
}
message Resources {

View File

@@ -39,8 +39,10 @@ func (b *driverPluginServer) Capabilities(ctx context.Context, req *proto.Capabi
}
resp := &proto.CapabilitiesResponse{
Capabilities: &proto.DriverCapabilities{
SendSignals: caps.SendSignals,
Exec: caps.Exec,
SendSignals: caps.SendSignals,
Exec: caps.Exec,
MustCreateNetwork: caps.MustInitiateNetwork,
NetworkIsolationModes: []proto.NetworkIsolationSpec_NetworkIsolationMode{},
},
}
@@ -54,6 +56,10 @@ func (b *driverPluginServer) Capabilities(ctx context.Context, req *proto.Capabi
default:
resp.Capabilities.FsIsolation = proto.DriverCapabilities_NONE
}
for _, mode := range caps.NetIsolationModes {
resp.Capabilities.NetworkIsolationModes = append(resp.Capabilities.NetworkIsolationModes, netIsolationModeToProto(mode))
}
return resp, nil
}
@@ -374,3 +380,11 @@ func (b *driverPluginServer) TaskEvents(req *proto.TaskEventsRequest, srv proto.
}
return nil
}
func (b *driverPluginServer) CreateNetwork(ctx context.Context, req *proto.CreateNetworkRequest) (*proto.CreateNetworkResponse, error) {
return nil, nil
}
func (b *driverPluginServer) DestroyNetwork(ctx context.Context, req *proto.DestroyNetworkRequest) (*proto.DestroyNetworkResponse, error) {
return nil, nil
}

View File

@@ -571,3 +571,55 @@ func memoryUsageMeasuredFieldsFromProto(fields []proto.MemoryUsage_Fields) []str
return r
}
func netIsolationModeToProto(mode NetIsolationMode) proto.NetworkIsolationSpec_NetworkIsolationMode {
switch mode {
case NetIsolationModeHost:
return proto.NetworkIsolationSpec_HOST
case NetIsolationModeGroup:
return proto.NetworkIsolationSpec_GROUP
case NetIsolationModeTask:
return proto.NetworkIsolationSpec_TASK
case NetIsolationModeNone:
return proto.NetworkIsolationSpec_NONE
default:
return proto.NetworkIsolationSpec_HOST
}
}
func netIsolationModeFromProto(pb proto.NetworkIsolationSpec_NetworkIsolationMode) NetIsolationMode {
switch pb {
case proto.NetworkIsolationSpec_HOST:
return NetIsolationModeHost
case proto.NetworkIsolationSpec_GROUP:
return NetIsolationModeGroup
case proto.NetworkIsolationSpec_TASK:
return NetIsolationModeTask
case proto.NetworkIsolationSpec_NONE:
return NetIsolationModeNone
default:
return NetIsolationModeHost
}
}
func NetworkIsolationSpecToProto(spec *NetworkIsolationSpec) *proto.NetworkIsolationSpec {
if spec == nil {
return nil
}
return &proto.NetworkIsolationSpec{
Path: spec.Path,
Labels: spec.Labels,
Mode: netIsolationModeToProto(spec.Mode),
}
}
func NetworkIsolationSpecFromProto(pb *proto.NetworkIsolationSpec) *NetworkIsolationSpec {
if pb == nil {
return nil
}
return &NetworkIsolationSpec{
Path: pb.Path,
Labels: pb.Labels,
Mode: netIsolationModeFromProto(pb.Mode),
}
}

View File

@@ -351,6 +351,11 @@ func tasksUpdated(jobA, jobB *structs.Job, taskGroup string) bool {
return true
}
// Check that the network resources haven't changed
if networkUpdated(a.Networks, b.Networks) {
return true
}
// Check each task
for _, at := range a.Tasks {
bt := b.LookupTask(at.Name)
@@ -387,22 +392,9 @@ func tasksUpdated(jobA, jobB *structs.Job, taskGroup string) bool {
}
// Inspect the network to see if the dynamic ports are different
if len(at.Resources.Networks) != len(bt.Resources.Networks) {
if networkUpdated(at.Resources.Networks, at.Resources.Networks) {
return true
}
for idx := range at.Resources.Networks {
an := at.Resources.Networks[idx]
bn := bt.Resources.Networks[idx]
if an.MBits != bn.MBits {
return true
}
aPorts, bPorts := networkPortMap(an), networkPortMap(bn)
if !reflect.DeepEqual(aPorts, bPorts) {
return true
}
}
// Inspect the non-network resources
if ar, br := at.Resources, bt.Resources; ar.CPU != br.CPU {
@@ -414,6 +406,26 @@ func tasksUpdated(jobA, jobB *structs.Job, taskGroup string) bool {
return false
}
func networkUpdated(netA, netB []*structs.NetworkResource) bool {
if len(netA) != len(netB) {
return true
}
for idx := range netA {
an := netA[idx]
bn := netB[idx]
if an.MBits != bn.MBits {
return true
}
aPorts, bPorts := networkPortMap(an), networkPortMap(bn)
if !reflect.DeepEqual(aPorts, bPorts) {
return true
}
}
return false
}
// networkPortMap takes a network resource and returns a map of port labels to
// values. The value for dynamic ports is disregarded even if it is set. This
// makes this function suitable for comparing two network resources for changes.