diff --git a/CHANGELOG.md b/CHANGELOG.md index 786380eef..118e0cfc8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## 0.2.0 (Unreleased) + +FEATURES: + + * Blocking queries supported in API [GH-366] + ## 0.1.2 (October 6, 2015) IMPROVEMENTS: diff --git a/api/compose_test.go b/api/compose_test.go index 68801519f..2a509bc55 100644 --- a/api/compose_test.go +++ b/api/compose_test.go @@ -69,6 +69,7 @@ func TestCompose(t *testing.T) { Operand: "=", }, }, + RestartPolicy: NewRestartPolicy(), Tasks: []*Task{ &Task{ Name: "task1", diff --git a/api/tasks.go b/api/tasks.go index c1d5bf2ff..2535d5ec5 100644 --- a/api/tasks.go +++ b/api/tasks.go @@ -1,19 +1,42 @@ package api +import ( + "time" +) + +// RestartPolicy defines how the Nomad client restarts +// tasks in a taskgroup when they fail +type RestartPolicy struct { + Interval time.Duration + Attempts int + Delay time.Duration +} + +func NewRestartPolicy() *RestartPolicy { + return &RestartPolicy{ + Attempts: 10, + Interval: 3 * time.Minute, + Delay: 5 * time.Second, + } +} + // TaskGroup is the unit of scheduling. type TaskGroup struct { - Name string - Count int - Constraints []*Constraint - Tasks []*Task - Meta map[string]string + Name string + Count int + Constraints []*Constraint + Tasks []*Task + RestartPolicy *RestartPolicy + Meta map[string]string } // NewTaskGroup creates a new TaskGroup. func NewTaskGroup(name string, count int) *TaskGroup { + restartPolicy := NewRestartPolicy() return &TaskGroup{ - Name: name, - Count: count, + Name: name, + Count: count, + RestartPolicy: restartPolicy, } } diff --git a/api/tasks_test.go b/api/tasks_test.go index 877f84d5c..945fdf9bf 100644 --- a/api/tasks_test.go +++ b/api/tasks_test.go @@ -8,8 +8,9 @@ import ( func TestTaskGroup_NewTaskGroup(t *testing.T) { grp := NewTaskGroup("grp1", 2) expect := &TaskGroup{ - Name: "grp1", - Count: 2, + Name: "grp1", + Count: 2, + RestartPolicy: NewRestartPolicy(), } if !reflect.DeepEqual(grp, expect) { t.Fatalf("expect: %#v, got: %#v", expect, grp) diff --git a/client/driver/exec.go b/client/driver/exec.go index 0324cad68..e48604894 100644 --- a/client/driver/exec.go +++ b/client/driver/exec.go @@ -12,7 +12,7 @@ import ( "github.com/hashicorp/go-getter" "github.com/hashicorp/nomad/client/allocdir" "github.com/hashicorp/nomad/client/config" - "github.com/hashicorp/nomad/client/executor" + "github.com/hashicorp/nomad/client/driver/executor" "github.com/hashicorp/nomad/nomad/structs" ) @@ -35,8 +35,11 @@ func NewExecDriver(ctx *DriverContext) Driver { } func (d *ExecDriver) Fingerprint(cfg *config.Config, node *structs.Node) (bool, error) { - // Only enable if we are root when running on non-windows systems. - if runtime.GOOS != "windows" && syscall.Geteuid() != 0 { + // Only enable if we are root on linux. + if runtime.GOOS != "linux" { + d.logger.Printf("[DEBUG] driver.exec: only available on linux, disabling") + return false, nil + } else if syscall.Geteuid() != 0 { d.logger.Printf("[DEBUG] driver.exec: must run as root user, disabling") return false, nil } @@ -73,10 +76,8 @@ func (d *ExecDriver) Start(ctx *ExecContext, task *structs.Task) (DriverHandle, } // Add execution permissions to the newly downloaded artifact - if runtime.GOOS != "windows" { - if err := syscall.Chmod(artifactFile, 0755); err != nil { - log.Printf("[ERR] driver.Exec: Error making artifact executable: %s", err) - } + if err := syscall.Chmod(artifactFile, 0755); err != nil { + log.Printf("[ERR] driver.exec: Error making artifact executable: %s", err) } } diff --git a/client/driver/exec_test.go b/client/driver/exec_test.go index ba8745176..1bb4adf36 100644 --- a/client/driver/exec_test.go +++ b/client/driver/exec_test.go @@ -5,7 +5,6 @@ import ( "io/ioutil" "path/filepath" "reflect" - "runtime" "testing" "time" @@ -123,13 +122,7 @@ func TestExecDriver_Start_Wait(t *testing.T) { func TestExecDriver_Start_Artifact_basic(t *testing.T) { ctestutils.ExecCompatible(t) - var file string - switch runtime.GOOS { - case "darwin": - file = "hi_darwin_amd64" - default: - file = "hi_linux_amd64" - } + file := "hi_linux_amd64" task := &structs.Task{ Name: "sleep", @@ -172,13 +165,7 @@ func TestExecDriver_Start_Artifact_basic(t *testing.T) { func TestExecDriver_Start_Artifact_expanded(t *testing.T) { ctestutils.ExecCompatible(t) - var file string - switch runtime.GOOS { - case "darwin": - file = "hi_darwin_amd64" - default: - file = "hi_linux_amd64" - } + file := "hi_linux_amd64" task := &structs.Task{ Name: "sleep", @@ -306,7 +293,7 @@ func TestExecDriver_Start_Kill_Wait(t *testing.T) { if err == nil { t.Fatal("should err") } - case <-time.After(2 * time.Second): + case <-time.After(8 * time.Second): t.Fatalf("timeout") } } diff --git a/client/executor/exec.go b/client/driver/executor/exec.go similarity index 100% rename from client/executor/exec.go rename to client/driver/executor/exec.go diff --git a/client/executor/exec_universal.go b/client/driver/executor/exec_basic.go similarity index 72% rename from client/executor/exec_universal.go rename to client/driver/executor/exec_basic.go index 6b1977d10..81f17d414 100644 --- a/client/executor/exec_universal.go +++ b/client/driver/executor/exec_basic.go @@ -1,5 +1,3 @@ -// +build !linux - package executor import ( @@ -14,24 +12,26 @@ import ( "github.com/hashicorp/nomad/nomad/structs" ) -func NewExecutor() Executor { - return &UniversalExecutor{} -} - -// UniversalExecutor should work everywhere, and as a result does not include +// BasicExecutor should work everywhere, and as a result does not include // any resource restrictions or runas capabilities. -type UniversalExecutor struct { +type BasicExecutor struct { cmd } -func (e *UniversalExecutor) Limit(resources *structs.Resources) error { +// TODO: Update to use the Spawner. +// TODO: Have raw_exec use this as well. +func NewBasicExecutor() Executor { + return &BasicExecutor{} +} + +func (e *BasicExecutor) Limit(resources *structs.Resources) error { if resources == nil { return errNoResources } return nil } -func (e *UniversalExecutor) ConfigureTaskDir(taskName string, alloc *allocdir.AllocDir) error { +func (e *BasicExecutor) ConfigureTaskDir(taskName string, alloc *allocdir.AllocDir) error { taskDir, ok := alloc.TaskDirs[taskName] if !ok { return fmt.Errorf("Error finding task dir for (%s)", taskName) @@ -40,7 +40,7 @@ func (e *UniversalExecutor) ConfigureTaskDir(taskName string, alloc *allocdir.Al return nil } -func (e *UniversalExecutor) Start() error { +func (e *BasicExecutor) Start() error { // Parse the commands arguments and replace instances of Nomad environment // variables. envVars, err := environment.ParseFromList(e.cmd.Env) @@ -67,7 +67,7 @@ func (e *UniversalExecutor) Start() error { return e.cmd.Start() } -func (e *UniversalExecutor) Open(pid string) error { +func (e *BasicExecutor) Open(pid string) error { pidNum, err := strconv.Atoi(pid) if err != nil { return fmt.Errorf("Failed to parse pid %v: %v", pid, err) @@ -81,12 +81,12 @@ func (e *UniversalExecutor) Open(pid string) error { return nil } -func (e *UniversalExecutor) Wait() error { +func (e *BasicExecutor) Wait() error { // We don't want to call ourself. We want to call Start on our embedded Cmd return e.cmd.Wait() } -func (e *UniversalExecutor) ID() (string, error) { +func (e *BasicExecutor) ID() (string, error) { if e.cmd.Process != nil { return strconv.Itoa(e.cmd.Process.Pid), nil } else { @@ -94,14 +94,14 @@ func (e *UniversalExecutor) ID() (string, error) { } } -func (e *UniversalExecutor) Shutdown() error { +func (e *BasicExecutor) Shutdown() error { return e.ForceStop() } -func (e *UniversalExecutor) ForceStop() error { +func (e *BasicExecutor) ForceStop() error { return e.Process.Kill() } -func (e *UniversalExecutor) Command() *cmd { +func (e *BasicExecutor) Command() *cmd { return &e.cmd } diff --git a/client/driver/executor/exec_linux.go b/client/driver/executor/exec_linux.go new file mode 100644 index 000000000..1b4b312bf --- /dev/null +++ b/client/driver/executor/exec_linux.go @@ -0,0 +1,422 @@ +package executor + +import ( + "bytes" + "encoding/json" + "errors" + "fmt" + "os" + "os/user" + "path/filepath" + "strings" + "syscall" + + "github.com/hashicorp/go-multierror" + "github.com/hashicorp/nomad/client/allocdir" + "github.com/hashicorp/nomad/client/driver/args" + "github.com/hashicorp/nomad/client/driver/environment" + "github.com/hashicorp/nomad/client/driver/spawn" + "github.com/hashicorp/nomad/nomad/structs" + + "github.com/opencontainers/runc/libcontainer/cgroups" + cgroupFs "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/cgroups/systemd" + cgroupConfig "github.com/opencontainers/runc/libcontainer/configs" +) + +var ( + // A mapping of directories on the host OS to attempt to embed inside each + // task's chroot. + chrootEnv = map[string]string{ + "/bin": "/bin", + "/etc": "/etc", + "/lib": "/lib", + "/lib32": "/lib32", + "/lib64": "/lib64", + "/usr/bin": "/usr/bin", + "/usr/lib": "/usr/lib", + } +) + +func NewExecutor() Executor { + return &LinuxExecutor{} +} + +// Linux executor is designed to run on linux kernel 2.8+. +type LinuxExecutor struct { + cmd + user *user.User + + // Isolation configurations. + groups *cgroupConfig.Cgroup + taskName string + taskDir string + allocDir string + + // Spawn process. + spawn *spawn.Spawner +} + +func (e *LinuxExecutor) Command() *cmd { + return &e.cmd +} + +func (e *LinuxExecutor) Limit(resources *structs.Resources) error { + if resources == nil { + return errNoResources + } + + return e.configureCgroups(resources) +} + +// execLinuxID contains the necessary information to reattach to an executed +// process and cleanup the created cgroups. +type ExecLinuxID struct { + Groups *cgroupConfig.Cgroup + Spawn *spawn.Spawner + TaskDir string +} + +func (e *LinuxExecutor) Open(id string) error { + // De-serialize the ID. + dec := json.NewDecoder(strings.NewReader(id)) + var execID ExecLinuxID + if err := dec.Decode(&execID); err != nil { + return fmt.Errorf("Failed to parse id: %v", err) + } + + // Setup the executor. + e.groups = execID.Groups + e.spawn = execID.Spawn + e.taskDir = execID.TaskDir + + return nil +} + +func (e *LinuxExecutor) ID() (string, error) { + if e.groups == nil || e.spawn == nil || e.taskDir == "" { + return "", fmt.Errorf("LinuxExecutor not properly initialized.") + } + + // Build the ID. + id := ExecLinuxID{ + Groups: e.groups, + Spawn: e.spawn, + TaskDir: e.taskDir, + } + + var buffer bytes.Buffer + enc := json.NewEncoder(&buffer) + if err := enc.Encode(id); err != nil { + return "", fmt.Errorf("Failed to serialize id: %v", err) + } + + return buffer.String(), nil +} + +// runAs takes a user id as a string and looks up the user. It stores the +// results in the executor and returns an error if the user could not be found. +func (e *LinuxExecutor) runAs(userid string) error { + errs := new(multierror.Error) + + // First, try to lookup the user by uid + u, err := user.LookupId(userid) + if err == nil { + e.user = u + return nil + } else { + errs = multierror.Append(errs, err) + } + + // Lookup failed, so try by username instead + u, err = user.Lookup(userid) + if err == nil { + e.user = u + return nil + } else { + errs = multierror.Append(errs, err) + } + + // If we got here we failed to lookup based on id and username, so we'll + // return those errors. + return fmt.Errorf("Failed to identify user to run as: %s", errs) +} + +func (e *LinuxExecutor) Start() error { + // Run as "nobody" user so we don't leak root privilege to the spawned + // process. + if err := e.runAs("nobody"); err == nil && e.user != nil { + e.cmd.SetUID(e.user.Uid) + e.cmd.SetGID(e.user.Gid) + } + + // Parse the commands arguments and replace instances of Nomad environment + // variables. + envVars, err := environment.ParseFromList(e.Cmd.Env) + if err != nil { + return err + } + + parsedPath, err := args.ParseAndReplace(e.cmd.Path, envVars.Map()) + if err != nil { + return err + } else if len(parsedPath) != 1 { + return fmt.Errorf("couldn't properly parse command path: %v", e.cmd.Path) + } + e.cmd.Path = parsedPath[0] + + combined := strings.Join(e.Cmd.Args, " ") + parsed, err := args.ParseAndReplace(combined, envVars.Map()) + if err != nil { + return err + } + e.Cmd.Args = parsed + + spawnState := filepath.Join(e.allocDir, fmt.Sprintf("%s_%s", e.taskName, "exit_status")) + e.spawn = spawn.NewSpawner(spawnState) + e.spawn.SetCommand(&e.cmd.Cmd) + e.spawn.SetChroot(e.taskDir) + e.spawn.SetLogs(&spawn.Logs{ + Stdout: filepath.Join(e.taskDir, allocdir.TaskLocal, fmt.Sprintf("%v.stdout", e.taskName)), + Stderr: filepath.Join(e.taskDir, allocdir.TaskLocal, fmt.Sprintf("%v.stderr", e.taskName)), + Stdin: os.DevNull, + }) + + enterCgroup := func(pid int) error { + // Join the spawn-daemon to the cgroup. + manager := e.getCgroupManager(e.groups) + + // Apply will place the spawn dameon into the created cgroups. + if err := manager.Apply(pid); err != nil { + return fmt.Errorf("Failed to join spawn-daemon to the cgroup (%+v): %v", e.groups, err) + } + + return nil + } + + return e.spawn.Spawn(enterCgroup) +} + +// Wait waits til the user process exits and returns an error on non-zero exit +// codes. Wait also cleans up the task directory and created cgroups. +func (e *LinuxExecutor) Wait() error { + errs := new(multierror.Error) + code, err := e.spawn.Wait() + if err != nil { + errs = multierror.Append(errs, err) + } + + if code != 0 { + errs = multierror.Append(errs, fmt.Errorf("Task exited with code: %d", code)) + } + + if err := e.destroyCgroup(); err != nil { + errs = multierror.Append(errs, err) + } + + if err := e.cleanTaskDir(); err != nil { + errs = multierror.Append(errs, err) + } + + return errs.ErrorOrNil() +} + +func (e *LinuxExecutor) Shutdown() error { + return e.ForceStop() +} + +// ForceStop immediately exits the user process and cleans up both the task +// directory and the cgroups. +func (e *LinuxExecutor) ForceStop() error { + errs := new(multierror.Error) + if err := e.destroyCgroup(); err != nil { + errs = multierror.Append(errs, err) + } + + if err := e.cleanTaskDir(); err != nil { + errs = multierror.Append(errs, err) + } + + return errs.ErrorOrNil() +} + +// Task Directory related functions. + +// ConfigureTaskDir creates the necessary directory structure for a proper +// chroot. cleanTaskDir should be called after. +func (e *LinuxExecutor) ConfigureTaskDir(taskName string, alloc *allocdir.AllocDir) error { + e.taskName = taskName + e.allocDir = alloc.AllocDir + + taskDir, ok := alloc.TaskDirs[taskName] + if !ok { + fmt.Errorf("Couldn't find task directory for task %v", taskName) + } + e.taskDir = taskDir + + if err := alloc.MountSharedDir(taskName); err != nil { + return err + } + + if err := alloc.Embed(taskName, chrootEnv); err != nil { + return err + } + + // Mount dev + dev := filepath.Join(taskDir, "dev") + if err := os.Mkdir(dev, 0777); err != nil { + return fmt.Errorf("Mkdir(%v) failed: %v", dev, err) + } + + if err := syscall.Mount("", dev, "devtmpfs", syscall.MS_RDONLY, ""); err != nil { + return fmt.Errorf("Couldn't mount /dev to %v: %v", dev, err) + } + + // Mount proc + proc := filepath.Join(taskDir, "proc") + if err := os.Mkdir(proc, 0777); err != nil { + return fmt.Errorf("Mkdir(%v) failed: %v", proc, err) + } + + if err := syscall.Mount("", proc, "proc", syscall.MS_RDONLY, ""); err != nil { + return fmt.Errorf("Couldn't mount /proc to %v: %v", proc, err) + } + + // Set the tasks AllocDir environment variable. + env, err := environment.ParseFromList(e.Cmd.Env) + if err != nil { + return err + } + env.SetAllocDir(filepath.Join("/", allocdir.SharedAllocName)) + env.SetTaskLocalDir(filepath.Join("/", allocdir.TaskLocal)) + e.Cmd.Env = env.List() + + return nil +} + +// pathExists is a helper function to check if the path exists. +func (e *LinuxExecutor) pathExists(path string) bool { + if _, err := os.Stat(path); err != nil { + if os.IsNotExist(err) { + return false + } + } + return true +} + +// cleanTaskDir is an idempotent operation to clean the task directory and +// should be called when tearing down the task. +func (e *LinuxExecutor) cleanTaskDir() error { + // Unmount dev. + errs := new(multierror.Error) + dev := filepath.Join(e.taskDir, "dev") + if e.pathExists(dev) { + if err := syscall.Unmount(dev, 0); err != nil { + errs = multierror.Append(errs, fmt.Errorf("Failed to unmount dev (%v): %v", dev, err)) + } + + if err := os.RemoveAll(dev); err != nil { + errs = multierror.Append(errs, fmt.Errorf("Failed to delete dev directory (%v): %v", dev, err)) + } + } + + // Unmount proc. + proc := filepath.Join(e.taskDir, "proc") + if e.pathExists(proc) { + if err := syscall.Unmount(proc, 0); err != nil { + errs = multierror.Append(errs, fmt.Errorf("Failed to unmount proc (%v): %v", proc, err)) + } + + if err := os.RemoveAll(proc); err != nil { + errs = multierror.Append(errs, fmt.Errorf("Failed to delete proc directory (%v): %v", dev, err)) + } + } + + return errs.ErrorOrNil() +} + +// Cgroup related functions. + +// configureCgroups converts a Nomad Resources specification into the equivalent +// cgroup configuration. It returns an error if the resources are invalid. +func (e *LinuxExecutor) configureCgroups(resources *structs.Resources) error { + e.groups = &cgroupConfig.Cgroup{} + e.groups.Name = structs.GenerateUUID() + + // TODO: verify this is needed for things like network access + e.groups.AllowAllDevices = true + + if resources.MemoryMB > 0 { + // Total amount of memory allowed to consume + e.groups.Memory = int64(resources.MemoryMB * 1024 * 1024) + // Disable swap to avoid issues on the machine + e.groups.MemorySwap = int64(-1) + } + + if resources.CPU < 2 { + return fmt.Errorf("resources.CPU must be equal to or greater than 2: %v", resources.CPU) + } + + // Set the relative CPU shares for this cgroup. + e.groups.CpuShares = int64(resources.CPU) + + if resources.IOPS != 0 { + // Validate it is in an acceptable range. + if resources.IOPS < 10 || resources.IOPS > 1000 { + return fmt.Errorf("resources.IOPS must be between 10 and 1000: %d", resources.IOPS) + } + + e.groups.BlkioWeight = uint16(resources.IOPS) + } + + return nil +} + +// destroyCgroup kills all processes in the cgroup and removes the cgroup +// configuration from the host. +func (e *LinuxExecutor) destroyCgroup() error { + if e.groups == nil { + return errors.New("Can't destroy: cgroup configuration empty") + } + + manager := e.getCgroupManager(e.groups) + pids, err := manager.GetPids() + if err != nil { + return fmt.Errorf("Failed to get pids in the cgroup %v: %v", e.groups.Name, err) + } + + errs := new(multierror.Error) + for _, pid := range pids { + process, err := os.FindProcess(pid) + if err != nil { + multierror.Append(errs, fmt.Errorf("Failed to find Pid %v: %v", pid, err)) + continue + } + + if err := process.Kill(); err != nil { + multierror.Append(errs, fmt.Errorf("Failed to kill Pid %v: %v", pid, err)) + continue + } + } + + // Remove the cgroup. + if err := manager.Destroy(); err != nil { + multierror.Append(errs, fmt.Errorf("Failed to delete the cgroup directories: %v", err)) + } + + if len(errs.Errors) != 0 { + return fmt.Errorf("Failed to destroy cgroup: %v", errs) + } + + return nil +} + +// getCgroupManager returns the correct libcontainer cgroup manager. +func (e *LinuxExecutor) getCgroupManager(groups *cgroupConfig.Cgroup) cgroups.Manager { + var manager cgroups.Manager + manager = &cgroupFs.Manager{Cgroups: groups} + if systemd.UseSystemd() { + manager = &systemd.Manager{Cgroups: groups} + } + return manager +} diff --git a/client/executor/exec_linux_test.go b/client/driver/executor/exec_linux_test.go similarity index 87% rename from client/executor/exec_linux_test.go rename to client/driver/executor/exec_linux_test.go index 8f33b0da4..1b8307b02 100644 --- a/client/executor/exec_linux_test.go +++ b/client/driver/executor/exec_linux_test.go @@ -139,11 +139,6 @@ func TestExecutorLinux_Start_Kill(t *testing.T) { filePath := filepath.Join(taskDir, "output") e := Command("/bin/bash", "-c", "sleep 1 ; echo \"failure\" > "+filePath) - // This test can only be run if cgroups are enabled. - if !e.(*LinuxExecutor).cgroupEnabled { - t.SkipNow() - } - if err := e.Limit(constraint); err != nil { t.Fatalf("Limit() failed: %v", err) } @@ -178,13 +173,11 @@ func TestExecutorLinux_Open(t *testing.T) { t.Fatalf("No task directory found for task %v", task) } - filePath := filepath.Join(taskDir, "output") - e := Command("/bin/bash", "-c", "sleep 1 ; echo \"failure\" > "+filePath) - - // This test can only be run if cgroups are enabled. - if !e.(*LinuxExecutor).cgroupEnabled { - t.SkipNow() - } + expected := "hello world" + file := filepath.Join(allocdir.TaskLocal, "output.txt") + absFilePath := filepath.Join(taskDir, file) + cmd := fmt.Sprintf(`"%v \"%v\" > %v"`, "/bin/sleep 1 ; echo -n", expected, file) + e := Command("/bin/bash", "-c", cmd) if err := e.Limit(constraint); err != nil { t.Fatalf("Limit() failed: %v", err) @@ -203,14 +196,22 @@ func TestExecutorLinux_Open(t *testing.T) { t.Fatalf("ID() failed: %v", err) } - if _, err := OpenId(id); err == nil { - t.Fatalf("Open(%v) should have failed", id) + e2 := NewExecutor() + if err := e2.Open(id); err != nil { + t.Fatalf("Open(%v) failed: %v", id, err) } - time.Sleep(1500 * time.Millisecond) + if err := e2.Wait(); err != nil { + t.Fatalf("Wait() failed: %v", err) + } - // Check that the file doesn't exist, open should have killed the process. - if _, err := os.Stat(filePath); err == nil { - t.Fatalf("Stat(%v) should have failed: task not killed", filePath) + output, err := ioutil.ReadFile(absFilePath) + if err != nil { + t.Fatalf("Couldn't read file %v", absFilePath) + } + + act := string(output) + if act != expected { + t.Fatalf("Command output incorrectly: want %v; got %v", expected, act) } } diff --git a/client/driver/executor/exec_universal.go b/client/driver/executor/exec_universal.go new file mode 100644 index 000000000..318faea4b --- /dev/null +++ b/client/driver/executor/exec_universal.go @@ -0,0 +1,12 @@ +// +build !linux + +package executor + +func NewExecutor() Executor { + return &UniversalExecutor{BasicExecutor{}} +} + +// UniversalExecutor wraps the BasicExecutor +type UniversalExecutor struct { + BasicExecutor +} diff --git a/client/executor/setuid.go b/client/driver/executor/setuid.go similarity index 100% rename from client/executor/setuid.go rename to client/driver/executor/setuid.go diff --git a/client/executor/setuid_windows.go b/client/driver/executor/setuid_windows.go similarity index 100% rename from client/executor/setuid_windows.go rename to client/driver/executor/setuid_windows.go diff --git a/client/driver/java.go b/client/driver/java.go index ac2c3c6f3..e7563f6e2 100644 --- a/client/driver/java.go +++ b/client/driver/java.go @@ -14,7 +14,7 @@ import ( "github.com/hashicorp/go-getter" "github.com/hashicorp/nomad/client/allocdir" "github.com/hashicorp/nomad/client/config" - "github.com/hashicorp/nomad/client/executor" + "github.com/hashicorp/nomad/client/driver/executor" "github.com/hashicorp/nomad/nomad/structs" ) @@ -38,8 +38,8 @@ func NewJavaDriver(ctx *DriverContext) Driver { func (d *JavaDriver) Fingerprint(cfg *config.Config, node *structs.Node) (bool, error) { // Only enable if we are root when running on non-windows systems. - if runtime.GOOS != "windows" && syscall.Geteuid() != 0 { - d.logger.Printf("[DEBUG] driver.java: must run as root user, disabling") + if runtime.GOOS == "linux" && syscall.Geteuid() != 0 { + d.logger.Printf("[DEBUG] driver.java: must run as root user on linux, disabling") return false, nil } diff --git a/client/driver/java_test.go b/client/driver/java_test.go index ad8f5e578..b4f2f2e15 100644 --- a/client/driver/java_test.go +++ b/client/driver/java_test.go @@ -19,7 +19,7 @@ func javaLocated() bool { // The fingerprinter test should always pass, even if Java is not installed. func TestJavaDriver_Fingerprint(t *testing.T) { - ctestutils.ExecCompatible(t) + ctestutils.JavaCompatible(t) d := NewJavaDriver(testDriverContext("")) node := &structs.Node{ Attributes: make(map[string]string), @@ -93,7 +93,7 @@ func TestJavaDriver_Start_Wait(t *testing.T) { t.Skip("Java not found; skipping") } - ctestutils.ExecCompatible(t) + ctestutils.JavaCompatible(t) task := &structs.Task{ Name: "demo-app", Config: map[string]string{ @@ -141,7 +141,7 @@ func TestJavaDriver_Start_Kill_Wait(t *testing.T) { t.Skip("Java not found; skipping") } - ctestutils.ExecCompatible(t) + ctestutils.JavaCompatible(t) task := &structs.Task{ Name: "demo-app", Config: map[string]string{ @@ -179,7 +179,7 @@ func TestJavaDriver_Start_Kill_Wait(t *testing.T) { if err == nil { t.Fatal("should err") } - case <-time.After(2 * time.Second): + case <-time.After(8 * time.Second): t.Fatalf("timeout") } diff --git a/client/driver/spawn/spawn.go b/client/driver/spawn/spawn.go new file mode 100644 index 000000000..ef160611e --- /dev/null +++ b/client/driver/spawn/spawn.go @@ -0,0 +1,285 @@ +package spawn + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "os" + "os/exec" + "strconv" + "time" + + "github.com/hashicorp/go-multierror" + "github.com/hashicorp/nomad/command" + "github.com/hashicorp/nomad/helper/discover" +) + +// Spawner is used to start a user command in an isolated fashion that is +// resistent to Nomad agent failure. +type Spawner struct { + spawn *os.Process + SpawnPid int + SpawnPpid int + StateFile string + + // User configuration + UserCmd *exec.Cmd + Logs *Logs + Chroot string +} + +// Logs is used to define the filepaths the user command's logs should be +// redirected to. The files do not need to exist. +type Logs struct { + Stdin, Stdout, Stderr string +} + +// NewSpawner takes a path to a state file. This state file can be used to +// create a new Spawner that can be used to wait on the exit status of a +// process even through Nomad restarts. +func NewSpawner(stateFile string) *Spawner { + return &Spawner{StateFile: stateFile} +} + +// SetCommand sets the user command to spawn. +func (s *Spawner) SetCommand(cmd *exec.Cmd) { + s.UserCmd = cmd +} + +// SetLogs sets the redirection of user command log files. +func (s *Spawner) SetLogs(l *Logs) { + s.Logs = l +} + +// SetChroot puts the user command into a chroot. +func (s *Spawner) SetChroot(root string) { + s.Chroot = root +} + +// Spawn does a double-fork to start and isolate the user command. It takes a +// call-back that is invoked with the pid of the intermediary process. If the +// call back returns an error, the user command is not started and the spawn is +// cancelled. This can be used to put the process into a cgroup or jail and +// cancel starting the user process if that was not successful. An error is +// returned if the call-back returns an error or the user-command couldn't be +// started. +func (s *Spawner) Spawn(cb func(pid int) error) error { + bin, err := discover.NomadExecutable() + if err != nil { + return fmt.Errorf("Failed to determine the nomad executable: %v", err) + } + + exitFile, err := os.OpenFile(s.StateFile, os.O_CREATE|os.O_WRONLY, 0666) + defer exitFile.Close() + if err != nil { + return fmt.Errorf("Error opening file to store exit status: %v", err) + } + + config, err := s.spawnConfig() + if err != nil { + return err + } + + spawn := exec.Command(bin, "spawn-daemon", config) + + // Capture stdout + spawnStdout, err := spawn.StdoutPipe() + defer spawnStdout.Close() + if err != nil { + return fmt.Errorf("Failed to capture spawn-daemon stdout: %v", err) + } + + // Capture stdin. + spawnStdin, err := spawn.StdinPipe() + defer spawnStdin.Close() + if err != nil { + return fmt.Errorf("Failed to capture spawn-daemon stdin: %v", err) + } + + if err := spawn.Start(); err != nil { + return fmt.Errorf("Failed to call spawn-daemon on nomad executable: %v", err) + } + + if cb != nil { + cbErr := cb(spawn.Process.Pid) + if cbErr != nil { + errs := new(multierror.Error) + errs = multierror.Append(errs, cbErr) + if err := s.sendAbortCommand(spawnStdin); err != nil { + errs = multierror.Append(errs, err) + } + + return errs + } + } + + if err := s.sendStartCommand(spawnStdin); err != nil { + return err + } + + respCh := make(chan command.SpawnStartStatus, 1) + errCh := make(chan error, 1) + + go func() { + var resp command.SpawnStartStatus + dec := json.NewDecoder(spawnStdout) + if err := dec.Decode(&resp); err != nil { + errCh <- fmt.Errorf("Failed to parse spawn-daemon start response: %v", err) + } + respCh <- resp + }() + + select { + case err := <-errCh: + return err + case resp := <-respCh: + if resp.ErrorMsg != "" { + return fmt.Errorf("Failed to execute user command: %s", resp.ErrorMsg) + } + case <-time.After(5 * time.Second): + return fmt.Errorf("timed out waiting for response") + } + + // Store the spawn process. + s.spawn = spawn.Process + s.SpawnPid = s.spawn.Pid + s.SpawnPpid = os.Getpid() + return nil +} + +// spawnConfig returns a serialized config to pass to the Nomad spawn-daemon +// command. +func (s *Spawner) spawnConfig() (string, error) { + if s.UserCmd == nil { + return "", fmt.Errorf("Must specify user command") + } + + config := command.DaemonConfig{ + Cmd: *s.UserCmd, + Chroot: s.Chroot, + ExitStatusFile: s.StateFile, + } + + if s.Logs != nil { + config.StdoutFile = s.Logs.Stdout + config.StdinFile = s.Logs.Stdin + config.StderrFile = s.Logs.Stderr + } + + var buffer bytes.Buffer + enc := json.NewEncoder(&buffer) + if err := enc.Encode(config); err != nil { + return "", fmt.Errorf("Failed to serialize configuration: %v", err) + } + + return strconv.Quote(buffer.String()), nil +} + +// sendStartCommand sends the necessary command to the spawn-daemon to have it +// start the user process. +func (s *Spawner) sendStartCommand(w io.Writer) error { + enc := json.NewEncoder(w) + if err := enc.Encode(true); err != nil { + return fmt.Errorf("Failed to serialize start command: %v", err) + } + + return nil +} + +// sendAbortCommand sends the necessary command to the spawn-daemon to have it +// abort starting the user process. This should be invoked if the spawn-daemon +// could not be isolated into a cgroup. +func (s *Spawner) sendAbortCommand(w io.Writer) error { + enc := json.NewEncoder(w) + if err := enc.Encode(false); err != nil { + return fmt.Errorf("Failed to serialize abort command: %v", err) + } + + return nil +} + +// Wait returns the exit code of the user process or an error if the wait +// failed. +func (s *Spawner) Wait() (int, error) { + if os.Getpid() == s.SpawnPpid { + return s.waitAsParent() + } + + return s.pollWait() +} + +// waitAsParent waits on the process if the current process was the spawner. +func (s *Spawner) waitAsParent() (int, error) { + if s.SpawnPpid != os.Getpid() { + return -1, fmt.Errorf("not the parent. Spawner parent is %v; current pid is %v", s.SpawnPpid, os.Getpid()) + } + + // Try to reattach to the spawn. + if s.spawn == nil { + // If it can't be reattached, it means the spawn process has exited so + // we should just read its exit file. + var err error + if s.spawn, err = os.FindProcess(s.SpawnPid); err != nil { + return s.pollWait() + } + } + + if _, err := s.spawn.Wait(); err != nil { + return -1, err + } + + return s.pollWait() +} + +// pollWait polls on the spawn daemon to determine when it exits. After it +// exits, it reads the state file and returns the exit code and possibly an +// error. +func (s *Spawner) pollWait() (int, error) { + // Stat to check if it is there to avoid a race condition. + stat, err := os.Stat(s.StateFile) + if err != nil { + return -1, fmt.Errorf("Failed to Stat exit status file %v: %v", s.StateFile, err) + } + + // If there is data it means that the file has already been written. + if stat.Size() > 0 { + return s.readExitCode() + } + + // Read after the process exits. + for _ = range time.Tick(5 * time.Second) { + if !s.Alive() { + break + } + } + + return s.readExitCode() +} + +// readExitCode parses the state file and returns the exit code of the task. It +// returns an error if the file can't be read. +func (s *Spawner) readExitCode() (int, error) { + f, err := os.Open(s.StateFile) + defer f.Close() + if err != nil { + return -1, fmt.Errorf("Failed to open %v to read exit code: %v", s.StateFile, err) + } + + stat, err := f.Stat() + if err != nil { + return -1, fmt.Errorf("Failed to stat file %v: %v", s.StateFile, err) + } + + if stat.Size() == 0 { + return -1, fmt.Errorf("Empty state file: %v", s.StateFile) + } + + var exitStatus command.SpawnExitStatus + dec := json.NewDecoder(f) + if err := dec.Decode(&exitStatus); err != nil { + return -1, fmt.Errorf("Failed to parse exit status from %v: %v", s.StateFile, err) + } + + return exitStatus.ExitCode, nil +} diff --git a/client/driver/spawn/spawn_posix.go b/client/driver/spawn/spawn_posix.go new file mode 100644 index 000000000..7df381064 --- /dev/null +++ b/client/driver/spawn/spawn_posix.go @@ -0,0 +1,14 @@ +// +build !windows + +package spawn + +import "syscall" + +func (s *Spawner) Alive() bool { + if s.spawn == nil { + return false + } + + err := s.spawn.Signal(syscall.Signal(0)) + return err == nil +} diff --git a/client/driver/spawn/spawn_test.go b/client/driver/spawn/spawn_test.go new file mode 100644 index 000000000..bbb8c8dca --- /dev/null +++ b/client/driver/spawn/spawn_test.go @@ -0,0 +1,300 @@ +package spawn + +import ( + "fmt" + "io/ioutil" + "os" + "os/exec" + "runtime" + "strings" + "testing" + "time" +) + +func TestSpawn_NoCmd(t *testing.T) { + f, err := ioutil.TempFile("", "") + if err != nil { + t.Fatalf("TempFile() failed") + } + defer os.Remove(f.Name()) + + spawn := NewSpawner(f.Name()) + if err := spawn.Spawn(nil); err == nil { + t.Fatalf("Spawn() with no user command should fail") + } +} + +func TestSpawn_InvalidCmd(t *testing.T) { + f, err := ioutil.TempFile("", "") + if err != nil { + t.Fatalf("TempFile() failed") + } + defer os.Remove(f.Name()) + + spawn := NewSpawner(f.Name()) + spawn.SetCommand(exec.Command("foo")) + if err := spawn.Spawn(nil); err == nil { + t.Fatalf("Spawn() with no invalid command should fail") + } +} + +func TestSpawn_SetsLogs(t *testing.T) { + // TODO: Figure out why this test fails. If the spawn-daemon directly writes + // to the opened stdout file it works but not the user command. Maybe a + // flush issue? + if runtime.GOOS == "windows" { + t.Skip("Test fails on windows; unknown reason. Skipping") + } + + f, err := ioutil.TempFile("", "") + if err != nil { + t.Fatalf("TempFile() failed") + } + defer os.Remove(f.Name()) + + spawn := NewSpawner(f.Name()) + exp := "foo" + spawn.SetCommand(exec.Command("echo", exp)) + + // Create file for stdout. + stdout, err := ioutil.TempFile("", "") + if err != nil { + t.Fatalf("TempFile() failed") + } + defer os.Remove(stdout.Name()) + spawn.SetLogs(&Logs{Stdout: stdout.Name()}) + + if err := spawn.Spawn(nil); err != nil { + t.Fatalf("Spawn() failed: %v", err) + } + + if code, err := spawn.Wait(); code != 0 && err != nil { + t.Fatalf("Wait() returned %v, %v; want 0, nil", code, err) + } + + stdout2, err := os.Open(stdout.Name()) + if err != nil { + t.Fatalf("Open() failed: %v", err) + } + + data, err := ioutil.ReadAll(stdout2) + if err != nil { + t.Fatalf("ReadAll() failed: %v", err) + } + + act := strings.TrimSpace(string(data)) + if act != exp { + t.Fatalf("Unexpected data written to stdout; got %v; want %v", act, exp) + } +} + +func TestSpawn_Callback(t *testing.T) { + f, err := ioutil.TempFile("", "") + if err != nil { + t.Fatalf("TempFile() failed") + } + defer os.Remove(f.Name()) + + spawn := NewSpawner(f.Name()) + spawn.SetCommand(exec.Command("sleep", "1")) + + called := false + cbErr := fmt.Errorf("ERROR CB") + cb := func(_ int) error { + called = true + return cbErr + } + + if err := spawn.Spawn(cb); err == nil { + t.Fatalf("Spawn(%#v) should have errored; want %v", cb, cbErr) + } + + if !called { + t.Fatalf("Spawn(%#v) didn't call callback", cb) + } +} + +func TestSpawn_ParentWaitExited(t *testing.T) { + f, err := ioutil.TempFile("", "") + if err != nil { + t.Fatalf("TempFile() failed") + } + defer os.Remove(f.Name()) + + spawn := NewSpawner(f.Name()) + spawn.SetCommand(exec.Command("echo", "foo")) + if err := spawn.Spawn(nil); err != nil { + t.Fatalf("Spawn() failed %v", err) + } + + time.Sleep(1 * time.Second) + + code, err := spawn.Wait() + if err != nil { + t.Fatalf("Wait() failed %v", err) + } + + if code != 0 { + t.Fatalf("Wait() returned %v; want 0", code) + } +} + +func TestSpawn_ParentWait(t *testing.T) { + f, err := ioutil.TempFile("", "") + if err != nil { + t.Fatalf("TempFile() failed") + } + defer os.Remove(f.Name()) + + spawn := NewSpawner(f.Name()) + spawn.SetCommand(exec.Command("sleep", "2")) + if err := spawn.Spawn(nil); err != nil { + t.Fatalf("Spawn() failed %v", err) + } + + code, err := spawn.Wait() + if err != nil { + t.Fatalf("Wait() failed %v", err) + } + + if code != 0 { + t.Fatalf("Wait() returned %v; want 0", code) + } +} + +func TestSpawn_NonParentWaitExited(t *testing.T) { + f, err := ioutil.TempFile("", "") + if err != nil { + t.Fatalf("TempFile() failed") + } + defer os.Remove(f.Name()) + + spawn := NewSpawner(f.Name()) + spawn.SetCommand(exec.Command("echo", "foo")) + if err := spawn.Spawn(nil); err != nil { + t.Fatalf("Spawn() failed %v", err) + } + + time.Sleep(1 * time.Second) + + // Force the wait to assume non-parent. + spawn.SpawnPpid = 0 + code, err := spawn.Wait() + if err != nil { + t.Fatalf("Wait() failed %v", err) + } + + if code != 0 { + t.Fatalf("Wait() returned %v; want 0", code) + } +} + +func TestSpawn_NonParentWait(t *testing.T) { + f, err := ioutil.TempFile("", "") + if err != nil { + t.Fatalf("TempFile() failed") + } + defer os.Remove(f.Name()) + + spawn := NewSpawner(f.Name()) + spawn.SetCommand(exec.Command("sleep", "2")) + if err := spawn.Spawn(nil); err != nil { + t.Fatalf("Spawn() failed %v", err) + } + + // Need to wait on the spawner, otherwise it becomes a zombie and the test + // only finishes after the init process cleans it. This speeds that up. + go func() { + time.Sleep(3 * time.Second) + if _, err := spawn.spawn.Wait(); err != nil { + t.FailNow() + } + }() + + // Force the wait to assume non-parent. + spawn.SpawnPpid = 0 + code, err := spawn.Wait() + if err != nil { + t.Fatalf("Wait() failed %v", err) + } + + if code != 0 { + t.Fatalf("Wait() returned %v; want 0", code) + } +} + +func TestSpawn_DeadSpawnDaemon_Parent(t *testing.T) { + f, err := ioutil.TempFile("", "") + if err != nil { + t.Fatalf("TempFile() failed") + } + defer os.Remove(f.Name()) + + var spawnPid int + cb := func(pid int) error { + spawnPid = pid + return nil + } + + spawn := NewSpawner(f.Name()) + spawn.SetCommand(exec.Command("sleep", "5")) + if err := spawn.Spawn(cb); err != nil { + t.Fatalf("Spawn() errored: %v", err) + } + + proc, err := os.FindProcess(spawnPid) + if err != nil { + t.FailNow() + } + + if err := proc.Kill(); err != nil { + t.FailNow() + } + + if _, err := proc.Wait(); err != nil { + t.FailNow() + } + + if _, err := spawn.Wait(); err == nil { + t.Fatalf("Wait() should have failed: %v", err) + } +} + +func TestSpawn_DeadSpawnDaemon_NonParent(t *testing.T) { + f, err := ioutil.TempFile("", "") + if err != nil { + t.Fatalf("TempFile() failed") + } + defer os.Remove(f.Name()) + + var spawnPid int + cb := func(pid int) error { + spawnPid = pid + return nil + } + + spawn := NewSpawner(f.Name()) + spawn.SetCommand(exec.Command("sleep", "2")) + if err := spawn.Spawn(cb); err != nil { + t.Fatalf("Spawn() errored: %v", err) + } + + proc, err := os.FindProcess(spawnPid) + if err != nil { + t.FailNow() + } + + if err := proc.Kill(); err != nil { + t.FailNow() + } + + if _, err := proc.Wait(); err != nil { + t.FailNow() + } + + // Force the wait to assume non-parent. + spawn.SpawnPpid = 0 + if _, err := spawn.Wait(); err == nil { + t.Fatalf("Wait() should have failed: %v", err) + } +} diff --git a/client/driver/spawn/spawn_windows.go b/client/driver/spawn/spawn_windows.go new file mode 100644 index 000000000..9683dce97 --- /dev/null +++ b/client/driver/spawn/spawn_windows.go @@ -0,0 +1,21 @@ +package spawn + +import "syscall" + +const STILL_ACTIVE = 259 + +func (s *Spawner) Alive() bool { + const da = syscall.STANDARD_RIGHTS_READ | syscall.PROCESS_QUERY_INFORMATION | syscall.SYNCHRONIZE + h, e := syscall.OpenProcess(da, false, uint32(s.SpawnPid)) + if e != nil { + return false + } + + var ec uint32 + e = syscall.GetExitCodeProcess(h, &ec) + if e != nil { + return false + } + + return ec == STILL_ACTIVE +} diff --git a/client/executor/exec_linux.go b/client/executor/exec_linux.go deleted file mode 100644 index ceb178063..000000000 --- a/client/executor/exec_linux.go +++ /dev/null @@ -1,579 +0,0 @@ -package executor - -import ( - "bytes" - "encoding/json" - "errors" - "fmt" - "io" - "os" - "os/exec" - "os/user" - "path/filepath" - "strconv" - "strings" - "syscall" - - "github.com/hashicorp/go-multierror" - "github.com/hashicorp/nomad/client/allocdir" - "github.com/hashicorp/nomad/client/driver/args" - "github.com/hashicorp/nomad/client/driver/environment" - "github.com/hashicorp/nomad/command" - "github.com/hashicorp/nomad/helper/discover" - "github.com/hashicorp/nomad/nomad/structs" - - cgroupFs "github.com/opencontainers/runc/libcontainer/cgroups/fs" - cgroupConfig "github.com/opencontainers/runc/libcontainer/configs" -) - -const ( - cgroupMount = "/sys/fs/cgroup" -) - -var ( - // A mapping of directories on the host OS to attempt to embed inside each - // task's chroot. - chrootEnv = map[string]string{ - "/bin": "/bin", - "/etc": "/etc", - "/lib": "/lib", - "/lib32": "/lib32", - "/lib64": "/lib64", - "/usr/bin": "/usr/bin", - "/usr/lib": "/usr/lib", - } -) - -func NewExecutor() Executor { - e := LinuxExecutor{} - - // TODO: In a follow-up PR make it so this only happens once per client. - // Fingerprinting shouldn't happen per task. - - // Check that cgroups are available. - if _, err := os.Stat(cgroupMount); err == nil { - e.cgroupEnabled = true - } - - return &e -} - -// Linux executor is designed to run on linux kernel 2.8+. -type LinuxExecutor struct { - cmd - user *user.User - - // Finger print capabilities. - cgroupEnabled bool - - // Isolation configurations. - groups *cgroupConfig.Cgroup - alloc *allocdir.AllocDir - taskName string - taskDir string - - // Tracking of child process. - spawnChild exec.Cmd - spawnOutputWriter *os.File - spawnOutputReader *os.File - - // Track whether there are filesystems mounted in the task dir. - mounts bool -} - -func (e *LinuxExecutor) Limit(resources *structs.Resources) error { - if resources == nil { - return errNoResources - } - - if e.cgroupEnabled { - return e.configureCgroups(resources) - } - - return nil -} - -func (e *LinuxExecutor) ConfigureTaskDir(taskName string, alloc *allocdir.AllocDir) error { - e.taskName = taskName - taskDir, ok := alloc.TaskDirs[taskName] - if !ok { - fmt.Errorf("Couldn't find task directory for task %v", taskName) - } - e.taskDir = taskDir - - if err := alloc.MountSharedDir(taskName); err != nil { - return err - } - - if err := alloc.Embed(taskName, chrootEnv); err != nil { - return err - } - - // Mount dev - dev := filepath.Join(taskDir, "dev") - if err := os.Mkdir(dev, 0777); err != nil { - return fmt.Errorf("Mkdir(%v) failed: %v", dev, err) - } - - if err := syscall.Mount("", dev, "devtmpfs", syscall.MS_RDONLY, ""); err != nil { - return fmt.Errorf("Couldn't mount /dev to %v: %v", dev, err) - } - - // Mount proc - proc := filepath.Join(taskDir, "proc") - if err := os.Mkdir(proc, 0777); err != nil { - return fmt.Errorf("Mkdir(%v) failed: %v", proc, err) - } - - if err := syscall.Mount("", proc, "proc", syscall.MS_RDONLY, ""); err != nil { - return fmt.Errorf("Couldn't mount /proc to %v: %v", proc, err) - } - - // Set the tasks AllocDir environment variable. - env, err := environment.ParseFromList(e.Cmd.Env) - if err != nil { - return err - } - env.SetAllocDir(filepath.Join("/", allocdir.SharedAllocName)) - env.SetTaskLocalDir(filepath.Join("/", allocdir.TaskLocal)) - e.Cmd.Env = env.List() - - e.alloc = alloc - e.mounts = true - return nil -} - -func (e *LinuxExecutor) cleanTaskDir() error { - if e.alloc == nil { - return errors.New("ConfigureTaskDir() must be called before Start()") - } - - if !e.mounts { - return nil - } - - // Unmount dev. - errs := new(multierror.Error) - dev := filepath.Join(e.taskDir, "dev") - if err := syscall.Unmount(dev, 0); err != nil { - errs = multierror.Append(errs, fmt.Errorf("Failed to unmount dev (%v): %v", dev, err)) - } - - // Unmount proc. - proc := filepath.Join(e.taskDir, "proc") - if err := syscall.Unmount(proc, 0); err != nil { - errs = multierror.Append(errs, fmt.Errorf("Failed to unmount proc (%v): %v", proc, err)) - } - - e.mounts = false - return errs.ErrorOrNil() -} - -func (e *LinuxExecutor) configureCgroups(resources *structs.Resources) error { - if !e.cgroupEnabled { - return nil - } - - e.groups = &cgroupConfig.Cgroup{} - - // Groups will be created in a heiarchy according to the resource being - // constrained, current session, and then this unique name. Restraints are - // then placed in the corresponding files. - // Ex: restricting a process to 2048Mhz CPU and 2MB of memory: - // $ cat /sys/fs/cgroup/cpu/user/1000.user/4.session//cpu.shares - // 2028 - // $ cat /sys/fs/cgroup/memory/user/1000.user/4.session//memory.limit_in_bytes - // 2097152 - e.groups.Name = structs.GenerateUUID() - - // TODO: verify this is needed for things like network access - e.groups.AllowAllDevices = true - - if resources.MemoryMB > 0 { - // Total amount of memory allowed to consume - e.groups.Memory = int64(resources.MemoryMB * 1024 * 1024) - // Disable swap to avoid issues on the machine - e.groups.MemorySwap = int64(-1) - } - - if resources.CPU != 0 { - if resources.CPU < 2 { - return fmt.Errorf("resources.CPU must be equal to or greater than 2: %v", resources.CPU) - } - - // Set the relative CPU shares for this cgroup. - // The simplest scale is 1 share to 1 MHz so 1024 = 1GHz. This means any - // given process will have at least that amount of resources, but likely - // more since it is (probably) rare that the machine will run at 100% - // CPU. This scale will cease to work if a node is overprovisioned. - e.groups.CpuShares = int64(resources.CPU) - } - - if resources.IOPS != 0 { - // Validate it is in an acceptable range. - if resources.IOPS < 10 || resources.IOPS > 1000 { - return fmt.Errorf("resources.IOPS must be between 10 and 1000: %d", resources.IOPS) - } - - e.groups.BlkioWeight = uint16(resources.IOPS) - } - - return nil -} - -func (e *LinuxExecutor) runAs(userid string) error { - errs := new(multierror.Error) - - // First, try to lookup the user by uid - u, err := user.LookupId(userid) - if err == nil { - e.user = u - return nil - } else { - errs = multierror.Append(errs, err) - } - - // Lookup failed, so try by username instead - u, err = user.Lookup(userid) - if err == nil { - e.user = u - return nil - } else { - errs = multierror.Append(errs, err) - } - - // If we got here we failed to lookup based on id and username, so we'll - // return those errors. - return fmt.Errorf("Failed to identify user to run as: %s", errs) -} - -func (e *LinuxExecutor) Start() error { - // Run as "nobody" user so we don't leak root privilege to the - // spawned process. - if err := e.runAs("nobody"); err == nil && e.user != nil { - e.cmd.SetUID(e.user.Uid) - e.cmd.SetGID(e.user.Gid) - } - - if e.alloc == nil { - return errors.New("ConfigureTaskDir() must be called before Start()") - } - - // Parse the commands arguments and replace instances of Nomad environment - // variables. - envVars, err := environment.ParseFromList(e.Cmd.Env) - if err != nil { - return err - } - - parsedPath, err := args.ParseAndReplace(e.cmd.Path, envVars.Map()) - if err != nil { - return err - } else if len(parsedPath) != 1 { - return fmt.Errorf("couldn't properly parse command path: %v", e.cmd.Path) - } - e.cmd.Path = parsedPath[0] - - combined := strings.Join(e.Cmd.Args, " ") - parsed, err := args.ParseAndReplace(combined, envVars.Map()) - if err != nil { - return err - } - e.Cmd.Args = parsed - - return e.spawnDaemon() -} - -// spawnDaemon executes a double fork to start the user command with proper -// isolation. Stores the child process for use in Wait. -func (e *LinuxExecutor) spawnDaemon() error { - bin, err := discover.NomadExecutable() - if err != nil { - return fmt.Errorf("Failed to determine the nomad executable: %v", err) - } - - // Serialize the cmd and the cgroup configuration so it can be passed to the - // sub-process. - var buffer bytes.Buffer - enc := json.NewEncoder(&buffer) - - c := command.DaemonConfig{ - Cmd: e.cmd.Cmd, - Chroot: e.taskDir, - StdoutFile: filepath.Join(e.taskDir, allocdir.TaskLocal, fmt.Sprintf("%v.stdout", e.taskName)), - StderrFile: filepath.Join(e.taskDir, allocdir.TaskLocal, fmt.Sprintf("%v.stderr", e.taskName)), - StdinFile: "/dev/null", - } - if err := enc.Encode(c); err != nil { - return fmt.Errorf("Failed to serialize daemon configuration: %v", err) - } - - // Create a pipe to capture Stdout. - pr, pw, err := os.Pipe() - if err != nil { - return err - } - e.spawnOutputWriter = pw - e.spawnOutputReader = pr - - // Call ourselves using a hidden flag. The new instance of nomad will join - // the passed cgroup, forkExec the cmd, and output status codes through - // Stdout. - escaped := strconv.Quote(buffer.String()) - spawn := exec.Command(bin, "spawn-daemon", escaped) - spawn.Stdout = e.spawnOutputWriter - - // Capture its Stdin. - spawnStdIn, err := spawn.StdinPipe() - if err != nil { - return err - } - - if err := spawn.Start(); err != nil { - fmt.Errorf("Failed to call spawn-daemon on nomad executable: %v", err) - } - - // Join the spawn-daemon to the cgroup. - if e.groups != nil { - manager := cgroupFs.Manager{} - manager.Cgroups = e.groups - - // Apply will place the current pid into the tasks file for each of the - // created cgroups: - // /sys/fs/cgroup/memory/user/1000.user/4.session//tasks - // - // Apply requires superuser permissions, and may fail if Nomad is not run with - // the required permissions - if err := manager.Apply(spawn.Process.Pid); err != nil { - errs := new(multierror.Error) - errs = multierror.Append(errs, fmt.Errorf("Failed to join spawn-daemon to the cgroup (config => %+v): %v", manager.Cgroups, err)) - - if err := sendAbortCommand(spawnStdIn); err != nil { - errs = multierror.Append(errs, err) - } - - return errs - } - } - - // Tell it to start. - if err := sendStartCommand(spawnStdIn); err != nil { - return err - } - - // Parse the response. - dec := json.NewDecoder(e.spawnOutputReader) - var resp command.SpawnStartStatus - if err := dec.Decode(&resp); err != nil { - return fmt.Errorf("Failed to parse spawn-daemon start response: %v", err) - } - - if resp.ErrorMsg != "" { - return fmt.Errorf("Failed to execute user command: %s", resp.ErrorMsg) - } - - e.spawnChild = *spawn - return nil -} - -func sendStartCommand(w io.Writer) error { - enc := json.NewEncoder(w) - if err := enc.Encode(true); err != nil { - return fmt.Errorf("Failed to serialize start command: %v", err) - } - - return nil -} - -func sendAbortCommand(w io.Writer) error { - enc := json.NewEncoder(w) - if err := enc.Encode(false); err != nil { - return fmt.Errorf("Failed to serialize abort command: %v", err) - } - - return nil -} - -// Open's behavior is to kill all processes associated with the id and return an -// error. This is done because it is not possible to re-attach to the -// spawn-daemon's stdout to retrieve status messages. -func (e *LinuxExecutor) Open(id string) error { - parts := strings.SplitN(id, ":", 2) - if len(parts) != 2 { - return fmt.Errorf("Invalid id: %v", id) - } - - switch parts[0] { - case "PID": - pid, err := strconv.Atoi(parts[1]) - if err != nil { - return fmt.Errorf("Invalid id: failed to parse pid %v", parts[1]) - } - - process, err := os.FindProcess(pid) - if err != nil { - return fmt.Errorf("Failed to find Pid %v: %v", pid, err) - } - - if err := process.Kill(); err != nil { - return fmt.Errorf("Failed to kill Pid %v: %v", pid, err) - } - case "CGROUP": - if !e.cgroupEnabled { - return errors.New("Passed a a cgroup identifier, but cgroups are disabled") - } - - // De-serialize the cgroup configuration. - dec := json.NewDecoder(strings.NewReader(parts[1])) - var groups cgroupConfig.Cgroup - if err := dec.Decode(&groups); err != nil { - return fmt.Errorf("Failed to parse cgroup configuration: %v", err) - } - - e.groups = &groups - if err := e.destroyCgroup(); err != nil { - return err - } - // TODO: cleanTaskDir is a little more complicated here because the OS - // may have already unmounted in the case of a restart. Need to scan. - default: - return fmt.Errorf("Invalid id type: %v", parts[0]) - } - - return errors.New("Could not re-open to id (intended).") -} - -func (e *LinuxExecutor) Wait() error { - if e.spawnChild.Process == nil { - return errors.New("Can not find child to wait on") - } - - defer e.spawnOutputWriter.Close() - defer e.spawnOutputReader.Close() - - errs := new(multierror.Error) - if err := e.spawnChild.Wait(); err != nil { - errs = multierror.Append(errs, fmt.Errorf("Wait failed on pid %v: %v", e.spawnChild.Process.Pid, err)) - } - - // If they fork/exec and then exit, wait will return but they will be still - // running processes so we need to kill the full cgroup. - if e.groups != nil { - if err := e.destroyCgroup(); err != nil { - errs = multierror.Append(errs, err) - } - } - - if err := e.cleanTaskDir(); err != nil { - errs = multierror.Append(errs, err) - } - - return errs.ErrorOrNil() -} - -// If cgroups are used, the ID is the cgroup structurue. Otherwise, it is the -// PID of the spawn-daemon process. An error is returned if the process was -// never started. -func (e *LinuxExecutor) ID() (string, error) { - if e.spawnChild.Process != nil { - if e.cgroupEnabled && e.groups != nil { - // Serialize the cgroup structure so it can be undone on suabsequent - // opens. - var buffer bytes.Buffer - enc := json.NewEncoder(&buffer) - if err := enc.Encode(e.groups); err != nil { - return "", fmt.Errorf("Failed to serialize daemon configuration: %v", err) - } - - return fmt.Sprintf("CGROUP:%v", buffer.String()), nil - } - - return fmt.Sprintf("PID:%d", e.spawnChild.Process.Pid), nil - } - - return "", fmt.Errorf("Process has finished or was never started") -} - -func (e *LinuxExecutor) Shutdown() error { - return e.ForceStop() -} - -func (e *LinuxExecutor) ForceStop() error { - if e.spawnOutputReader != nil { - e.spawnOutputReader.Close() - } - - if e.spawnOutputWriter != nil { - e.spawnOutputWriter.Close() - } - - // If the task is not running inside a cgroup then just the spawn-daemon child is killed. - // TODO: Find a good way to kill the children of the spawn-daemon. - if e.groups == nil { - if err := e.spawnChild.Process.Kill(); err != nil { - return fmt.Errorf("Failed to kill child (%v): %v", e.spawnChild.Process.Pid, err) - } - - return nil - } - - errs := new(multierror.Error) - if e.groups != nil { - if err := e.destroyCgroup(); err != nil { - errs = multierror.Append(errs, err) - } - } - - if err := e.cleanTaskDir(); err != nil { - errs = multierror.Append(errs, err) - } - - return errs.ErrorOrNil() -} - -func (e *LinuxExecutor) destroyCgroup() error { - if e.groups == nil { - return errors.New("Can't destroy: cgroup configuration empty") - } - - manager := cgroupFs.Manager{} - manager.Cgroups = e.groups - pids, err := manager.GetPids() - if err != nil { - return fmt.Errorf("Failed to get pids in the cgroup %v: %v", e.groups.Name, err) - } - - errs := new(multierror.Error) - for _, pid := range pids { - process, err := os.FindProcess(pid) - if err != nil { - multierror.Append(errs, fmt.Errorf("Failed to find Pid %v: %v", pid, err)) - continue - } - - if err := process.Kill(); err != nil { - multierror.Append(errs, fmt.Errorf("Failed to kill Pid %v: %v", pid, err)) - continue - } - - if _, err := process.Wait(); err != nil { - multierror.Append(errs, fmt.Errorf("Failed to wait Pid %v: %v", pid, err)) - continue - } - } - - // Remove the cgroup. - if err := manager.Destroy(); err != nil { - multierror.Append(errs, fmt.Errorf("Failed to delete the cgroup directories: %v", err)) - } - - if len(errs.Errors) != 0 { - return fmt.Errorf("Failed to destroy cgroup: %v", errs) - } - - return nil -} - -func (e *LinuxExecutor) Command() *cmd { - return &e.cmd -} diff --git a/client/fingerprint/env_aws.go b/client/fingerprint/env_aws.go index 839285a1d..575409bf8 100644 --- a/client/fingerprint/env_aws.go +++ b/client/fingerprint/env_aws.go @@ -15,6 +15,10 @@ import ( "github.com/hashicorp/nomad/nomad/structs" ) +// This is where the AWS metadata server normally resides. We hardcode the +// "instance" path as well since it's the only one we access here. +const DEFAULT_AWS_URL = "http//169.254.169.254/latest/meta-data/" + // map of instance type to approximate speed, in Mbits/s // http://serverfault.com/questions/324883/aws-bandwidth-and-content-delivery/326797#326797 // which itself cites these sources: @@ -89,7 +93,7 @@ func (f *EnvAWSFingerprint) Fingerprint(cfg *config.Config, node *structs.Node) } metadataURL := os.Getenv("AWS_ENV_URL") if metadataURL == "" { - metadataURL = "http://169.254.169.254/latest/meta-data/" + metadataURL = DEFAULT_AWS_URL } // assume 2 seconds is enough time for inside AWS network @@ -161,7 +165,7 @@ func isAWS() bool { // provide their own metadataURL := os.Getenv("AWS_ENV_URL") if metadataURL == "" { - metadataURL = "http://169.254.169.254/latest/meta-data/" + metadataURL = DEFAULT_AWS_URL } // assume 2 seconds is enough time for inside AWS network @@ -205,7 +209,7 @@ func (f *EnvAWSFingerprint) linkSpeed() int { // the network speed metadataURL := os.Getenv("AWS_ENV_URL") if metadataURL == "" { - metadataURL = "http://169.254.169.254/latest/meta-data/" + metadataURL = DEFAULT_AWS_URL } // assume 2 seconds is enough time for inside AWS network diff --git a/client/testutil/driver_compatible.go b/client/testutil/driver_compatible.go index d73d62f33..996fca131 100644 --- a/client/testutil/driver_compatible.go +++ b/client/testutil/driver_compatible.go @@ -8,8 +8,14 @@ import ( ) func ExecCompatible(t *testing.T) { - if runtime.GOOS != "windows" && syscall.Geteuid() != 0 { - t.Skip("Must be root on non-windows environments to run test") + if runtime.GOOS != "linux" || syscall.Geteuid() != 0 { + t.Skip("Test only available running as root on linux") + } +} + +func JavaCompatible(t *testing.T) { + if runtime.GOOS == "linux" && syscall.Geteuid() != 0 { + t.Skip("Test only available when running as root on linux") } } diff --git a/command/init.go b/command/init.go index 0b9be934b..356337ae8 100644 --- a/command/init.go +++ b/command/init.go @@ -104,6 +104,17 @@ job "example" { # Defaults to 1 # count = 1 + # Restart Policy - This block defines the restart policy for TaskGroups, + # the attempts value defines the number of restarts Nomad will do if Tasks + # in this TaskGroup fails in a rolling window of interval duration + # The delay value makes Nomad wait for that duration to restart after a Task + # fails or crashes. + restart { + interval = "5m" + attempts = 10 + delay = "25s" + } + # Define a task to run task "redis" { # Use Docker to run the task. diff --git a/command/spawn_daemon.go b/command/spawn_daemon.go index 3ca825d41..52ffd8e6c 100644 --- a/command/spawn_daemon.go +++ b/command/spawn_daemon.go @@ -2,19 +2,19 @@ package command import ( "encoding/json" + "fmt" + "io" "os" + "os/exec" + "strconv" "strings" + "syscall" ) type SpawnDaemonCommand struct { Meta -} - -// Status of executing the user's command. -type SpawnStartStatus struct { - // ErrorMsg will be empty if the user command was started successfully. - // Otherwise it will have an error message. - ErrorMsg string + config *DaemonConfig + exitFile io.WriteCloser } func (c *SpawnDaemonCommand) Help() string { @@ -23,15 +23,15 @@ Usage: nomad spawn-daemon [options] INTERNAL ONLY - Spawns a daemon process optionally inside a cgroup. The required daemon_config is a json - encoding of the DaemonConfig struct containing the isolation configuration and command to run. - SpawnStartStatus is json serialized to Stdout upon running the user command or if any error - prevents its execution. If there is no error, the process waits on the users - command and then json serializes SpawnExitStatus to Stdout after its termination. - -General Options: - - ` + generalOptionsUsage() + Spawns a daemon process by double forking. The required daemon_config is a + json encoding of the DaemonConfig struct containing the isolation + configuration and command to run. SpawnStartStatus is json serialized to + stdout upon running the user command or if any error prevents its execution. + If there is no error, the process waits on the users command. Once the user + command exits, the exit code is written to a file specified in the + daemon_config and this process exits with the same exit status as the user + command. + ` return strings.TrimSpace(helpText) } @@ -40,6 +40,154 @@ func (c *SpawnDaemonCommand) Synopsis() string { return "Spawn a daemon command with configurable isolation." } +// Status of executing the user's command. +type SpawnStartStatus struct { + // The PID of the user's command. + UserPID int + + // ErrorMsg will be empty if the user command was started successfully. + // Otherwise it will have an error message. + ErrorMsg string +} + +// Exit status of the user's command. +type SpawnExitStatus struct { + // The exit code of the user's command. + ExitCode int +} + +// Configuration for the command to start as a daemon. +type DaemonConfig struct { + exec.Cmd + + // The filepath to write the exit status to. + ExitStatusFile string + + // The paths, if not /dev/null, must be either in the tasks root directory + // or in the shared alloc directory. + StdoutFile string + StdinFile string + StderrFile string + + // An optional path specifying the directory to chroot the process in. + Chroot string +} + +// Whether to start the user command or abort. +type TaskStart bool + +// parseConfig reads the DaemonConfig from the passed arguments. If not +// successful, an error is returned. +func (c *SpawnDaemonCommand) parseConfig(args []string) (*DaemonConfig, error) { + flags := c.Meta.FlagSet("spawn-daemon", FlagSetClient) + flags.Usage = func() { c.Ui.Output(c.Help()) } + if err := flags.Parse(args); err != nil { + return nil, fmt.Errorf("failed to parse args: %v", err) + } + + // Check that we got json input. + args = flags.Args() + if len(args) != 1 { + return nil, fmt.Errorf("incorrect number of args; got %v; want 1", len(args)) + } + jsonInput, err := strconv.Unquote(args[0]) + if err != nil { + return nil, fmt.Errorf("Failed to unquote json input: %v", err) + } + + // De-serialize the passed command. + var config DaemonConfig + dec := json.NewDecoder(strings.NewReader(jsonInput)) + if err := dec.Decode(&config); err != nil { + return nil, err + } + + return &config, nil +} + +// configureLogs creates the log files and redirects the process +// stdin/stderr/stdout to them. If unsuccessful, an error is returned. +func (c *SpawnDaemonCommand) configureLogs() error { + if len(c.config.StdoutFile) != 0 { + stdo, err := os.OpenFile(c.config.StdoutFile, os.O_CREATE|os.O_RDWR|os.O_APPEND, 0666) + if err != nil { + return fmt.Errorf("Error opening file to redirect stdout: %v", err) + } + + c.config.Cmd.Stdout = stdo + } + + if len(c.config.StderrFile) != 0 { + stde, err := os.OpenFile(c.config.StderrFile, os.O_CREATE|os.O_RDWR|os.O_APPEND, 0666) + if err != nil { + return fmt.Errorf("Error opening file to redirect stderr: %v", err) + } + c.config.Cmd.Stderr = stde + } + + if len(c.config.StdinFile) != 0 { + stdi, err := os.OpenFile(c.config.StdinFile, os.O_CREATE|os.O_RDONLY, 0666) + if err != nil { + return fmt.Errorf("Error opening file to redirect stdin: %v", err) + } + c.config.Cmd.Stdin = stdi + } + + return nil +} + +func (c *SpawnDaemonCommand) Run(args []string) int { + var err error + c.config, err = c.parseConfig(args) + if err != nil { + return c.outputStartStatus(err, 1) + } + + // Open the file we will be using to write exit codes to. We do this early + // to ensure that we don't start the user process when we can't capture its + // exit status. + c.exitFile, err = os.OpenFile(c.config.ExitStatusFile, os.O_WRONLY, 0666) + if err != nil { + return c.outputStartStatus(fmt.Errorf("Error opening file to store exit status: %v", err), 1) + } + + // Isolate the user process. + if err := c.isolateCmd(); err != nil { + return c.outputStartStatus(err, 1) + } + + // Redirect logs. + if err := c.configureLogs(); err != nil { + return c.outputStartStatus(err, 1) + } + + // Chroot jail the process and set its working directory. + c.configureChroot() + + // Wait to get the start command. + var start TaskStart + dec := json.NewDecoder(os.Stdin) + if err := dec.Decode(&start); err != nil { + return c.outputStartStatus(err, 1) + } + + // Aborted by Nomad process. + if !start { + return 0 + } + + // Spawn the user process. + if err := c.config.Cmd.Start(); err != nil { + return c.outputStartStatus(fmt.Errorf("Error starting user command: %v", err), 1) + } + + // Indicate that the command was started successfully. + c.outputStartStatus(nil, 0) + + // Wait and then output the exit status. + return c.writeExitStatus(c.config.Cmd.Wait()) +} + // outputStartStatus is a helper function that outputs a SpawnStartStatus to // Stdout with the passed error, which may be nil to indicate no error. It // returns the passed status. @@ -51,6 +199,36 @@ func (c *SpawnDaemonCommand) outputStartStatus(err error, status int) int { startStatus.ErrorMsg = err.Error() } + if c.config != nil && c.config.Cmd.Process != nil { + startStatus.UserPID = c.config.Process.Pid + } + enc.Encode(startStatus) return status } + +// writeExitStatus takes in the error result from calling wait and writes out +// the exit status to a file. It returns the same exit status as the user +// command. +func (c *SpawnDaemonCommand) writeExitStatus(exit error) int { + // Parse the exit code. + exitStatus := &SpawnExitStatus{} + if exit != nil { + // Default to exit code 1 if we can not get the actual exit code. + exitStatus.ExitCode = 1 + + if exiterr, ok := exit.(*exec.ExitError); ok { + if status, ok := exiterr.Sys().(syscall.WaitStatus); ok { + exitStatus.ExitCode = status.ExitStatus() + } + } + } + + if c.exitFile != nil { + enc := json.NewEncoder(c.exitFile) + enc.Encode(exitStatus) + c.exitFile.Close() + } + + return exitStatus.ExitCode +} diff --git a/command/spawn_daemon_darwin.go b/command/spawn_daemon_darwin.go new file mode 100644 index 000000000..f3fe8484a --- /dev/null +++ b/command/spawn_daemon_darwin.go @@ -0,0 +1,4 @@ +package command + +// No chroot on darwin. +func (c *SpawnDaemonCommand) configureChroot() {} diff --git a/command/spawn_daemon_linux.go b/command/spawn_daemon_linux.go index 3e9ceaa3e..512ec645f 100644 --- a/command/spawn_daemon_linux.go +++ b/command/spawn_daemon_linux.go @@ -1,115 +1,16 @@ package command -import ( - "encoding/json" - "fmt" - "os" - "os/exec" - "strconv" - "strings" - "syscall" -) +import "syscall" -// Configuration for the command to start as a daemon. -type DaemonConfig struct { - exec.Cmd +// configureChroot enters the user command into a chroot if specified in the +// config and on an OS that supports Chroots. +func (c *SpawnDaemonCommand) configureChroot() { + if len(c.config.Chroot) != 0 { + if c.config.Cmd.SysProcAttr == nil { + c.config.Cmd.SysProcAttr = &syscall.SysProcAttr{} + } - // The paths, if not /dev/null, must be either in the tasks root directory - // or in the shared alloc directory. - StdoutFile string - StdinFile string - StderrFile string - - Chroot string -} - -// Whether to start the user command or abort. -type TaskStart bool - -func (c *SpawnDaemonCommand) Run(args []string) int { - flags := c.Meta.FlagSet("spawn-daemon", FlagSetClient) - flags.Usage = func() { c.Ui.Output(c.Help()) } - - if err := flags.Parse(args); err != nil { - return 1 - } - - // Check that we got json input. - args = flags.Args() - if len(args) != 1 { - c.Ui.Error(c.Help()) - return 1 - } - jsonInput, err := strconv.Unquote(args[0]) - if err != nil { - return c.outputStartStatus(fmt.Errorf("Failed to unquote json input: %v", err), 1) - } - - // De-serialize the passed command. - var cmd DaemonConfig - dec := json.NewDecoder(strings.NewReader(jsonInput)) - if err := dec.Decode(&cmd); err != nil { - return c.outputStartStatus(err, 1) - } - - // Isolate the user process. - if _, err := syscall.Setsid(); err != nil { - return c.outputStartStatus(fmt.Errorf("Failed setting sid: %v", err), 1) - } - - syscall.Umask(0) - - // Redirect logs. - stdo, err := os.OpenFile(cmd.StdoutFile, os.O_CREATE|os.O_RDWR|os.O_APPEND, 0666) - if err != nil { - return c.outputStartStatus(fmt.Errorf("Error opening file to redirect Stdout: %v", err), 1) - } - - stde, err := os.OpenFile(cmd.StderrFile, os.O_CREATE|os.O_RDWR|os.O_APPEND, 0666) - if err != nil { - return c.outputStartStatus(fmt.Errorf("Error opening file to redirect Stderr: %v", err), 1) - } - - stdi, err := os.OpenFile(cmd.StdinFile, os.O_CREATE|os.O_RDONLY, 0666) - if err != nil { - return c.outputStartStatus(fmt.Errorf("Error opening file to redirect Stdin: %v", err), 1) - } - - cmd.Cmd.Stdout = stdo - cmd.Cmd.Stderr = stde - cmd.Cmd.Stdin = stdi - - // Chroot jail the process and set its working directory. - if cmd.Cmd.SysProcAttr == nil { - cmd.Cmd.SysProcAttr = &syscall.SysProcAttr{} - } - - cmd.Cmd.SysProcAttr.Chroot = cmd.Chroot - cmd.Cmd.Dir = "/" - - // Wait to get the start command. - var start TaskStart - dec = json.NewDecoder(os.Stdin) - if err := dec.Decode(&start); err != nil { - return c.outputStartStatus(err, 1) - } - - if !start { - return 0 - } - - // Spawn the user process. - if err := cmd.Cmd.Start(); err != nil { - return c.outputStartStatus(fmt.Errorf("Error starting user command: %v", err), 1) - } - - // Indicate that the command was started successfully. - c.outputStartStatus(nil, 0) - - // Wait and then output the exit status. - if err := cmd.Wait(); err != nil { - return 1 - } - - return 0 + c.config.Cmd.SysProcAttr.Chroot = c.config.Chroot + c.config.Cmd.Dir = "/" + } } diff --git a/command/spawn_daemon_test.go b/command/spawn_daemon_test.go new file mode 100644 index 000000000..5bfd6ad5a --- /dev/null +++ b/command/spawn_daemon_test.go @@ -0,0 +1,48 @@ +package command + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "os/exec" + "testing" +) + +type nopCloser struct { + io.ReadWriter +} + +func (n *nopCloser) Close() error { + return nil +} + +func TestSpawnDaemon_WriteExitStatus(t *testing.T) { + // Check if there is python. + path, err := exec.LookPath("python") + if err != nil { + t.Skip("python not detected") + } + + var b bytes.Buffer + daemon := &SpawnDaemonCommand{exitFile: &nopCloser{&b}} + + code := 3 + cmd := exec.Command(path, "./test-resources/exiter.py", fmt.Sprintf("%d", code)) + err = cmd.Run() + actual := daemon.writeExitStatus(err) + if actual != code { + t.Fatalf("writeExitStatus(%v) returned %v; want %v", err, actual, code) + } + + // De-serialize the passed command. + var exitStatus SpawnExitStatus + dec := json.NewDecoder(&b) + if err := dec.Decode(&exitStatus); err != nil { + t.Fatalf("failed to decode exit status: %v", err) + } + + if exitStatus.ExitCode != code { + t.Fatalf("writeExitStatus(%v) wrote exit status %v; want %v", err, exitStatus.ExitCode, code) + } +} diff --git a/command/spawn_daemon_universal.go b/command/spawn_daemon_universal.go deleted file mode 100644 index 5083af5f3..000000000 --- a/command/spawn_daemon_universal.go +++ /dev/null @@ -1,9 +0,0 @@ -// +build !linux - -package command - -import "errors" - -func (c *SpawnDaemonCommand) Run(args []string) int { - return c.outputStartStatus(errors.New("spawn-daemon not supported"), 1) -} diff --git a/command/spawn_daemon_unix.go b/command/spawn_daemon_unix.go new file mode 100644 index 000000000..981e52596 --- /dev/null +++ b/command/spawn_daemon_unix.go @@ -0,0 +1,16 @@ +// +build !windows + +package command + +import "syscall" + +// isolateCmd sets the session id for the process and the umask. +func (c *SpawnDaemonCommand) isolateCmd() error { + if c.config.Cmd.SysProcAttr == nil { + c.config.Cmd.SysProcAttr = &syscall.SysProcAttr{} + } + + c.config.Cmd.SysProcAttr.Setsid = true + syscall.Umask(0) + return nil +} diff --git a/command/spawn_daemon_windows.go b/command/spawn_daemon_windows.go new file mode 100644 index 000000000..bb2d63ed8 --- /dev/null +++ b/command/spawn_daemon_windows.go @@ -0,0 +1,7 @@ +// build !linux !darwin + +package command + +// No isolation on Windows. +func (c *SpawnDaemonCommand) isolateCmd() error { return nil } +func (c *SpawnDaemonCommand) configureChroot() {} diff --git a/command/test-resources/exiter.py b/command/test-resources/exiter.py new file mode 100644 index 000000000..90e66b98c --- /dev/null +++ b/command/test-resources/exiter.py @@ -0,0 +1,3 @@ +import sys + +sys.exit(int(sys.argv[1])) diff --git a/helper/discover/discover.go b/helper/discover/discover.go index d90ddb4cc..8582a0133 100644 --- a/helper/discover/discover.go +++ b/helper/discover/discover.go @@ -3,18 +3,21 @@ package discover import ( "fmt" "os" + "os/exec" "path/filepath" + "runtime" "github.com/kardianos/osext" ) -const ( - nomadExe = "nomad" -) - // Checks the current executable, then $GOPATH/bin, and finally the CWD, in that // order. If it can't be found, an error is returned. func NomadExecutable() (string, error) { + nomadExe := "nomad" + if runtime.GOOS == "windows" { + nomadExe = "nomad.exe" + } + // Check the current executable. bin, err := osext.Executable() if err != nil { @@ -25,6 +28,11 @@ func NomadExecutable() (string, error) { return bin, nil } + // Check the $PATH + if bin, err := exec.LookPath(nomadExe); err == nil { + return bin, nil + } + // Check the $GOPATH. bin = filepath.Join(os.Getenv("GOPATH"), "bin", nomadExe) if _, err := os.Stat(bin); err == nil { diff --git a/jobspec/parse.go b/jobspec/parse.go index f63ac5294..77f9b819f 100644 --- a/jobspec/parse.go +++ b/jobspec/parse.go @@ -124,7 +124,7 @@ func parseJob(result *structs.Job, obj *hclobj.Object) error { } } - // If we have tasks outside, do those + // If we have tasks outside, create TaskGroups for them if o := obj.Get("task", false); o != nil { var tasks []*structs.Task if err := parseTasks(&tasks, o); err != nil { @@ -134,9 +134,10 @@ func parseJob(result *structs.Job, obj *hclobj.Object) error { result.TaskGroups = make([]*structs.TaskGroup, len(tasks), len(tasks)*2) for i, t := range tasks { result.TaskGroups[i] = &structs.TaskGroup{ - Name: t.Name, - Count: 1, - Tasks: []*structs.Task{t}, + Name: t.Name, + Count: 1, + Tasks: []*structs.Task{t}, + RestartPolicy: structs.NewRestartPolicy(result.Type), } } } @@ -180,6 +181,7 @@ func parseGroups(result *structs.Job, obj *hclobj.Object) error { delete(m, "constraint") delete(m, "meta") delete(m, "task") + delete(m, "restart") // Default count to 1 if not specified if _, ok := m["count"]; !ok { @@ -199,6 +201,11 @@ func parseGroups(result *structs.Job, obj *hclobj.Object) error { return err } } + g.RestartPolicy = structs.NewRestartPolicy(result.Type) + + if err := parseRestartPolicy(g.RestartPolicy, o); err != nil { + return err + } // Parse out meta fields. These are in HCL as a list so we need // to iterate over them and merge them. @@ -228,6 +235,42 @@ func parseGroups(result *structs.Job, obj *hclobj.Object) error { return nil } +func parseRestartPolicy(result *structs.RestartPolicy, obj *hclobj.Object) error { + var restartHclObj *hclobj.Object + var m map[string]interface{} + if restartHclObj = obj.Get("restart", false); restartHclObj == nil { + return nil + } + if err := hcl.DecodeObject(&m, restartHclObj); err != nil { + return err + } + + if delay, ok := m["delay"]; ok { + d, err := toDuration(delay) + if err != nil { + return fmt.Errorf("Invalid Delay time in restart policy: %v", err) + } + result.Delay = d + } + + if interval, ok := m["interval"]; ok { + i, err := toDuration(interval) + if err != nil { + return fmt.Errorf("Invalid Interval time in restart policy: %v", err) + } + result.Interval = i + } + + if attempts, ok := m["attempts"]; ok { + a, err := toInteger(attempts) + if err != nil { + return fmt.Errorf("Invalid value in attempts: %v", err) + } + result.Attempts = a + } + return nil +} + func parseConstraints(result *[]*structs.Constraint, obj *hclobj.Object) error { for _, o := range obj.Elem(false) { var m map[string]interface{} @@ -455,19 +498,11 @@ func parseUpdate(result *structs.UpdateStrategy, obj *hclobj.Object) error { } for _, key := range []string{"stagger", "Stagger"} { if raw, ok := m[key]; ok { - switch v := raw.(type) { - case string: - dur, err := time.ParseDuration(v) - if err != nil { - return fmt.Errorf("invalid stagger time '%s'", raw) - } - m[key] = dur - case int: - m[key] = time.Duration(v) * time.Second - default: - return fmt.Errorf("invalid type for stagger time '%s'", - raw) + staggerTime, err := toDuration(raw) + if err != nil { + return fmt.Errorf("Invalid stagger time: %v", err) } + m[key] = staggerTime } } @@ -477,3 +512,35 @@ func parseUpdate(result *structs.UpdateStrategy, obj *hclobj.Object) error { } return nil } + +func toDuration(value interface{}) (time.Duration, error) { + var dur time.Duration + var err error + switch v := value.(type) { + case string: + dur, err = time.ParseDuration(v) + case int: + dur = time.Duration(v) * time.Second + default: + err = fmt.Errorf("Invalid time %s", value) + } + + return dur, err +} + +func toInteger(value interface{}) (int, error) { + var integer int + var err error + switch v := value.(type) { + case string: + var i int64 + i, err = strconv.ParseInt(v, 10, 32) + integer = int(i) + case int: + integer = v + default: + err = fmt.Errorf("Value: %v can't be parsed into int", value) + } + + return integer, err +} diff --git a/jobspec/parse_test.go b/jobspec/parse_test.go index f91789ddb..e785443b7 100644 --- a/jobspec/parse_test.go +++ b/jobspec/parse_test.go @@ -48,6 +48,11 @@ func TestParse(t *testing.T) { &structs.TaskGroup{ Name: "outside", Count: 1, + RestartPolicy: &structs.RestartPolicy{ + Attempts: 2, + Interval: 1 * time.Minute, + Delay: 15 * time.Second, + }, Tasks: []*structs.Task{ &structs.Task{ Name: "outside", @@ -77,6 +82,11 @@ func TestParse(t *testing.T) { "elb_interval": "10", "elb_checks": "3", }, + RestartPolicy: &structs.RestartPolicy{ + Interval: 10 * time.Minute, + Attempts: 5, + Delay: 15 * time.Second, + }, Tasks: []*structs.Task{ &structs.Task{ Name: "binstore", diff --git a/jobspec/test-fixtures/basic.hcl b/jobspec/test-fixtures/basic.hcl index 941272b2d..bf81a6ae7 100644 --- a/jobspec/test-fixtures/basic.hcl +++ b/jobspec/test-fixtures/basic.hcl @@ -31,6 +31,11 @@ job "binstore-storagelocker" { group "binsl" { count = 5 + restart { + attempts = 5 + interval = "10m" + delay = "15s" + } task "binstore" { driver = "docker" config { diff --git a/nomad/alloc_endpoint.go b/nomad/alloc_endpoint.go index 53b630480..c07d5549d 100644 --- a/nomad/alloc_endpoint.go +++ b/nomad/alloc_endpoint.go @@ -5,6 +5,7 @@ import ( "github.com/armon/go-metrics" "github.com/hashicorp/nomad/nomad/structs" + "github.com/hashicorp/nomad/nomad/watch" ) // Alloc endpoint is used for manipulating allocations @@ -19,35 +20,45 @@ func (a *Alloc) List(args *structs.AllocListRequest, reply *structs.AllocListRes } defer metrics.MeasureSince([]string{"nomad", "alloc", "list"}, time.Now()) - // Capture all the allocations - snap, err := a.srv.fsm.State().Snapshot() - if err != nil { - return err - } - iter, err := snap.Allocs() - if err != nil { - return err - } + // Setup the blocking query + opts := blockingOptions{ + queryOpts: &args.QueryOptions, + queryMeta: &reply.QueryMeta, + watch: watch.NewItems(watch.Item{Table: "allocs"}), + run: func() error { + // Capture all the allocations + snap, err := a.srv.fsm.State().Snapshot() + if err != nil { + return err + } + iter, err := snap.Allocs() + if err != nil { + return err + } - for { - raw := iter.Next() - if raw == nil { - break - } - alloc := raw.(*structs.Allocation) - reply.Allocations = append(reply.Allocations, alloc.Stub()) - } + var allocs []*structs.AllocListStub + for { + raw := iter.Next() + if raw == nil { + break + } + alloc := raw.(*structs.Allocation) + allocs = append(allocs, alloc.Stub()) + } + reply.Allocations = allocs - // Use the last index that affected the jobs table - index, err := snap.Index("allocs") - if err != nil { - return err - } - reply.Index = index + // Use the last index that affected the jobs table + index, err := snap.Index("allocs") + if err != nil { + return err + } + reply.Index = index - // Set the query response - a.srv.setQueryMeta(&reply.QueryMeta) - return nil + // Set the query response + a.srv.setQueryMeta(&reply.QueryMeta) + return nil + }} + return a.srv.blockingRPC(&opts) } // GetAlloc is used to lookup a particular allocation @@ -58,30 +69,38 @@ func (a *Alloc) GetAlloc(args *structs.AllocSpecificRequest, } defer metrics.MeasureSince([]string{"nomad", "alloc", "get_alloc"}, time.Now()) - // Lookup the allocation - snap, err := a.srv.fsm.State().Snapshot() - if err != nil { - return err - } - out, err := snap.AllocByID(args.AllocID) - if err != nil { - return err - } + // Setup the blocking query + opts := blockingOptions{ + queryOpts: &args.QueryOptions, + queryMeta: &reply.QueryMeta, + watch: watch.NewItems(watch.Item{Alloc: args.AllocID}), + run: func() error { + // Lookup the allocation + snap, err := a.srv.fsm.State().Snapshot() + if err != nil { + return err + } + out, err := snap.AllocByID(args.AllocID) + if err != nil { + return err + } - // Setup the output - if out != nil { - reply.Alloc = out - reply.Index = out.ModifyIndex - } else { - // Use the last index that affected the nodes table - index, err := snap.Index("allocs") - if err != nil { - return err - } - reply.Index = index - } + // Setup the output + reply.Alloc = out + if out != nil { + reply.Index = out.ModifyIndex + } else { + // Use the last index that affected the nodes table + index, err := snap.Index("allocs") + if err != nil { + return err + } + reply.Index = index + } - // Set the query response - a.srv.setQueryMeta(&reply.QueryMeta) - return nil + // Set the query response + a.srv.setQueryMeta(&reply.QueryMeta) + return nil + }} + return a.srv.blockingRPC(&opts) } diff --git a/nomad/alloc_endpoint_test.go b/nomad/alloc_endpoint_test.go index 8076b64d6..bcab0a387 100644 --- a/nomad/alloc_endpoint_test.go +++ b/nomad/alloc_endpoint_test.go @@ -3,6 +3,7 @@ package nomad import ( "reflect" "testing" + "time" "github.com/hashicorp/net-rpc-msgpackrpc" "github.com/hashicorp/nomad/nomad/mock" @@ -44,6 +45,74 @@ func TestAllocEndpoint_List(t *testing.T) { } } +func TestAllocEndpoint_List_Blocking(t *testing.T) { + s1 := testServer(t, nil) + defer s1.Shutdown() + state := s1.fsm.State() + codec := rpcClient(t, s1) + testutil.WaitForLeader(t, s1.RPC) + + // Create the alloc + alloc := mock.Alloc() + + // Upsert alloc triggers watches + time.AfterFunc(100*time.Millisecond, func() { + if err := state.UpsertAllocs(2, []*structs.Allocation{alloc}); err != nil { + t.Fatalf("err: %v", err) + } + }) + + req := &structs.AllocListRequest{ + QueryOptions: structs.QueryOptions{ + Region: "global", + MinQueryIndex: 1, + }, + } + start := time.Now() + var resp structs.AllocListResponse + if err := msgpackrpc.CallWithCodec(codec, "Alloc.List", req, &resp); err != nil { + t.Fatalf("err: %v", err) + } + + if elapsed := time.Since(start); elapsed < 100*time.Millisecond { + t.Fatalf("should block (returned in %s) %#v", elapsed, resp) + } + if resp.Index != 2 { + t.Fatalf("Bad index: %d %d", resp.Index, 2) + } + if len(resp.Allocations) != 1 || resp.Allocations[0].ID != alloc.ID { + t.Fatalf("bad: %#v", resp.Allocations) + } + + // Client updates trigger watches + alloc2 := mock.Alloc() + alloc2.ID = alloc.ID + alloc2.ClientStatus = structs.AllocClientStatusRunning + time.AfterFunc(100*time.Millisecond, func() { + if err := state.UpdateAllocFromClient(3, alloc2); err != nil { + t.Fatalf("err: %v", err) + } + }) + + req.MinQueryIndex = 2 + start = time.Now() + var resp2 structs.AllocListResponse + if err := msgpackrpc.CallWithCodec(codec, "Alloc.List", req, &resp2); err != nil { + t.Fatalf("err: %v", err) + } + + if elapsed := time.Since(start); elapsed < 100*time.Millisecond { + t.Fatalf("should block (returned in %s) %#v", elapsed, resp2) + } + if resp2.Index != 3 { + t.Fatalf("Bad index: %d %d", resp2.Index, 3) + } + if len(resp2.Allocations) != 1 || resp.Allocations[0].ID != alloc.ID || + resp2.Allocations[0].ClientStatus != structs.AllocClientStatusRunning { + t.Fatalf("bad: %#v", resp2.Allocations) + } +} + func TestAllocEndpoint_GetAlloc(t *testing.T) { s1 := testServer(t, nil) defer s1.Shutdown() @@ -75,3 +144,55 @@ func TestAllocEndpoint_GetAlloc(t *testing.T) { t.Fatalf("bad: %#v", resp.Alloc) } } + +func TestAllocEndpoint_GetAlloc_Blocking(t *testing.T) { + s1 := testServer(t, nil) + defer s1.Shutdown() + state := s1.fsm.State() + codec := rpcClient(t, s1) + testutil.WaitForLeader(t, s1.RPC) + + // Create the allocs + alloc1 := mock.Alloc() + alloc2 := mock.Alloc() + + // First create an unrelated alloc + time.AfterFunc(100*time.Millisecond, func() { + err := state.UpsertAllocs(100, []*structs.Allocation{alloc1}) + if err != nil { + t.Fatalf("err: %v", err) + } + }) + + // Create the alloc we are watching later + time.AfterFunc(200*time.Millisecond, func() { + err := state.UpsertAllocs(200, []*structs.Allocation{alloc2}) + if err != nil { + t.Fatalf("err: %v", err) + } + }) + + // Lookup the jobs + get := &structs.AllocSpecificRequest{ + AllocID: alloc2.ID, + QueryOptions: structs.QueryOptions{ + Region: "global", + MinQueryIndex: 50, + }, + } + var resp structs.SingleAllocResponse + start := time.Now() + if err := msgpackrpc.CallWithCodec(codec, "Alloc.GetAlloc", get, &resp); err != nil { + t.Fatalf("err: %v", err) + } + + if elapsed := time.Since(start); elapsed < 200*time.Millisecond { + t.Fatalf("should block (returned in %s) %#v", elapsed, resp) + } + if resp.Index != 200 { + t.Fatalf("Bad index: %d %d", resp.Index, 200) + } + if resp.Alloc == nil || resp.Alloc.ID != alloc2.ID { + t.Fatalf("bad: %#v", resp.Alloc) + } +} diff --git a/nomad/eval_endpoint.go b/nomad/eval_endpoint.go index 0dce98a52..bc74e85f3 100644 --- a/nomad/eval_endpoint.go +++ b/nomad/eval_endpoint.go @@ -6,6 +6,7 @@ import ( "github.com/armon/go-metrics" "github.com/hashicorp/nomad/nomad/structs" + "github.com/hashicorp/nomad/nomad/watch" ) const ( @@ -26,32 +27,40 @@ func (e *Eval) GetEval(args *structs.EvalSpecificRequest, } defer metrics.MeasureSince([]string{"nomad", "eval", "get_eval"}, time.Now()) - // Look for the job - snap, err := e.srv.fsm.State().Snapshot() - if err != nil { - return err - } - out, err := snap.EvalByID(args.EvalID) - if err != nil { - return err - } + // Setup the blocking query + opts := blockingOptions{ + queryOpts: &args.QueryOptions, + queryMeta: &reply.QueryMeta, + watch: watch.NewItems(watch.Item{Eval: args.EvalID}), + run: func() error { + // Look for the job + snap, err := e.srv.fsm.State().Snapshot() + if err != nil { + return err + } + out, err := snap.EvalByID(args.EvalID) + if err != nil { + return err + } - // Setup the output - if out != nil { - reply.Eval = out - reply.Index = out.ModifyIndex - } else { - // Use the last index that affected the nodes table - index, err := snap.Index("evals") - if err != nil { - return err - } - reply.Index = index - } + // Setup the output + reply.Eval = out + if out != nil { + reply.Index = out.ModifyIndex + } else { + // Use the last index that affected the nodes table + index, err := snap.Index("evals") + if err != nil { + return err + } + reply.Index = index + } - // Set the query response - e.srv.setQueryMeta(&reply.QueryMeta) - return nil + // Set the query response + e.srv.setQueryMeta(&reply.QueryMeta) + return nil + }} + return e.srv.blockingRPC(&opts) } // Dequeue is used to dequeue a pending evaluation @@ -219,35 +228,45 @@ func (e *Eval) List(args *structs.EvalListRequest, } defer metrics.MeasureSince([]string{"nomad", "eval", "list"}, time.Now()) - // Scan all the evaluations - snap, err := e.srv.fsm.State().Snapshot() - if err != nil { - return err - } - iter, err := snap.Evals() - if err != nil { - return err - } + // Setup the blocking query + opts := blockingOptions{ + queryOpts: &args.QueryOptions, + queryMeta: &reply.QueryMeta, + watch: watch.NewItems(watch.Item{Table: "evals"}), + run: func() error { + // Scan all the evaluations + snap, err := e.srv.fsm.State().Snapshot() + if err != nil { + return err + } + iter, err := snap.Evals() + if err != nil { + return err + } - for { - raw := iter.Next() - if raw == nil { - break - } - eval := raw.(*structs.Evaluation) - reply.Evaluations = append(reply.Evaluations, eval) - } + var evals []*structs.Evaluation + for { + raw := iter.Next() + if raw == nil { + break + } + eval := raw.(*structs.Evaluation) + evals = append(evals, eval) + } + reply.Evaluations = evals - // Use the last index that affected the jobs table - index, err := snap.Index("evals") - if err != nil { - return err - } - reply.Index = index + // Use the last index that affected the jobs table + index, err := snap.Index("evals") + if err != nil { + return err + } + reply.Index = index - // Set the query response - e.srv.setQueryMeta(&reply.QueryMeta) - return nil + // Set the query response + e.srv.setQueryMeta(&reply.QueryMeta) + return nil + }} + return e.srv.blockingRPC(&opts) } // Allocations is used to list the allocations for an evaluation @@ -258,32 +277,40 @@ func (e *Eval) Allocations(args *structs.EvalSpecificRequest, } defer metrics.MeasureSince([]string{"nomad", "eval", "allocations"}, time.Now()) - // Capture the allocations - snap, err := e.srv.fsm.State().Snapshot() - if err != nil { - return err - } - allocs, err := snap.AllocsByEval(args.EvalID) - if err != nil { - return err - } + // Setup the blocking query + opts := blockingOptions{ + queryOpts: &args.QueryOptions, + queryMeta: &reply.QueryMeta, + watch: watch.NewItems(watch.Item{AllocEval: args.EvalID}), + run: func() error { + // Capture the allocations + snap, err := e.srv.fsm.State().Snapshot() + if err != nil { + return err + } + allocs, err := snap.AllocsByEval(args.EvalID) + if err != nil { + return err + } - // Convert to a stub - if len(allocs) > 0 { - reply.Allocations = make([]*structs.AllocListStub, 0, len(allocs)) - for _, alloc := range allocs { - reply.Allocations = append(reply.Allocations, alloc.Stub()) - } - } + // Convert to a stub + if len(allocs) > 0 { + reply.Allocations = make([]*structs.AllocListStub, 0, len(allocs)) + for _, alloc := range allocs { + reply.Allocations = append(reply.Allocations, alloc.Stub()) + } + } - // Use the last index that affected the allocs table - index, err := snap.Index("allocs") - if err != nil { - return err - } - reply.Index = index + // Use the last index that affected the allocs table + index, err := snap.Index("allocs") + if err != nil { + return err + } + reply.Index = index - // Set the query response - e.srv.setQueryMeta(&reply.QueryMeta) - return nil + // Set the query response + e.srv.setQueryMeta(&reply.QueryMeta) + return nil + }} + return e.srv.blockingRPC(&opts) } diff --git a/nomad/eval_endpoint_test.go b/nomad/eval_endpoint_test.go index eb61ea3d0..55782a031 100644 --- a/nomad/eval_endpoint_test.go +++ b/nomad/eval_endpoint_test.go @@ -51,6 +51,83 @@ func TestEvalEndpoint_GetEval(t *testing.T) { } } +func TestEvalEndpoint_GetEval_Blocking(t *testing.T) { + s1 := testServer(t, nil) + defer s1.Shutdown() + state := s1.fsm.State() + codec := rpcClient(t, s1) + testutil.WaitForLeader(t, s1.RPC) + + // Create the evals + eval1 := mock.Eval() + eval2 := mock.Eval() + + // First create an unrelated eval + time.AfterFunc(100*time.Millisecond, func() { + err := state.UpsertEvals(100, []*structs.Evaluation{eval1}) + if err != nil { + t.Fatalf("err: %v", err) + } + }) + + // Upsert the eval we are watching later + time.AfterFunc(200*time.Millisecond, func() { + err := state.UpsertEvals(200, []*structs.Evaluation{eval2}) + if err != nil { + t.Fatalf("err: %v", err) + } + }) + + // Lookup the eval + req := &structs.EvalSpecificRequest{ + EvalID: eval2.ID, + QueryOptions: structs.QueryOptions{ + Region: "global", + MinQueryIndex: 50, + }, + } + var resp structs.SingleEvalResponse + start := time.Now() + if err := msgpackrpc.CallWithCodec(codec, "Eval.GetEval", req, &resp); err != nil { + t.Fatalf("err: %v", err) + } + + if elapsed := time.Since(start); elapsed < 200*time.Millisecond { + t.Fatalf("should block (returned in %s) %#v", elapsed, resp) + } + if resp.Index != 200 { + t.Fatalf("Bad index: %d %d", resp.Index, 200) + } + if resp.Eval == nil || resp.Eval.ID != eval2.ID { + t.Fatalf("bad: %#v", resp.Eval) + } + + // Eval delete triggers watches + time.AfterFunc(100*time.Millisecond, func() { + err := state.DeleteEval(300, []string{eval2.ID}, []string{}) + if err != nil { + t.Fatalf("err: %v", err) + } + }) + + req.QueryOptions.MinQueryIndex = 250 + var resp2 structs.SingleEvalResponse + start = time.Now() + if err := msgpackrpc.CallWithCodec(codec, "Eval.GetEval", req, &resp2); err != nil { + t.Fatalf("err: %v", err) + } + + if elapsed := time.Since(start); elapsed < 100*time.Millisecond { + t.Fatalf("should block (returned in %s) %#v", elapsed, resp2) + } + if resp2.Index != 300 { + t.Fatalf("Bad index: %d %d", resp2.Index, 300) + } + if resp2.Eval != nil { + t.Fatalf("bad: %#v", resp2.Eval) + } +} + func TestEvalEndpoint_Dequeue(t *testing.T) { s1 := testServer(t, func(c *Config) { c.NumSchedulers = 0 // Prevent automatic dequeue @@ -334,6 +411,70 @@ func TestEvalEndpoint_List(t *testing.T) { } } +func TestEvalEndpoint_List_Blocking(t *testing.T) { + s1 := testServer(t, nil) + defer s1.Shutdown() + state := s1.fsm.State() + codec := rpcClient(t, s1) + testutil.WaitForLeader(t, s1.RPC) + + // Create the ieval + eval := mock.Eval() + + // Upsert eval triggers watches + time.AfterFunc(100*time.Millisecond, func() { + if err := state.UpsertEvals(2, []*structs.Evaluation{eval}); err != nil { + t.Fatalf("err: %v", err) + } + }) + + req := &structs.EvalListRequest{ + QueryOptions: structs.QueryOptions{ + Region: "global", + MinQueryIndex: 1, + }, + } + start := time.Now() + var resp structs.EvalListResponse + if err := msgpackrpc.CallWithCodec(codec, "Eval.List", req, &resp); err != nil { + t.Fatalf("err: %v", err) + } + + if elapsed := time.Since(start); elapsed < 100*time.Millisecond { + t.Fatalf("should block (returned in %s) %#v", elapsed, resp) + } + if resp.Index != 2 { + t.Fatalf("Bad index: %d %d", resp.Index, 2) + } + if len(resp.Evaluations) != 1 || resp.Evaluations[0].ID != eval.ID { + t.Fatalf("bad: %#v", resp.Evaluations) + } + + // Eval deletion triggers watches + time.AfterFunc(100*time.Millisecond, func() { + if err := state.DeleteEval(3, []string{eval.ID}, nil); err != nil { + t.Fatalf("err: %v", err) + } + }) + + req.MinQueryIndex = 2 + start = time.Now() + var resp2 structs.EvalListResponse + if err := msgpackrpc.CallWithCodec(codec, "Eval.List", req, &resp2); err != nil { + t.Fatalf("err: %v", err) + } + + if elapsed := time.Since(start); elapsed < 100*time.Millisecond { + t.Fatalf("should block (returned in %s) %#v", elapsed, resp2) + } + if resp2.Index != 3 { + t.Fatalf("Bad index: %d %d", resp2.Index, 3) + } + if len(resp2.Evaluations) != 0 { + t.Fatalf("bad: %#v", resp2.Evaluations) + } +} + func TestEvalEndpoint_Allocations(t *testing.T) { s1 := testServer(t, nil) defer s1.Shutdown() @@ -368,3 +509,55 @@ func TestEvalEndpoint_Allocations(t *testing.T) { t.Fatalf("bad: %#v", resp.Allocations) } } + +func TestEvalEndpoint_Allocations_Blocking(t *testing.T) { + s1 := testServer(t, nil) + defer s1.Shutdown() + state := s1.fsm.State() + codec := rpcClient(t, s1) + testutil.WaitForLeader(t, s1.RPC) + + // Create the allocs + alloc1 := mock.Alloc() + alloc2 := mock.Alloc() + + // Upsert an unrelated alloc first + time.AfterFunc(100*time.Millisecond, func() { + err := state.UpsertAllocs(100, []*structs.Allocation{alloc1}) + if err != nil { + t.Fatalf("err: %v", err) + } + }) + + // Upsert an alloc which will trigger the watch later + time.AfterFunc(200*time.Millisecond, func() { + err := state.UpsertAllocs(200, []*structs.Allocation{alloc2}) + if err != nil { + t.Fatalf("err: %v", err) + } + }) + + // Lookup the eval + get := &structs.EvalSpecificRequest{ + EvalID: alloc2.EvalID, + QueryOptions: structs.QueryOptions{ + Region: "global", + MinQueryIndex: 50, + }, + } + var resp structs.EvalAllocationsResponse + start := time.Now() + if err := msgpackrpc.CallWithCodec(codec, "Eval.Allocations", get, &resp); err != nil { + t.Fatalf("err: %v", err) + } + + if elapsed := time.Since(start); elapsed < 200*time.Millisecond { + t.Fatalf("should block (returned in %s) %#v", elapsed, resp) + } + if resp.Index != 200 { + t.Fatalf("Bad index: %d %d", resp.Index, 200) + } + if len(resp.Allocations) != 1 || resp.Allocations[0].ID != alloc2.ID { + t.Fatalf("bad: %#v", resp.Allocations) + } +} diff --git a/nomad/job_endpoint.go b/nomad/job_endpoint.go index 63d31eb3c..e961428e4 100644 --- a/nomad/job_endpoint.go +++ b/nomad/job_endpoint.go @@ -6,6 +6,7 @@ import ( "github.com/armon/go-metrics" "github.com/hashicorp/nomad/nomad/structs" + "github.com/hashicorp/nomad/nomad/watch" ) // Job endpoint is used for job interactions @@ -180,32 +181,41 @@ func (j *Job) GetJob(args *structs.JobSpecificRequest, } defer metrics.MeasureSince([]string{"nomad", "job", "get_job"}, time.Now()) - // Look for the job - snap, err := j.srv.fsm.State().Snapshot() - if err != nil { - return err - } - out, err := snap.JobByID(args.JobID) - if err != nil { - return err - } + // Setup the blocking query + opts := blockingOptions{ + queryOpts: &args.QueryOptions, + queryMeta: &reply.QueryMeta, + watch: watch.NewItems(watch.Item{Job: args.JobID}), + run: func() error { - // Setup the output - if out != nil { - reply.Job = out - reply.Index = out.ModifyIndex - } else { - // Use the last index that affected the nodes table - index, err := snap.Index("jobs") - if err != nil { - return err - } - reply.Index = index - } + // Look for the job + snap, err := j.srv.fsm.State().Snapshot() + if err != nil { + return err + } + out, err := snap.JobByID(args.JobID) + if err != nil { + return err + } - // Set the query response - j.srv.setQueryMeta(&reply.QueryMeta) - return nil + // Setup the output + reply.Job = out + if out != nil { + reply.Index = out.ModifyIndex + } else { + // Use the last index that affected the nodes table + index, err := snap.Index("jobs") + if err != nil { + return err + } + reply.Index = index + } + + // Set the query response + j.srv.setQueryMeta(&reply.QueryMeta) + return nil + }} + return j.srv.blockingRPC(&opts) } // List is used to list the jobs registered in the system @@ -216,35 +226,45 @@ func (j *Job) List(args *structs.JobListRequest, } defer metrics.MeasureSince([]string{"nomad", "job", "list"}, time.Now()) - // Capture all the jobs - snap, err := j.srv.fsm.State().Snapshot() - if err != nil { - return err - } - iter, err := snap.Jobs() - if err != nil { - return err - } + // Setup the blocking query + opts := blockingOptions{ + queryOpts: &args.QueryOptions, + queryMeta: &reply.QueryMeta, + watch: watch.NewItems(watch.Item{Table: "jobs"}), + run: func() error { + // Capture all the jobs + snap, err := j.srv.fsm.State().Snapshot() + if err != nil { + return err + } + iter, err := snap.Jobs() + if err != nil { + return err + } - for { - raw := iter.Next() - if raw == nil { - break - } - job := raw.(*structs.Job) - reply.Jobs = append(reply.Jobs, job.Stub()) - } + var jobs []*structs.JobListStub + for { + raw := iter.Next() + if raw == nil { + break + } + job := raw.(*structs.Job) + jobs = append(jobs, job.Stub()) + } + reply.Jobs = jobs - // Use the last index that affected the jobs table - index, err := snap.Index("jobs") - if err != nil { - return err - } - reply.Index = index + // Use the last index that affected the jobs table + index, err := snap.Index("jobs") + if err != nil { + return err + } + reply.Index = index - // Set the query response - j.srv.setQueryMeta(&reply.QueryMeta) - return nil + // Set the query response + j.srv.setQueryMeta(&reply.QueryMeta) + return nil + }} + return j.srv.blockingRPC(&opts) } // Allocations is used to list the allocations for a job @@ -255,34 +275,43 @@ func (j *Job) Allocations(args *structs.JobSpecificRequest, } defer metrics.MeasureSince([]string{"nomad", "job", "allocations"}, time.Now()) - // Capture the allocations - snap, err := j.srv.fsm.State().Snapshot() - if err != nil { - return err - } - allocs, err := snap.AllocsByJob(args.JobID) - if err != nil { - return err - } + // Setup the blocking query + opts := blockingOptions{ + queryOpts: &args.QueryOptions, + queryMeta: &reply.QueryMeta, + watch: watch.NewItems(watch.Item{AllocJob: args.JobID}), + run: func() error { + // Capture the allocations + snap, err := j.srv.fsm.State().Snapshot() + if err != nil { + return err + } + allocs, err := snap.AllocsByJob(args.JobID) + if err != nil { + return err + } - // Convert to stubs - if len(allocs) > 0 { - reply.Allocations = make([]*structs.AllocListStub, 0, len(allocs)) - for _, alloc := range allocs { - reply.Allocations = append(reply.Allocations, alloc.Stub()) - } - } + // Convert to stubs + if len(allocs) > 0 { + reply.Allocations = make([]*structs.AllocListStub, 0, len(allocs)) + for _, alloc := range allocs { + reply.Allocations = append(reply.Allocations, alloc.Stub()) + } + } - // Use the last index that affected the allocs table - index, err := snap.Index("allocs") - if err != nil { - return err - } - reply.Index = index + // Use the last index that affected the allocs table + index, err := snap.Index("allocs") + if err != nil { + return err + } + reply.Index = index - // Set the query response - j.srv.setQueryMeta(&reply.QueryMeta) - return nil + // Set the query response + j.srv.setQueryMeta(&reply.QueryMeta) + return nil + + }} + return j.srv.blockingRPC(&opts) } // Evaluations is used to list the evaluations for a job diff --git a/nomad/job_endpoint_test.go b/nomad/job_endpoint_test.go index e43ed3ba2..c12e5b463 100644 --- a/nomad/job_endpoint_test.go +++ b/nomad/job_endpoint_test.go @@ -3,6 +3,7 @@ package nomad import ( "reflect" "testing" + "time" "github.com/hashicorp/net-rpc-msgpackrpc" "github.com/hashicorp/nomad/nomad/mock" @@ -363,6 +364,80 @@ func TestJobEndpoint_GetJob(t *testing.T) { } } +func TestJobEndpoint_GetJob_Blocking(t *testing.T) { + s1 := testServer(t, nil) + defer s1.Shutdown() + state := s1.fsm.State() + codec := rpcClient(t, s1) + testutil.WaitForLeader(t, s1.RPC) + + // Create the jobs + job1 := mock.Job() + job2 := mock.Job() + + // Upsert a job we are not interested in first. + time.AfterFunc(100*time.Millisecond, func() { + if err := state.UpsertJob(100, job1); err != nil { + t.Fatalf("err: %v", err) + } + }) + + // Upsert another job later which should trigger the watch. + time.AfterFunc(200*time.Millisecond, func() { + if err := state.UpsertJob(200, job2); err != nil { + t.Fatalf("err: %v", err) + } + }) + + req := &structs.JobSpecificRequest{ + JobID: job2.ID, + QueryOptions: structs.QueryOptions{ + Region: "global", + MinQueryIndex: 50, + }, + } + start := time.Now() + var resp structs.SingleJobResponse + if err := msgpackrpc.CallWithCodec(codec, "Job.GetJob", req, &resp); err != nil { + t.Fatalf("err: %v", err) + } + + if elapsed := time.Since(start); elapsed < 200*time.Millisecond { + t.Fatalf("should block (returned in %s) %#v", elapsed, resp) + } + if resp.Index != 200 { + t.Fatalf("Bad index: %d %d", resp.Index, 200) + } + if resp.Job == nil || resp.Job.ID != job2.ID { + t.Fatalf("bad: %#v", resp.Job) + } + + // Job delete fires watches + time.AfterFunc(100*time.Millisecond, func() { + if err := state.DeleteJob(300, job2.ID); err != nil { + t.Fatalf("err: %v", err) + } + }) + + req.QueryOptions.MinQueryIndex = 250 + start = time.Now() + + var resp2 structs.SingleJobResponse + if err := msgpackrpc.CallWithCodec(codec, "Job.GetJob", req, &resp2); err != nil { + t.Fatalf("err: %v", err) + } + + if elapsed := time.Since(start); elapsed < 100*time.Millisecond { + t.Fatalf("should block (returned in %s) %#v", elapsed, resp2) + } + if resp2.Index != 300 { + t.Fatalf("Bad index: %d %d", resp2.Index, 300) + } + if resp2.Job != nil { + t.Fatalf("bad: %#v", resp2.Job) + } +} + func TestJobEndpoint_ListJobs(t *testing.T) { s1 := testServer(t, nil) defer s1.Shutdown() @@ -397,6 +472,70 @@ func TestJobEndpoint_ListJobs(t *testing.T) { } } +func TestJobEndpoint_ListJobs_Blocking(t *testing.T) { + s1 := testServer(t, nil) + defer s1.Shutdown() + state := s1.fsm.State() + codec := rpcClient(t, s1) + testutil.WaitForLeader(t, s1.RPC) + + // Create the job + job := mock.Job() + + // Upsert job triggers watches + time.AfterFunc(100*time.Millisecond, func() { + if err := state.UpsertJob(100, job); err != nil { + t.Fatalf("err: %v", err) + } + }) + + req := &structs.JobListRequest{ + QueryOptions: structs.QueryOptions{ + Region: "global", + MinQueryIndex: 50, + }, + } + start := time.Now() + var resp structs.JobListResponse + if err := msgpackrpc.CallWithCodec(codec, "Job.List", req, &resp); err != nil { + t.Fatalf("err: %v", err) + } + + if elapsed := time.Since(start); elapsed < 100*time.Millisecond { + t.Fatalf("should block (returned in %s) %#v", elapsed, resp) + } + if resp.Index != 100 { + t.Fatalf("Bad index: %d %d", resp.Index, 100) + } + if len(resp.Jobs) != 1 || resp.Jobs[0].ID != job.ID { + t.Fatalf("bad: %#v", resp.Jobs) + } + + // Job deletion triggers watches + time.AfterFunc(100*time.Millisecond, func() { + if err := state.DeleteJob(200, job.ID); err != nil { + t.Fatalf("err: %v", err) + } + }) + + req.MinQueryIndex = 150 + start = time.Now() + var resp2 structs.JobListResponse + if err := msgpackrpc.CallWithCodec(codec, "Job.List", req, &resp2); err != nil { + t.Fatalf("err: %v", err) + } + + if elapsed := time.Since(start); elapsed < 100*time.Millisecond { + t.Fatalf("should block (returned in %s) %#v", elapsed, resp2) + } + if resp2.Index != 200 { + t.Fatalf("Bad index: %d %d", resp2.Index, 200) + } + if len(resp2.Jobs) != 0 { + t.Fatalf("bad: %#v", resp2.Jobs) + } +} + func TestJobEndpoint_Allocations(t *testing.T) { s1 := testServer(t, nil) defer s1.Shutdown() @@ -432,6 +571,59 @@ func TestJobEndpoint_Allocations(t *testing.T) { } } +func TestJobEndpoint_Allocations_Blocking(t *testing.T) { + s1 := testServer(t, nil) + defer s1.Shutdown() + codec := rpcClient(t, s1) + testutil.WaitForLeader(t, s1.RPC) + + // Create the register request + alloc1 := mock.Alloc() + alloc2 := mock.Alloc() + alloc2.JobID = "job1" + state := s1.fsm.State() + + // First upsert an unrelated alloc + time.AfterFunc(100*time.Millisecond, func() { + err := state.UpsertAllocs(100, []*structs.Allocation{alloc1}) + if err != nil { + t.Fatalf("err: %v", err) + } + }) + + // Upsert an alloc for the job we are interested in later + time.AfterFunc(200*time.Millisecond, func() { + err := state.UpsertAllocs(200, []*structs.Allocation{alloc2}) + if err != nil { + t.Fatalf("err: %v", err) + } + }) + + // Lookup the jobs + get := &structs.JobSpecificRequest{ + JobID: "job1", + QueryOptions: structs.QueryOptions{ + Region: "global", + MinQueryIndex: 50, + }, + } + var resp structs.JobAllocationsResponse + start := time.Now() + if err := msgpackrpc.CallWithCodec(codec, "Job.Allocations", get, &resp); err != nil { + t.Fatalf("err: %v", err) + } + + if elapsed := time.Since(start); elapsed < 200*time.Millisecond { + t.Fatalf("should block (returned in %s) %#v", elapsed, resp) + } + if resp.Index != 200 { + t.Fatalf("Bad index: %d %d", resp.Index, 200) + } + if len(resp.Allocations) != 1 || resp.Allocations[0].JobID != "job1" { + t.Fatalf("bad: %#v", resp.Allocations) + } +} + func TestJobEndpoint_Evaluations(t *testing.T) { s1 := testServer(t, nil) defer s1.Shutdown() diff --git a/nomad/mock/mock.go b/nomad/mock/mock.go index 87c426dce..329ecd872 100644 --- a/nomad/mock/mock.go +++ b/nomad/mock/mock.go @@ -1,6 +1,9 @@ package mock -import "github.com/hashicorp/nomad/nomad/structs" +import ( + "github.com/hashicorp/nomad/nomad/structs" + "time" +) func Node() *structs.Node { node := &structs.Node{ @@ -71,6 +74,11 @@ func Job() *structs.Job { &structs.TaskGroup{ Name: "web", Count: 10, + RestartPolicy: &structs.RestartPolicy{ + Attempts: 3, + Interval: 10 * time.Minute, + Delay: 1 * time.Minute, + }, Tasks: []*structs.Task{ &structs.Task{ Name: "web", @@ -131,6 +139,11 @@ func SystemJob() *structs.Job { &structs.TaskGroup{ Name: "web", Count: 1, + RestartPolicy: &structs.RestartPolicy{ + Attempts: 3, + Interval: 10 * time.Minute, + Delay: 1 * time.Minute, + }, Tasks: []*structs.Task{ &structs.Task{ Name: "web", diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go index 9ce14aadd..5bd600380 100644 --- a/nomad/node_endpoint.go +++ b/nomad/node_endpoint.go @@ -6,6 +6,7 @@ import ( "github.com/armon/go-metrics" "github.com/hashicorp/nomad/nomad/structs" + "github.com/hashicorp/nomad/nomad/watch" ) // Node endpoint is used for client interactions @@ -282,37 +283,45 @@ func (n *Node) GetNode(args *structs.NodeSpecificRequest, } defer metrics.MeasureSince([]string{"nomad", "client", "get_node"}, time.Now()) - // Verify the arguments - if args.NodeID == "" { - return fmt.Errorf("missing node ID") - } + // Setup the blocking query + opts := blockingOptions{ + queryOpts: &args.QueryOptions, + queryMeta: &reply.QueryMeta, + watch: watch.NewItems(watch.Item{Node: args.NodeID}), + run: func() error { + // Verify the arguments + if args.NodeID == "" { + return fmt.Errorf("missing node ID") + } - // Look for the node - snap, err := n.srv.fsm.State().Snapshot() - if err != nil { - return err - } - out, err := snap.NodeByID(args.NodeID) - if err != nil { - return err - } + // Look for the node + snap, err := n.srv.fsm.State().Snapshot() + if err != nil { + return err + } + out, err := snap.NodeByID(args.NodeID) + if err != nil { + return err + } - // Setup the output - if out != nil { - reply.Node = out - reply.Index = out.ModifyIndex - } else { - // Use the last index that affected the nodes table - index, err := snap.Index("nodes") - if err != nil { - return err - } - reply.Index = index - } + // Setup the output + reply.Node = out + if out != nil { + reply.Index = out.ModifyIndex + } else { + // Use the last index that affected the nodes table + index, err := snap.Index("nodes") + if err != nil { + return err + } + reply.Index = index + } - // Set the query response - n.srv.setQueryMeta(&reply.QueryMeta) - return nil + // Set the query response + n.srv.setQueryMeta(&reply.QueryMeta) + return nil + }} + return n.srv.blockingRPC(&opts) } // GetAllocs is used to request allocations for a specific node @@ -330,9 +339,9 @@ func (n *Node) GetAllocs(args *structs.NodeSpecificRequest, // Setup the blocking query opts := blockingOptions{ - queryOpts: &args.QueryOptions, - queryMeta: &reply.QueryMeta, - allocWatch: args.NodeID, + queryOpts: &args.QueryOptions, + queryMeta: &reply.QueryMeta, + watch: watch.NewItems(watch.Item{AllocNode: args.NodeID}), run: func() error { // Look for the node snap, err := n.srv.fsm.State().Snapshot() @@ -404,35 +413,45 @@ func (n *Node) List(args *structs.NodeListRequest, } defer metrics.MeasureSince([]string{"nomad", "client", "list"}, time.Now()) - // Capture all the nodes - snap, err := n.srv.fsm.State().Snapshot() - if err != nil { - return err - } - iter, err := snap.Nodes() - if err != nil { - return err - } + // Setup the blocking query + opts := blockingOptions{ + queryOpts: &args.QueryOptions, + queryMeta: &reply.QueryMeta, + watch: watch.NewItems(watch.Item{Table: "nodes"}), + run: func() error { + // Capture all the nodes + snap, err := n.srv.fsm.State().Snapshot() + if err != nil { + return err + } + iter, err := snap.Nodes() + if err != nil { + return err + } - for { - raw := iter.Next() - if raw == nil { - break - } - node := raw.(*structs.Node) - reply.Nodes = append(reply.Nodes, node.Stub()) - } + var nodes []*structs.NodeListStub + for { + raw := iter.Next() + if raw == nil { + break + } + node := raw.(*structs.Node) + nodes = append(nodes, node.Stub()) + } + reply.Nodes = nodes - // Use the last index that affected the jobs table - index, err := snap.Index("nodes") - if err != nil { - return err - } - reply.Index = index + // Use the last index that affected the jobs table + index, err := snap.Index("nodes") + if err != nil { + return err + } + reply.Index = index - // Set the query response - n.srv.setQueryMeta(&reply.QueryMeta) - return nil + // Set the query response + n.srv.setQueryMeta(&reply.QueryMeta) + return nil + }} + return n.srv.blockingRPC(&opts) } // createNodeEvals is used to create evaluations for each alloc on a node. diff --git a/nomad/node_endpoint_test.go b/nomad/node_endpoint_test.go index 62f4a4959..74b154655 100644 --- a/nomad/node_endpoint_test.go +++ b/nomad/node_endpoint_test.go @@ -371,6 +371,107 @@ func TestClientEndpoint_GetNode(t *testing.T) { } } +func TestClientEndpoint_GetNode_Blocking(t *testing.T) { + s1 := testServer(t, nil) + defer s1.Shutdown() + state := s1.fsm.State() + codec := rpcClient(t, s1) + testutil.WaitForLeader(t, s1.RPC) + + // Create the node + node1 := mock.Node() + node2 := mock.Node() + + // First create an unrelated node. + time.AfterFunc(100*time.Millisecond, func() { + if err := state.UpsertNode(100, node1); err != nil { + t.Fatalf("err: %v", err) + } + }) + + // Upsert the node we are watching later + time.AfterFunc(200*time.Millisecond, func() { + if err := state.UpsertNode(200, node2); err != nil { + t.Fatalf("err: %v", err) + } + }) + + // Lookup the node + req := &structs.NodeSpecificRequest{ + NodeID: node2.ID, + QueryOptions: structs.QueryOptions{ + Region: "global", + MinQueryIndex: 50, + }, + } + var resp structs.SingleNodeResponse + start := time.Now() + if err := msgpackrpc.CallWithCodec(codec, "Node.GetNode", req, &resp); err != nil { + t.Fatalf("err: %v", err) + } + + if elapsed := time.Since(start); elapsed < 200*time.Millisecond { + t.Fatalf("should block (returned in %s) %#v", elapsed, resp) + } + if resp.Index != 200 { + t.Fatalf("Bad index: %d %d", resp.Index, 200) + } + if resp.Node == nil || resp.Node.ID != node2.ID { + t.Fatalf("bad: %#v", resp.Node) + } + + // Node update triggers watches + time.AfterFunc(100*time.Millisecond, func() { + nodeUpdate := mock.Node() + nodeUpdate.ID = node2.ID + nodeUpdate.Status = structs.NodeStatusDown + if err := state.UpsertNode(300, nodeUpdate); err != nil { + t.Fatalf("err: %v", err) + } + }) + + req.QueryOptions.MinQueryIndex = 250 + var resp2 structs.SingleNodeResponse + start = time.Now() + if err := msgpackrpc.CallWithCodec(codec, "Node.GetNode", req, &resp2); err != nil { + t.Fatalf("err: %v", err) + } + + if elapsed := time.Since(start); elapsed < 100*time.Millisecond { + t.Fatalf("should block (returned in %s) %#v", elapsed, resp) + } + if resp2.Index != 300 { + t.Fatalf("Bad index: %d %d", resp2.Index, 300) + } + if resp2.Node == nil || resp2.Node.Status != structs.NodeStatusDown { + t.Fatalf("bad: %#v", resp2.Node) + } + + // Node delete triggers watches + time.AfterFunc(100*time.Millisecond, func() { + if err := state.DeleteNode(400, node2.ID); err != nil { + t.Fatalf("err: %v", err) + } + }) + + req.QueryOptions.MinQueryIndex = 350 + var resp3 structs.SingleNodeResponse + start = time.Now() + if err := msgpackrpc.CallWithCodec(codec, "Node.GetNode", req, &resp3); err != nil { + t.Fatalf("err: %v", err) + } + + if elapsed := time.Since(start); elapsed < 100*time.Millisecond { + t.Fatalf("should block (returned in %s) %#v", elapsed, resp) + } + if resp3.Index != 400 { + t.Fatalf("Bad index: %d %d", resp2.Index, 400) + } + if resp3.Node != nil { + t.Fatalf("bad: %#v", resp3.Node) + } +} + func TestClientEndpoint_GetAllocs(t *testing.T) { s1 := testServer(t, nil) defer s1.Shutdown() @@ -457,16 +558,15 @@ func TestClientEndpoint_GetAllocs_Blocking(t *testing.T) { alloc.NodeID = node.ID state := s1.fsm.State() start := time.Now() - go func() { - time.Sleep(100 * time.Millisecond) + time.AfterFunc(100*time.Millisecond, func() { err := state.UpsertAllocs(100, []*structs.Allocation{alloc}) if err != nil { t.Fatalf("err: %v", err) } - }() + }) // Lookup the allocs in a blocking query - get := &structs.NodeSpecificRequest{ + req := &structs.NodeSpecificRequest{ NodeID: node.ID, QueryOptions: structs.QueryOptions{ Region: "global", @@ -475,7 +575,7 @@ func TestClientEndpoint_GetAllocs_Blocking(t *testing.T) { }, } var resp2 structs.NodeAllocsResponse - if err := msgpackrpc.CallWithCodec(codec, "Node.GetAllocs", get, &resp2); err != nil { + if err := msgpackrpc.CallWithCodec(codec, "Node.GetAllocs", req, &resp2); err != nil { t.Fatalf("err: %v", err) } @@ -491,6 +591,34 @@ func TestClientEndpoint_GetAllocs_Blocking(t *testing.T) { if len(resp2.Allocs) != 1 || resp2.Allocs[0].ID != alloc.ID { t.Fatalf("bad: %#v", resp2.Allocs) } + + // Alloc updates fire watches + time.AfterFunc(100*time.Millisecond, func() { + allocUpdate := mock.Alloc() + allocUpdate.NodeID = alloc.NodeID + allocUpdate.ID = alloc.ID + allocUpdate.ClientStatus = structs.AllocClientStatusRunning + err := state.UpdateAllocFromClient(200, allocUpdate) + if err != nil { + t.Fatalf("err: %v", err) + } + }) + + req.QueryOptions.MinQueryIndex = 150 + var resp3 structs.NodeAllocsResponse + if err := msgpackrpc.CallWithCodec(codec, "Node.GetAllocs", req, &resp3); err != nil { + t.Fatalf("err: %v", err) + } + + if time.Since(start) < 100*time.Millisecond { + t.Fatalf("too fast") + } + if resp3.Index != 200 { + t.Fatalf("Bad index: %d %d", resp3.Index, 200) + } + if len(resp3.Allocs) != 1 || resp3.Allocs[0].ClientStatus != structs.AllocClientStatusRunning { + t.Fatalf("bad: %#v", resp3.Allocs[0]) + } } func TestClientEndpoint_UpdateAlloc(t *testing.T) { @@ -752,3 +880,115 @@ func TestClientEndpoint_ListNodes(t *testing.T) { t.Fatalf("bad: %#v", resp2.Nodes[0]) } } + +func TestClientEndpoint_ListNodes_Blocking(t *testing.T) { + s1 := testServer(t, nil) + defer s1.Shutdown() + state := s1.fsm.State() + codec := rpcClient(t, s1) + testutil.WaitForLeader(t, s1.RPC) + + // Create the node + node := mock.Node() + + // Node upsert triggers watches + time.AfterFunc(100*time.Millisecond, func() { + if err := state.UpsertNode(2, node); err != nil { + t.Fatalf("err: %v", err) + } + }) + + req := &structs.NodeListRequest{ + QueryOptions: structs.QueryOptions{ + Region: "global", + MinQueryIndex: 1, + }, + } + start := time.Now() + var resp structs.NodeListResponse + if err := msgpackrpc.CallWithCodec(codec, "Node.List", req, &resp); err != nil { + t.Fatalf("err: %v", err) + } + + if elapsed := time.Since(start); elapsed < 100*time.Millisecond { + t.Fatalf("should block (returned in %s) %#v", elapsed, resp) + } + if resp.Index != 2 { + t.Fatalf("Bad index: %d %d", resp.Index, 2) + } + if len(resp.Nodes) != 1 || resp.Nodes[0].ID != node.ID { + t.Fatalf("bad: %#v", resp.Nodes) + } + + // Node drain updates trigger watches. + time.AfterFunc(100*time.Millisecond, func() { + if err := state.UpdateNodeDrain(3, node.ID, true); err != nil { + t.Fatalf("err: %v", err) + } + }) + + req.MinQueryIndex = 2 + var resp2 structs.NodeListResponse + start = time.Now() + if err := msgpackrpc.CallWithCodec(codec, "Node.List", req, &resp2); err != nil { + t.Fatalf("err: %v", err) + } + + if elapsed := time.Since(start); elapsed < 100*time.Millisecond { + t.Fatalf("should block (returned in %s) %#v", elapsed, resp2) + } + if resp2.Index != 3 { + t.Fatalf("Bad index: %d %d", resp2.Index, 3) + } + if len(resp2.Nodes) != 1 || !resp2.Nodes[0].Drain { + t.Fatalf("bad: %#v", resp2.Nodes) + } + + // Node status update triggers watches + time.AfterFunc(100*time.Millisecond, func() { + if err := state.UpdateNodeStatus(4, node.ID, structs.NodeStatusDown); err != nil { + t.Fatalf("err: %v", err) + } + }) + + req.MinQueryIndex = 3 + var resp3 structs.NodeListResponse + start = time.Now() + if err := msgpackrpc.CallWithCodec(codec, "Node.List", req, &resp3); err != nil { + t.Fatalf("err: %v", err) + } + + if elapsed := time.Since(start); elapsed < 100*time.Millisecond { + t.Fatalf("should block (returned in %s) %#v", elapsed, resp3) + } + if resp3.Index != 4 { + t.Fatalf("Bad index: %d %d", resp3.Index, 4) + } + if len(resp3.Nodes) != 1 || resp3.Nodes[0].Status != structs.NodeStatusDown { + t.Fatalf("bad: %#v", resp3.Nodes) + } + + // Node delete triggers watches. + time.AfterFunc(100*time.Millisecond, func() { + if err := state.DeleteNode(5, node.ID); err != nil { + t.Fatalf("err: %v", err) + } + }) + + req.MinQueryIndex = 4 + var resp4 structs.NodeListResponse + start = time.Now() + if err := msgpackrpc.CallWithCodec(codec, "Node.List", req, &resp4); err != nil { + t.Fatalf("err: %v", err) + } + + if elapsed := time.Since(start); elapsed < 100*time.Millisecond { + t.Fatalf("should block (returned in %s) %#v", elapsed, resp4) + } + if resp4.Index != 5 { + t.Fatalf("Bad index: %d %d", resp4.Index, 5) + } + if len(resp4.Nodes) != 0 { + t.Fatalf("bad: %#v", resp4.Nodes) + } +} diff --git a/nomad/rpc.go b/nomad/rpc.go index 074dec0d6..21f9c9dc6 100644 --- a/nomad/rpc.go +++ b/nomad/rpc.go @@ -13,6 +13,7 @@ import ( "github.com/hashicorp/net-rpc-msgpackrpc" "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" + "github.com/hashicorp/nomad/nomad/watch" "github.com/hashicorp/raft" "github.com/hashicorp/yamux" ) @@ -268,10 +269,10 @@ func (s *Server) setQueryMeta(m *structs.QueryMeta) { // blockingOptions is used to parameterize blockingRPC type blockingOptions struct { - queryOpts *structs.QueryOptions - queryMeta *structs.QueryMeta - allocWatch string - run func() error + queryOpts *structs.QueryOptions + queryMeta *structs.QueryMeta + watch watch.Items + run func() error } // blockingRPC is used for queries that need to wait for a @@ -306,17 +307,13 @@ func (s *Server) blockingRPC(opts *blockingOptions) error { state = s.fsm.State() defer func() { timeout.Stop() - if opts.allocWatch != "" { - state.StopWatchAllocs(opts.allocWatch, notifyCh) - } + state.StopWatch(opts.watch, notifyCh) }() REGISTER_NOTIFY: // Register the notification channel. This may be done // multiple times if we have not reached the target wait index. - if opts.allocWatch != "" { - state.WatchAllocs(opts.allocWatch, notifyCh) - } + state.Watch(opts.watch, notifyCh) RUN_QUERY: // Update the query meta data @@ -327,7 +324,7 @@ RUN_QUERY: err := opts.run() // Check for minimum query time - if err == nil && opts.queryMeta.Index > 0 && opts.queryMeta.Index <= opts.queryOpts.MinQueryIndex { + if err == nil && opts.queryOpts.MinQueryIndex > 0 && opts.queryMeta.Index <= opts.queryOpts.MinQueryIndex { select { case <-notifyCh: goto REGISTER_NOTIFY diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go index 22487234b..30ee87259 100644 --- a/nomad/state/state_store.go +++ b/nomad/state/state_store.go @@ -8,8 +8,16 @@ import ( "github.com/hashicorp/go-memdb" "github.com/hashicorp/nomad/nomad/structs" + "github.com/hashicorp/nomad/nomad/watch" ) +// IndexEntry is used with the "index" table +// for managing the latest Raft index affecting a table. +type IndexEntry struct { + Key string + Value uint64 +} + // The StateStore is responsible for maintaining all the Nomad // state. It is manipulated by the FSM which maintains consistency // through the use of Raft. The goals of the StateStore are to provide @@ -23,45 +31,6 @@ type StateStore struct { watch *stateWatch } -// StateSnapshot is used to provide a point-in-time snapshot -type StateSnapshot struct { - StateStore -} - -// StateRestore is used to optimize the performance when -// restoring state by only using a single large transaction -// instead of thousands of sub transactions -type StateRestore struct { - txn *memdb.Txn - watch *stateWatch - allocNodes map[string]struct{} -} - -// Abort is used to abort the restore operation -func (s *StateRestore) Abort() { - s.txn.Abort() -} - -// Commit is used to commit the restore operation -func (s *StateRestore) Commit() { - s.txn.Defer(func() { s.watch.notifyAllocs(s.allocNodes) }) - s.txn.Commit() -} - -// IndexEntry is used with the "index" table -// for managing the latest Raft index affecting a table. -type IndexEntry struct { - Key string - Value uint64 -} - -// stateWatch holds shared state for watching updates. This is -// outside of StateStore so it can be shared with snapshots. -type stateWatch struct { - allocs map[string]*NotifyGroup - allocLock sync.Mutex -} - // NewStateStore is used to create a new state store func NewStateStore(logOutput io.Writer) (*StateStore, error) { // Create the MemDB @@ -70,16 +39,11 @@ func NewStateStore(logOutput io.Writer) (*StateStore, error) { return nil, fmt.Errorf("state store setup failed: %v", err) } - // Create the watch entry - watch := &stateWatch{ - allocs: make(map[string]*NotifyGroup), - } - // Create the state store s := &StateStore{ logger: log.New(logOutput, "", log.LstdFlags), db: db, - watch: watch, + watch: newStateWatch(), } return s, nil } @@ -104,55 +68,21 @@ func (s *StateStore) Snapshot() (*StateSnapshot, error) { func (s *StateStore) Restore() (*StateRestore, error) { txn := s.db.Txn(true) r := &StateRestore{ - txn: txn, - watch: s.watch, - allocNodes: make(map[string]struct{}), + txn: txn, + watch: s.watch, + items: watch.NewItems(), } return r, nil } -// WatchAllocs is used to subscribe a channel to changes in allocations for a node -func (s *StateStore) WatchAllocs(node string, notify chan struct{}) { - s.watch.allocLock.Lock() - defer s.watch.allocLock.Unlock() - - // Check for an existing notify group - if grp, ok := s.watch.allocs[node]; ok { - grp.Wait(notify) - return - } - - // Create new notify group - grp := &NotifyGroup{} - grp.Wait(notify) - s.watch.allocs[node] = grp +// Watch subscribes a channel to a set of watch items. +func (s *StateStore) Watch(items watch.Items, notify chan struct{}) { + s.watch.watch(items, notify) } -// StopWatchAllocs is used to unsubscribe a channel from changes in allocations -func (s *StateStore) StopWatchAllocs(node string, notify chan struct{}) { - s.watch.allocLock.Lock() - defer s.watch.allocLock.Unlock() - - // Check for an existing notify group - if grp, ok := s.watch.allocs[node]; ok { - grp.Clear(notify) - if grp.Empty() { - delete(s.watch.allocs, node) - } - } -} - -// notifyAllocs is used to notify any node alloc listeners of a change -func (w *stateWatch) notifyAllocs(nodes map[string]struct{}) { - w.allocLock.Lock() - defer w.allocLock.Unlock() - - for node := range nodes { - if grp, ok := w.allocs[node]; ok { - grp.Notify() - delete(w.allocs, node) - } - } +// StopWatch unsubscribes a channel from a set of watch items. +func (s *StateStore) StopWatch(items watch.Items, notify chan struct{}) { + s.watch.stopWatch(items, notify) } // UpsertNode is used to register a node or update a node definition @@ -162,6 +92,10 @@ func (s *StateStore) UpsertNode(index uint64, node *structs.Node) error { txn := s.db.Txn(true) defer txn.Abort() + watcher := watch.NewItems() + watcher.Add(watch.Item{Table: "nodes"}) + watcher.Add(watch.Item{Node: node.ID}) + // Check if the node already exists existing, err := txn.First("nodes", "id", node.ID) if err != nil { @@ -187,6 +121,7 @@ func (s *StateStore) UpsertNode(index uint64, node *structs.Node) error { return fmt.Errorf("index update failed: %v", err) } + txn.Defer(func() { s.watch.notify(watcher) }) txn.Commit() return nil } @@ -196,6 +131,10 @@ func (s *StateStore) DeleteNode(index uint64, nodeID string) error { txn := s.db.Txn(true) defer txn.Abort() + watcher := watch.NewItems() + watcher.Add(watch.Item{Table: "nodes"}) + watcher.Add(watch.Item{Node: nodeID}) + // Lookup the node existing, err := txn.First("nodes", "id", nodeID) if err != nil { @@ -213,6 +152,7 @@ func (s *StateStore) DeleteNode(index uint64, nodeID string) error { return fmt.Errorf("index update failed: %v", err) } + txn.Defer(func() { s.watch.notify(watcher) }) txn.Commit() return nil } @@ -222,6 +162,10 @@ func (s *StateStore) UpdateNodeStatus(index uint64, nodeID, status string) error txn := s.db.Txn(true) defer txn.Abort() + watcher := watch.NewItems() + watcher.Add(watch.Item{Table: "nodes"}) + watcher.Add(watch.Item{Node: nodeID}) + // Lookup the node existing, err := txn.First("nodes", "id", nodeID) if err != nil { @@ -248,6 +192,7 @@ func (s *StateStore) UpdateNodeStatus(index uint64, nodeID, status string) error return fmt.Errorf("index update failed: %v", err) } + txn.Defer(func() { s.watch.notify(watcher) }) txn.Commit() return nil } @@ -257,6 +202,10 @@ func (s *StateStore) UpdateNodeDrain(index uint64, nodeID string, drain bool) er txn := s.db.Txn(true) defer txn.Abort() + watcher := watch.NewItems() + watcher.Add(watch.Item{Table: "nodes"}) + watcher.Add(watch.Item{Node: nodeID}) + // Lookup the node existing, err := txn.First("nodes", "id", nodeID) if err != nil { @@ -283,6 +232,7 @@ func (s *StateStore) UpdateNodeDrain(index uint64, nodeID string, drain bool) er return fmt.Errorf("index update failed: %v", err) } + txn.Defer(func() { s.watch.notify(watcher) }) txn.Commit() return nil } @@ -319,6 +269,10 @@ func (s *StateStore) UpsertJob(index uint64, job *structs.Job) error { txn := s.db.Txn(true) defer txn.Abort() + watcher := watch.NewItems() + watcher.Add(watch.Item{Table: "jobs"}) + watcher.Add(watch.Item{Job: job.ID}) + // Check if the job already exists existing, err := txn.First("jobs", "id", job.ID) if err != nil { @@ -342,6 +296,7 @@ func (s *StateStore) UpsertJob(index uint64, job *structs.Job) error { return fmt.Errorf("index update failed: %v", err) } + txn.Defer(func() { s.watch.notify(watcher) }) txn.Commit() return nil } @@ -351,6 +306,10 @@ func (s *StateStore) DeleteJob(index uint64, jobID string) error { txn := s.db.Txn(true) defer txn.Abort() + watcher := watch.NewItems() + watcher.Add(watch.Item{Table: "jobs"}) + watcher.Add(watch.Item{Job: jobID}) + // Lookup the node existing, err := txn.First("jobs", "id", jobID) if err != nil { @@ -368,6 +327,7 @@ func (s *StateStore) DeleteJob(index uint64, jobID string) error { return fmt.Errorf("index update failed: %v", err) } + txn.Defer(func() { s.watch.notify(watcher) }) txn.Commit() return nil } @@ -417,13 +377,18 @@ func (s *StateStore) UpsertEvals(index uint64, evals []*structs.Evaluation) erro txn := s.db.Txn(true) defer txn.Abort() + watcher := watch.NewItems() + watcher.Add(watch.Item{Table: "evals"}) + // Do a nested upsert for _, eval := range evals { + watcher.Add(watch.Item{Eval: eval.ID}) if err := s.nestedUpsertEval(txn, index, eval); err != nil { return err } } + txn.Defer(func() { s.watch.notify(watcher) }) txn.Commit() return nil } @@ -459,7 +424,9 @@ func (s *StateStore) nestedUpsertEval(txn *memdb.Txn, index uint64, eval *struct func (s *StateStore) DeleteEval(index uint64, evals []string, allocs []string) error { txn := s.db.Txn(true) defer txn.Abort() - nodes := make(map[string]struct{}) + watcher := watch.NewItems() + watcher.Add(watch.Item{Table: "evals"}) + watcher.Add(watch.Item{Table: "allocs"}) for _, eval := range evals { existing, err := txn.First("evals", "id", eval) @@ -472,6 +439,7 @@ func (s *StateStore) DeleteEval(index uint64, evals []string, allocs []string) e if err := txn.Delete("evals", existing); err != nil { return fmt.Errorf("eval delete failed: %v", err) } + watcher.Add(watch.Item{Eval: eval}) } for _, alloc := range allocs { @@ -482,10 +450,14 @@ func (s *StateStore) DeleteEval(index uint64, evals []string, allocs []string) e if existing == nil { continue } - nodes[existing.(*structs.Allocation).NodeID] = struct{}{} if err := txn.Delete("allocs", existing); err != nil { return fmt.Errorf("alloc delete failed: %v", err) } + realAlloc := existing.(*structs.Allocation) + watcher.Add(watch.Item{Alloc: realAlloc.ID}) + watcher.Add(watch.Item{AllocEval: realAlloc.EvalID}) + watcher.Add(watch.Item{AllocJob: realAlloc.JobID}) + watcher.Add(watch.Item{AllocNode: realAlloc.NodeID}) } // Update the indexes @@ -495,7 +467,8 @@ func (s *StateStore) DeleteEval(index uint64, evals []string, allocs []string) e if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } - txn.Defer(func() { s.watch.notifyAllocs(nodes) }) + + txn.Defer(func() { s.watch.notify(watcher) }) txn.Commit() return nil } @@ -557,6 +530,13 @@ func (s *StateStore) UpdateAllocFromClient(index uint64, alloc *structs.Allocati txn := s.db.Txn(true) defer txn.Abort() + watcher := watch.NewItems() + watcher.Add(watch.Item{Table: "allocs"}) + watcher.Add(watch.Item{Alloc: alloc.ID}) + watcher.Add(watch.Item{AllocEval: alloc.EvalID}) + watcher.Add(watch.Item{AllocJob: alloc.JobID}) + watcher.Add(watch.Item{AllocNode: alloc.NodeID}) + // Look for existing alloc existing, err := txn.First("allocs", "id", alloc.ID) if err != nil { @@ -590,8 +570,7 @@ func (s *StateStore) UpdateAllocFromClient(index uint64, alloc *structs.Allocati return fmt.Errorf("index update failed: %v", err) } - nodes := map[string]struct{}{alloc.NodeID: struct{}{}} - txn.Defer(func() { s.watch.notifyAllocs(nodes) }) + txn.Defer(func() { s.watch.notify(watcher) }) txn.Commit() return nil } @@ -601,7 +580,9 @@ func (s *StateStore) UpdateAllocFromClient(index uint64, alloc *structs.Allocati func (s *StateStore) UpsertAllocs(index uint64, allocs []*structs.Allocation) error { txn := s.db.Txn(true) defer txn.Abort() - nodes := make(map[string]struct{}) + + watcher := watch.NewItems() + watcher.Add(watch.Item{Table: "allocs"}) // Handle the allocations for _, alloc := range allocs { @@ -620,10 +601,14 @@ func (s *StateStore) UpsertAllocs(index uint64, allocs []*structs.Allocation) er alloc.ClientStatus = exist.ClientStatus alloc.ClientDescription = exist.ClientDescription } - nodes[alloc.NodeID] = struct{}{} if err := txn.Insert("allocs", alloc); err != nil { return fmt.Errorf("alloc insert failed: %v", err) } + + watcher.Add(watch.Item{Alloc: alloc.ID}) + watcher.Add(watch.Item{AllocEval: alloc.EvalID}) + watcher.Add(watch.Item{AllocJob: alloc.JobID}) + watcher.Add(watch.Item{AllocNode: alloc.NodeID}) } // Update the indexes @@ -631,7 +616,7 @@ func (s *StateStore) UpsertAllocs(index uint64, allocs []*structs.Allocation) er return fmt.Errorf("index update failed: %v", err) } - txn.Defer(func() { s.watch.notifyAllocs(nodes) }) + txn.Defer(func() { s.watch.notify(watcher) }) txn.Commit() return nil } @@ -753,8 +738,35 @@ func (s *StateStore) Indexes() (memdb.ResultIterator, error) { return iter, nil } +// StateSnapshot is used to provide a point-in-time snapshot +type StateSnapshot struct { + StateStore +} + +// StateRestore is used to optimize the performance when +// restoring state by only using a single large transaction +// instead of thousands of sub transactions +type StateRestore struct { + txn *memdb.Txn + watch *stateWatch + items watch.Items +} + +// Abort is used to abort the restore operation +func (s *StateRestore) Abort() { + s.txn.Abort() +} + +// Commit is used to commit the restore operation +func (s *StateRestore) Commit() { + s.txn.Defer(func() { s.watch.notify(s.items) }) + s.txn.Commit() +} + // NodeRestore is used to restore a node func (r *StateRestore) NodeRestore(node *structs.Node) error { + r.items.Add(watch.Item{Table: "nodes"}) + r.items.Add(watch.Item{Node: node.ID}) if err := r.txn.Insert("nodes", node); err != nil { return fmt.Errorf("node insert failed: %v", err) } @@ -763,6 +775,8 @@ func (r *StateRestore) NodeRestore(node *structs.Node) error { // JobRestore is used to restore a job func (r *StateRestore) JobRestore(job *structs.Job) error { + r.items.Add(watch.Item{Table: "jobs"}) + r.items.Add(watch.Item{Job: job.ID}) if err := r.txn.Insert("jobs", job); err != nil { return fmt.Errorf("job insert failed: %v", err) } @@ -771,6 +785,8 @@ func (r *StateRestore) JobRestore(job *structs.Job) error { // EvalRestore is used to restore an evaluation func (r *StateRestore) EvalRestore(eval *structs.Evaluation) error { + r.items.Add(watch.Item{Table: "evals"}) + r.items.Add(watch.Item{Eval: eval.ID}) if err := r.txn.Insert("evals", eval); err != nil { return fmt.Errorf("eval insert failed: %v", err) } @@ -779,7 +795,11 @@ func (r *StateRestore) EvalRestore(eval *structs.Evaluation) error { // AllocRestore is used to restore an allocation func (r *StateRestore) AllocRestore(alloc *structs.Allocation) error { - r.allocNodes[alloc.NodeID] = struct{}{} + r.items.Add(watch.Item{Table: "allocs"}) + r.items.Add(watch.Item{Alloc: alloc.ID}) + r.items.Add(watch.Item{AllocEval: alloc.EvalID}) + r.items.Add(watch.Item{AllocJob: alloc.JobID}) + r.items.Add(watch.Item{AllocNode: alloc.NodeID}) if err := r.txn.Insert("allocs", alloc); err != nil { return fmt.Errorf("alloc insert failed: %v", err) } @@ -793,3 +813,59 @@ func (r *StateRestore) IndexRestore(idx *IndexEntry) error { } return nil } + +// stateWatch holds shared state for watching updates. This is +// outside of StateStore so it can be shared with snapshots. +type stateWatch struct { + items map[watch.Item]*NotifyGroup + l sync.Mutex +} + +// newStateWatch creates a new stateWatch for change notification. +func newStateWatch() *stateWatch { + return &stateWatch{ + items: make(map[watch.Item]*NotifyGroup), + } +} + +// watch subscribes a channel to the given watch items. +func (w *stateWatch) watch(items watch.Items, ch chan struct{}) { + w.l.Lock() + defer w.l.Unlock() + + for item, _ := range items { + grp, ok := w.items[item] + if !ok { + grp = new(NotifyGroup) + w.items[item] = grp + } + grp.Wait(ch) + } +} + +// stopWatch unsubscribes a channel from the given watch items. +func (w *stateWatch) stopWatch(items watch.Items, ch chan struct{}) { + w.l.Lock() + defer w.l.Unlock() + + for item, _ := range items { + if grp, ok := w.items[item]; ok { + grp.Clear(ch) + if grp.Empty() { + delete(w.items, item) + } + } + } +} + +// notify is used to fire notifications on the given watch items. +func (w *stateWatch) notify(items watch.Items) { + w.l.Lock() + defer w.l.Unlock() + + for wi, _ := range items { + if grp, ok := w.items[wi]; ok { + grp.Notify() + } + } +} diff --git a/nomad/state/state_store_test.go b/nomad/state/state_store_test.go index 1c4b60238..5e1021e55 100644 --- a/nomad/state/state_store_test.go +++ b/nomad/state/state_store_test.go @@ -8,6 +8,7 @@ import ( "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" + "github.com/hashicorp/nomad/nomad/watch" ) func testStateStore(t *testing.T) *StateStore { @@ -25,6 +26,11 @@ func TestStateStore_UpsertNode_Node(t *testing.T) { state := testStateStore(t) node := mock.Node() + notify := setupNotifyTest( + state, + watch.Item{Table: "nodes"}, + watch.Item{Node: node.ID}) + err := state.UpsertNode(1000, node) if err != nil { t.Fatalf("err: %v", err) @@ -46,12 +52,19 @@ func TestStateStore_UpsertNode_Node(t *testing.T) { if index != 1000 { t.Fatalf("bad: %d", index) } + + notify.verify(t) } func TestStateStore_DeleteNode_Node(t *testing.T) { state := testStateStore(t) node := mock.Node() + notify := setupNotifyTest( + state, + watch.Item{Table: "nodes"}, + watch.Item{Node: node.ID}) + err := state.UpsertNode(1000, node) if err != nil { t.Fatalf("err: %v", err) @@ -78,12 +91,19 @@ func TestStateStore_DeleteNode_Node(t *testing.T) { if index != 1001 { t.Fatalf("bad: %d", index) } + + notify.verify(t) } func TestStateStore_UpdateNodeStatus_Node(t *testing.T) { state := testStateStore(t) node := mock.Node() + notify := setupNotifyTest( + state, + watch.Item{Table: "nodes"}, + watch.Item{Node: node.ID}) + err := state.UpsertNode(1000, node) if err != nil { t.Fatalf("err: %v", err) @@ -113,12 +133,19 @@ func TestStateStore_UpdateNodeStatus_Node(t *testing.T) { if index != 1001 { t.Fatalf("bad: %d", index) } + + notify.verify(t) } func TestStateStore_UpdateNodeDrain_Node(t *testing.T) { state := testStateStore(t) node := mock.Node() + notify := setupNotifyTest( + state, + watch.Item{Table: "nodes"}, + watch.Item{Node: node.ID}) + err := state.UpsertNode(1000, node) if err != nil { t.Fatalf("err: %v", err) @@ -148,6 +175,8 @@ func TestStateStore_UpdateNodeDrain_Node(t *testing.T) { if index != 1001 { t.Fatalf("bad: %d", index) } + + notify.verify(t) } func TestStateStore_Nodes(t *testing.T) { @@ -188,18 +217,22 @@ func TestStateStore_Nodes(t *testing.T) { func TestStateStore_RestoreNode(t *testing.T) { state := testStateStore(t) + node := mock.Node() + + notify := setupNotifyTest( + state, + watch.Item{Table: "nodes"}, + watch.Item{Node: node.ID}) restore, err := state.Restore() if err != nil { t.Fatalf("err: %v", err) } - node := mock.Node() err = restore.NodeRestore(node) if err != nil { t.Fatalf("err: %v", err) } - restore.Commit() out, err := state.NodeByID(node.ID) @@ -210,12 +243,19 @@ func TestStateStore_RestoreNode(t *testing.T) { if !reflect.DeepEqual(out, node) { t.Fatalf("Bad: %#v %#v", out, node) } + + notify.verify(t) } func TestStateStore_UpsertJob_Job(t *testing.T) { state := testStateStore(t) job := mock.Job() + notify := setupNotifyTest( + state, + watch.Item{Table: "jobs"}, + watch.Item{Job: job.ID}) + err := state.UpsertJob(1000, job) if err != nil { t.Fatalf("err: %v", err) @@ -237,12 +277,19 @@ func TestStateStore_UpsertJob_Job(t *testing.T) { if index != 1000 { t.Fatalf("bad: %d", index) } + + notify.verify(t) } func TestStateStore_UpdateUpsertJob_Job(t *testing.T) { state := testStateStore(t) job := mock.Job() + notify := setupNotifyTest( + state, + watch.Item{Table: "jobs"}, + watch.Item{Job: job.ID}) + err := state.UpsertJob(1000, job) if err != nil { t.Fatalf("err: %v", err) @@ -278,12 +325,19 @@ func TestStateStore_UpdateUpsertJob_Job(t *testing.T) { if index != 1001 { t.Fatalf("bad: %d", index) } + + notify.verify(t) } func TestStateStore_DeleteJob_Job(t *testing.T) { state := testStateStore(t) job := mock.Job() + notify := setupNotifyTest( + state, + watch.Item{Table: "jobs"}, + watch.Item{Job: job.ID}) + err := state.UpsertJob(1000, job) if err != nil { t.Fatalf("err: %v", err) @@ -310,6 +364,8 @@ func TestStateStore_DeleteJob_Job(t *testing.T) { if index != 1001 { t.Fatalf("bad: %d", index) } + + notify.verify(t) } func TestStateStore_Jobs(t *testing.T) { @@ -417,18 +473,22 @@ func TestStateStore_JobsByScheduler(t *testing.T) { func TestStateStore_RestoreJob(t *testing.T) { state := testStateStore(t) + job := mock.Job() + + notify := setupNotifyTest( + state, + watch.Item{Table: "jobs"}, + watch.Item{Job: job.ID}) restore, err := state.Restore() if err != nil { t.Fatalf("err: %v", err) } - job := mock.Job() err = restore.JobRestore(job) if err != nil { t.Fatalf("err: %v", err) } - restore.Commit() out, err := state.JobByID(job.ID) @@ -439,6 +499,8 @@ func TestStateStore_RestoreJob(t *testing.T) { if !reflect.DeepEqual(out, job) { t.Fatalf("Bad: %#v %#v", out, job) } + + notify.verify(t) } func TestStateStore_Indexes(t *testing.T) { @@ -503,6 +565,11 @@ func TestStateStore_UpsertEvals_Eval(t *testing.T) { state := testStateStore(t) eval := mock.Eval() + notify := setupNotifyTest( + state, + watch.Item{Table: "evals"}, + watch.Item{Eval: eval.ID}) + err := state.UpsertEvals(1000, []*structs.Evaluation{eval}) if err != nil { t.Fatalf("err: %v", err) @@ -524,6 +591,8 @@ func TestStateStore_UpsertEvals_Eval(t *testing.T) { if index != 1000 { t.Fatalf("bad: %d", index) } + + notify.verify(t) } func TestStateStore_Update_UpsertEvals_Eval(t *testing.T) { @@ -535,6 +604,11 @@ func TestStateStore_Update_UpsertEvals_Eval(t *testing.T) { t.Fatalf("err: %v", err) } + notify := setupNotifyTest( + state, + watch.Item{Table: "evals"}, + watch.Item{Eval: eval.ID}) + eval2 := mock.Eval() eval2.ID = eval.ID err = state.UpsertEvals(1001, []*structs.Evaluation{eval2}) @@ -565,40 +639,54 @@ func TestStateStore_Update_UpsertEvals_Eval(t *testing.T) { if index != 1001 { t.Fatalf("bad: %d", index) } + + notify.verify(t) } func TestStateStore_DeleteEval_Eval(t *testing.T) { state := testStateStore(t) - eval := mock.Eval() + eval1 := mock.Eval() eval2 := mock.Eval() - alloc := mock.Alloc() + alloc1 := mock.Alloc() alloc2 := mock.Alloc() - err := state.UpsertEvals(1000, []*structs.Evaluation{eval, eval2}) + notify := setupNotifyTest( + state, + watch.Item{Table: "evals"}, + watch.Item{Table: "allocs"}, + watch.Item{Eval: eval1.ID}, + watch.Item{Eval: eval2.ID}, + watch.Item{Alloc: alloc1.ID}, + watch.Item{Alloc: alloc2.ID}, + watch.Item{AllocEval: alloc1.EvalID}, + watch.Item{AllocEval: alloc2.EvalID}, + watch.Item{AllocJob: alloc1.JobID}, + watch.Item{AllocJob: alloc2.JobID}, + watch.Item{AllocNode: alloc1.NodeID}, + watch.Item{AllocNode: alloc2.NodeID}) + + err := state.UpsertEvals(1000, []*structs.Evaluation{eval1, eval2}) if err != nil { t.Fatalf("err: %v", err) } - err = state.UpsertAllocs(1001, []*structs.Allocation{alloc, alloc2}) + err = state.UpsertAllocs(1001, []*structs.Allocation{alloc1, alloc2}) if err != nil { t.Fatalf("err: %v", err) } - notify1 := make(chan struct{}, 1) - state.WatchAllocs(alloc.NodeID, notify1) - - err = state.DeleteEval(1002, []string{eval.ID, eval2.ID}, []string{alloc.ID, alloc2.ID}) + err = state.DeleteEval(1002, []string{eval1.ID, eval2.ID}, []string{alloc1.ID, alloc2.ID}) if err != nil { t.Fatalf("err: %v", err) } - out, err := state.EvalByID(eval.ID) + out, err := state.EvalByID(eval1.ID) if err != nil { t.Fatalf("err: %v", err) } if out != nil { - t.Fatalf("bad: %#v %#v", eval, out) + t.Fatalf("bad: %#v %#v", eval1, out) } out, err = state.EvalByID(eval2.ID) @@ -607,16 +695,16 @@ func TestStateStore_DeleteEval_Eval(t *testing.T) { } if out != nil { - t.Fatalf("bad: %#v %#v", eval, out) + t.Fatalf("bad: %#v %#v", eval1, out) } - outA, err := state.AllocByID(alloc.ID) + outA, err := state.AllocByID(alloc1.ID) if err != nil { t.Fatalf("err: %v", err) } if out != nil { - t.Fatalf("bad: %#v %#v", alloc, outA) + t.Fatalf("bad: %#v %#v", alloc1, outA) } outA, err = state.AllocByID(alloc2.ID) @@ -625,7 +713,7 @@ func TestStateStore_DeleteEval_Eval(t *testing.T) { } if out != nil { - t.Fatalf("bad: %#v %#v", alloc, outA) + t.Fatalf("bad: %#v %#v", alloc1, outA) } index, err := state.Index("evals") @@ -644,11 +732,7 @@ func TestStateStore_DeleteEval_Eval(t *testing.T) { t.Fatalf("bad: %d", index) } - select { - case <-notify1: - default: - t.Fatalf("should be notified") - } + notify.verify(t) } func TestStateStore_EvalsByJob(t *testing.T) { @@ -720,34 +804,48 @@ func TestStateStore_Evals(t *testing.T) { func TestStateStore_RestoreEval(t *testing.T) { state := testStateStore(t) + eval := mock.Eval() + + notify := setupNotifyTest( + state, + watch.Item{Table: "evals"}, + watch.Item{Eval: eval.ID}) restore, err := state.Restore() if err != nil { t.Fatalf("err: %v", err) } - job := mock.Eval() - err = restore.EvalRestore(job) + err = restore.EvalRestore(eval) if err != nil { t.Fatalf("err: %v", err) } - restore.Commit() - out, err := state.EvalByID(job.ID) + out, err := state.EvalByID(eval.ID) if err != nil { t.Fatalf("err: %v", err) } - if !reflect.DeepEqual(out, job) { - t.Fatalf("Bad: %#v %#v", out, job) + if !reflect.DeepEqual(out, eval) { + t.Fatalf("Bad: %#v %#v", out, eval) } + + notify.verify(t) } func TestStateStore_UpdateAllocFromClient(t *testing.T) { state := testStateStore(t) - alloc := mock.Alloc() + + notify := setupNotifyTest( + state, + watch.Item{Table: "allocs"}, + watch.Item{Alloc: alloc.ID}, + watch.Item{AllocEval: alloc.EvalID}, + watch.Item{AllocJob: alloc.JobID}, + watch.Item{AllocNode: alloc.NodeID}) + err := state.UpsertAllocs(1000, []*structs.Allocation{alloc}) if err != nil { t.Fatalf("err: %v", err) @@ -779,12 +877,22 @@ func TestStateStore_UpdateAllocFromClient(t *testing.T) { if index != 1001 { t.Fatalf("bad: %d", index) } + + notify.verify(t) } func TestStateStore_UpsertAlloc_Alloc(t *testing.T) { state := testStateStore(t) - alloc := mock.Alloc() + + notify := setupNotifyTest( + state, + watch.Item{Table: "allocs"}, + watch.Item{Alloc: alloc.ID}, + watch.Item{AllocEval: alloc.EvalID}, + watch.Item{AllocJob: alloc.JobID}, + watch.Item{AllocNode: alloc.NodeID}) + err := state.UpsertAllocs(1000, []*structs.Allocation{alloc}) if err != nil { t.Fatalf("err: %v", err) @@ -806,35 +914,8 @@ func TestStateStore_UpsertAlloc_Alloc(t *testing.T) { if index != 1000 { t.Fatalf("bad: %d", index) } -} -func TestStateStore_WatchAllocs(t *testing.T) { - state := testStateStore(t) - - notify1 := make(chan struct{}, 1) - notify2 := make(chan struct{}, 1) - state.WatchAllocs("foo", notify1) - state.WatchAllocs("foo", notify2) - state.StopWatchAllocs("foo", notify2) - - alloc := mock.Alloc() - alloc.NodeID = "foo" - err := state.UpsertAllocs(1000, []*structs.Allocation{alloc}) - if err != nil { - t.Fatalf("err: %v", err) - } - - select { - case <-notify1: - default: - t.Fatalf("should be notified") - } - - select { - case <-notify2: - t.Fatalf("should not be notified") - default: - } + notify.verify(t) } func TestStateStore_UpdateAlloc_Alloc(t *testing.T) { @@ -849,6 +930,15 @@ func TestStateStore_UpdateAlloc_Alloc(t *testing.T) { alloc2 := mock.Alloc() alloc2.ID = alloc.ID alloc2.NodeID = alloc.NodeID + ".new" + + notify := setupNotifyTest( + state, + watch.Item{Table: "allocs"}, + watch.Item{Alloc: alloc2.ID}, + watch.Item{AllocEval: alloc2.EvalID}, + watch.Item{AllocJob: alloc2.JobID}, + watch.Item{AllocNode: alloc2.NodeID}) + err = state.UpsertAllocs(1001, []*structs.Allocation{alloc2}) if err != nil { t.Fatalf("err: %v", err) @@ -877,6 +967,8 @@ func TestStateStore_UpdateAlloc_Alloc(t *testing.T) { if index != 1001 { t.Fatalf("bad: %d", index) } + + notify.verify(t) } func TestStateStore_EvictAlloc_Alloc(t *testing.T) { @@ -1008,13 +1100,21 @@ func TestStateStore_Allocs(t *testing.T) { func TestStateStore_RestoreAlloc(t *testing.T) { state := testStateStore(t) + alloc := mock.Alloc() + + notify := setupNotifyTest( + state, + watch.Item{Table: "allocs"}, + watch.Item{Alloc: alloc.ID}, + watch.Item{AllocEval: alloc.EvalID}, + watch.Item{AllocJob: alloc.JobID}, + watch.Item{AllocNode: alloc.NodeID}) restore, err := state.Restore() if err != nil { t.Fatalf("err: %v", err) } - alloc := mock.Alloc() err = restore.AllocRestore(alloc) if err != nil { t.Fatalf("err: %v", err) @@ -1030,6 +1130,87 @@ func TestStateStore_RestoreAlloc(t *testing.T) { if !reflect.DeepEqual(out, alloc) { t.Fatalf("Bad: %#v %#v", out, alloc) } + + notify.verify(t) +} + +func TestStateWatch_watch(t *testing.T) { + sw := newStateWatch() + notify1 := make(chan struct{}, 1) + notify2 := make(chan struct{}, 1) + notify3 := make(chan struct{}, 1) + + // Notifications trigger subscribed channels + sw.watch(watch.NewItems(watch.Item{Table: "foo"}), notify1) + sw.watch(watch.NewItems(watch.Item{Table: "bar"}), notify2) + sw.watch(watch.NewItems(watch.Item{Table: "baz"}), notify3) + + items := watch.NewItems() + items.Add(watch.Item{Table: "foo"}) + items.Add(watch.Item{Table: "bar"}) + + sw.notify(items) + if len(notify1) != 1 { + t.Fatalf("should notify") + } + if len(notify2) != 1 { + t.Fatalf("should notify") + } + if len(notify3) != 0 { + t.Fatalf("should not notify") + } +} + +func TestStateWatch_stopWatch(t *testing.T) { + sw := newStateWatch() + notify := make(chan struct{}) + + // First subscribe + sw.watch(watch.NewItems(watch.Item{Table: "foo"}), notify) + + // Unsubscribe stop notifications + sw.stopWatch(watch.NewItems(watch.Item{Table: "foo"}), notify) + + // Check that the group was removed + if _, ok := sw.items[watch.Item{Table: "foo"}]; ok { + t.Fatalf("should remove group") + } + + // Check that we are not notified + sw.notify(watch.NewItems(watch.Item{Table: "foo"})) + if len(notify) != 0 { + t.Fatalf("should not notify") + } +} + +// setupNotifyTest takes a state store and a set of watch items, then creates +// and subscribes a notification channel for each item. +func setupNotifyTest(state *StateStore, items ...watch.Item) notifyTest { + var n notifyTest + for _, item := range items { + ch := make(chan struct{}, 1) + state.Watch(watch.NewItems(item), ch) + n = append(n, ¬ifyTestCase{item, ch}) + } + return n +} + +// notifyTestCase is used to set up and verify watch triggers. +type notifyTestCase struct { + item watch.Item + ch chan struct{} +} + +// notifyTest is a suite of notifyTestCases. +type notifyTest []*notifyTestCase + +// verify ensures that each channel received a notification. +func (n notifyTest) verify(t *testing.T) { + for _, tcase := range n { + if len(tcase.ch) != 1 { + t.Fatalf("should notify %#v", tcase.item) + } + } } // NodeIDSort is used to sort nodes by ID diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index f5d20552a..bfec26fce 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -14,8 +14,17 @@ import ( ) var ( - ErrNoLeader = fmt.Errorf("No cluster leader") - ErrNoRegionPath = fmt.Errorf("No path to region") + ErrNoLeader = fmt.Errorf("No cluster leader") + ErrNoRegionPath = fmt.Errorf("No path to region") + defaultServiceJobRestartPolicy = RestartPolicy{ + Delay: 15 * time.Second, + Attempts: 2, + Interval: 1 * time.Minute, + } + defaultBatchJobRestartPolicy = RestartPolicy{ + Delay: 15 * time.Second, + Attempts: 15, + } ) type MessageType uint8 @@ -898,6 +907,33 @@ func (u *UpdateStrategy) Rolling() bool { return u.Stagger > 0 && u.MaxParallel > 0 } +// RestartPolicy influences how Nomad restarts Tasks when they +// crash or fail. +type RestartPolicy struct { + Attempts int + Interval time.Duration + Delay time.Duration +} + +func (r *RestartPolicy) Validate() error { + if time.Duration(r.Attempts)*r.Delay > r.Interval { + return fmt.Errorf("Nomad can't restart the TaskGroup %v times in an interval of %v with a delay of %v", r.Attempts, r.Interval, r.Delay) + } + return nil +} + +func NewRestartPolicy(jobType string) *RestartPolicy { + switch jobType { + case JobTypeService: + rp := defaultServiceJobRestartPolicy + return &rp + case JobTypeBatch: + rp := defaultBatchJobRestartPolicy + return &rp + } + return nil +} + // TaskGroup is an atomic unit of placement. Each task group belongs to // a job and may contain any number of tasks. A task group support running // in many replicas using the same configuration.. @@ -913,6 +949,9 @@ type TaskGroup struct { // all the tasks contained. Constraints []*Constraint + //RestartPolicy of a TaskGroup + RestartPolicy *RestartPolicy + // Tasks are the collection of tasks that this task group needs to run Tasks []*Task @@ -940,6 +979,10 @@ func (tg *TaskGroup) Validate() error { } } + if err := tg.RestartPolicy.Validate(); err != nil { + mErr.Errors = append(mErr.Errors, err) + } + // Check for duplicate tasks tasks := make(map[string]int) for idx, task := range tg.Tasks { diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go index cabf83dfa..1f107b095 100644 --- a/nomad/structs/structs_test.go +++ b/nomad/structs/structs_test.go @@ -1,11 +1,11 @@ package structs import ( + "github.com/hashicorp/go-multierror" "reflect" "strings" "testing" - - "github.com/hashicorp/go-multierror" + "time" ) func TestJob_Validate(t *testing.T) { @@ -44,11 +44,27 @@ func TestJob_Validate(t *testing.T) { TaskGroups: []*TaskGroup{ &TaskGroup{ Name: "web", + RestartPolicy: &RestartPolicy{ + Interval: 5 * time.Minute, + Delay: 10 * time.Second, + Attempts: 10, + }, }, &TaskGroup{ Name: "web", + RestartPolicy: &RestartPolicy{ + Interval: 5 * time.Minute, + Delay: 10 * time.Second, + Attempts: 10, + }, + }, + &TaskGroup{ + RestartPolicy: &RestartPolicy{ + Interval: 5 * time.Minute, + Delay: 10 * time.Second, + Attempts: 10, + }, }, - &TaskGroup{}, }, } err = j.Validate() @@ -65,7 +81,13 @@ func TestJob_Validate(t *testing.T) { } func TestTaskGroup_Validate(t *testing.T) { - tg := &TaskGroup{} + tg := &TaskGroup{ + RestartPolicy: &RestartPolicy{ + Interval: 5 * time.Minute, + Delay: 10 * time.Second, + Attempts: 10, + }, + } err := tg.Validate() mErr := err.(*multierror.Error) if !strings.Contains(mErr.Errors[0].Error(), "group name") { @@ -86,6 +108,11 @@ func TestTaskGroup_Validate(t *testing.T) { &Task{Name: "web"}, &Task{}, }, + RestartPolicy: &RestartPolicy{ + Interval: 5 * time.Minute, + Delay: 10 * time.Second, + Attempts: 10, + }, } err = tg.Validate() mErr = err.(*multierror.Error) diff --git a/nomad/watch/watch.go b/nomad/watch/watch.go new file mode 100644 index 000000000..4e9bafbc9 --- /dev/null +++ b/nomad/watch/watch.go @@ -0,0 +1,38 @@ +package watch + +// The watch package provides a means of describing a watch for a blocking +// query. It is exported so it may be shared between Nomad's RPC layer and +// the underlying state store. + +// Item describes the scope of a watch. It is used to provide a uniform +// input for subscribe/unsubscribe and notification firing. Specifying +// multiple fields does not place a watch on multiple items. Each Item +// describes exactly one scoped watch. +type Item struct { + Alloc string + AllocEval string + AllocJob string + AllocNode string + Eval string + Job string + Node string + Table string +} + +// Items is a helper used to construct a set of watchItems. It deduplicates +// the items as they are added using map keys. +type Items map[Item]struct{} + +// NewItems creates a new Items set and adds the given items. +func NewItems(items ...Item) Items { + wi := make(Items) + for _, item := range items { + wi.Add(item) + } + return wi +} + +// Add adds an item to the watch set. +func (wi Items) Add(i Item) { + wi[i] = struct{}{} +} diff --git a/nomad/watch/watch_test.go b/nomad/watch/watch_test.go new file mode 100644 index 000000000..9a8901aa8 --- /dev/null +++ b/nomad/watch/watch_test.go @@ -0,0 +1,31 @@ +package watch + +import ( + "testing" +) + +func TestWatchItems(t *testing.T) { + // Creates an empty set of items + wi := NewItems() + if len(wi) != 0 { + t.Fatalf("expect 0 items, got: %#v", wi) + } + + // Creates a new set of supplied items + wi = NewItems(Item{Table: "foo"}) + if len(wi) != 1 { + t.Fatalf("expected 1 item, got: %#v", wi) + } + + // Adding items works + wi.Add(Item{Node: "bar"}) + if len(wi) != 2 { + t.Fatalf("expected 2 items, got: %#v", wi) + } + + // Adding duplicates auto-dedupes + wi.Add(Item{Table: "foo"}) + if len(wi) != 2 { + t.Fatalf("expected 2 items, got: %#v", wi) + } +} diff --git a/website/source/docs/agent/config.html.md b/website/source/docs/agent/config.html.md index 6f0b9b0dc..d548b7e67 100644 --- a/website/source/docs/agent/config.html.md +++ b/website/source/docs/agent/config.html.md @@ -42,7 +42,7 @@ nodes, unless otherwise specified: as `us-west` and `us-east`. Defaults to `global`. * `datacenter`: Datacenter of the local agent. All members of a datacenter - should all share a local LAN connection. Defaults to `dc1`. + should share a local LAN connection. Defaults to `dc1`. * `name`: The name of the local node. This value is used to identify individual nodes in a given datacenter and must be unique @@ -103,7 +103,7 @@ nodes, unless otherwise specified: This can be used to advertise a different address to the peers of a server node to support more complex network configurations such as NAT. This configuration is optional, and defaults to the bind address of the specific - network service if it is not provided. This configuration is only appicable + network service if it is not provided. This configuration is only applicable on server nodes. The value is a map of IP addresses and supports the following keys:
@@ -125,13 +125,13 @@ nodes, unless otherwise specified: * `disable_hostname`: A boolean indicating if gauge values should not be prefixed with the local hostname. -* `leave_on_interrupt`: Enables gracefully leave when receiving the +* `leave_on_interrupt`: Enables gracefully leaving when receiving the interrupt signal. By default, the agent will exit forcefully on any signal. -* `leave_on_terminate`: Enables gracefully leave when receiving the +* `leave_on_terminate`: Enables gracefully leaving when receiving the terminate signal. By default, the agent will exit forcefully on any signal. -* `enable_syslog`: Enables logging to syslog. This option only work on +* `enable_syslog`: Enables logging to syslog. This option only works on Unix based systems. * `syslog_facility`: Controls the syslog facility that is used. By default, diff --git a/website/source/docs/drivers/exec.html.md b/website/source/docs/drivers/exec.html.md index dadf28549..e82aa1505 100644 --- a/website/source/docs/drivers/exec.html.md +++ b/website/source/docs/drivers/exec.html.md @@ -11,7 +11,7 @@ description: |- Name: `exec` The `exec` driver is used to simply execute a particular command for a task. -However unlike [`raw_exec`](raw_exec.html) it uses the underlying isolation +However, unlike [`raw_exec`](raw_exec.html) it uses the underlying isolation primitives of the operating system to limit the tasks access to resources. While simple, since the `exec` driver can invoke any command, it can be used to call scripts or other wrappers which provide higher level features. @@ -28,9 +28,10 @@ must reference it in the `command` as show in the examples below ## Client Requirements -The `exec` driver can run on all supported operating systems but to provide -proper isolation the client must be run as root on non-Windows operating systems. -Further, to support cgroups, `/sys/fs/cgroups/` must be mounted. +The `exec` driver can only be run when on Linux and running Nomad as root. +`exec` is limited to this configuration because currently isolation of resources +is only guaranteed on Linux. Further the host must have cgroups mounted properly +in order for the driver to work. You must specify a `command` to be executed. Optionally you can specify an `artifact_source` to be downloaded as well. Any `command` is assumed to be present on the @@ -68,8 +69,5 @@ The `exec` driver will set the following client attributes: The resource isolation provided varies by the operating system of the client and the configuration. -On Linux, Nomad will use cgroups, namespaces, and chroot to isolate the +On Linux, Nomad will use cgroups, and a chroot to isolate the resources of a process and as such the Nomad agent must be run as root. - -On Windows, the task driver will just execute the command with no additional -resource isolation. diff --git a/website/source/docs/http/alloc.html.md b/website/source/docs/http/alloc.html.md index 3c224fd54..822858a8c 100644 --- a/website/source/docs/http/alloc.html.md +++ b/website/source/docs/http/alloc.html.md @@ -31,6 +31,11 @@ be specified using the `?region=` query parameter. None +
Blocking Queries
+
+ [Supported](/docs/http/index.html#blocking-queries) +
+
Returns
@@ -179,4 +184,3 @@ be specified using the `?region=` query parameter.
- diff --git a/website/source/docs/http/allocs.html.md b/website/source/docs/http/allocs.html.md index 44ad8aa7e..b59a4f204 100644 --- a/website/source/docs/http/allocs.html.md +++ b/website/source/docs/http/allocs.html.md @@ -31,6 +31,11 @@ be specified using the `?region=` query parameter. None +
Blocking Queries
+
+ [Supported](/docs/http/index.html#blocking-queries) +
+
Returns
@@ -56,4 +61,3 @@ be specified using the `?region=` query parameter.
- diff --git a/website/source/docs/http/eval.html.md b/website/source/docs/http/eval.html.md index cba43900c..87e048209 100644 --- a/website/source/docs/http/eval.html.md +++ b/website/source/docs/http/eval.html.md @@ -3,7 +3,7 @@ layout: "http" page_title: "HTTP API: /v1/evaluation" sidebar_current: "docs-http-eval-" description: |- - The '/1/evaluation' endpoint is used to query a specific evaluation. + The '/v1/evaluation' endpoint is used to query a specific evaluation. --- # /v1/evaluation @@ -17,7 +17,7 @@ be specified using the `?region=` query parameter.
Description
- Lists all the evaluations. + Query a specific evaluation.
Method
@@ -31,6 +31,11 @@ be specified using the `?region=` query parameter. None +
Blocking Queries
+
+ [Supported](/docs/http/index.html#blocking-queries) +
+
Returns
@@ -57,9 +62,6 @@ be specified using the `?region=` query parameter.
-# /v1/evaluation/\/allocations -## GET -
Description
@@ -77,6 +79,11 @@ be specified using the `?region=` query parameter. None
+
Blocking Queries
+
+ [Supported](/docs/http/index.html#blocking-queries) +
+
Returns
@@ -102,4 +109,3 @@ be specified using the `?region=` query parameter.
- diff --git a/website/source/docs/http/evals.html.md b/website/source/docs/http/evals.html.md index 3bc22da8f..23d98cc95 100644 --- a/website/source/docs/http/evals.html.md +++ b/website/source/docs/http/evals.html.md @@ -31,6 +31,11 @@ be specified using the `?region=` query parameter. None +
Blocking Queries
+
+ [Supported](/docs/http/index.html#blocking-queries) +
+
Returns
@@ -59,4 +64,3 @@ be specified using the `?region=` query parameter.
- diff --git a/website/source/docs/http/index.html.md b/website/source/docs/http/index.html.md index 671d19fa5..7ed3f0dbd 100644 --- a/website/source/docs/http/index.html.md +++ b/website/source/docs/http/index.html.md @@ -31,6 +31,7 @@ The API is modeled closely on the underlying data model. Use the links to the le documentation about specific endpoints. There are also "Agent" APIs which interact with a specific agent and not the broader cluster used for administration. + ## Blocking Queries Certain endpoints support a feature called a "blocking query." A blocking query diff --git a/website/source/docs/http/job.html.md b/website/source/docs/http/job.html.md index 211963e6a..cbf0f5097 100644 --- a/website/source/docs/http/job.html.md +++ b/website/source/docs/http/job.html.md @@ -6,7 +6,7 @@ description: |- The '/1/job' endpoint is used for CRUD on a single job. --- -# /v1/job/\ +# /v1/job The `job` endpoint is used for CRUD on a single job. By default, the agent's local region is used; another region can be specified using the `?region=` query parameter. @@ -30,6 +30,11 @@ region is used; another region can be specified using the `?region=` query param None +
Blocking Queries
+
+ [Supported](/docs/http/index.html#blocking-queries) +
+
Returns
@@ -136,6 +141,105 @@ region is used; another region can be specified using the `?region=` query param
+
+
Description
+
+ Query the allocations belonging to a single job. +
+ +
Method
+
GET
+ +
URL
+
`/v1/job//allocations`
+ +
Parameters
+
+ None +
+ +
Blocking Queries
+
+ [Supported](/docs/http/index.html#blocking-queries) +
+ +
Returns
+
+ + ```javascript + [ + { + "ID": "3575ba9d-7a12-0c96-7b28-add168c67984", + "EvalID": "151accaa-1ac6-90fe-d427-313e70ccbb88", + "Name": "binstore-storagelocker.binsl[0]", + "NodeID": "a703c3ca-5ff8-11e5-9213-970ee8879d1b", + "JobID": "binstore-storagelocker", + "TaskGroup": "binsl", + "DesiredStatus": "run", + "DesiredDescription": "", + "ClientStatus": "running", + "ClientDescription": "", + "CreateIndex": 16, + "ModifyIndex": 16 + }, + ... + ] + ``` + +
+
+ +
+
Description
+
+ Query the evaluations belonging to a single job. +
+ +
Method
+
GET
+ +
URL
+
`/v1/job//evaluations`
+ +
Parameters
+
+ None +
+ +
Blocking Queries
+
+ [Supported](/docs/http/index.html#blocking-queries) +
+ +
Returns
+
+ + ```javascript + [ + { + "ID": "151accaa-1ac6-90fe-d427-313e70ccbb88", + "Priority": 50, + "Type": "service", + "TriggeredBy": "job-register", + "JobID": "binstore-storagelocker", + "JobModifyIndex": 14, + "NodeID": "", + "NodeModifyIndex": 0, + "Status": "complete", + "StatusDescription": "", + "Wait": 0, + "NextEval": "", + "PreviousEval": "", + "CreateIndex": 15, + "ModifyIndex": 17 + }, + ... + ] + ``` + +
+
+ ## PUT / POST
@@ -177,6 +281,38 @@ region is used; another region can be specified using the `?region=` query param
+
+
Description
+
+ Creates a new evaluation for the given job. This can be used to force + run the scheduling logic if necessary. +
+ +
Method
+
PUT or POST
+ +
URL
+
`/v1/job//evaluate`
+ +
Parameters
+
+ None +
+ +
Returns
+
+ + ```javascript + { + "EvalID": "d092fdc0-e1fd-2536-67d8-43af8ca798ac", + "EvalCreateIndex": 35, + "JobModifyIndex": 34, + } + ``` + +
+
+ ## DELETE
@@ -209,134 +345,3 @@ region is used; another region can be specified using the `?region=` query param
- -# /v1/job/\/allocations -## GET - -
-
Description
-
- Query the allocations belonging to a single job. -
- -
Method
-
GET
- -
URL
-
`/v1/job//allocations`
- -
Parameters
-
- None -
- -
Returns
-
- - ```javascript - [ - { - "ID": "3575ba9d-7a12-0c96-7b28-add168c67984", - "EvalID": "151accaa-1ac6-90fe-d427-313e70ccbb88", - "Name": "binstore-storagelocker.binsl[0]", - "NodeID": "a703c3ca-5ff8-11e5-9213-970ee8879d1b", - "JobID": "binstore-storagelocker", - "TaskGroup": "binsl", - "DesiredStatus": "run", - "DesiredDescription": "", - "ClientStatus": "running", - "ClientDescription": "", - "CreateIndex": 16, - "ModifyIndex": 16 - }, - ... - ] - ``` - -
-
- -# /v1/job/\/evaluate -## PUT / POST - -
-
Description
-
- Creates a new evaluation for the given job. This can be used to force - run the scheduling logic if necessary. -
- -
Method
-
PUT or POST
- -
URL
-
`/v1/job//evaluate`
- -
Parameters
-
- None -
- -
Returns
-
- - ```javascript - { - "EvalID": "d092fdc0-e1fd-2536-67d8-43af8ca798ac", - "EvalCreateIndex": 35, - "JobModifyIndex": 34, - } - ``` - -
-
- -# /v1/job/\/evaluations -## GET - -
-
Description
-
- Query the evaluations belonging to a single job. -
- -
Method
-
GET
- -
URL
-
`/v1/job//evaluations`
- -
Parameters
-
- None -
- -
Returns
-
- - ```javascript - [ - { - "ID": "151accaa-1ac6-90fe-d427-313e70ccbb88", - "Priority": 50, - "Type": "service", - "TriggeredBy": "job-register", - "JobID": "binstore-storagelocker", - "JobModifyIndex": 14, - "NodeID": "", - "NodeModifyIndex": 0, - "Status": "complete", - "StatusDescription": "", - "Wait": 0, - "NextEval": "", - "PreviousEval": "", - "CreateIndex": 15, - "ModifyIndex": 17 - }, - ... - ] - ``` - -
-
- diff --git a/website/source/docs/http/jobs.html.md b/website/source/docs/http/jobs.html.md index f724ce0ac..8f098b1ca 100644 --- a/website/source/docs/http/jobs.html.md +++ b/website/source/docs/http/jobs.html.md @@ -31,6 +31,11 @@ another region can be specified using the `?region=` query parameter. None +
Blocking Queries
+
+ [Supported](/docs/http/index.html#blocking-queries) +
+
Returns
@@ -93,4 +98,3 @@ another region can be specified using the `?region=` query parameter.
- diff --git a/website/source/docs/http/node.html.md b/website/source/docs/http/node.html.md index f16131f97..df09426d6 100644 --- a/website/source/docs/http/node.html.md +++ b/website/source/docs/http/node.html.md @@ -31,6 +31,11 @@ be specified using the `?region=` query parameter. None +
Blocking Queries
+
+ [Supported](/docs/http/index.html#blocking-queries) +
+
Returns
@@ -82,9 +87,6 @@ be specified using the `?region=` query parameter.
-# /v1/node/\/allocations -## GET -
Description
@@ -102,6 +104,11 @@ be specified using the `?region=` query parameter. None
+
Blocking Queries
+
+ [Supported](/docs/http/index.html#blocking-queries) +
+
Returns
@@ -128,7 +135,6 @@ be specified using the `?region=` query parameter.
-# /v1/node/\/evaluate ## PUT / POST
@@ -163,9 +169,6 @@ be specified using the `?region=` query parameter.
-# /v1/node/\/drain -## PUT / POST -
Description
@@ -175,7 +178,7 @@ be specified using the `?region=` query parameter.
Method
-
PUT or POSt
+
PUT or POST
URL
`/v1/node//drain`
@@ -205,4 +208,3 @@ be specified using the `?region=` query parameter.
- diff --git a/website/source/docs/http/nodes.html.md b/website/source/docs/http/nodes.html.md index 36fa96fcd..b8e2b91a9 100644 --- a/website/source/docs/http/nodes.html.md +++ b/website/source/docs/http/nodes.html.md @@ -31,6 +31,11 @@ be specified using the `?region=` query parameter. None +
Blocking Queries
+
+ [Supported](/docs/http/index.html#blocking-queries) +
+
Returns
@@ -53,5 +58,3 @@ be specified using the `?region=` query parameter.
- -