drain: block cli until all allocs stop

Before the drain CLI would block until the node was marked as completing drain operations. While technically correct, it could lead operators (or more likely: scripts) to shutdown drained nodes before all of its allocations had *actually* terminated. This change makes the CLI block until all allocations have terminated (unless ignoring system jobs).
2026-01-05 18:05:42 +03:00 · 2018-03-28 14:01:54 -07:00
parent 00b358553d
commit 4cefb6f06a
4 changed files with 474 additions and 158 deletions
--- a/command/node_drain.go
+++ b/command/node_drain.go
@@ -1,13 +1,13 @@
 package command

 import (
+	"context"
 	"fmt"
 	"strings"
 	"time"

 	"github.com/hashicorp/nomad/api"
 	"github.com/hashicorp/nomad/api/contexts"
-	"github.com/hashicorp/nomad/nomad/structs"
 	"github.com/posener/complete"
 )

@@ -271,160 +271,18 @@ func (c *NodeDrainCommand) Run(args []string) int {
 		return 1
 	}

-	c.Ui.Output(fmt.Sprintf("Node %q drain strategy set", node.ID))
+	if enable {
+		c.Ui.Output(fmt.Sprintf("Node %q drain strategy set", node.ID))
+	} else {
+		c.Ui.Output(fmt.Sprintf("Node %q drain strategy unset", node.ID))
+	}

 	if enable && !detach {
-		if err := monitorDrain(c.Ui.Output, client.Nodes(), node.ID, meta.LastIndex); err != nil {
-			c.Ui.Error(fmt.Sprintf("Error monitoring drain: %v", err))
-			return 1
+		outCh := client.Nodes().MonitorDrain(context.Background(), node.ID, meta.LastIndex, ignoreSystem)
+		for msg := range outCh {
+			c.Ui.Output(msg)
 		}
-
-		c.Ui.Output(fmt.Sprintf("Node %q drain complete", nodeID))
 	}

 	return 0
 }
-
-// monitorDrain monitors the node being drained and exits when the node has
-// finished draining.
-func monitorDrain(output func(string), nodeClient *api.Nodes, nodeID string, index uint64) error {
-	doneCh := make(chan struct{})
-	defer close(doneCh)
-
-	// Errors from either goroutine are sent here
-	errCh := make(chan error, 1)
-
-	// Monitor node changes and close chan when drain is complete
-	nodeCh := make(chan struct{})
-	go func() {
-		for {
-			q := api.QueryOptions{
-				AllowStale: true,
-				WaitIndex:  index,
-			}
-			node, meta, err := nodeClient.Info(nodeID, &q)
-			if err != nil {
-				select {
-				case errCh <- err:
-				case <-doneCh:
-				}
-				return
-			}
-
-			if node.DrainStrategy == nil {
-				close(nodeCh)
-				return
-			}
-
-			// Drain still ongoing
-			index = meta.LastIndex
-		}
-	}()
-
-	// Monitor alloc changes
-	allocCh := make(chan string, 1)
-	go func() {
-		allocs, meta, err := nodeClient.Allocations(nodeID, nil)
-		if err != nil {
-			select {
-			case errCh <- err:
-			case <-doneCh:
-			}
-			return
-		}
-
-		initial := make(map[string]*api.Allocation, len(allocs))
-		for _, a := range allocs {
-			initial[a.ID] = a
-		}
-
-		for {
-			q := api.QueryOptions{
-				AllowStale: true,
-				WaitIndex:  meta.LastIndex,
-			}
-
-			allocs, meta, err = nodeClient.Allocations(nodeID, &q)
-			if err != nil {
-				select {
-				case errCh <- err:
-				case <-doneCh:
-				}
-				return
-			}
-
-			for _, a := range allocs {
-				// Get previous version of alloc
-				orig, ok := initial[a.ID]
-
-				// Update local alloc state
-				initial[a.ID] = a
-
-				migrating := a.DesiredTransition.ShouldMigrate()
-
-				msg := ""
-				switch {
-				case !ok:
-					// Should only be possible if response
-					// from initial Allocations call was
-					// stale. No need to output
-
-				case orig.ClientStatus != a.ClientStatus:
-					// Alloc status has changed; output
-					msg = fmt.Sprintf("status %s -> %s", orig.ClientStatus, a.ClientStatus)
-
-				case migrating && !orig.DesiredTransition.ShouldMigrate():
-					// Alloc was marked for migration
-					msg = "marked for migration"
-				case migrating && (orig.DesiredStatus != a.DesiredStatus) && a.DesiredStatus == structs.AllocDesiredStatusStop:
-					// Alloc has already been marked for migration and is now being stopped
-					msg = "draining"
-				case a.NextAllocation != "" && orig.NextAllocation == "":
-					// Alloc has been replaced by another allocation
-					msg = fmt.Sprintf("replaced by allocation %q", a.NextAllocation)
-				}
-
-				if msg != "" {
-					select {
-					case allocCh <- fmt.Sprintf("Alloc %q %s", a.ID, msg):
-					case <-doneCh:
-						return
-					}
-				}
-			}
-		}
-	}()
-
-	done := false
-	for !done {
-		select {
-		case err := <-errCh:
-			return err
-		case <-nodeCh:
-			done = true
-		case msg := <-allocCh:
-			output(msg)
-		}
-	}
-
-	// Loop on alloc messages for a bit longer as we may have gotten the
-	// "node done" first (since the watchers run concurrently the events
-	// may be received out of order)
-	deadline := 500 * time.Millisecond
-	timer := time.NewTimer(deadline)
-	for {
-		select {
-		case err := <-errCh:
-			return err
-		case msg := <-allocCh:
-			output(msg)
-			if !timer.Stop() {
-				<-timer.C
-			}
-			timer.Reset(deadline)
-		case <-timer.C:
-			// No events within deadline, exit
-			return nil
-		}
-	}
-}
--- a/command/node_drain_test.go
+++ b/command/node_drain_test.go
@@ -119,16 +119,17 @@ func TestNodeDrainCommand_Monitor(t *testing.T) {
 		t.Fatalf("err: %s", err)
 	})

-	// Register a job to create an alloc to drain
-	count := 3
+	// Register a service job to create allocs to drain
+	serviceCount := 3
 	job := &api.Job{
 		ID:          helper.StringToPtr("mock_service"),
 		Name:        helper.StringToPtr("mock_service"),
 		Datacenters: []string{"dc1"},
+		Type:        helper.StringToPtr("service"),
 		TaskGroups: []*api.TaskGroup{
 			{
 				Name:  helper.StringToPtr("mock_group"),
-				Count: &count,
+				Count: &serviceCount,
 				Migrate: &api.MigrateStrategy{
 					MaxParallel:     helper.IntToPtr(1),
 					HealthCheck:     helper.StringToPtr("task_states"),
@@ -142,6 +143,10 @@ func TestNodeDrainCommand_Monitor(t *testing.T) {
 						Config: map[string]interface{}{
 							"run_for": "10m",
 						},
+						Resources: &api.Resources{
+							CPU:      helper.IntToPtr(50),
+							MemoryMB: helper.IntToPtr(50),
+						},
 					},
 				},
 			},
@@ -151,14 +156,44 @@ func TestNodeDrainCommand_Monitor(t *testing.T) {
 	_, _, err := client.Jobs().Register(job, nil)
 	require.Nil(err)

+	// Register a system job to ensure it is ignored during draining
+	sysjob := &api.Job{
+		ID:          helper.StringToPtr("mock_system"),
+		Name:        helper.StringToPtr("mock_system"),
+		Datacenters: []string{"dc1"},
+		Type:        helper.StringToPtr("system"),
+		TaskGroups: []*api.TaskGroup{
+			{
+				Name:  helper.StringToPtr("mock_sysgroup"),
+				Count: helper.IntToPtr(1),
+				Tasks: []*api.Task{
+					{
+						Name:   "mock_systask",
+						Driver: "mock_driver",
+						Config: map[string]interface{}{
+							"run_for": "10m",
+						},
+						Resources: &api.Resources{
+							CPU:      helper.IntToPtr(50),
+							MemoryMB: helper.IntToPtr(50),
+						},
+					},
+				},
+			},
+		},
+	}
+
+	_, _, err = client.Jobs().Register(sysjob, nil)
+	require.Nil(err)
+
 	var allocs []*api.Allocation
 	testutil.WaitForResult(func() (bool, error) {
 		allocs, _, err = client.Nodes().Allocations(nodeID, nil)
 		if err != nil {
 			return false, err
 		}
-		if len(allocs) != count {
-			return false, fmt.Errorf("number of allocs %d != count (%d)", len(allocs), count)
+		if len(allocs) != serviceCount+1 {
+			return false, fmt.Errorf("number of allocs %d != count (%d)", len(allocs), serviceCount+1)
 		}
 		for _, a := range allocs {
 			if a.ClientStatus != "running" {
@@ -172,10 +207,10 @@ func TestNodeDrainCommand_Monitor(t *testing.T) {

 	ui := new(cli.MockUi)
 	cmd := &NodeDrainCommand{Meta: Meta{Ui: ui}}
-	args := []string{"-address=" + url, "-self", "-enable", "-deadline", "1s"}
+	args := []string{"-address=" + url, "-self", "-enable", "-deadline", "1s", "-ignore-system"}
 	t.Logf("Running: %v", args)
 	if code := cmd.Run(args); code != 0 {
-		t.Fatalf("expected exit 0, got: %d", code)
+		t.Fatalf("expected exit 0, got: %d\n%s", code, ui.OutputWriter.String())
 	}

 	out := ui.OutputWriter.String()
@@ -183,9 +218,19 @@ func TestNodeDrainCommand_Monitor(t *testing.T) {

 	require.Contains(out, "drain complete")
 	for _, a := range allocs {
+		if *a.Job.Type == "system" {
+			if strings.Contains(out, a.ID) {
+				t.Fatalf("output should not contain system alloc %q", a.ID)
+			}
+			continue
+		}
 		require.Contains(out, fmt.Sprintf("Alloc %q marked for migration", a.ID))
 		require.Contains(out, fmt.Sprintf("Alloc %q draining", a.ID))
 	}
+	expected := fmt.Sprintf("All allocations on node %q have stopped.\n", nodeID)
+	if !strings.HasSuffix(out, expected) {
+		t.Fatalf("expected output to end with:\n%s", expected)
+	}
 }

 func TestNodeDrainCommand_Fails(t *testing.T) {