System test runs on mac

2026-01-07 19:05:42 +03:00 · 2018-03-01 13:36:26 -08:00
parent a027016b87
commit c00c02df62
3 changed files with 263 additions and 2 deletions
--- a/nomad/drainer/drain_test.go
+++ b/nomad/drainer/drain_test.go
@@ -297,6 +297,7 @@ func TestNodeDrainer_SystemDrain(t *testing.T) {
 	serviceJob := mock.Job()
 	serviceJob.Name = "service-job"
 	serviceJob.Type = structs.JobTypeService
+	serviceJob.Constraints = nil
 	serviceJob.TaskGroups[0].Count = 2
 	serviceJob.TaskGroups[0].Migrate = &structs.MigrateStrategy{
 		MaxParallel:     1,
@@ -315,6 +316,7 @@ func TestNodeDrainer_SystemDrain(t *testing.T) {
 	systemJob := mock.SystemJob()
 	systemJob.Name = "system-job"
 	systemJob.Type = structs.JobTypeSystem
+	systemJob.Constraints = nil
 	systemJob.TaskGroups[0].Tasks[0].Driver = "mock_driver"
 	systemJob.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
 		"run_for":    "10m",
@@ -486,6 +488,4 @@ func TestNodeDrainer_SystemDrain(t *testing.T) {
 		t.Logf("job: %s  node: %s  alloc: %s  desired_status: %s  desired_transition: %s  actual: %s  replaces: %s",
 			alloc.Job.Name, alloc.NodeID[:6], alloc.ID[:6], alloc.DesiredStatus, pretty.Sprint(alloc.DesiredTransition.Migrate), alloc.ClientStatus, alloc.PreviousAllocation)
 	}
-
-	t.Logf("==> PASS")
 }
--- a/nomad/drainer/job_watcher.go
+++ b/nomad/drainer/job_watcher.go
@@ -0,0 +1,140 @@
+package drainer
+
+import (
+	"context"
+	"log"
+	"sync"
+
+	memdb "github.com/hashicorp/go-memdb"
+	"github.com/hashicorp/nomad/nomad/state"
+	"github.com/hashicorp/nomad/nomad/structs"
+)
+
+// jobWatcher watches allocation changes for jobs with at least one allocation
+// on a draining node.
+type jobWatcher struct {
+	// allocsIndex to start watching from
+	allocsIndex uint64
+
+	// job -> node.ID
+	jobs   map[jobKey]string
+	jobsMu sync.Mutex
+
+	jobsCh chan map[jobKey]struct{}
+
+	state *state.StateStore
+
+	logger *log.Logger
+}
+
+func newJobWatcher(logger *log.Logger, jobs map[jobKey]string, allocsIndex uint64, state *state.StateStore) *jobWatcher {
+	return &jobWatcher{
+		allocsIndex: allocsIndex,
+		logger:      logger,
+		jobs:        jobs,
+		jobsCh:      make(chan map[jobKey]struct{}),
+		state:       state,
+	}
+}
+
+func (j *jobWatcher) watch(k jobKey, nodeID string) {
+	j.logger.Printf("[TRACE] nomad.drain: watching job %s on draining node %s", k.jobid, nodeID[:6])
+	j.jobsMu.Lock()
+	j.jobs[k] = nodeID
+	j.jobsMu.Unlock()
+}
+
+func (j *jobWatcher) nodeDone(nodeID string) {
+	j.jobsMu.Lock()
+	defer j.jobsMu.Unlock()
+	for k, v := range j.jobs {
+		if v == nodeID {
+			j.logger.Printf("[TRACE] nomad.drain: UNwatching job %s on done draining node %s", k.jobid, nodeID[:6])
+			delete(j.jobs, k)
+		}
+	}
+}
+
+func (j *jobWatcher) WaitCh() <-chan map[jobKey]struct{} {
+	return j.jobsCh
+}
+
+func (j *jobWatcher) run(ctx context.Context) {
+	var resp interface{}
+	var err error
+
+	for {
+		//FIXME have watchAllocs create a closure and give it a copy of j.jobs to remove locking?
+		//FIXME it seems possible for this to return a nil error and a 0 index, what to do in that case?
+		var newIndex uint64
+		resp, newIndex, err = j.state.BlockingQuery(j.watchAllocs, j.allocsIndex, ctx)
+		if err != nil {
+			if err == context.Canceled {
+				j.logger.Printf("[TRACE] nomad.drain: job watcher shutting down")
+				return
+			}
+			j.logger.Printf("[ERR] nomad.drain: error blocking on alloc updates: %v", err)
+			return
+		}
+
+		j.logger.Printf("[TRACE] nomad.drain: job watcher old index: %d new index: %d", j.allocsIndex, newIndex)
+		j.allocsIndex = newIndex
+
+		changedJobs := resp.(map[jobKey]struct{})
+		if len(changedJobs) > 0 {
+			select {
+			case j.jobsCh <- changedJobs:
+			case <-ctx.Done():
+				return
+			}
+		}
+	}
+}
+
+func (j *jobWatcher) watchAllocs(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
+	iter, err := state.Allocs(ws)
+	if err != nil {
+		return nil, 0, err
+	}
+
+	index, err := state.Index("allocs")
+	if err != nil {
+		return nil, 0, err
+	}
+
+	skipped := 0
+
+	// job ids
+	resp := map[jobKey]struct{}{}
+
+	for {
+		raw := iter.Next()
+		if raw == nil {
+			break
+		}
+
+		alloc := raw.(*structs.Allocation)
+
+		j.jobsMu.Lock()
+		_, ok := j.jobs[jobKey{alloc.Namespace, alloc.JobID}]
+		j.jobsMu.Unlock()
+
+		if !ok {
+			// alloc is not part of a draining job
+			skipped++
+			continue
+		}
+
+		// don't wake drain loop if alloc hasn't updated its health
+		if alloc.DeploymentStatus.IsHealthy() || alloc.DeploymentStatus.IsUnhealthy() {
+			j.logger.Printf("[TRACE] nomad.drain: job watcher found alloc %s - deployment status: %t", alloc.ID[:6], *alloc.DeploymentStatus.Healthy)
+			resp[jobKey{alloc.Namespace, alloc.JobID}] = struct{}{}
+		} else {
+			j.logger.Printf("[TRACE] nomad.drain: job watcher ignoring alloc %s - no deployment status", alloc.ID[:6])
+		}
+	}
+
+	j.logger.Printf("[TRACE] nomad.drain: job watcher ignoring %d allocs - not part of draining job at index %d", skipped, index)
+
+	return resp, index, nil
+}
--- a/nomad/drainer/node_watcher.go
+++ b/nomad/drainer/node_watcher.go
@@ -0,0 +1,121 @@
+package drainer
+
+import (
+	"context"
+	"log"
+
+	memdb "github.com/hashicorp/go-memdb"
+	"github.com/hashicorp/nomad/nomad/state"
+	"github.com/hashicorp/nomad/nomad/structs"
+)
+
+// nodeWatcher watches for nodes to start or stop draining
+type nodeWatcher struct {
+	index   uint64
+	nodes   map[string]*structs.Node
+	nodesCh chan map[string]*structs.Node
+	state   *state.StateStore
+	logger  *log.Logger
+}
+
+func newNodeWatcher(logger *log.Logger, nodes map[string]*structs.Node, index uint64, state *state.StateStore) *nodeWatcher {
+	return &nodeWatcher{
+		nodes:   nodes,
+		nodesCh: make(chan map[string]*structs.Node),
+		index:   index,
+		state:   state,
+		logger:  logger,
+	}
+}
+
+func (n *nodeWatcher) run(ctx context.Context) {
+	// Trigger an initial drain pass if there are already nodes draining
+	//FIXME this is unneccessary if a node has reached a deadline
+	n.logger.Printf("[TRACE] nomad.drain: initial draining nodes: %d", len(n.nodes))
+	if len(n.nodes) > 0 {
+		n.nodesCh <- n.nodes
+	}
+
+	for {
+		//FIXME it seems possible for this to return a nil error and a 0 index, what to do in that case?
+		resp, index, err := n.state.BlockingQuery(n.queryNodeDrain, n.index, ctx)
+		if err != nil {
+			if err == context.Canceled {
+				n.logger.Printf("[TRACE] nomad.drain: draining node watcher shutting down")
+				return
+			}
+			n.logger.Printf("[ERR] nomad.drain: error blocking on node updates at index %d: %v", n.index, err)
+			return
+		}
+
+		// update index for next run
+		n.index = index
+
+		changed := false
+		newNodes := resp.([]*structs.Node)
+		n.logger.Printf("[TRACE] nomad.drain: %d nodes to consider", len(newNodes)) //FIXME remove
+		for _, newNode := range newNodes {
+			if existingNode, ok := n.nodes[newNode.ID]; ok {
+				// Node was draining, see if it has changed
+				if newNode.DrainStrategy == nil {
+					// Node stopped draining
+					delete(n.nodes, newNode.ID)
+					changed = true
+				} else if !newNode.DrainStrategy.Equal(existingNode.DrainStrategy) {
+					// Update deadline
+					n.nodes[newNode.ID] = newNode
+					changed = true
+				}
+			} else {
+				// Node was not draining
+				if newNode.DrainStrategy != nil {
+					// Node started draining
+					n.nodes[newNode.ID] = newNode
+					changed = true
+				}
+			}
+		}
+
+		// Send a copy of the draining nodes if there were changes
+		if !changed {
+			continue
+		}
+
+		nodesCopy := make(map[string]*structs.Node, len(n.nodes))
+		for k, v := range n.nodes {
+			nodesCopy[k] = v
+		}
+
+		select {
+		case n.nodesCh <- nodesCopy:
+		case <-ctx.Done():
+			return
+		}
+	}
+}
+
+func (n *nodeWatcher) queryNodeDrain(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
+	iter, err := state.Nodes(ws)
+	if err != nil {
+		return nil, 0, err
+	}
+
+	index, err := state.Index("nodes")
+	if err != nil {
+		return nil, 0, err
+	}
+
+	resp := make([]*structs.Node, 0, 8)
+
+	for {
+		raw := iter.Next()
+		if raw == nil {
+			break
+		}
+
+		node := raw.(*structs.Node)
+		resp = append(resp, node)
+	}
+
+	return resp, index, nil
+}