System test runs on mac

This commit is contained in:
Alex Dadgar
2018-03-01 13:36:26 -08:00
committed by Michael Schurter
parent a027016b87
commit c00c02df62
3 changed files with 263 additions and 2 deletions

View File

@@ -297,6 +297,7 @@ func TestNodeDrainer_SystemDrain(t *testing.T) {
serviceJob := mock.Job()
serviceJob.Name = "service-job"
serviceJob.Type = structs.JobTypeService
serviceJob.Constraints = nil
serviceJob.TaskGroups[0].Count = 2
serviceJob.TaskGroups[0].Migrate = &structs.MigrateStrategy{
MaxParallel: 1,
@@ -315,6 +316,7 @@ func TestNodeDrainer_SystemDrain(t *testing.T) {
systemJob := mock.SystemJob()
systemJob.Name = "system-job"
systemJob.Type = structs.JobTypeSystem
systemJob.Constraints = nil
systemJob.TaskGroups[0].Tasks[0].Driver = "mock_driver"
systemJob.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
"run_for": "10m",
@@ -486,6 +488,4 @@ func TestNodeDrainer_SystemDrain(t *testing.T) {
t.Logf("job: %s node: %s alloc: %s desired_status: %s desired_transition: %s actual: %s replaces: %s",
alloc.Job.Name, alloc.NodeID[:6], alloc.ID[:6], alloc.DesiredStatus, pretty.Sprint(alloc.DesiredTransition.Migrate), alloc.ClientStatus, alloc.PreviousAllocation)
}
t.Logf("==> PASS")
}

View File

@@ -0,0 +1,140 @@
package drainer
import (
"context"
"log"
"sync"
memdb "github.com/hashicorp/go-memdb"
"github.com/hashicorp/nomad/nomad/state"
"github.com/hashicorp/nomad/nomad/structs"
)
// jobWatcher watches allocation changes for jobs with at least one allocation
// on a draining node.
type jobWatcher struct {
// allocsIndex to start watching from
allocsIndex uint64
// job -> node.ID
jobs map[jobKey]string
jobsMu sync.Mutex
jobsCh chan map[jobKey]struct{}
state *state.StateStore
logger *log.Logger
}
func newJobWatcher(logger *log.Logger, jobs map[jobKey]string, allocsIndex uint64, state *state.StateStore) *jobWatcher {
return &jobWatcher{
allocsIndex: allocsIndex,
logger: logger,
jobs: jobs,
jobsCh: make(chan map[jobKey]struct{}),
state: state,
}
}
func (j *jobWatcher) watch(k jobKey, nodeID string) {
j.logger.Printf("[TRACE] nomad.drain: watching job %s on draining node %s", k.jobid, nodeID[:6])
j.jobsMu.Lock()
j.jobs[k] = nodeID
j.jobsMu.Unlock()
}
func (j *jobWatcher) nodeDone(nodeID string) {
j.jobsMu.Lock()
defer j.jobsMu.Unlock()
for k, v := range j.jobs {
if v == nodeID {
j.logger.Printf("[TRACE] nomad.drain: UNwatching job %s on done draining node %s", k.jobid, nodeID[:6])
delete(j.jobs, k)
}
}
}
func (j *jobWatcher) WaitCh() <-chan map[jobKey]struct{} {
return j.jobsCh
}
func (j *jobWatcher) run(ctx context.Context) {
var resp interface{}
var err error
for {
//FIXME have watchAllocs create a closure and give it a copy of j.jobs to remove locking?
//FIXME it seems possible for this to return a nil error and a 0 index, what to do in that case?
var newIndex uint64
resp, newIndex, err = j.state.BlockingQuery(j.watchAllocs, j.allocsIndex, ctx)
if err != nil {
if err == context.Canceled {
j.logger.Printf("[TRACE] nomad.drain: job watcher shutting down")
return
}
j.logger.Printf("[ERR] nomad.drain: error blocking on alloc updates: %v", err)
return
}
j.logger.Printf("[TRACE] nomad.drain: job watcher old index: %d new index: %d", j.allocsIndex, newIndex)
j.allocsIndex = newIndex
changedJobs := resp.(map[jobKey]struct{})
if len(changedJobs) > 0 {
select {
case j.jobsCh <- changedJobs:
case <-ctx.Done():
return
}
}
}
}
func (j *jobWatcher) watchAllocs(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
iter, err := state.Allocs(ws)
if err != nil {
return nil, 0, err
}
index, err := state.Index("allocs")
if err != nil {
return nil, 0, err
}
skipped := 0
// job ids
resp := map[jobKey]struct{}{}
for {
raw := iter.Next()
if raw == nil {
break
}
alloc := raw.(*structs.Allocation)
j.jobsMu.Lock()
_, ok := j.jobs[jobKey{alloc.Namespace, alloc.JobID}]
j.jobsMu.Unlock()
if !ok {
// alloc is not part of a draining job
skipped++
continue
}
// don't wake drain loop if alloc hasn't updated its health
if alloc.DeploymentStatus.IsHealthy() || alloc.DeploymentStatus.IsUnhealthy() {
j.logger.Printf("[TRACE] nomad.drain: job watcher found alloc %s - deployment status: %t", alloc.ID[:6], *alloc.DeploymentStatus.Healthy)
resp[jobKey{alloc.Namespace, alloc.JobID}] = struct{}{}
} else {
j.logger.Printf("[TRACE] nomad.drain: job watcher ignoring alloc %s - no deployment status", alloc.ID[:6])
}
}
j.logger.Printf("[TRACE] nomad.drain: job watcher ignoring %d allocs - not part of draining job at index %d", skipped, index)
return resp, index, nil
}

View File

@@ -0,0 +1,121 @@
package drainer
import (
"context"
"log"
memdb "github.com/hashicorp/go-memdb"
"github.com/hashicorp/nomad/nomad/state"
"github.com/hashicorp/nomad/nomad/structs"
)
// nodeWatcher watches for nodes to start or stop draining
type nodeWatcher struct {
index uint64
nodes map[string]*structs.Node
nodesCh chan map[string]*structs.Node
state *state.StateStore
logger *log.Logger
}
func newNodeWatcher(logger *log.Logger, nodes map[string]*structs.Node, index uint64, state *state.StateStore) *nodeWatcher {
return &nodeWatcher{
nodes: nodes,
nodesCh: make(chan map[string]*structs.Node),
index: index,
state: state,
logger: logger,
}
}
func (n *nodeWatcher) run(ctx context.Context) {
// Trigger an initial drain pass if there are already nodes draining
//FIXME this is unneccessary if a node has reached a deadline
n.logger.Printf("[TRACE] nomad.drain: initial draining nodes: %d", len(n.nodes))
if len(n.nodes) > 0 {
n.nodesCh <- n.nodes
}
for {
//FIXME it seems possible for this to return a nil error and a 0 index, what to do in that case?
resp, index, err := n.state.BlockingQuery(n.queryNodeDrain, n.index, ctx)
if err != nil {
if err == context.Canceled {
n.logger.Printf("[TRACE] nomad.drain: draining node watcher shutting down")
return
}
n.logger.Printf("[ERR] nomad.drain: error blocking on node updates at index %d: %v", n.index, err)
return
}
// update index for next run
n.index = index
changed := false
newNodes := resp.([]*structs.Node)
n.logger.Printf("[TRACE] nomad.drain: %d nodes to consider", len(newNodes)) //FIXME remove
for _, newNode := range newNodes {
if existingNode, ok := n.nodes[newNode.ID]; ok {
// Node was draining, see if it has changed
if newNode.DrainStrategy == nil {
// Node stopped draining
delete(n.nodes, newNode.ID)
changed = true
} else if !newNode.DrainStrategy.Equal(existingNode.DrainStrategy) {
// Update deadline
n.nodes[newNode.ID] = newNode
changed = true
}
} else {
// Node was not draining
if newNode.DrainStrategy != nil {
// Node started draining
n.nodes[newNode.ID] = newNode
changed = true
}
}
}
// Send a copy of the draining nodes if there were changes
if !changed {
continue
}
nodesCopy := make(map[string]*structs.Node, len(n.nodes))
for k, v := range n.nodes {
nodesCopy[k] = v
}
select {
case n.nodesCh <- nodesCopy:
case <-ctx.Done():
return
}
}
}
func (n *nodeWatcher) queryNodeDrain(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
iter, err := state.Nodes(ws)
if err != nil {
return nil, 0, err
}
index, err := state.Index("nodes")
if err != nil {
return nil, 0, err
}
resp := make([]*structs.Node, 0, 8)
for {
raw := iter.Next()
if raw == nil {
break
}
node := raw.(*structs.Node)
resp = append(resp, node)
}
return resp, index, nil
}