From 33b74334b71956d149292e85a50695fc3a804f00 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Wed, 19 Dec 2018 12:09:05 -0800 Subject: [PATCH 1/6] test: fix race around updateCh handling PeriodicDispatch.SetEnabled sets updateCh in one goroutine, and PeriodicDispatch.run accesses updateCh in another. The race can be prevented by having SetEnabled pass updateCh to run. Race detector output from `go test -race -run TestServer_RPC` in nomad/ ``` ================== WARNING: DATA RACE Write at 0x00c0001d3f48 by goroutine 75: github.com/hashicorp/nomad/nomad.(*PeriodicDispatch).SetEnabled() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/periodic.go:468 +0x256 github.com/hashicorp/nomad/nomad.(*Server).revokeLeadership() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:724 +0x267 github.com/hashicorp/nomad/nomad.(*Server).leaderLoop.func1() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:131 +0x3c github.com/hashicorp/nomad/nomad.(*Server).leaderLoop() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:163 +0x4dd github.com/hashicorp/nomad/nomad.(*Server).monitorLeadership.func1() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:72 +0x6c Previous read at 0x00c0001d3f48 by goroutine 515: github.com/hashicorp/nomad/nomad.(*PeriodicDispatch).run() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/periodic.go:338 +0x177 Goroutine 75 (running) created at: github.com/hashicorp/nomad/nomad.(*Server).monitorLeadership() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:70 +0x269 Goroutine 515 (running) created at: github.com/hashicorp/nomad/nomad.(*PeriodicDispatch).SetEnabled() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/periodic.go:176 +0x1bc github.com/hashicorp/nomad/nomad.(*Server).establishLeadership() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:231 +0x582 github.com/hashicorp/nomad/nomad.(*Server).leaderLoop() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:117 +0x82e github.com/hashicorp/nomad/nomad.(*Server).monitorLeadership.func1() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:72 +0x6c ================== ``` --- nomad/periodic.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nomad/periodic.go b/nomad/periodic.go index edd295461..26c2a6715 100644 --- a/nomad/periodic.go +++ b/nomad/periodic.go @@ -173,7 +173,7 @@ func (p *PeriodicDispatch) SetEnabled(enabled bool) { // If we are transitioning from disabled to enabled, run the daemon. ctx, cancel := context.WithCancel(context.Background()) p.stopFn = cancel - go p.run(ctx) + go p.run(ctx, p.updateCh) } } @@ -320,7 +320,7 @@ func (p *PeriodicDispatch) shouldRun() bool { // run is a long-lived function that waits till a job's periodic spec is met and // then creates an evaluation to run the job. -func (p *PeriodicDispatch) run(ctx context.Context) { +func (p *PeriodicDispatch) run(ctx context.Context, updateCh <-chan struct{}) { var launchCh <-chan time.Time for p.shouldRun() { job, launch := p.nextLaunch() @@ -335,7 +335,7 @@ func (p *PeriodicDispatch) run(ctx context.Context) { select { case <-ctx.Done(): return - case <-p.updateCh: + case <-updateCh: continue case <-launchCh: p.dispatch(job, launch) From b88dbdb516b603a3fa3122eb6753a84fd1bf9dc5 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Wed, 19 Dec 2018 12:23:15 -0800 Subject: [PATCH 2/6] test: fix race around block eval chans Similar to previous commit, stop and change chans were being set and accessed from different goroutines. Passing the chans on the stack resolves the race. Output from `go test -race -run 'Server_RPC$' in nomad/ ``` ================== WARNING: DATA RACE Write at 0x00c0002b4e10 by goroutine 63: github.com/hashicorp/nomad/nomad.(*BlockedEvals).Flush() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/blocked_evals.go:648 +0x32a github.com/hashicorp/nomad/nomad.(*BlockedEvals).SetEnabled() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/blocked_evals.go:149 +0x12b github.com/hashicorp/nomad/nomad.(*Server).revokeLeadership() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:721 +0x232 github.com/hashicorp/nomad/nomad.(*Server).leaderLoop() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:122 +0x95d github.com/hashicorp/nomad/nomad.(*Server).monitorLeadership.func1() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:72 +0x6c Previous read at 0x00c0002b4e10 by goroutine 75: github.com/hashicorp/nomad/nomad.(*BlockedEvals).watchCapacity() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/blocked_evals.go:483 +0xfe Goroutine 63 (running) created at: github.com/hashicorp/nomad/nomad.(*Server).monitorLeadership() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:70 +0x269 Goroutine 75 (finished) created at: github.com/hashicorp/nomad/nomad.(*BlockedEvals).SetEnabled() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/blocked_evals.go:141 +0xba github.com/hashicorp/nomad/nomad.(*Server).establishLeadership() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:210 +0x392 github.com/hashicorp/nomad/nomad.(*Server).leaderLoop() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:117 +0x82e github.com/hashicorp/nomad/nomad.(*Server).monitorLeadership.func1() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:72 +0x6c ================== ================== WARNING: DATA RACE Write at 0x00c0002b4e50 by goroutine 63: github.com/hashicorp/nomad/nomad.(*BlockedEvals).Flush() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/blocked_evals.go:649 +0x388 github.com/hashicorp/nomad/nomad.(*BlockedEvals).SetEnabled() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/blocked_evals.go:149 +0x12b github.com/hashicorp/nomad/nomad.(*Server).revokeLeadership() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:721 +0x232 github.com/hashicorp/nomad/nomad.(*Server).leaderLoop() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:122 +0x95d github.com/hashicorp/nomad/nomad.(*Server).monitorLeadership.func1() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:72 +0x6c Previous read at 0x00c0002b4e50 by goroutine 77: github.com/hashicorp/nomad/nomad.(*BlockedEvals).prune() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/blocked_evals.go:690 +0xae Goroutine 63 (running) created at: github.com/hashicorp/nomad/nomad.(*Server).monitorLeadership() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:70 +0x269 Goroutine 77 (finished) created at: github.com/hashicorp/nomad/nomad.(*BlockedEvals).SetEnabled() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/blocked_evals.go:142 +0xdc github.com/hashicorp/nomad/nomad.(*Server).establishLeadership() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:210 +0x392 github.com/hashicorp/nomad/nomad.(*Server).leaderLoop() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:117 +0x82e github.com/hashicorp/nomad/nomad.(*Server).monitorLeadership.func1() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:72 +0x6c ================== ``` --- nomad/blocked_evals.go | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/nomad/blocked_evals.go b/nomad/blocked_evals.go index 9f9ca013f..dcb3ef635 100644 --- a/nomad/blocked_evals.go +++ b/nomad/blocked_evals.go @@ -4,7 +4,7 @@ import ( "sync" "time" - "github.com/armon/go-metrics" + metrics "github.com/armon/go-metrics" "github.com/hashicorp/consul/lib" log "github.com/hashicorp/go-hclog" "github.com/hashicorp/nomad/helper" @@ -138,8 +138,8 @@ func (b *BlockedEvals) SetEnabled(enabled bool) { b.l.Unlock() return } else if enabled { - go b.watchCapacity() - go b.prune() + go b.watchCapacity(b.stopCh, b.capacityChangeCh) + go b.prune(b.stopCh) } else { close(b.stopCh) } @@ -475,12 +475,12 @@ func (b *BlockedEvals) UnblockClassAndQuota(class, quota string, index uint64) { // watchCapacity is a long lived function that watches for capacity changes in // nodes and unblocks the correct set of evals. -func (b *BlockedEvals) watchCapacity() { +func (b *BlockedEvals) watchCapacity(stopCh <-chan struct{}, changeCh <-chan *capacityUpdate) { for { select { - case <-b.stopCh: + case <-stopCh: return - case update := <-b.capacityChangeCh: + case update := <-changeCh: b.unblock(update.computedClass, update.quotaChange, update.index) } } @@ -606,6 +606,11 @@ SCAN: b.l.Unlock() return dups } + + // Capture chans inside the lock to prevent a race with them getting + // reset in Flush + dupCh := b.duplicateCh + stopCh := b.stopCh b.l.Unlock() // Create the timer @@ -616,11 +621,11 @@ SCAN: } select { - case <-b.stopCh: + case <-stopCh: return nil case <-timeoutCh: return nil - case <-b.duplicateCh: + case <-dupCh: goto SCAN } } @@ -676,13 +681,13 @@ func (b *BlockedEvals) EmitStats(period time.Duration, stopCh chan struct{}) { } // prune is a long lived function that prunes unnecessary objects on a timer. -func (b *BlockedEvals) prune() { +func (b *BlockedEvals) prune(stopCh <-chan struct{}) { ticker := time.NewTicker(pruneInterval) defer ticker.Stop() for { select { - case <-b.stopCh: + case <-stopCh: return case <-ticker.C: b.pruneUnblockIndexes() From 4235b32a5c72154badebabe1f930be0e1f126007 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Wed, 19 Dec 2018 12:26:57 -0800 Subject: [PATCH 3/6] test: fix race in eval broker update chan Similar to previous commits the delayed eval update chan was set and access from different goroutines causing a race. Passing the chan on the stack resolves the race. Race output from `go test -race -run 'Server_RPC$'` in nomad/ ``` ================== WARNING: DATA RACE Write at 0x00c000339150 by goroutine 63: github.com/hashicorp/nomad/nomad.(*EvalBroker).flush() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/eval_broker.go:708 +0x3dc github.com/hashicorp/nomad/nomad.(*EvalBroker).SetEnabled() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/eval_broker.go:174 +0xc4 github.com/hashicorp/nomad/nomad.(*Server).revokeLeadership() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:718 +0x1fd github.com/hashicorp/nomad/nomad.(*Server).leaderLoop() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:122 +0x95d github.com/hashicorp/nomad/nomad.(*Server).monitorLeadership.func1() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:72 +0x6c Previous read at 0x00c000339150 by goroutine 73: github.com/hashicorp/nomad/nomad.(*EvalBroker).runDelayedEvalsWatcher() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/eval_broker.go:771 +0x176 Goroutine 63 (running) created at: github.com/hashicorp/nomad/nomad.(*Server).monitorLeadership() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:70 +0x269 Goroutine 73 (running) created at: github.com/hashicorp/nomad/nomad.(*EvalBroker).SetEnabled() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/eval_broker.go:170 +0x173 github.com/hashicorp/nomad/nomad.(*Server).establishLeadership() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:207 +0x355 github.com/hashicorp/nomad/nomad.(*Server).leaderLoop() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:117 +0x82e github.com/hashicorp/nomad/nomad.(*Server).monitorLeadership.func1() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:72 +0x6c ================== ``` --- nomad/eval_broker.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nomad/eval_broker.go b/nomad/eval_broker.go index e507a060c..d3dfdc61e 100644 --- a/nomad/eval_broker.go +++ b/nomad/eval_broker.go @@ -10,7 +10,7 @@ import ( "context" - "github.com/armon/go-metrics" + metrics "github.com/armon/go-metrics" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/lib/delayheap" "github.com/hashicorp/nomad/nomad/structs" @@ -167,7 +167,7 @@ func (b *EvalBroker) SetEnabled(enabled bool) { // start the go routine for delayed evals ctx, cancel := context.WithCancel(context.Background()) b.delayedEvalCancelFunc = cancel - go b.runDelayedEvalsWatcher(ctx) + go b.runDelayedEvalsWatcher(ctx, b.delayedEvalsUpdateCh) } b.l.Unlock() if !enabled { @@ -741,7 +741,7 @@ func (d *evalWrapper) Namespace() string { // runDelayedEvalsWatcher is a long-lived function that waits till a time deadline is met for // pending evaluations before enqueuing them -func (b *EvalBroker) runDelayedEvalsWatcher(ctx context.Context) { +func (b *EvalBroker) runDelayedEvalsWatcher(ctx context.Context, updateCh <-chan struct{}) { var timerChannel <-chan time.Time var delayTimer *time.Timer for { @@ -768,7 +768,7 @@ func (b *EvalBroker) runDelayedEvalsWatcher(ctx context.Context) { b.stats.TotalWaiting -= 1 b.enqueueLocked(eval, eval.Type) b.l.Unlock() - case <-b.delayedEvalsUpdateCh: + case <-updateCh: continue } } From 796542c07ec2c9fd4f52921b15748a9e860928c5 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Wed, 19 Dec 2018 13:05:12 -0800 Subject: [PATCH 4/6] test: fix race and nil panic in nomad/ tests Race was test only and due to unlocked map access. Panic was test only and due to checking a field on a struct even when we knew the struct was nil. Race output that was fixed: ``` ================== WARNING: DATA RACE Read at 0x00c000697dd0 by goroutine 768: runtime.mapaccess2() /usr/local/go/src/runtime/map.go:439 +0x0 github.com/hashicorp/nomad/nomad.TestLeader_PeriodicDispatcher_Restore_Adds.func8() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader_test.go:402 +0xe6 github.com/hashicorp/nomad/testutil.WaitForResultRetries() /home/schmichael/go/src/github.com/hashicorp/nomad/testutil/wait.go:30 +0x5a github.com/hashicorp/nomad/testutil.WaitForResult() /home/schmichael/go/src/github.com/hashicorp/nomad/testutil/wait.go:22 +0x57 github.com/hashicorp/nomad/nomad.TestLeader_PeriodicDispatcher_Restore_Adds() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader_test.go:401 +0xb53 testing.tRunner() /usr/local/go/src/testing/testing.go:827 +0x162 Previous write at 0x00c000697dd0 by goroutine 569: runtime.mapassign() /usr/local/go/src/runtime/map.go:549 +0x0 github.com/hashicorp/nomad/nomad.(*PeriodicDispatch).Add() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/periodic.go:224 +0x2eb github.com/hashicorp/nomad/nomad.(*Server).restorePeriodicDispatcher() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:394 +0x29a github.com/hashicorp/nomad/nomad.(*Server).establishLeadership() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:234 +0x593 github.com/hashicorp/nomad/nomad.(*Server).leaderLoop() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:117 +0x82e github.com/hashicorp/nomad/nomad.(*Server).monitorLeadership.func1() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:72 +0x6c Goroutine 768 (running) created at: testing.(*T).Run() /usr/local/go/src/testing/testing.go:878 +0x650 testing.runTests.func1() /usr/local/go/src/testing/testing.go:1119 +0xa8 testing.tRunner() /usr/local/go/src/testing/testing.go:827 +0x162 testing.runTests() /usr/local/go/src/testing/testing.go:1117 +0x4ee testing.(*M).Run() /usr/local/go/src/testing/testing.go:1034 +0x2ee main.main() _testmain.go:1150 +0x221 Goroutine 569 (running) created at: github.com/hashicorp/nomad/nomad.(*Server).monitorLeadership() /home/schmichael/go/src/github.com/hashicorp/nomad/nomad/leader.go:70 +0x269 ================== ``` --- nomad/leader_test.go | 2 ++ nomad/periodic_endpoint_test.go | 12 +++++++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/nomad/leader_test.go b/nomad/leader_test.go index dd02761e5..5f8b81526 100644 --- a/nomad/leader_test.go +++ b/nomad/leader_test.go @@ -399,6 +399,8 @@ func TestLeader_PeriodicDispatcher_Restore_Adds(t *testing.T) { // Check that the new leader is tracking the periodic job only testutil.WaitForResult(func() (bool, error) { + leader.periodicDispatcher.l.Lock() + defer leader.periodicDispatcher.l.Unlock() if _, tracked := leader.periodicDispatcher.tracked[tuplePeriodic]; !tracked { return false, fmt.Errorf("periodic job not tracked") } diff --git a/nomad/periodic_endpoint_test.go b/nomad/periodic_endpoint_test.go index b16b4eb64..db50451b9 100644 --- a/nomad/periodic_endpoint_test.go +++ b/nomad/periodic_endpoint_test.go @@ -4,7 +4,7 @@ import ( "testing" memdb "github.com/hashicorp/go-memdb" - "github.com/hashicorp/net-rpc-msgpackrpc" + msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc" "github.com/hashicorp/nomad/acl" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" @@ -120,8 +120,9 @@ func TestPeriodicEndpoint_Force_ACL(t *testing.T) { ws := memdb.NewWatchSet() eval, err := state.EvalByID(ws, resp.EvalID) assert.Nil(err) - assert.NotNil(eval) - assert.Equal(eval.CreateIndex, resp.EvalCreateIndex) + if assert.NotNil(eval) { + assert.Equal(eval.CreateIndex, resp.EvalCreateIndex) + } } // Fetch the response with management token @@ -135,8 +136,9 @@ func TestPeriodicEndpoint_Force_ACL(t *testing.T) { ws := memdb.NewWatchSet() eval, err := state.EvalByID(ws, resp.EvalID) assert.Nil(err) - assert.NotNil(eval) - assert.Equal(eval.CreateIndex, resp.EvalCreateIndex) + if assert.NotNil(eval) { + assert.Equal(eval.CreateIndex, resp.EvalCreateIndex) + } } } From 6478a93aca950f1e9c35363b1f739256410a9b64 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Wed, 19 Dec 2018 15:39:59 -0800 Subject: [PATCH 5/6] drivermgr: fix race in building driver list --- client/pluginmanager/drivermanager/manager.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/client/pluginmanager/drivermanager/manager.go b/client/pluginmanager/drivermanager/manager.go index 76efd50da..9c8c6af80 100644 --- a/client/pluginmanager/drivermanager/manager.go +++ b/client/pluginmanager/drivermanager/manager.go @@ -242,6 +242,7 @@ func (m *manager) waitForFirstFingerprint(ctx context.Context, cancel context.Ca return } + var mu sync.Mutex var availDrivers []string var wg sync.WaitGroup @@ -253,7 +254,9 @@ func (m *manager) waitForFirstFingerprint(ctx context.Context, cancel context.Ca defer wg.Done() instance.WaitForFirstFingerprint(ctx) if instance.getLastHealth() != drivers.HealthStateUndetected { + mu.Lock() availDrivers = append(availDrivers, name) + mu.Unlock() } }(n, i) } From 2cf3a3c56afc3b6d11e203bb4a23e790a1e28ed5 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Wed, 19 Dec 2018 15:42:49 -0800 Subject: [PATCH 6/6] test: copy AR's Alloc before mutating Fixes a race in client tests --- client/gc_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/gc_test.go b/client/gc_test.go index b40030368..101b91c47 100644 --- a/client/gc_test.go +++ b/client/gc_test.go @@ -30,7 +30,7 @@ func gcConfig() *GCConfig { // runners to be terminal func exitAllocRunner(runners ...AllocRunner) { for _, ar := range runners { - terminalAlloc := ar.Alloc() + terminalAlloc := ar.Alloc().Copy() terminalAlloc.DesiredStatus = structs.AllocDesiredStatusStop ar.Update(terminalAlloc) }