consul: fix deadlock in check-based restarts

Fixes #5395
Alternative to #5957

Make task restarting asynchronous when handling check-based restarts.
This matches the pre-0.9 behavior where TaskRunner.Restart was an
asynchronous signal. The check-based restarting code was not designed
to handle blocking in TaskRunner.Restart. 0.9 made it reentrant and
could easily overwhelm the buffered update chan and deadlock.

Many thanks to @byronwolfman for his excellent debugging, PR, and
reproducer!

I created this alternative as changing the functionality of
TaskRunner.Restart has a much larger impact. This approach reverts to
old known-good behavior and minimizes the number of places changes are
made.
This commit is contained in:
Michael Schurter
2019-07-17 15:22:21 -07:00
parent 596b5aaf7e
commit 9c418c224b
2 changed files with 43 additions and 6 deletions

View File

@@ -103,20 +103,34 @@ func (c *checkRestart) apply(ctx context.Context, now time.Time, status string)
c.logger.Debug("restarting due to unhealthy check")
// Tell TaskRunner to restart due to failure
const failure = true
reason := fmt.Sprintf("healthcheck: check %q unhealthy", c.checkName)
event := structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reason)
err := c.task.Restart(ctx, event, failure)
if err != nil {
// Error restarting
return false
}
go asyncRestart(ctx, c.logger, c.task, event)
return true
}
return false
}
// asyncRestart mimics the pre-0.9 TaskRunner.Restart behavior and is intended
// to be called in a goroutine.
func asyncRestart(ctx context.Context, logger log.Logger, task TaskRestarter, event *structs.TaskEvent) {
// Check watcher restarts are always failures
const failure = true
// Restarting is asynchronous so there's no reason to allow this
// goroutine to block indefinitely.
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
if err := task.Restart(ctx, event, failure); err != nil {
// Restart errors are not actionable and only relevant when
// debugging allocation lifecycle management.
logger.Debug("error restarting task", "error", err,
"event_time", event.Time, "event_type", event.Type)
}
}
// checkWatchUpdates add or remove checks from the watcher
type checkWatchUpdate struct {
checkID string

View File

@@ -9,6 +9,7 @@ import (
"github.com/hashicorp/consul/api"
"github.com/hashicorp/nomad/helper/testlog"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/stretchr/testify/require"
)
// checkRestartRecord is used by a testFakeCtx to record when restarts occur
@@ -194,6 +195,28 @@ func TestCheckWatcher_Healthy(t *testing.T) {
}
}
// TestCheckWatcher_Unhealthy asserts unhealthy tasks are restarted exactly once.
func TestCheckWatcher_Unhealthy(t *testing.T) {
t.Parallel()
fakeAPI, cw := testWatcherSetup(t)
check1 := testCheck()
restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1)
cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1)
// Check has always been failing
fakeAPI.add("testcheck1", "critical", time.Time{})
// Run
ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
defer cancel()
cw.Run(ctx)
// Ensure restart was called exactly once
require.Len(t, restarter1.restarts, 1)
}
// TestCheckWatcher_HealthyWarning asserts checks in warning with
// ignore_warnings=true do not restart tasks.
func TestCheckWatcher_HealthyWarning(t *testing.T) {