diff --git a/command/agent/consul/check_watcher.go b/command/agent/consul/check_watcher.go index b6e8610fc..eab7ceb11 100644 --- a/command/agent/consul/check_watcher.go +++ b/command/agent/consul/check_watcher.go @@ -103,20 +103,34 @@ func (c *checkRestart) apply(ctx context.Context, now time.Time, status string) c.logger.Debug("restarting due to unhealthy check") // Tell TaskRunner to restart due to failure - const failure = true reason := fmt.Sprintf("healthcheck: check %q unhealthy", c.checkName) event := structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reason) - err := c.task.Restart(ctx, event, failure) - if err != nil { - // Error restarting - return false - } + go asyncRestart(ctx, c.logger, c.task, event) return true } return false } +// asyncRestart mimics the pre-0.9 TaskRunner.Restart behavior and is intended +// to be called in a goroutine. +func asyncRestart(ctx context.Context, logger log.Logger, task TaskRestarter, event *structs.TaskEvent) { + // Check watcher restarts are always failures + const failure = true + + // Restarting is asynchronous so there's no reason to allow this + // goroutine to block indefinitely. + ctx, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + + if err := task.Restart(ctx, event, failure); err != nil { + // Restart errors are not actionable and only relevant when + // debugging allocation lifecycle management. + logger.Debug("error restarting task", "error", err, + "event_time", event.Time, "event_type", event.Type) + } +} + // checkWatchUpdates add or remove checks from the watcher type checkWatchUpdate struct { checkID string diff --git a/command/agent/consul/check_watcher_test.go b/command/agent/consul/check_watcher_test.go index fd8d578ed..5e303516c 100644 --- a/command/agent/consul/check_watcher_test.go +++ b/command/agent/consul/check_watcher_test.go @@ -9,6 +9,7 @@ import ( "github.com/hashicorp/consul/api" "github.com/hashicorp/nomad/helper/testlog" "github.com/hashicorp/nomad/nomad/structs" + "github.com/stretchr/testify/require" ) // checkRestartRecord is used by a testFakeCtx to record when restarts occur @@ -194,6 +195,28 @@ func TestCheckWatcher_Healthy(t *testing.T) { } } +// TestCheckWatcher_Unhealthy asserts unhealthy tasks are restarted exactly once. +func TestCheckWatcher_Unhealthy(t *testing.T) { + t.Parallel() + + fakeAPI, cw := testWatcherSetup(t) + + check1 := testCheck() + restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1) + cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) + + // Check has always been failing + fakeAPI.add("testcheck1", "critical", time.Time{}) + + // Run + ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) + defer cancel() + cw.Run(ctx) + + // Ensure restart was called exactly once + require.Len(t, restarter1.restarts, 1) +} + // TestCheckWatcher_HealthyWarning asserts checks in warning with // ignore_warnings=true do not restart tasks. func TestCheckWatcher_HealthyWarning(t *testing.T) {