consul: fix deadlock in check-based restarts

Fixes #5395 Alternative to #5957 Make task restarting asynchronous when handling check-based restarts. This matches the pre-0.9 behavior where TaskRunner.Restart was an asynchronous signal. The check-based restarting code was not designed to handle blocking in TaskRunner.Restart. 0.9 made it reentrant and could easily overwhelm the buffered update chan and deadlock. Many thanks to @byronwolfman for his excellent debugging, PR, and reproducer! I created this alternative as changing the functionality of TaskRunner.Restart has a much larger impact. This approach reverts to old known-good behavior and minimizes the number of places changes are made.
2026-01-06 18:35:44 +03:00 · 2019-07-17 15:22:21 -07:00
parent 596b5aaf7e
commit 9c418c224b
2 changed files with 43 additions and 6 deletions
--- a/command/agent/consul/check_watcher.go
+++ b/command/agent/consul/check_watcher.go
@@ -103,20 +103,34 @@ func (c *checkRestart) apply(ctx context.Context, now time.Time, status string)
 		c.logger.Debug("restarting due to unhealthy check")

 		// Tell TaskRunner to restart due to failure
-		const failure = true
 		reason := fmt.Sprintf("healthcheck: check %q unhealthy", c.checkName)
 		event := structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reason)
-		err := c.task.Restart(ctx, event, failure)
-		if err != nil {
-			// Error restarting
-			return false
-		}
+		go asyncRestart(ctx, c.logger, c.task, event)
 		return true
 	}

 	return false
 }

+// asyncRestart mimics the pre-0.9 TaskRunner.Restart behavior and is intended
+// to be called in a goroutine.
+func asyncRestart(ctx context.Context, logger log.Logger, task TaskRestarter, event *structs.TaskEvent) {
+	// Check watcher restarts are always failures
+	const failure = true
+
+	// Restarting is asynchronous so there's no reason to allow this
+	// goroutine to block indefinitely.
+	ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
+	defer cancel()
+
+	if err := task.Restart(ctx, event, failure); err != nil {
+		// Restart errors are not actionable and only relevant when
+		// debugging allocation lifecycle management.
+		logger.Debug("error restarting task", "error", err,
+			"event_time", event.Time, "event_type", event.Type)
+	}
+}
+
 // checkWatchUpdates add or remove checks from the watcher
 type checkWatchUpdate struct {
 	checkID      string
--- a/command/agent/consul/check_watcher_test.go
+++ b/command/agent/consul/check_watcher_test.go
@@ -9,6 +9,7 @@ import (
 	"github.com/hashicorp/consul/api"
 	"github.com/hashicorp/nomad/helper/testlog"
 	"github.com/hashicorp/nomad/nomad/structs"
+	"github.com/stretchr/testify/require"
 )

 // checkRestartRecord is used by a testFakeCtx to record when restarts occur
@@ -194,6 +195,28 @@ func TestCheckWatcher_Healthy(t *testing.T) {
 	}
 }

+// TestCheckWatcher_Unhealthy asserts unhealthy tasks are restarted exactly once.
+func TestCheckWatcher_Unhealthy(t *testing.T) {
+	t.Parallel()
+
+	fakeAPI, cw := testWatcherSetup(t)
+
+	check1 := testCheck()
+	restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1)
+	cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1)
+
+	// Check has always been failing
+	fakeAPI.add("testcheck1", "critical", time.Time{})
+
+	// Run
+	ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
+	defer cancel()
+	cw.Run(ctx)
+
+	// Ensure restart was called exactly once
+	require.Len(t, restarter1.restarts, 1)
+}
+
 // TestCheckWatcher_HealthyWarning asserts checks in warning with
 // ignore_warnings=true do not restart tasks.
 func TestCheckWatcher_HealthyWarning(t *testing.T) {