mirror of
https://github.com/kemko/nomad.git
synced 2026-01-06 18:35:44 +03:00
consul: fix deadlock in check-based restarts
Fixes #5395 Alternative to #5957 Make task restarting asynchronous when handling check-based restarts. This matches the pre-0.9 behavior where TaskRunner.Restart was an asynchronous signal. The check-based restarting code was not designed to handle blocking in TaskRunner.Restart. 0.9 made it reentrant and could easily overwhelm the buffered update chan and deadlock. Many thanks to @byronwolfman for his excellent debugging, PR, and reproducer! I created this alternative as changing the functionality of TaskRunner.Restart has a much larger impact. This approach reverts to old known-good behavior and minimizes the number of places changes are made.
This commit is contained in:
@@ -103,20 +103,34 @@ func (c *checkRestart) apply(ctx context.Context, now time.Time, status string)
|
||||
c.logger.Debug("restarting due to unhealthy check")
|
||||
|
||||
// Tell TaskRunner to restart due to failure
|
||||
const failure = true
|
||||
reason := fmt.Sprintf("healthcheck: check %q unhealthy", c.checkName)
|
||||
event := structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reason)
|
||||
err := c.task.Restart(ctx, event, failure)
|
||||
if err != nil {
|
||||
// Error restarting
|
||||
return false
|
||||
}
|
||||
go asyncRestart(ctx, c.logger, c.task, event)
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// asyncRestart mimics the pre-0.9 TaskRunner.Restart behavior and is intended
|
||||
// to be called in a goroutine.
|
||||
func asyncRestart(ctx context.Context, logger log.Logger, task TaskRestarter, event *structs.TaskEvent) {
|
||||
// Check watcher restarts are always failures
|
||||
const failure = true
|
||||
|
||||
// Restarting is asynchronous so there's no reason to allow this
|
||||
// goroutine to block indefinitely.
|
||||
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
if err := task.Restart(ctx, event, failure); err != nil {
|
||||
// Restart errors are not actionable and only relevant when
|
||||
// debugging allocation lifecycle management.
|
||||
logger.Debug("error restarting task", "error", err,
|
||||
"event_time", event.Time, "event_type", event.Type)
|
||||
}
|
||||
}
|
||||
|
||||
// checkWatchUpdates add or remove checks from the watcher
|
||||
type checkWatchUpdate struct {
|
||||
checkID string
|
||||
|
||||
@@ -9,6 +9,7 @@ import (
|
||||
"github.com/hashicorp/consul/api"
|
||||
"github.com/hashicorp/nomad/helper/testlog"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
// checkRestartRecord is used by a testFakeCtx to record when restarts occur
|
||||
@@ -194,6 +195,28 @@ func TestCheckWatcher_Healthy(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestCheckWatcher_Unhealthy asserts unhealthy tasks are restarted exactly once.
|
||||
func TestCheckWatcher_Unhealthy(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
fakeAPI, cw := testWatcherSetup(t)
|
||||
|
||||
check1 := testCheck()
|
||||
restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1)
|
||||
cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1)
|
||||
|
||||
// Check has always been failing
|
||||
fakeAPI.add("testcheck1", "critical", time.Time{})
|
||||
|
||||
// Run
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
|
||||
defer cancel()
|
||||
cw.Run(ctx)
|
||||
|
||||
// Ensure restart was called exactly once
|
||||
require.Len(t, restarter1.restarts, 1)
|
||||
}
|
||||
|
||||
// TestCheckWatcher_HealthyWarning asserts checks in warning with
|
||||
// ignore_warnings=true do not restart tasks.
|
||||
func TestCheckWatcher_HealthyWarning(t *testing.T) {
|
||||
|
||||
Reference in New Issue
Block a user