From 05bae8d1492b96523a6ee4da49df03d7f5e41cb2 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Mon, 25 Feb 2019 15:42:45 -0800 Subject: [PATCH] client: restart task on logmon failures This code chooses to be conservative as opposed to optimal: when failing to reattach to logmon simply return a recoverable error instead of immediately trying to restart logmon. The recoverable error will cause the task's restart policy to be applied and a new logmon will be launched upon restart. Trying to do the optimal approach of simply starting a new logmon requires error string comparison and should be tested against a task actively logging to assert the behavior (are writes blocked? dropped?). --- client/allocrunner/taskrunner/logmon_hook.go | 5 +- .../taskrunner/logmon_hook_test.go | 75 +++++++++++++++++++ 2 files changed, 79 insertions(+), 1 deletion(-) diff --git a/client/allocrunner/taskrunner/logmon_hook.go b/client/allocrunner/taskrunner/logmon_hook.go index 93d8ef37d..acd58300f 100644 --- a/client/allocrunner/taskrunner/logmon_hook.go +++ b/client/allocrunner/taskrunner/logmon_hook.go @@ -12,6 +12,7 @@ import ( "github.com/hashicorp/nomad/client/allocrunner/interfaces" "github.com/hashicorp/nomad/client/logmon" "github.com/hashicorp/nomad/helper/uuid" + "github.com/hashicorp/nomad/nomad/structs" pstructs "github.com/hashicorp/nomad/plugins/shared/structs" ) @@ -102,8 +103,10 @@ func (h *logmonHook) Prestart(ctx context.Context, // Launch or reattach logmon instance for the task. if err := h.launchLogMon(reattachConfig); err != nil { + // Retry errors launching logmon as logmon may have crashed and + // subsequent attempts will start a new one. h.logger.Error("failed to launch logmon process", "error", err) - return err + return structs.NewRecoverableError(err, true) } // Only tell logmon to start when we are not reattaching to a running instance diff --git a/client/allocrunner/taskrunner/logmon_hook_test.go b/client/allocrunner/taskrunner/logmon_hook_test.go index 1f84b4971..c1fa8c2fc 100644 --- a/client/allocrunner/taskrunner/logmon_hook_test.go +++ b/client/allocrunner/taskrunner/logmon_hook_test.go @@ -1,18 +1,24 @@ +// +build !windows + package taskrunner import ( "context" "encoding/json" + "fmt" "io/ioutil" "net" "os" + "syscall" "testing" plugin "github.com/hashicorp/go-plugin" "github.com/hashicorp/nomad/client/allocrunner/interfaces" "github.com/hashicorp/nomad/helper/testlog" "github.com/hashicorp/nomad/nomad/mock" + "github.com/hashicorp/nomad/nomad/structs" pstructs "github.com/hashicorp/nomad/plugins/shared/structs" + "github.com/hashicorp/nomad/testutil" "github.com/stretchr/testify/require" ) @@ -100,3 +106,72 @@ func TestTaskRunner_LogmonHook_StartStop(t *testing.T) { // Running stop should shutdown logmon require.NoError(t, hook.Stop(context.Background(), nil, nil)) } + +// TestTaskRunner_LogmonHook_StartCrashStop simulates logmon crashing while the +// Nomad client is restarting and asserts failing to reattach to logmon causes +// a recoverable error (task restart). +func TestTaskRunner_LogmonHook_StartCrashStop(t *testing.T) { + t.Parallel() + + alloc := mock.BatchAlloc() + task := alloc.Job.TaskGroups[0].Tasks[0] + + dir, err := ioutil.TempDir("", "nomadtest") + require.NoError(t, err) + defer func() { + require.NoError(t, os.RemoveAll(dir)) + }() + + hookConf := newLogMonHookConfig(task.Name, dir) + hook := newLogMonHook(hookConf, testlog.HCLogger(t)) + + req := interfaces.TaskPrestartRequest{ + Task: task, + } + resp := interfaces.TaskPrestartResponse{} + + // First start + require.NoError(t, hook.Prestart(context.Background(), &req, &resp)) + defer hook.Stop(context.Background(), nil, nil) + + origHookData := resp.HookData[logmonReattachKey] + require.NotEmpty(t, origHookData) + + // Pluck PID out of reattach synthesize a crash + reattach := struct { + Pid int + }{} + require.NoError(t, json.Unmarshal([]byte(origHookData), &reattach)) + pid := reattach.Pid + require.NotZero(t, pid) + + proc, _ := os.FindProcess(pid) + + // Assert logmon is running + require.NoError(t, proc.Signal(syscall.Signal(0))) + + // Kill it + require.NoError(t, proc.Signal(os.Kill)) + + // Since signals are asynchronous wait for the process to die + testutil.WaitForResult(func() (bool, error) { + err := proc.Signal(syscall.Signal(0)) + return err != nil, fmt.Errorf("pid %d still running", pid) + }, func(err error) { + require.NoError(t, err) + }) + + // Running prestart again should return a recoverable error with no + // reattach config to cause the task to be restarted with a new logmon. + req.HookData = map[string]string{ + logmonReattachKey: origHookData, + } + resp = interfaces.TaskPrestartResponse{} + err = hook.Prestart(context.Background(), &req, &resp) + require.Error(t, err) + require.True(t, structs.IsRecoverable(err)) + require.Empty(t, resp.HookData) + + // Running stop should shutdown logmon + require.NoError(t, hook.Stop(context.Background(), nil, nil)) +}