mirror of
https://github.com/kemko/nomad.git
synced 2026-01-06 10:25:42 +03:00
client: restart task on logmon failures
This code chooses to be conservative as opposed to optimal: when failing to reattach to logmon simply return a recoverable error instead of immediately trying to restart logmon. The recoverable error will cause the task's restart policy to be applied and a new logmon will be launched upon restart. Trying to do the optimal approach of simply starting a new logmon requires error string comparison and should be tested against a task actively logging to assert the behavior (are writes blocked? dropped?).
This commit is contained in:
@@ -12,6 +12,7 @@ import (
|
||||
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
|
||||
"github.com/hashicorp/nomad/client/logmon"
|
||||
"github.com/hashicorp/nomad/helper/uuid"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
pstructs "github.com/hashicorp/nomad/plugins/shared/structs"
|
||||
)
|
||||
|
||||
@@ -102,8 +103,10 @@ func (h *logmonHook) Prestart(ctx context.Context,
|
||||
|
||||
// Launch or reattach logmon instance for the task.
|
||||
if err := h.launchLogMon(reattachConfig); err != nil {
|
||||
// Retry errors launching logmon as logmon may have crashed and
|
||||
// subsequent attempts will start a new one.
|
||||
h.logger.Error("failed to launch logmon process", "error", err)
|
||||
return err
|
||||
return structs.NewRecoverableError(err, true)
|
||||
}
|
||||
|
||||
// Only tell logmon to start when we are not reattaching to a running instance
|
||||
|
||||
@@ -1,18 +1,24 @@
|
||||
// +build !windows
|
||||
|
||||
package taskrunner
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net"
|
||||
"os"
|
||||
"syscall"
|
||||
"testing"
|
||||
|
||||
plugin "github.com/hashicorp/go-plugin"
|
||||
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
|
||||
"github.com/hashicorp/nomad/helper/testlog"
|
||||
"github.com/hashicorp/nomad/nomad/mock"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
pstructs "github.com/hashicorp/nomad/plugins/shared/structs"
|
||||
"github.com/hashicorp/nomad/testutil"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
@@ -100,3 +106,72 @@ func TestTaskRunner_LogmonHook_StartStop(t *testing.T) {
|
||||
// Running stop should shutdown logmon
|
||||
require.NoError(t, hook.Stop(context.Background(), nil, nil))
|
||||
}
|
||||
|
||||
// TestTaskRunner_LogmonHook_StartCrashStop simulates logmon crashing while the
|
||||
// Nomad client is restarting and asserts failing to reattach to logmon causes
|
||||
// a recoverable error (task restart).
|
||||
func TestTaskRunner_LogmonHook_StartCrashStop(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
alloc := mock.BatchAlloc()
|
||||
task := alloc.Job.TaskGroups[0].Tasks[0]
|
||||
|
||||
dir, err := ioutil.TempDir("", "nomadtest")
|
||||
require.NoError(t, err)
|
||||
defer func() {
|
||||
require.NoError(t, os.RemoveAll(dir))
|
||||
}()
|
||||
|
||||
hookConf := newLogMonHookConfig(task.Name, dir)
|
||||
hook := newLogMonHook(hookConf, testlog.HCLogger(t))
|
||||
|
||||
req := interfaces.TaskPrestartRequest{
|
||||
Task: task,
|
||||
}
|
||||
resp := interfaces.TaskPrestartResponse{}
|
||||
|
||||
// First start
|
||||
require.NoError(t, hook.Prestart(context.Background(), &req, &resp))
|
||||
defer hook.Stop(context.Background(), nil, nil)
|
||||
|
||||
origHookData := resp.HookData[logmonReattachKey]
|
||||
require.NotEmpty(t, origHookData)
|
||||
|
||||
// Pluck PID out of reattach synthesize a crash
|
||||
reattach := struct {
|
||||
Pid int
|
||||
}{}
|
||||
require.NoError(t, json.Unmarshal([]byte(origHookData), &reattach))
|
||||
pid := reattach.Pid
|
||||
require.NotZero(t, pid)
|
||||
|
||||
proc, _ := os.FindProcess(pid)
|
||||
|
||||
// Assert logmon is running
|
||||
require.NoError(t, proc.Signal(syscall.Signal(0)))
|
||||
|
||||
// Kill it
|
||||
require.NoError(t, proc.Signal(os.Kill))
|
||||
|
||||
// Since signals are asynchronous wait for the process to die
|
||||
testutil.WaitForResult(func() (bool, error) {
|
||||
err := proc.Signal(syscall.Signal(0))
|
||||
return err != nil, fmt.Errorf("pid %d still running", pid)
|
||||
}, func(err error) {
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
// Running prestart again should return a recoverable error with no
|
||||
// reattach config to cause the task to be restarted with a new logmon.
|
||||
req.HookData = map[string]string{
|
||||
logmonReattachKey: origHookData,
|
||||
}
|
||||
resp = interfaces.TaskPrestartResponse{}
|
||||
err = hook.Prestart(context.Background(), &req, &resp)
|
||||
require.Error(t, err)
|
||||
require.True(t, structs.IsRecoverable(err))
|
||||
require.Empty(t, resp.HookData)
|
||||
|
||||
// Running stop should shutdown logmon
|
||||
require.NoError(t, hook.Stop(context.Background(), nil, nil))
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user