From b1ce39297285564780a064cf186de6424b2ae0de Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Fri, 22 Apr 2022 13:07:47 -0400 Subject: [PATCH] CSI: plugin supervisor prestart should not mark itself done (#12752) The task runner hook `Prestart` response object includes a `Done` field that's intended to tell the client not to run the hook again. The plugin supervisor creates mount points for the task during prestart and saves these mounts in the hook resources. But if a client restarts the hook resources will not be populated. If the plugin task restarts at any time after the client restarts, it will fail to have the correct mounts and crash loop until restart attempts run out. Fix this by not returning `Done` in the response, just as we do for the `volume_mount_hook`. --- .changelog/12752.txt | 3 +++ .../allocrunner/taskrunner/plugin_supervisor_hook.go | 12 +++++++----- 2 files changed, 10 insertions(+), 5 deletions(-) create mode 100644 .changelog/12752.txt diff --git a/.changelog/12752.txt b/.changelog/12752.txt new file mode 100644 index 000000000..5e271c950 --- /dev/null +++ b/.changelog/12752.txt @@ -0,0 +1,3 @@ +```release-note:bug +csi: Fixed a bug where plugins would not restart if they failed any time after a client restart +``` diff --git a/client/allocrunner/taskrunner/plugin_supervisor_hook.go b/client/allocrunner/taskrunner/plugin_supervisor_hook.go index 415b6e20b..6e63dcfbe 100644 --- a/client/allocrunner/taskrunner/plugin_supervisor_hook.go +++ b/client/allocrunner/taskrunner/plugin_supervisor_hook.go @@ -129,10 +129,10 @@ func (*csiPluginSupervisorHook) Name() string { } // Prestart is called before the task is started including after every -// restart (but not after restore). This requires that the mount paths -// for a plugin be idempotent, despite us not knowing the name of the -// plugin ahead of time. Because of this, we use the allocid_taskname -// as the unique identifier for a plugin on the filesystem. +// restart. This requires that the mount paths for a plugin be +// idempotent, despite us not knowing the name of the plugin ahead of +// time. Because of this, we use the allocid_taskname as the unique +// identifier for a plugin on the filesystem. func (h *csiPluginSupervisorHook) Prestart(ctx context.Context, req *interfaces.TaskPrestartRequest, resp *interfaces.TaskPrestartResponse) error { @@ -191,9 +191,11 @@ func (h *csiPluginSupervisorHook) Prestart(ctx context.Context, mounts = ensureMountpointInserted(mounts, volumeStagingMounts) mounts = ensureMountpointInserted(mounts, devMount) + // we normally would set resp.Mounts here but without setting the + // hookResources before returning we can get a postrun hook that's + // missing resources. h.runner.hookResources.setMounts(mounts) - resp.Done = true return nil }