CSI: make gRPC client creation more robust (#12057)

Nomad communicates with CSI plugin tasks via gRPC. The plugin supervisor hook uses this to ping the plugin for health checks which it emits as task events. After the first successful health check the plugin supervisor registers the plugin in the client's dynamic plugin registry, which in turn creates a CSI plugin manager instance that has its own gRPC client for fingerprinting the plugin and sending mount requests. If the plugin manager instance fails to connect to the plugin on its first attempt, it exits. The plugin supervisor hook is unaware that connection failed so long as its own pings continue to work. A transient failure during plugin startup may mislead the plugin supervisor hook into thinking the plugin is up (so there's no need to restart the allocation) but no fingerprinter is started. * Refactors the gRPC client to connect on first use. This provides the plugin manager instance the ability to retry the gRPC client connection until success. * Add a 30s timeout to the plugin supervisor so that we don't poll forever waiting for a plugin that will never come back up. Minor improvements: * The plugin supervisor hook creates a new gRPC client for every probe and then throws it away. Instead, reuse the client as we do for the plugin manager. * The gRPC client constructor has a 1 second timeout. Clarify that this timeout applies to the connection and not the rest of the client lifetime.
2026-01-06 18:35:44 +03:00 · 2022-02-15 16:57:29 -05:00
parent 07f4227d76
commit b775a73ded
7 changed files with 203 additions and 148 deletions
--- a/client/allocrunner/taskrunner/plugin_supervisor_hook.go
+++ b/client/allocrunner/taskrunner/plugin_supervisor_hook.go
@@ -38,6 +38,7 @@ type csiPluginSupervisorHook struct {

 	// eventEmitter is used to emit events to the task
 	eventEmitter ti.EventEmitter
+	lifecycle    ti.TaskLifecycle

 	shutdownCtx      context.Context
 	shutdownCancelFn context.CancelFunc
@@ -54,6 +55,7 @@ type csiPluginSupervisorHookConfig struct {
 	clientStateDirPath string
 	events             ti.EventEmitter
 	runner             *TaskRunner
+	lifecycle          ti.TaskLifecycle
 	capabilities       *drivers.Capabilities
 	logger             hclog.Logger
 }
@@ -90,6 +92,7 @@ func newCSIPluginSupervisorHook(config *csiPluginSupervisorHookConfig) *csiPlugi
 	hook := &csiPluginSupervisorHook{
 		alloc:            config.runner.Alloc(),
 		runner:           config.runner,
+		lifecycle:        config.lifecycle,
 		logger:           config.logger,
 		task:             task,
 		mountPoint:       pluginRoot,
@@ -201,27 +204,41 @@ func (h *csiPluginSupervisorHook) ensureSupervisorLoop(ctx context.Context) {
 	}()

 	socketPath := filepath.Join(h.mountPoint, structs.CSISocketName)
+
+	client := csi.NewClient(socketPath, h.logger.Named("csi_client").With(
+		"plugin.name", h.task.CSIPluginConfig.ID,
+		"plugin.type", h.task.CSIPluginConfig.Type))
+	defer client.Close()
+
 	t := time.NewTimer(0)

+	// We're in Poststart at this point, so if we can't connect within
+	// this deadline, assume it's broken so we can restart the task
+	startCtx, startCancelFn := context.WithTimeout(ctx, 30*time.Second)
+	defer startCancelFn()
+
+	var err error
+	var pluginHealthy bool
+
 	// Step 1: Wait for the plugin to initially become available.
 WAITFORREADY:
 	for {
 		select {
-		case <-ctx.Done():
+		case <-startCtx.Done():
+			h.kill(ctx, fmt.Errorf("CSI plugin failed probe: %v", err))
 			return
 		case <-t.C:
-			pluginHealthy, err := h.supervisorLoopOnce(ctx, socketPath)
+			pluginHealthy, err = h.supervisorLoopOnce(startCtx, client)
 			if err != nil || !pluginHealthy {
-				h.logger.Debug("CSI Plugin not ready", "error", err)
-
-				// Plugin is not yet returning healthy, because we want to optimise for
-				// quickly bringing a plugin online, we use a short timeout here.
-				// TODO(dani): Test with more plugins and adjust.
+				h.logger.Debug("CSI plugin not ready", "error", err)
+				// Use only a short delay here to optimize for quickly
+				// bringing up a plugin
 				t.Reset(5 * time.Second)
 				continue
 			}

 			// Mark the plugin as healthy in a task event
+			h.logger.Debug("CSI plugin is ready")
 			h.previousHealthState = pluginHealthy
 			event := structs.NewTaskEvent(structs.TaskPluginHealthy)
 			event.SetMessage(fmt.Sprintf("plugin: %s", h.task.CSIPluginConfig.ID))
@@ -232,15 +249,14 @@ WAITFORREADY:
 	}

 	// Step 2: Register the plugin with the catalog.
-	deregisterPluginFn, err := h.registerPlugin(socketPath)
+	deregisterPluginFn, err := h.registerPlugin(client, socketPath)
 	if err != nil {
-		h.logger.Error("CSI Plugin registration failed", "error", err)
-		event := structs.NewTaskEvent(structs.TaskPluginUnhealthy)
-		event.SetMessage(fmt.Sprintf("failed to register plugin: %s, reason: %v", h.task.CSIPluginConfig.ID, err))
-		h.eventEmitter.EmitEvent(event)
+		h.kill(ctx, fmt.Errorf("CSI plugin failed to register: %v", err))
+		return
 	}

-	// Step 3: Start the lightweight supervisor loop.
+	// Step 3: Start the lightweight supervisor loop. At this point, failures
+	// don't cause the task to restart
 	t.Reset(0)
 	for {
 		select {
@@ -249,9 +265,9 @@ WAITFORREADY:
 			deregisterPluginFn()
 			return
 		case <-t.C:
-			pluginHealthy, err := h.supervisorLoopOnce(ctx, socketPath)
+			pluginHealthy, err := h.supervisorLoopOnce(ctx, client)
 			if err != nil {
-				h.logger.Error("CSI Plugin fingerprinting failed", "error", err)
+				h.logger.Error("CSI plugin fingerprinting failed", "error", err)
 			}

 			// The plugin has transitioned to a healthy state. Emit an event.
@@ -265,7 +281,7 @@ WAITFORREADY:
 			if h.previousHealthState && !pluginHealthy {
 				event := structs.NewTaskEvent(structs.TaskPluginUnhealthy)
 				if err != nil {
-					event.SetMessage(fmt.Sprintf("error: %v", err))
+					event.SetMessage(fmt.Sprintf("Error: %v", err))
 				} else {
 					event.SetMessage("Unknown Reason")
 				}
@@ -281,16 +297,9 @@ WAITFORREADY:
 	}
 }

-func (h *csiPluginSupervisorHook) registerPlugin(socketPath string) (func(), error) {
-
+func (h *csiPluginSupervisorHook) registerPlugin(client csi.CSIPlugin, socketPath string) (func(), error) {
 	// At this point we know the plugin is ready and we can fingerprint it
 	// to get its vendor name and version
-	client, err := csi.NewClient(socketPath, h.logger.Named("csi_client").With("plugin.name", h.task.CSIPluginConfig.ID, "plugin.type", h.task.CSIPluginConfig.Type))
-	if err != nil {
-		return nil, fmt.Errorf("failed to create csi client: %v", err)
-	}
-	defer client.Close()
-
 	info, err := client.PluginInfo()
 	if err != nil {
 		return nil, fmt.Errorf("failed to probe plugin: %v", err)
@@ -354,21 +363,13 @@ func (h *csiPluginSupervisorHook) registerPlugin(socketPath string) (func(), err
 	}, nil
 }

-func (h *csiPluginSupervisorHook) supervisorLoopOnce(ctx context.Context, socketPath string) (bool, error) {
-	_, err := os.Stat(socketPath)
-	if err != nil {
-		return false, fmt.Errorf("failed to stat socket: %v", err)
-	}
+func (h *csiPluginSupervisorHook) supervisorLoopOnce(ctx context.Context, client csi.CSIPlugin) (bool, error) {
+	probeCtx, probeCancelFn := context.WithTimeout(ctx, 5*time.Second)
+	defer probeCancelFn()

-	client, err := csi.NewClient(socketPath, h.logger.Named("csi_client").With("plugin.name", h.task.CSIPluginConfig.ID, "plugin.type", h.task.CSIPluginConfig.Type))
+	healthy, err := client.PluginProbe(probeCtx)
 	if err != nil {
-		return false, fmt.Errorf("failed to create csi client: %v", err)
-	}
-	defer client.Close()
-
-	healthy, err := client.PluginProbe(ctx)
-	if err != nil {
-		return false, fmt.Errorf("failed to probe plugin: %v", err)
+		return false, err
 	}

 	return healthy, nil
@@ -387,6 +388,21 @@ func (h *csiPluginSupervisorHook) Stop(_ context.Context, req *interfaces.TaskSt
 	return nil
 }

+func (h *csiPluginSupervisorHook) kill(ctx context.Context, reason error) {
+	h.logger.Error("killing task because plugin failed", "error", reason)
+	event := structs.NewTaskEvent(structs.TaskPluginUnhealthy)
+	event.SetMessage(fmt.Sprintf("Error: %v", reason.Error()))
+	h.eventEmitter.EmitEvent(event)
+
+	if err := h.lifecycle.Kill(ctx,
+		structs.NewTaskEvent(structs.TaskKilling).
+			SetFailsTask().
+			SetDisplayMessage("CSI plugin did not become healthy before timeout"),
+	); err != nil {
+		h.logger.Error("failed to kill task", "kill_reason", reason, "error", err)
+	}
+}
+
 func ensureMountpointInserted(mounts []*drivers.MountConfig, mount *drivers.MountConfig) []*drivers.MountConfig {
 	for _, mnt := range mounts {
 		if mnt.IsEqual(mount) {
--- a/client/allocrunner/taskrunner/task_runner_hooks.go
+++ b/client/allocrunner/taskrunner/task_runner_hooks.go
@@ -76,6 +76,7 @@ func (tr *TaskRunner) initHooks() {
 				clientStateDirPath: tr.clientConfig.StateDir,
 				events:             tr,
 				runner:             tr,
+				lifecycle:          tr,
 				capabilities:       tr.driverCapabilities,
 				logger:             hookLogger,
 			}))