Merge pull request #4059 from hashicorp/b-drain-health-svc-only

only service allocs should have health watched
2026-01-04 01:15:43 +03:00 · 2018-03-28 16:49:22 -07:00
parent b7248824cf 12dd17affe
commit 35f42b1fca
2 changed files with 96 additions and 22 deletions
--- a/client/alloc_runner_health_watcher.go
+++ b/client/alloc_runner_health_watcher.go
@@ -31,31 +31,29 @@ func (r *AllocRunner) watchHealth(ctx context.Context) {

 	// See if we should watch the allocs health
 	alloc := r.Alloc()
+
+	// Neither deployments nor migrations care about the health of
+	// non-service jobs so never watch their health
+	if alloc.Job.Type != structs.JobTypeService {
+		return
+	}
+
+	// No need to watch health as it's already set
 	if alloc.DeploymentStatus.IsHealthy() || alloc.DeploymentStatus.IsUnhealthy() {
-		// No need to watch health as it's already set
-		return
-	}
-
-	// Neither deployments nor migrations care about system jobs so never
-	// watch their health
-	if alloc.Job.Type == structs.JobTypeSystem {
-		return
-	}
-
-	isDeploy := alloc.DeploymentID != ""
-
-	// Migrations don't consider the health of batch jobs so only watch
-	// batch health during deployments
-	if !isDeploy && alloc.Job.Type == structs.JobTypeBatch {
 		return
 	}

 	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
 	if tg == nil {
-		r.logger.Printf("[ERR] client.alloc_watcher: failed to lookup allocation's task group. Exiting watcher")
+		r.logger.Printf("[ERR] client.alloc_watcher: failed to lookup allocation %q task group %q. Exiting watcher",
+			alloc.ID, alloc.TaskGroup)
 		return
 	}

+	isDeploy := alloc.DeploymentID != ""
+
+	// No need to watch allocs for deployments that rely on operators
+	// manually setting health
 	if isDeploy && (tg.Update == nil || tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Manual) {
 		return
 	}
--- a/client/alloc_runner_test.go
+++ b/client/alloc_runner_test.go
@@ -115,7 +115,7 @@ func TestAllocRunner_DeploymentHealth_Unhealthy_BadStart(t *testing.T) {
 	assert := assert.New(t)

 	// Ensure the task fails and restarts
-	upd, ar := testAllocRunner(t, false)
+	upd, ar := testAllocRunner(t, true)

 	// Make the task fail
 	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
@@ -163,7 +163,7 @@ func TestAllocRunner_DeploymentHealth_Unhealthy_Deadline(t *testing.T) {
 	assert := assert.New(t)

 	// Ensure the task fails and restarts
-	upd, ar := testAllocRunner(t, false)
+	upd, ar := testAllocRunner(t, true)

 	// Make the task block
 	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
@@ -211,7 +211,7 @@ func TestAllocRunner_DeploymentHealth_Healthy_NoChecks(t *testing.T) {
 	t.Parallel()

 	// Ensure the task fails and restarts
-	upd, ar := testAllocRunner(t, false)
+	upd, ar := testAllocRunner(t, true)

 	// Make the task run healthy
 	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
@@ -259,7 +259,7 @@ func TestAllocRunner_DeploymentHealth_Healthy_Checks(t *testing.T) {
 	t.Parallel()

 	// Ensure the task fails and restarts
-	upd, ar := testAllocRunner(t, false)
+	upd, ar := testAllocRunner(t, true)

 	// Make the task fail
 	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
@@ -352,7 +352,7 @@ func TestAllocRunner_DeploymentHealth_Unhealthy_Checks(t *testing.T) {
 	assert := assert.New(t)

 	// Ensure the task fails and restarts
-	upd, ar := testAllocRunner(t, false)
+	upd, ar := testAllocRunner(t, true)

 	// Make the task fail
 	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
@@ -421,7 +421,7 @@ func TestAllocRunner_DeploymentHealth_Healthy_UpdatedDeployment(t *testing.T) {
 	t.Parallel()

 	// Ensure the task fails and restarts
-	upd, ar := testAllocRunner(t, false)
+	upd, ar := testAllocRunner(t, true)

 	// Make the task run healthy
 	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
@@ -475,6 +475,82 @@ func TestAllocRunner_DeploymentHealth_Healthy_UpdatedDeployment(t *testing.T) {
 	})
 }

+// Test that health is reported for services that got migrated; not just part
+// of deployments.
+func TestAllocRunner_DeploymentHealth_Healthy_Migration(t *testing.T) {
+	t.Parallel()
+
+	// Ensure the task fails and restarts
+	upd, ar := testAllocRunner(t, true)
+
+	// Make the task run healthy
+	tg := ar.alloc.Job.TaskGroups[0]
+	task := tg.Tasks[0]
+	task.Driver = "mock_driver"
+	task.Config["run_for"] = "30s"
+
+	// Shorten the default migration healthy time
+	tg.Migrate = structs.DefaultMigrateStrategy()
+	tg.Migrate.MinHealthyTime = 100 * time.Millisecond
+	tg.Migrate.HealthCheck = structs.MigrateStrategyHealthStates
+
+	// Ensure the alloc is *not* part of a deployment
+	ar.alloc.DeploymentID = ""
+
+	go ar.Run()
+	defer ar.Destroy()
+
+	testutil.WaitForResult(func() (bool, error) {
+		_, last := upd.Last()
+		if last == nil {
+			return false, fmt.Errorf("No updates")
+		}
+		if last.DeploymentStatus == nil || last.DeploymentStatus.Healthy == nil {
+			return false, fmt.Errorf("want deployment status unhealthy; got unset")
+		} else if !*last.DeploymentStatus.Healthy {
+			return false, fmt.Errorf("want deployment status healthy; got unhealthy")
+		}
+		return true, nil
+	}, func(err error) {
+		t.Fatalf("err: %v", err)
+	})
+}
+
+// Test that health is *not* reported for batch jobs
+func TestAllocRunner_DeploymentHealth_BatchDisabled(t *testing.T) {
+	t.Parallel()
+
+	// Ensure the task fails and restarts
+	alloc := mock.BatchAlloc()
+	tg := alloc.Job.TaskGroups[0]
+
+	// This should not be possile as validation should prevent batch jobs
+	// from having a migration stanza!
+	tg.Migrate = structs.DefaultMigrateStrategy()
+	tg.Migrate.MinHealthyTime = 1 * time.Millisecond
+	tg.Migrate.HealthyDeadline = 2 * time.Millisecond
+	tg.Migrate.HealthCheck = structs.MigrateStrategyHealthStates
+
+	task := tg.Tasks[0]
+	task.Driver = "mock_driver"
+	task.Config["run_for"] = "5s"
+	upd, ar := testAllocRunnerFromAlloc(t, alloc, false)
+
+	go ar.Run()
+	defer ar.Destroy()
+
+	for i := 0; i < 10; i++ {
+		time.Sleep(10 * time.Millisecond)
+		_, last := upd.Last()
+		if last == nil {
+			continue
+		}
+		if last.DeploymentStatus != nil {
+			t.Fatalf("unexpected deployment health set: %v", last.DeploymentStatus.Healthy)
+		}
+	}
+}
+
 // TestAllocRuner_RetryArtifact ensures that if one task in a task group is
 // retrying fetching an artifact, other tasks in the group should be able
 // to proceed.