From 12dd17affeec05511a27e4a4b5df27fe39402772 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Tue, 27 Mar 2018 17:22:04 -0700 Subject: [PATCH] only service allocs should have health watched --- client/alloc_runner_health_watcher.go | 30 +++++---- client/alloc_runner_test.go | 88 +++++++++++++++++++++++++-- 2 files changed, 96 insertions(+), 22 deletions(-) diff --git a/client/alloc_runner_health_watcher.go b/client/alloc_runner_health_watcher.go index b57f9c46e..2134e7ee2 100644 --- a/client/alloc_runner_health_watcher.go +++ b/client/alloc_runner_health_watcher.go @@ -31,31 +31,29 @@ func (r *AllocRunner) watchHealth(ctx context.Context) { // See if we should watch the allocs health alloc := r.Alloc() + + // Neither deployments nor migrations care about the health of + // non-service jobs so never watch their health + if alloc.Job.Type != structs.JobTypeService { + return + } + + // No need to watch health as it's already set if alloc.DeploymentStatus.IsHealthy() || alloc.DeploymentStatus.IsUnhealthy() { - // No need to watch health as it's already set - return - } - - // Neither deployments nor migrations care about system jobs so never - // watch their health - if alloc.Job.Type == structs.JobTypeSystem { - return - } - - isDeploy := alloc.DeploymentID != "" - - // Migrations don't consider the health of batch jobs so only watch - // batch health during deployments - if !isDeploy && alloc.Job.Type == structs.JobTypeBatch { return } tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) if tg == nil { - r.logger.Printf("[ERR] client.alloc_watcher: failed to lookup allocation's task group. Exiting watcher") + r.logger.Printf("[ERR] client.alloc_watcher: failed to lookup allocation %q task group %q. Exiting watcher", + alloc.ID, alloc.TaskGroup) return } + isDeploy := alloc.DeploymentID != "" + + // No need to watch allocs for deployments that rely on operators + // manually setting health if isDeploy && (tg.Update == nil || tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Manual) { return } diff --git a/client/alloc_runner_test.go b/client/alloc_runner_test.go index b2927f86e..c32531bc4 100644 --- a/client/alloc_runner_test.go +++ b/client/alloc_runner_test.go @@ -115,7 +115,7 @@ func TestAllocRunner_DeploymentHealth_Unhealthy_BadStart(t *testing.T) { assert := assert.New(t) // Ensure the task fails and restarts - upd, ar := testAllocRunner(t, false) + upd, ar := testAllocRunner(t, true) // Make the task fail task := ar.alloc.Job.TaskGroups[0].Tasks[0] @@ -163,7 +163,7 @@ func TestAllocRunner_DeploymentHealth_Unhealthy_Deadline(t *testing.T) { assert := assert.New(t) // Ensure the task fails and restarts - upd, ar := testAllocRunner(t, false) + upd, ar := testAllocRunner(t, true) // Make the task block task := ar.alloc.Job.TaskGroups[0].Tasks[0] @@ -211,7 +211,7 @@ func TestAllocRunner_DeploymentHealth_Healthy_NoChecks(t *testing.T) { t.Parallel() // Ensure the task fails and restarts - upd, ar := testAllocRunner(t, false) + upd, ar := testAllocRunner(t, true) // Make the task run healthy task := ar.alloc.Job.TaskGroups[0].Tasks[0] @@ -259,7 +259,7 @@ func TestAllocRunner_DeploymentHealth_Healthy_Checks(t *testing.T) { t.Parallel() // Ensure the task fails and restarts - upd, ar := testAllocRunner(t, false) + upd, ar := testAllocRunner(t, true) // Make the task fail task := ar.alloc.Job.TaskGroups[0].Tasks[0] @@ -352,7 +352,7 @@ func TestAllocRunner_DeploymentHealth_Unhealthy_Checks(t *testing.T) { assert := assert.New(t) // Ensure the task fails and restarts - upd, ar := testAllocRunner(t, false) + upd, ar := testAllocRunner(t, true) // Make the task fail task := ar.alloc.Job.TaskGroups[0].Tasks[0] @@ -421,7 +421,7 @@ func TestAllocRunner_DeploymentHealth_Healthy_UpdatedDeployment(t *testing.T) { t.Parallel() // Ensure the task fails and restarts - upd, ar := testAllocRunner(t, false) + upd, ar := testAllocRunner(t, true) // Make the task run healthy task := ar.alloc.Job.TaskGroups[0].Tasks[0] @@ -475,6 +475,82 @@ func TestAllocRunner_DeploymentHealth_Healthy_UpdatedDeployment(t *testing.T) { }) } +// Test that health is reported for services that got migrated; not just part +// of deployments. +func TestAllocRunner_DeploymentHealth_Healthy_Migration(t *testing.T) { + t.Parallel() + + // Ensure the task fails and restarts + upd, ar := testAllocRunner(t, true) + + // Make the task run healthy + tg := ar.alloc.Job.TaskGroups[0] + task := tg.Tasks[0] + task.Driver = "mock_driver" + task.Config["run_for"] = "30s" + + // Shorten the default migration healthy time + tg.Migrate = structs.DefaultMigrateStrategy() + tg.Migrate.MinHealthyTime = 100 * time.Millisecond + tg.Migrate.HealthCheck = structs.MigrateStrategyHealthStates + + // Ensure the alloc is *not* part of a deployment + ar.alloc.DeploymentID = "" + + go ar.Run() + defer ar.Destroy() + + testutil.WaitForResult(func() (bool, error) { + _, last := upd.Last() + if last == nil { + return false, fmt.Errorf("No updates") + } + if last.DeploymentStatus == nil || last.DeploymentStatus.Healthy == nil { + return false, fmt.Errorf("want deployment status unhealthy; got unset") + } else if !*last.DeploymentStatus.Healthy { + return false, fmt.Errorf("want deployment status healthy; got unhealthy") + } + return true, nil + }, func(err error) { + t.Fatalf("err: %v", err) + }) +} + +// Test that health is *not* reported for batch jobs +func TestAllocRunner_DeploymentHealth_BatchDisabled(t *testing.T) { + t.Parallel() + + // Ensure the task fails and restarts + alloc := mock.BatchAlloc() + tg := alloc.Job.TaskGroups[0] + + // This should not be possile as validation should prevent batch jobs + // from having a migration stanza! + tg.Migrate = structs.DefaultMigrateStrategy() + tg.Migrate.MinHealthyTime = 1 * time.Millisecond + tg.Migrate.HealthyDeadline = 2 * time.Millisecond + tg.Migrate.HealthCheck = structs.MigrateStrategyHealthStates + + task := tg.Tasks[0] + task.Driver = "mock_driver" + task.Config["run_for"] = "5s" + upd, ar := testAllocRunnerFromAlloc(t, alloc, false) + + go ar.Run() + defer ar.Destroy() + + for i := 0; i < 10; i++ { + time.Sleep(10 * time.Millisecond) + _, last := upd.Last() + if last == nil { + continue + } + if last.DeploymentStatus != nil { + t.Fatalf("unexpected deployment health set: %v", last.DeploymentStatus.Healthy) + } + } +} + // TestAllocRuner_RetryArtifact ensures that if one task in a task group is // retrying fetching an artifact, other tasks in the group should be able // to proceed.