mirror of
https://github.com/kemko/nomad.git
synced 2026-01-04 01:15:43 +03:00
Merge pull request #4059 from hashicorp/b-drain-health-svc-only
only service allocs should have health watched
This commit is contained in:
@@ -31,31 +31,29 @@ func (r *AllocRunner) watchHealth(ctx context.Context) {
|
||||
|
||||
// See if we should watch the allocs health
|
||||
alloc := r.Alloc()
|
||||
|
||||
// Neither deployments nor migrations care about the health of
|
||||
// non-service jobs so never watch their health
|
||||
if alloc.Job.Type != structs.JobTypeService {
|
||||
return
|
||||
}
|
||||
|
||||
// No need to watch health as it's already set
|
||||
if alloc.DeploymentStatus.IsHealthy() || alloc.DeploymentStatus.IsUnhealthy() {
|
||||
// No need to watch health as it's already set
|
||||
return
|
||||
}
|
||||
|
||||
// Neither deployments nor migrations care about system jobs so never
|
||||
// watch their health
|
||||
if alloc.Job.Type == structs.JobTypeSystem {
|
||||
return
|
||||
}
|
||||
|
||||
isDeploy := alloc.DeploymentID != ""
|
||||
|
||||
// Migrations don't consider the health of batch jobs so only watch
|
||||
// batch health during deployments
|
||||
if !isDeploy && alloc.Job.Type == structs.JobTypeBatch {
|
||||
return
|
||||
}
|
||||
|
||||
tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
|
||||
if tg == nil {
|
||||
r.logger.Printf("[ERR] client.alloc_watcher: failed to lookup allocation's task group. Exiting watcher")
|
||||
r.logger.Printf("[ERR] client.alloc_watcher: failed to lookup allocation %q task group %q. Exiting watcher",
|
||||
alloc.ID, alloc.TaskGroup)
|
||||
return
|
||||
}
|
||||
|
||||
isDeploy := alloc.DeploymentID != ""
|
||||
|
||||
// No need to watch allocs for deployments that rely on operators
|
||||
// manually setting health
|
||||
if isDeploy && (tg.Update == nil || tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Manual) {
|
||||
return
|
||||
}
|
||||
|
||||
@@ -115,7 +115,7 @@ func TestAllocRunner_DeploymentHealth_Unhealthy_BadStart(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
// Ensure the task fails and restarts
|
||||
upd, ar := testAllocRunner(t, false)
|
||||
upd, ar := testAllocRunner(t, true)
|
||||
|
||||
// Make the task fail
|
||||
task := ar.alloc.Job.TaskGroups[0].Tasks[0]
|
||||
@@ -163,7 +163,7 @@ func TestAllocRunner_DeploymentHealth_Unhealthy_Deadline(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
// Ensure the task fails and restarts
|
||||
upd, ar := testAllocRunner(t, false)
|
||||
upd, ar := testAllocRunner(t, true)
|
||||
|
||||
// Make the task block
|
||||
task := ar.alloc.Job.TaskGroups[0].Tasks[0]
|
||||
@@ -211,7 +211,7 @@ func TestAllocRunner_DeploymentHealth_Healthy_NoChecks(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// Ensure the task fails and restarts
|
||||
upd, ar := testAllocRunner(t, false)
|
||||
upd, ar := testAllocRunner(t, true)
|
||||
|
||||
// Make the task run healthy
|
||||
task := ar.alloc.Job.TaskGroups[0].Tasks[0]
|
||||
@@ -259,7 +259,7 @@ func TestAllocRunner_DeploymentHealth_Healthy_Checks(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// Ensure the task fails and restarts
|
||||
upd, ar := testAllocRunner(t, false)
|
||||
upd, ar := testAllocRunner(t, true)
|
||||
|
||||
// Make the task fail
|
||||
task := ar.alloc.Job.TaskGroups[0].Tasks[0]
|
||||
@@ -352,7 +352,7 @@ func TestAllocRunner_DeploymentHealth_Unhealthy_Checks(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
// Ensure the task fails and restarts
|
||||
upd, ar := testAllocRunner(t, false)
|
||||
upd, ar := testAllocRunner(t, true)
|
||||
|
||||
// Make the task fail
|
||||
task := ar.alloc.Job.TaskGroups[0].Tasks[0]
|
||||
@@ -421,7 +421,7 @@ func TestAllocRunner_DeploymentHealth_Healthy_UpdatedDeployment(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// Ensure the task fails and restarts
|
||||
upd, ar := testAllocRunner(t, false)
|
||||
upd, ar := testAllocRunner(t, true)
|
||||
|
||||
// Make the task run healthy
|
||||
task := ar.alloc.Job.TaskGroups[0].Tasks[0]
|
||||
@@ -475,6 +475,82 @@ func TestAllocRunner_DeploymentHealth_Healthy_UpdatedDeployment(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
// Test that health is reported for services that got migrated; not just part
|
||||
// of deployments.
|
||||
func TestAllocRunner_DeploymentHealth_Healthy_Migration(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// Ensure the task fails and restarts
|
||||
upd, ar := testAllocRunner(t, true)
|
||||
|
||||
// Make the task run healthy
|
||||
tg := ar.alloc.Job.TaskGroups[0]
|
||||
task := tg.Tasks[0]
|
||||
task.Driver = "mock_driver"
|
||||
task.Config["run_for"] = "30s"
|
||||
|
||||
// Shorten the default migration healthy time
|
||||
tg.Migrate = structs.DefaultMigrateStrategy()
|
||||
tg.Migrate.MinHealthyTime = 100 * time.Millisecond
|
||||
tg.Migrate.HealthCheck = structs.MigrateStrategyHealthStates
|
||||
|
||||
// Ensure the alloc is *not* part of a deployment
|
||||
ar.alloc.DeploymentID = ""
|
||||
|
||||
go ar.Run()
|
||||
defer ar.Destroy()
|
||||
|
||||
testutil.WaitForResult(func() (bool, error) {
|
||||
_, last := upd.Last()
|
||||
if last == nil {
|
||||
return false, fmt.Errorf("No updates")
|
||||
}
|
||||
if last.DeploymentStatus == nil || last.DeploymentStatus.Healthy == nil {
|
||||
return false, fmt.Errorf("want deployment status unhealthy; got unset")
|
||||
} else if !*last.DeploymentStatus.Healthy {
|
||||
return false, fmt.Errorf("want deployment status healthy; got unhealthy")
|
||||
}
|
||||
return true, nil
|
||||
}, func(err error) {
|
||||
t.Fatalf("err: %v", err)
|
||||
})
|
||||
}
|
||||
|
||||
// Test that health is *not* reported for batch jobs
|
||||
func TestAllocRunner_DeploymentHealth_BatchDisabled(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// Ensure the task fails and restarts
|
||||
alloc := mock.BatchAlloc()
|
||||
tg := alloc.Job.TaskGroups[0]
|
||||
|
||||
// This should not be possile as validation should prevent batch jobs
|
||||
// from having a migration stanza!
|
||||
tg.Migrate = structs.DefaultMigrateStrategy()
|
||||
tg.Migrate.MinHealthyTime = 1 * time.Millisecond
|
||||
tg.Migrate.HealthyDeadline = 2 * time.Millisecond
|
||||
tg.Migrate.HealthCheck = structs.MigrateStrategyHealthStates
|
||||
|
||||
task := tg.Tasks[0]
|
||||
task.Driver = "mock_driver"
|
||||
task.Config["run_for"] = "5s"
|
||||
upd, ar := testAllocRunnerFromAlloc(t, alloc, false)
|
||||
|
||||
go ar.Run()
|
||||
defer ar.Destroy()
|
||||
|
||||
for i := 0; i < 10; i++ {
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
_, last := upd.Last()
|
||||
if last == nil {
|
||||
continue
|
||||
}
|
||||
if last.DeploymentStatus != nil {
|
||||
t.Fatalf("unexpected deployment health set: %v", last.DeploymentStatus.Healthy)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestAllocRuner_RetryArtifact ensures that if one task in a task group is
|
||||
// retrying fetching an artifact, other tasks in the group should be able
|
||||
// to proceed.
|
||||
|
||||
Reference in New Issue
Block a user