Merge pull request #7810 from hashicorp/spread-configuration

spread scheduling algorithm
This commit is contained in:
Mahmood Ali
2020-05-01 13:15:19 -04:00
committed by GitHub
17 changed files with 181 additions and 64 deletions

View File

@@ -3,6 +3,10 @@
FEATURES:
* **Task dependencies UI**: task lifecycle charts and details
IMPROVEMENTS:
* core: Allow spreading allocations as an alternative to binpacking [[GH-7810](https://github.com/hashicorp/nomad/issues/7810)]
BUG FIXES:
* api: autoscaling policies should not be returned for stopped jobs [[GH-7768](https://github.com/hashicorp/nomad/issues/7768)]

View File

@@ -112,7 +112,11 @@ func (op *Operator) RaftRemovePeerByID(id string, q *WriteOptions) error {
return nil
}
// SchedulerConfiguration is the config for controlling scheduler behavior
type SchedulerConfiguration struct {
// SchedulerAlgorithm lets you select between available scheduling algorithms.
SchedulerAlgorithm SchedulerAlgorithm
// PreemptionConfig specifies whether to enable eviction of lower
// priority jobs to place higher priority jobs.
PreemptionConfig PreemptionConfig
@@ -140,6 +144,16 @@ type SchedulerSetConfigurationResponse struct {
WriteMeta
}
// SchedulerAlgorithm is an enum string that encapsulates the valid options for a
// SchedulerConfiguration stanza's SchedulerAlgorithm. These modes will allow the
// scheduler to be user-selectable.
type SchedulerAlgorithm string
const (
SchedulerAlgorithmBinpack SchedulerAlgorithm = "binpack"
SchedulerAlgorithmSpread SchedulerAlgorithm = "spread"
)
// PreemptionConfig specifies whether preemption is enabled based on scheduler type
type PreemptionConfig struct {
SystemSchedulerEnabled bool

View File

@@ -184,6 +184,7 @@ func TestAgent_ServerConfig_SchedulerFlags(t *testing.T) {
"default case",
nil,
structs.SchedulerConfiguration{
SchedulerAlgorithm: "binpack",
PreemptionConfig: structs.PreemptionConfig{
SystemSchedulerEnabled: true,
},

View File

@@ -377,6 +377,11 @@ func (c *Command) isValidConfig(config, cmdConfig *Config) bool {
c.Ui.Error("WARNING: Bootstrap mode enabled! Potentially unsafe operation.")
}
if err := config.Server.DefaultSchedulerConfig.Validate(); err != nil {
c.Ui.Error(err.Error())
return false
}
return true
}

View File

@@ -126,6 +126,7 @@ var basicConfig = &Config{
RetryMaxAttempts: 3,
},
DefaultSchedulerConfig: &structs.SchedulerConfiguration{
SchedulerAlgorithm: "spread",
PreemptionConfig: structs.PreemptionConfig{
SystemSchedulerEnabled: true,
BatchSchedulerEnabled: true,

View File

@@ -14,6 +14,8 @@ import (
"github.com/hashicorp/raft"
)
// OperatorRequest is used route operator/raft API requests to the implementing
// functions.
func (s *HTTPServer) OperatorRequest(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
path := strings.TrimPrefix(req.URL.Path, "/v1/operator/raft/")
switch {
@@ -252,12 +254,17 @@ func (s *HTTPServer) schedulerUpdateConfig(resp http.ResponseWriter, req *http.R
}
args.Config = structs.SchedulerConfiguration{
SchedulerAlgorithm: structs.SchedulerAlgorithm(conf.SchedulerAlgorithm),
PreemptionConfig: structs.PreemptionConfig{
SystemSchedulerEnabled: conf.PreemptionConfig.SystemSchedulerEnabled,
BatchSchedulerEnabled: conf.PreemptionConfig.BatchSchedulerEnabled,
ServiceSchedulerEnabled: conf.PreemptionConfig.ServiceSchedulerEnabled},
}
if err := args.Config.Validate(); err != nil {
return nil, CodedError(http.StatusBadRequest, err.Error())
}
// Check for cas value
params := req.URL.Query()
if _, ok := params["cas"]; ok {

View File

@@ -135,6 +135,8 @@ server {
}
default_scheduler_config {
scheduler_algorithm = "spread"
preemption_config {
batch_scheduler_enabled = true
system_scheduler_enabled = true

View File

@@ -297,6 +297,7 @@
"2.2.2.2"
],
"default_scheduler_config": [{
"scheduler_algorithm": "spread",
"preemption_config": [{
"batch_scheduler_enabled": true,
"system_scheduler_enabled": true,

View File

@@ -403,6 +403,7 @@ func DefaultConfig() *Config {
ServerHealthInterval: 2 * time.Second,
AutopilotInterval: 10 * time.Second,
DefaultSchedulerConfig: structs.SchedulerConfiguration{
SchedulerAlgorithm: structs.SchedulerAlgorithmBinpack,
PreemptionConfig: structs.PreemptionConfig{
SystemSchedulerEnabled: true,
BatchSchedulerEnabled: false,

View File

@@ -148,10 +148,7 @@ func AllocsFit(node *Node, allocs []*Allocation, netIdx *NetworkIndex, checkDevi
return true, "", used, nil
}
// ScoreFit is used to score the fit based on the Google work published here:
// http://www.columbia.edu/~cs2035/courses/ieor4405.S13/datacenter_scheduling.ppt
// This is equivalent to their BestFit v3
func ScoreFit(node *Node, util *ComparableResources) float64 {
func computeFreePercentage(node *Node, util *ComparableResources) (freePctCpu, freePctRam float64) {
// COMPAT(0.11): Remove in 0.11
reserved := node.ComparableReservedResources()
res := node.ComparableResources()
@@ -165,8 +162,18 @@ func ScoreFit(node *Node, util *ComparableResources) float64 {
}
// Compute the free percentage
freePctCpu := 1 - (float64(util.Flattened.Cpu.CpuShares) / nodeCpu)
freePctRam := 1 - (float64(util.Flattened.Memory.MemoryMB) / nodeMem)
freePctCpu = 1 - (float64(util.Flattened.Cpu.CpuShares) / nodeCpu)
freePctRam = 1 - (float64(util.Flattened.Memory.MemoryMB) / nodeMem)
return freePctCpu, freePctRam
}
// ScoreFitBinPack computes a fit score to achieve pinbacking behavior.
// Score is in [0, 18]
//
// It's the BestFit v3 on the Google work published here:
// http://www.columbia.edu/~cs2035/courses/ieor4405.S13/datacenter_scheduling.ppt
func ScoreFitBinPack(node *Node, util *ComparableResources) float64 {
freePctCpu, freePctRam := computeFreePercentage(node, util)
// Total will be "maximized" the smaller the value is.
// At 100% utilization, the total is 2, while at 0% util it is 20.
@@ -187,6 +194,24 @@ func ScoreFit(node *Node, util *ComparableResources) float64 {
return score
}
// ScoreFitBinSpread computes a fit score to achieve spread behavior.
// Score is in [0, 18]
//
// This is equivalent to Worst Fit of
// http://www.columbia.edu/~cs2035/courses/ieor4405.S13/datacenter_scheduling.ppt
func ScoreFitSpread(node *Node, util *ComparableResources) float64 {
freePctCpu, freePctRam := computeFreePercentage(node, util)
total := math.Pow(10, freePctCpu) + math.Pow(10, freePctRam)
score := total - 2
if score > 18.0 {
score = 18.0
} else if score < 0 {
score = 0
}
return score
}
func CopySliceConstraints(s []*Constraint) []*Constraint {
l := len(s)
if l == 0 {

View File

@@ -506,7 +506,7 @@ func TestAllocsFit_Devices(t *testing.T) {
}
// COMPAT(0.11): Remove in 0.11
func TestScoreFit_Old(t *testing.T) {
func TestScoreFitBinPack_Old(t *testing.T) {
node := &Node{}
node.Resources = &Resources{
CPU: 4096,
@@ -528,7 +528,7 @@ func TestScoreFit_Old(t *testing.T) {
},
},
}
score := ScoreFit(node, util)
score := ScoreFitBinPack(node, util)
if score != 18.0 {
t.Fatalf("bad: %v", score)
}
@@ -544,7 +544,7 @@ func TestScoreFit_Old(t *testing.T) {
},
},
}
score = ScoreFit(node, util)
score = ScoreFitBinPack(node, util)
if score != 0.0 {
t.Fatalf("bad: %v", score)
}
@@ -560,13 +560,13 @@ func TestScoreFit_Old(t *testing.T) {
},
},
}
score = ScoreFit(node, util)
score = ScoreFitBinPack(node, util)
if score < 10.0 || score > 16.0 {
t.Fatalf("bad: %v", score)
}
}
func TestScoreFit(t *testing.T) {
func TestScoreFitBinPack(t *testing.T) {
node := &Node{}
node.NodeResources = &NodeResources{
Cpu: NodeCpuResources{
@@ -585,52 +585,53 @@ func TestScoreFit(t *testing.T) {
},
}
// Test a perfect fit
util := &ComparableResources{
Flattened: AllocatedTaskResources{
Cpu: AllocatedCpuResources{
CpuShares: 2048,
},
Memory: AllocatedMemoryResources{
MemoryMB: 4096,
cases := []struct {
name string
flattened AllocatedTaskResources
binPackScore float64
spreadScore float64
}{
{
name: "almost filled node, but with just enough hole",
flattened: AllocatedTaskResources{
Cpu: AllocatedCpuResources{CpuShares: 2048},
Memory: AllocatedMemoryResources{MemoryMB: 4096},
},
binPackScore: 18,
spreadScore: 0,
},
{
name: "unutilized node",
flattened: AllocatedTaskResources{
Cpu: AllocatedCpuResources{CpuShares: 0},
Memory: AllocatedMemoryResources{MemoryMB: 0},
},
binPackScore: 0,
spreadScore: 18,
},
{
name: "mid-case scnario",
flattened: AllocatedTaskResources{
Cpu: AllocatedCpuResources{CpuShares: 1024},
Memory: AllocatedMemoryResources{MemoryMB: 2048},
},
binPackScore: 13.675,
spreadScore: 4.325,
},
}
score := ScoreFit(node, util)
if score != 18.0 {
t.Fatalf("bad: %v", score)
}
// Test the worst fit
util = &ComparableResources{
Flattened: AllocatedTaskResources{
Cpu: AllocatedCpuResources{
CpuShares: 0,
},
Memory: AllocatedMemoryResources{
MemoryMB: 0,
},
},
}
score = ScoreFit(node, util)
if score != 0.0 {
t.Fatalf("bad: %v", score)
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
util := &ComparableResources{Flattened: c.flattened}
// Test a mid-case scenario
util = &ComparableResources{
Flattened: AllocatedTaskResources{
Cpu: AllocatedCpuResources{
CpuShares: 1024,
},
Memory: AllocatedMemoryResources{
MemoryMB: 2048,
},
},
}
score = ScoreFit(node, util)
if score < 10.0 || score > 16.0 {
t.Fatalf("bad: %v", score)
binPackScore := ScoreFitBinPack(node, util)
require.InDelta(t, c.binPackScore, binPackScore, 0.001, "binpack score")
spreadScore := ScoreFitSpread(node, util)
require.InDelta(t, c.spreadScore, spreadScore, 0.001, "spread score")
require.InDelta(t, 18, binPackScore+spreadScore, 0.001, "score sum")
})
}
}

View File

@@ -1,6 +1,7 @@
package structs
import (
"fmt"
"time"
"github.com/hashicorp/raft"
@@ -124,8 +125,26 @@ type AutopilotConfig struct {
ModifyIndex uint64
}
// SchedulerAlgorithm is an enum string that encapsulates the valid options for a
// SchedulerConfiguration stanza's SchedulerAlgorithm. These modes will allow the
// scheduler to be user-selectable.
type SchedulerAlgorithm string
const (
// SchedulerAlgorithmBinpack indicates that the scheduler should spread
// allocations as evenly as possible over the available hardware.
SchedulerAlgorithmBinpack SchedulerAlgorithm = "binpack"
// SchedulerAlgorithmSpread indicates that the scheduler should spread
// allocations as evenly as possible over the available hardware.
SchedulerAlgorithmSpread SchedulerAlgorithm = "spread"
)
// SchedulerConfiguration is the config for controlling scheduler behavior
type SchedulerConfiguration struct {
// SchedulerAlgorithm lets you select between available scheduling algorithms.
SchedulerAlgorithm SchedulerAlgorithm `hcl:"scheduler_algorithm"`
// PreemptionConfig specifies whether to enable eviction of lower
// priority jobs to place higher priority jobs.
PreemptionConfig PreemptionConfig `hcl:"preemption_config"`
@@ -135,6 +154,28 @@ type SchedulerConfiguration struct {
ModifyIndex uint64
}
func (s *SchedulerConfiguration) EffectiveSchedulerAlgorithm() SchedulerAlgorithm {
if s == nil || s.SchedulerAlgorithm == "" {
return SchedulerAlgorithmBinpack
}
return s.SchedulerAlgorithm
}
func (s *SchedulerConfiguration) Validate() error {
if s == nil {
return nil
}
switch s.SchedulerAlgorithm {
case "", SchedulerAlgorithmBinpack, SchedulerAlgorithmSpread:
default:
return fmt.Errorf("invalid scheduler algorithm: %v", s.SchedulerAlgorithm)
}
return nil
}
// SchedulerConfigurationResponse is the response object that wraps SchedulerConfiguration
type SchedulerConfigurationResponse struct {
// SchedulerConfig contains scheduler config options

View File

@@ -1349,7 +1349,7 @@ func TestPreemption(t *testing.T) {
ctx.plan.NodePreemptions[node.ID] = tc.currentPreemptions
}
static := NewStaticRankIterator(ctx, nodes)
binPackIter := NewBinPackIterator(ctx, static, true, tc.jobPriority)
binPackIter := NewBinPackIterator(ctx, static, true, tc.jobPriority, structs.SchedulerAlgorithmBinpack)
job := mock.Job()
job.Priority = tc.jobPriority
binPackIter.SetJob(job)

View File

@@ -153,17 +153,26 @@ type BinPackIterator struct {
priority int
jobId *structs.NamespacedID
taskGroup *structs.TaskGroup
scoreFit func(*structs.Node, *structs.ComparableResources) float64
}
// NewBinPackIterator returns a BinPackIterator which tries to fit tasks
// potentially evicting other tasks based on a given priority.
func NewBinPackIterator(ctx Context, source RankIterator, evict bool, priority int) *BinPackIterator {
func NewBinPackIterator(ctx Context, source RankIterator, evict bool, priority int, algorithm structs.SchedulerAlgorithm) *BinPackIterator {
scoreFn := structs.ScoreFitBinPack
if algorithm == structs.SchedulerAlgorithmSpread {
scoreFn = structs.ScoreFitSpread
}
iter := &BinPackIterator{
ctx: ctx,
source: source,
evict: evict,
priority: priority,
scoreFit: scoreFn,
}
iter.ctx.Logger().Named("binpack").Trace("NewBinPackIterator created", "algorithm", algorithm)
return iter
}
@@ -436,7 +445,7 @@ OUTER:
}
// Score the fit normally otherwise
fitness := structs.ScoreFit(option.Node, util)
fitness := iter.scoreFit(option.Node, util)
normalizedFit := fitness / binPackingMaxFitScore
option.Scores = append(option.Scores, normalizedFit)
iter.ctx.Metrics().ScoreNode(option.Node, "binpack", normalizedFit)

View File

@@ -107,7 +107,7 @@ func TestBinPackIterator_NoExistingAlloc(t *testing.T) {
},
},
}
binp := NewBinPackIterator(ctx, static, false, 0)
binp := NewBinPackIterator(ctx, static, false, 0, structs.SchedulerAlgorithmBinpack)
binp.SetTaskGroup(taskGroup)
scoreNorm := NewScoreNormalizationIterator(ctx, binp)
@@ -220,7 +220,7 @@ func TestBinPackIterator_NoExistingAlloc_MixedReserve(t *testing.T) {
},
},
}
binp := NewBinPackIterator(ctx, static, false, 0)
binp := NewBinPackIterator(ctx, static, false, 0, structs.SchedulerAlgorithmBinpack)
binp.SetTaskGroup(taskGroup)
scoreNorm := NewScoreNormalizationIterator(ctx, binp)
@@ -471,7 +471,7 @@ func TestBinPackIterator_Network_Failure(t *testing.T) {
},
}
binp := NewBinPackIterator(ctx, static, false, 0)
binp := NewBinPackIterator(ctx, static, false, 0, structs.SchedulerAlgorithmBinpack)
binp.SetTaskGroup(taskGroup)
scoreNorm := NewScoreNormalizationIterator(ctx, binp)
@@ -569,7 +569,7 @@ func TestBinPackIterator_PlannedAlloc(t *testing.T) {
},
}
binp := NewBinPackIterator(ctx, static, false, 0)
binp := NewBinPackIterator(ctx, static, false, 0, structs.SchedulerAlgorithmBinpack)
binp.SetTaskGroup(taskGroup)
scoreNorm := NewScoreNormalizationIterator(ctx, binp)
@@ -685,7 +685,7 @@ func TestBinPackIterator_ExistingAlloc(t *testing.T) {
},
},
}
binp := NewBinPackIterator(ctx, static, false, 0)
binp := NewBinPackIterator(ctx, static, false, 0, structs.SchedulerAlgorithmBinpack)
binp.SetTaskGroup(taskGroup)
scoreNorm := NewScoreNormalizationIterator(ctx, binp)
@@ -805,7 +805,7 @@ func TestBinPackIterator_ExistingAlloc_PlannedEvict(t *testing.T) {
},
}
binp := NewBinPackIterator(ctx, static, false, 0)
binp := NewBinPackIterator(ctx, static, false, 0, structs.SchedulerAlgorithmBinpack)
binp.SetTaskGroup(taskGroup)
scoreNorm := NewScoreNormalizationIterator(ctx, binp)
@@ -1110,7 +1110,7 @@ func TestBinPackIterator_Devices(t *testing.T) {
}
static := NewStaticRankIterator(ctx, []*RankedNode{{Node: c.Node}})
binp := NewBinPackIterator(ctx, static, false, 0)
binp := NewBinPackIterator(ctx, static, false, 0, structs.SchedulerAlgorithmBinpack)
binp.SetTaskGroup(c.TaskGroup)
out := binp.Next()

View File

@@ -237,11 +237,13 @@ func NewSystemStack(ctx Context) *SystemStack {
// by a particular task group. Enable eviction as system jobs are high
// priority.
_, schedConfig, _ := s.ctx.State().SchedulerConfig()
schedulerAlgorithm := schedConfig.EffectiveSchedulerAlgorithm()
enablePreemption := true
if schedConfig != nil {
enablePreemption = schedConfig.PreemptionConfig.SystemSchedulerEnabled
}
s.binPack = NewBinPackIterator(ctx, rankSource, enablePreemption, 0)
s.binPack = NewBinPackIterator(ctx, rankSource, enablePreemption, 0, schedulerAlgorithm)
// Apply score normalization
s.scoreNorm = NewScoreNormalizationIterator(ctx, s.binPack)

View File

@@ -60,7 +60,10 @@ func NewGenericStack(batch bool, ctx Context) *GenericStack {
// Apply the bin packing, this depends on the resources needed
// by a particular task group.
s.binPack = NewBinPackIterator(ctx, rankSource, false, 0)
_, schedConfig, _ := s.ctx.State().SchedulerConfig()
schedulerAlgorithm := schedConfig.EffectiveSchedulerAlgorithm()
s.binPack = NewBinPackIterator(ctx, rankSource, false, 0, schedulerAlgorithm)
// Apply the job anti-affinity iterator. This is to avoid placing
// multiple allocations on the same node for this job.