scheduler/csi: fix early return when multiple volumes are requested

When multiple CSI volumes are requested, the feasibility check could return
early for read/write volumes with free claims, even if a later volume in the
request was not feasible for any other reason (including not existing at
all). This can result in random failure to fail feasibility checking,
depending on how the map of volumes was being ordered at runtime.

Remove the early return from the feasibility check. Add a test to verify that
missing volumes in the map will cause a failure; this test will not catch a
regression every test run because of the random map ordering, but any failure
will be caught over the course of several CI runs.
This commit is contained in:
Tim Gross
2021-03-09 10:34:07 -05:00
parent ded978c34c
commit b66a341505
3 changed files with 27 additions and 9 deletions

View File

@@ -6,6 +6,7 @@ BUG FIXES:
* cli: Fixed a bug where non-int proxy port would panic CLI [[GH-10072](https://github.com/hashicorp/nomad/issues/10072)]
* cli: Fixed a bug where `nomad operator debug` incorrectly parsed https Consul API URLs. [[GH-10082](https://github.com/hashicorp/nomad/pull/10082)]
* client: Fixed log formatting when killing tasks. [[GH-10135](https://github.com/hashicorp/nomad/issues/10135)]
* scheduler: Fixed a bug where jobs requesting multiple CSI volumes could be incorrectly scheduled if only one of the volumes passed feasibility checking. [[GH-10143](https://github.com/hashicorp/nomad/issues/10143)]
* ui: Fixed the rendering of interstitial components shown after processing a dynamic application sizing recommendation. [[GH-10094](https://github.com/hashicorp/nomad/pull/10094)]
IMPROVEMENTS:

View File

@@ -312,15 +312,15 @@ func (c *CSIVolumeChecker) hasPlugins(n *structs.Node) (bool, string) {
if !vol.WriteSchedulable() {
return false, fmt.Sprintf(FilterConstraintCSIVolumeNoWriteTemplate, vol.ID)
}
if vol.WriteFreeClaims() {
return true, ""
}
// Check the blocking allocations to see if they belong to this job
for id := range vol.WriteAllocs {
a, err := c.ctx.State().AllocByID(ws, id)
if err != nil || a == nil || a.Namespace != c.namespace || a.JobID != c.jobID {
return false, fmt.Sprintf(FilterConstraintCSIVolumeInUseTemplate, vol.ID)
if !vol.WriteFreeClaims() {
// Check the blocking allocations to see if they belong to this job
for id := range vol.WriteAllocs {
a, err := c.ctx.State().AllocByID(ws, id)
if err != nil || a == nil ||
a.Namespace != c.namespace || a.JobID != c.jobID {
return false, fmt.Sprintf(
FilterConstraintCSIVolumeInUseTemplate, vol.ID)
}
}
}
}

View File

@@ -395,6 +395,23 @@ func TestCSIVolumeChecker(t *testing.T) {
t.Fatalf("case(%d) failed: got %v; want %v", i, act, c.Result)
}
}
// add a missing volume
volumes["missing"] = &structs.VolumeRequest{
Type: "csi",
Name: "bar",
Source: "does-not-exist",
}
checker = NewCSIVolumeChecker(ctx)
checker.SetNamespace(structs.DefaultNamespace)
for _, node := range nodes {
checker.SetVolumes(volumes)
act := checker.Feasible(node)
require.False(t, act, "request with missing volume should never be feasible")
}
}
func TestNetworkChecker(t *testing.T) {