client: fix panic from 0.8 -> 0.10 upgrade

makeAllocTaskServices did not do a nil check on AllocatedResources
which causes a panic when upgrading directly from 0.8 to 0.10. While
skipping 0.9 is not supported we intend to fix serious crashers caused
by such upgrades to prevent cluster outages.

I did a quick audit of the client package and everywhere else that
accesses AllocatedResources appears to be properly guarded by a nil
check.
This commit is contained in:
Michael Schurter
2019-11-01 07:15:11 -07:00
parent 20bb9f04d5
commit 0fcb0d4016
4 changed files with 94 additions and 4 deletions

View File

@@ -805,7 +805,9 @@ func (noopRestarter) Restart(context.Context, *structs.TaskEvent, bool) error {
//
//TODO(schmichael) rename TaskServices and refactor this into a New method
func makeAllocTaskServices(alloc *structs.Allocation, tg *structs.TaskGroup) (*TaskServices, error) {
if n := len(alloc.AllocatedResources.Shared.Networks); n == 0 {
//COMPAT(0.11) AllocatedResources is only nil when upgrading directly
// from 0.8.
if alloc.AllocatedResources == nil || len(alloc.AllocatedResources.Shared.Networks) == 0 {
return nil, fmt.Errorf("unable to register a group service without a group network")
}
@@ -867,6 +869,11 @@ func (c *ServiceClient) UpdateGroup(oldAlloc, newAlloc *structs.Allocation) erro
return fmt.Errorf("task group %q not in old allocation", oldAlloc.TaskGroup)
}
if len(oldTG.Services) == 0 {
// No old group services, simply add new group services
return c.RegisterGroup(newAlloc)
}
oldServices, err := makeAllocTaskServices(oldAlloc, oldTG)
if err != nil {
return err

View File

@@ -127,3 +127,84 @@ func TestConsul_Connect(t *testing.T) {
time.Sleep(interval >> 2)
}
}
// TestConsul_Update08Alloc asserts that adding group services to a previously
// 0.8 alloc works.
//
// COMPAT(0.11) Only valid for upgrades from 0.8.
func TestConsul_Update08Alloc(t *testing.T) {
// Create an embedded Consul server
testconsul, err := testutil.NewTestServerConfig(func(c *testutil.TestServerConfig) {
// If -v wasn't specified squelch consul logging
if !testing.Verbose() {
c.Stdout = ioutil.Discard
c.Stderr = ioutil.Discard
}
})
if err != nil {
t.Fatalf("error starting test consul server: %v", err)
}
defer testconsul.Stop()
consulConfig := consulapi.DefaultConfig()
consulConfig.Address = testconsul.HTTPAddr
consulClient, err := consulapi.NewClient(consulConfig)
require.NoError(t, err)
serviceClient := NewServiceClient(consulClient.Agent(), testlog.HCLogger(t), true)
// Lower periodicInterval to ensure periodic syncing doesn't improperly
// remove Connect services.
const interval = 50 * time.Millisecond
serviceClient.periodicInterval = interval
// Disable deregistration probation to test syncing
serviceClient.deregisterProbationExpiry = time.Time{}
go serviceClient.Run()
defer serviceClient.Shutdown()
// Create new 0.10-style alloc
alloc := mock.Alloc()
alloc.AllocatedResources.Shared.Networks = []*structs.NetworkResource{
{
Mode: "bridge",
IP: "10.0.0.1",
DynamicPorts: []structs.Port{
{
Label: "connect-proxy-testconnect",
Value: 9999,
To: 9998,
},
},
},
}
tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
tg.Services = []*structs.Service{
{
Name: "testconnect",
PortLabel: "9999",
Connect: &structs.ConsulConnect{
SidecarService: &structs.ConsulSidecarService{
Proxy: &structs.ConsulProxy{
LocalServicePort: 9000,
},
},
},
},
}
// Create old 0.8-style alloc from new alloc
oldAlloc := alloc.Copy()
oldAlloc.AllocatedResources = nil
oldAlloc.Job.LookupTaskGroup(alloc.TaskGroup).Services = nil
// Expect new services to get registered
require.NoError(t, serviceClient.UpdateGroup(oldAlloc, alloc))
// Assert the group and sidecar services are registered
require.Eventually(t, func() bool {
services, err := consulClient.Agent().Services()
require.NoError(t, err)
return len(services) == 2
}, 3*time.Second, 100*time.Millisecond)
}