core job for key rotation (#13309)

Extend the GC job to support periodic key rotation.

Update the GC process to safely support signed workload identity. We
can't GC any key used to sign a workload identity. Finding which key
was used to sign every allocation will be expensive, but there are not
that many keys. This lets us take a conservative approach: find the
oldest live allocation and ensure that we don't GC any key older than
that key.
This commit is contained in:
Tim Gross
2022-06-20 16:26:05 -04:00
parent 8c5a669a11
commit bbef759dc7
5 changed files with 114 additions and 15 deletions

View File

@@ -204,6 +204,10 @@ type Config struct {
// to be eligible for GC.
RootKeyGCThreshold time.Duration
// RootKeyRotationThreshold is how "old" an active key can be
// before it's rotated
RootKeyRotationThreshold time.Duration
// EvalNackTimeout controls how long we allow a sub-scheduler to
// work on an evaluation before we consider it failed and Nack it.
// This allows that evaluation to be handed to another sub-scheduler
@@ -395,6 +399,7 @@ func DefaultConfig() *Config {
OneTimeTokenGCInterval: 10 * time.Minute,
RootKeyGCInterval: 10 * time.Minute,
RootKeyGCThreshold: 1 * time.Hour,
RootKeyRotationThreshold: 720 * time.Hour, // 30 days
EvalNackTimeout: 60 * time.Second,
EvalDeliveryLimit: 3,
EvalNackInitialReenqueueDelay: 1 * time.Second,

View File

@@ -51,8 +51,8 @@ func (c *CoreScheduler) Process(eval *structs.Evaluation) error {
return c.csiPluginGC(eval)
case structs.CoreJobOneTimeTokenGC:
return c.expiredOneTimeTokenGC(eval)
case structs.CoreJobRootKeyGC:
return c.rootKeyGC(eval)
case structs.CoreJobRootKeyRotateOrGC:
return c.rootKeyRotateOrGC(eval)
case structs.CoreJobForceGC:
return c.forceGC(eval)
default:
@@ -80,7 +80,7 @@ func (c *CoreScheduler) forceGC(eval *structs.Evaluation) error {
if err := c.expiredOneTimeTokenGC(eval); err != nil {
return err
}
if err := c.rootKeyGC(eval); err != nil {
if err := c.rootKeyRotateOrGC(eval); err != nil {
return err
}
// Node GC must occur after the others to ensure the allocations are
@@ -778,8 +778,29 @@ func (c *CoreScheduler) expiredOneTimeTokenGC(eval *structs.Evaluation) error {
return c.srv.RPC("ACL.ExpireOneTimeTokens", req, &structs.GenericResponse{})
}
// rootKeyGC is used to garbage collect unused root keys
func (c *CoreScheduler) rootKeyGC(eval *structs.Evaluation) error {
// rootKeyRotateOrGC is used to rotate or garbage collect root keys
func (c *CoreScheduler) rootKeyRotateOrGC(eval *structs.Evaluation) error {
// a rotation will be sent to the leader so our view of state
// is no longer valid. we ack this core job and will pick up
// the GC work on the next interval
wasRotated, err := c.rootKeyRotation(eval)
if err != nil {
return err
}
if wasRotated {
return nil
}
// we can't GC any key older than the oldest live allocation
// because it might have signed that allocation's workload
// identity; this is conservative so that we don't have to iterate
// over all the allocations and find out which keys signed their
// identity, which will be expensive on large clusters
allocOldThreshold, err := c.getOldestAllocationIndex()
if err != nil {
return err
}
oldThreshold := c.getThreshold(eval, "root key",
"root_key_gc_threshold", c.srv.config.RootKeyGCThreshold)
@@ -802,6 +823,9 @@ func (c *CoreScheduler) rootKeyGC(eval *structs.Evaluation) error {
if keyMeta.CreateIndex > oldThreshold {
continue // don't GC recent keys
}
if keyMeta.CreateIndex > allocOldThreshold {
continue // don't GC keys possibly used to sign live allocations
}
varIter, err := c.snap.GetSecureVariablesByKeyID(ws, keyMeta.KeyID)
if err != nil {
return err
@@ -827,6 +851,40 @@ func (c *CoreScheduler) rootKeyGC(eval *structs.Evaluation) error {
return nil
}
// rootKeyRotation checks if the active key is old enough that we need
// to kick off a rotation. Returns true if the key was rotated.
func (c *CoreScheduler) rootKeyRotation(eval *structs.Evaluation) (bool, error) {
rotationThreshold := c.getThreshold(eval, "root key",
"root_key_rotation_threshold", c.srv.config.RootKeyRotationThreshold)
ws := memdb.NewWatchSet()
activeKey, err := c.snap.GetActiveRootKeyMeta(ws)
if err != nil {
return false, err
}
if activeKey == nil {
return false, nil // no active key
}
if activeKey.CreateIndex >= rotationThreshold {
return false, nil // key is too new
}
req := &structs.KeyringRotateRootKeyRequest{
WriteRequest: structs.WriteRequest{
Region: c.srv.config.Region,
AuthToken: eval.LeaderACL,
},
}
if err := c.srv.RPC("Keyring.Rotate",
req, &structs.KeyringRotateRootKeyResponse{}); err != nil {
c.logger.Error("root key rotation failed", "error", err)
return false, err
}
return true, nil
}
// getThreshold returns the index threshold for determining whether an
// object is old enough to GC
func (c *CoreScheduler) getThreshold(eval *structs.Evaluation, objectName, configName string, configThreshold time.Duration) uint64 {
@@ -850,3 +908,24 @@ func (c *CoreScheduler) getThreshold(eval *structs.Evaluation, objectName, confi
}
return oldThreshold
}
// getOldestAllocationIndex returns the CreateIndex of the oldest
// non-terminal allocation in the state store
func (c *CoreScheduler) getOldestAllocationIndex() (uint64, error) {
ws := memdb.NewWatchSet()
allocs, err := c.snap.Allocs(ws, state.SortDefault)
if err != nil {
return 0, err
}
for {
raw := allocs.Next()
if raw == nil {
break
}
alloc := raw.(*structs.Allocation)
if !alloc.TerminalStatus() {
return alloc.CreateIndex, nil
}
}
return 0, nil
}

View File

@@ -2485,22 +2485,33 @@ func TestCoreScheduler_RootKeyGC(t *testing.T) {
require.NoError(t, store.UpsertSecureVariables(
structs.MsgTypeTestSetup, 601, []*structs.SecureVariableEncrypted{variable}))
// insert a time table index between the two keys
// insert an allocation
alloc := mock.Alloc()
alloc.ClientStatus = structs.AllocClientStatusRunning
require.NoError(t, store.UpsertAllocs(
structs.MsgTypeTestSetup, 700, []*structs.Allocation{alloc}))
// insert an "old" key that's newer than oldest alloc
key3 := structs.NewRootKeyMeta()
key3.Active = false
require.NoError(t, store.UpsertRootKeyMeta(750, key3))
// insert a time table index before the last key
tt := srv.fsm.TimeTable()
tt.Witness(1000, time.Now().UTC().Add(-1*srv.config.RootKeyGCThreshold))
// insert a "new" but inactive key
key3 := structs.NewRootKeyMeta()
key3.Active = false
require.NoError(t, store.UpsertRootKeyMeta(1500, key3))
key4 := structs.NewRootKeyMeta()
key4.Active = false
require.NoError(t, store.UpsertRootKeyMeta(1500, key4))
// run the core job
snap, err := store.Snapshot()
require.NoError(t, err)
core := NewCoreScheduler(srv, snap)
eval := srv.coreJobEval(structs.CoreJobRootKeyGC, 2000)
eval := srv.coreJobEval(structs.CoreJobRootKeyRotateOrGC, 2000)
c := core.(*CoreScheduler)
require.NoError(t, c.rootKeyGC(eval))
require.NoError(t, c.rootKeyRotateOrGC(eval))
ws := memdb.NewWatchSet()
key, err := store.RootKeyMetaByID(ws, key0.KeyID)
@@ -2517,6 +2528,10 @@ func TestCoreScheduler_RootKeyGC(t *testing.T) {
key, err = store.RootKeyMetaByID(ws, key3.KeyID)
require.NoError(t, err)
require.NotNil(t, key, "old key newer than oldest alloc should not have been GCd")
key, err = store.RootKeyMetaByID(ws, key4.KeyID)
require.NoError(t, err)
require.NotNil(t, key, "new key should not have been GCd")
}

View File

@@ -819,7 +819,7 @@ func (s *Server) schedulePeriodic(stopCh chan struct{}) {
}
case <-rootKeyGC.C:
if index, ok := getLatest(); ok {
s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobRootKeyGC, index))
s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobRootKeyRotateOrGC, index))
}
case <-stopCh:
return

View File

@@ -10787,9 +10787,9 @@ const (
// tokens. We periodically scan for expired tokens and delete them.
CoreJobOneTimeTokenGC = "one-time-token-gc"
// CoreJobRootKeyGC is used for the garbage collection of unused
// encryption keys.
CoreJobRootKeyGC = "root-key-gc"
// CoreJobRootKeyRotateGC is used for periodic key rotation and
// garbage collection of unused encryption keys.
CoreJobRootKeyRotateOrGC = "root-key-rotate-gc"
// CoreJobForceGC is used to force garbage collection of all GCable objects.
CoreJobForceGC = "force-gc"