mirror of
https://github.com/kemko/nomad.git
synced 2026-01-05 09:55:44 +03:00
core job for key rotation (#13309)
Extend the GC job to support periodic key rotation. Update the GC process to safely support signed workload identity. We can't GC any key used to sign a workload identity. Finding which key was used to sign every allocation will be expensive, but there are not that many keys. This lets us take a conservative approach: find the oldest live allocation and ensure that we don't GC any key older than that key.
This commit is contained in:
@@ -204,6 +204,10 @@ type Config struct {
|
||||
// to be eligible for GC.
|
||||
RootKeyGCThreshold time.Duration
|
||||
|
||||
// RootKeyRotationThreshold is how "old" an active key can be
|
||||
// before it's rotated
|
||||
RootKeyRotationThreshold time.Duration
|
||||
|
||||
// EvalNackTimeout controls how long we allow a sub-scheduler to
|
||||
// work on an evaluation before we consider it failed and Nack it.
|
||||
// This allows that evaluation to be handed to another sub-scheduler
|
||||
@@ -395,6 +399,7 @@ func DefaultConfig() *Config {
|
||||
OneTimeTokenGCInterval: 10 * time.Minute,
|
||||
RootKeyGCInterval: 10 * time.Minute,
|
||||
RootKeyGCThreshold: 1 * time.Hour,
|
||||
RootKeyRotationThreshold: 720 * time.Hour, // 30 days
|
||||
EvalNackTimeout: 60 * time.Second,
|
||||
EvalDeliveryLimit: 3,
|
||||
EvalNackInitialReenqueueDelay: 1 * time.Second,
|
||||
|
||||
@@ -51,8 +51,8 @@ func (c *CoreScheduler) Process(eval *structs.Evaluation) error {
|
||||
return c.csiPluginGC(eval)
|
||||
case structs.CoreJobOneTimeTokenGC:
|
||||
return c.expiredOneTimeTokenGC(eval)
|
||||
case structs.CoreJobRootKeyGC:
|
||||
return c.rootKeyGC(eval)
|
||||
case structs.CoreJobRootKeyRotateOrGC:
|
||||
return c.rootKeyRotateOrGC(eval)
|
||||
case structs.CoreJobForceGC:
|
||||
return c.forceGC(eval)
|
||||
default:
|
||||
@@ -80,7 +80,7 @@ func (c *CoreScheduler) forceGC(eval *structs.Evaluation) error {
|
||||
if err := c.expiredOneTimeTokenGC(eval); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := c.rootKeyGC(eval); err != nil {
|
||||
if err := c.rootKeyRotateOrGC(eval); err != nil {
|
||||
return err
|
||||
}
|
||||
// Node GC must occur after the others to ensure the allocations are
|
||||
@@ -778,8 +778,29 @@ func (c *CoreScheduler) expiredOneTimeTokenGC(eval *structs.Evaluation) error {
|
||||
return c.srv.RPC("ACL.ExpireOneTimeTokens", req, &structs.GenericResponse{})
|
||||
}
|
||||
|
||||
// rootKeyGC is used to garbage collect unused root keys
|
||||
func (c *CoreScheduler) rootKeyGC(eval *structs.Evaluation) error {
|
||||
// rootKeyRotateOrGC is used to rotate or garbage collect root keys
|
||||
func (c *CoreScheduler) rootKeyRotateOrGC(eval *structs.Evaluation) error {
|
||||
|
||||
// a rotation will be sent to the leader so our view of state
|
||||
// is no longer valid. we ack this core job and will pick up
|
||||
// the GC work on the next interval
|
||||
wasRotated, err := c.rootKeyRotation(eval)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if wasRotated {
|
||||
return nil
|
||||
}
|
||||
|
||||
// we can't GC any key older than the oldest live allocation
|
||||
// because it might have signed that allocation's workload
|
||||
// identity; this is conservative so that we don't have to iterate
|
||||
// over all the allocations and find out which keys signed their
|
||||
// identity, which will be expensive on large clusters
|
||||
allocOldThreshold, err := c.getOldestAllocationIndex()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
oldThreshold := c.getThreshold(eval, "root key",
|
||||
"root_key_gc_threshold", c.srv.config.RootKeyGCThreshold)
|
||||
@@ -802,6 +823,9 @@ func (c *CoreScheduler) rootKeyGC(eval *structs.Evaluation) error {
|
||||
if keyMeta.CreateIndex > oldThreshold {
|
||||
continue // don't GC recent keys
|
||||
}
|
||||
if keyMeta.CreateIndex > allocOldThreshold {
|
||||
continue // don't GC keys possibly used to sign live allocations
|
||||
}
|
||||
varIter, err := c.snap.GetSecureVariablesByKeyID(ws, keyMeta.KeyID)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -827,6 +851,40 @@ func (c *CoreScheduler) rootKeyGC(eval *structs.Evaluation) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// rootKeyRotation checks if the active key is old enough that we need
|
||||
// to kick off a rotation. Returns true if the key was rotated.
|
||||
func (c *CoreScheduler) rootKeyRotation(eval *structs.Evaluation) (bool, error) {
|
||||
|
||||
rotationThreshold := c.getThreshold(eval, "root key",
|
||||
"root_key_rotation_threshold", c.srv.config.RootKeyRotationThreshold)
|
||||
|
||||
ws := memdb.NewWatchSet()
|
||||
activeKey, err := c.snap.GetActiveRootKeyMeta(ws)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if activeKey == nil {
|
||||
return false, nil // no active key
|
||||
}
|
||||
if activeKey.CreateIndex >= rotationThreshold {
|
||||
return false, nil // key is too new
|
||||
}
|
||||
|
||||
req := &structs.KeyringRotateRootKeyRequest{
|
||||
WriteRequest: structs.WriteRequest{
|
||||
Region: c.srv.config.Region,
|
||||
AuthToken: eval.LeaderACL,
|
||||
},
|
||||
}
|
||||
if err := c.srv.RPC("Keyring.Rotate",
|
||||
req, &structs.KeyringRotateRootKeyResponse{}); err != nil {
|
||||
c.logger.Error("root key rotation failed", "error", err)
|
||||
return false, err
|
||||
}
|
||||
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// getThreshold returns the index threshold for determining whether an
|
||||
// object is old enough to GC
|
||||
func (c *CoreScheduler) getThreshold(eval *structs.Evaluation, objectName, configName string, configThreshold time.Duration) uint64 {
|
||||
@@ -850,3 +908,24 @@ func (c *CoreScheduler) getThreshold(eval *structs.Evaluation, objectName, confi
|
||||
}
|
||||
return oldThreshold
|
||||
}
|
||||
|
||||
// getOldestAllocationIndex returns the CreateIndex of the oldest
|
||||
// non-terminal allocation in the state store
|
||||
func (c *CoreScheduler) getOldestAllocationIndex() (uint64, error) {
|
||||
ws := memdb.NewWatchSet()
|
||||
allocs, err := c.snap.Allocs(ws, state.SortDefault)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
for {
|
||||
raw := allocs.Next()
|
||||
if raw == nil {
|
||||
break
|
||||
}
|
||||
alloc := raw.(*structs.Allocation)
|
||||
if !alloc.TerminalStatus() {
|
||||
return alloc.CreateIndex, nil
|
||||
}
|
||||
}
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
@@ -2485,22 +2485,33 @@ func TestCoreScheduler_RootKeyGC(t *testing.T) {
|
||||
require.NoError(t, store.UpsertSecureVariables(
|
||||
structs.MsgTypeTestSetup, 601, []*structs.SecureVariableEncrypted{variable}))
|
||||
|
||||
// insert a time table index between the two keys
|
||||
// insert an allocation
|
||||
alloc := mock.Alloc()
|
||||
alloc.ClientStatus = structs.AllocClientStatusRunning
|
||||
require.NoError(t, store.UpsertAllocs(
|
||||
structs.MsgTypeTestSetup, 700, []*structs.Allocation{alloc}))
|
||||
|
||||
// insert an "old" key that's newer than oldest alloc
|
||||
key3 := structs.NewRootKeyMeta()
|
||||
key3.Active = false
|
||||
require.NoError(t, store.UpsertRootKeyMeta(750, key3))
|
||||
|
||||
// insert a time table index before the last key
|
||||
tt := srv.fsm.TimeTable()
|
||||
tt.Witness(1000, time.Now().UTC().Add(-1*srv.config.RootKeyGCThreshold))
|
||||
|
||||
// insert a "new" but inactive key
|
||||
key3 := structs.NewRootKeyMeta()
|
||||
key3.Active = false
|
||||
require.NoError(t, store.UpsertRootKeyMeta(1500, key3))
|
||||
key4 := structs.NewRootKeyMeta()
|
||||
key4.Active = false
|
||||
require.NoError(t, store.UpsertRootKeyMeta(1500, key4))
|
||||
|
||||
// run the core job
|
||||
snap, err := store.Snapshot()
|
||||
require.NoError(t, err)
|
||||
core := NewCoreScheduler(srv, snap)
|
||||
eval := srv.coreJobEval(structs.CoreJobRootKeyGC, 2000)
|
||||
eval := srv.coreJobEval(structs.CoreJobRootKeyRotateOrGC, 2000)
|
||||
c := core.(*CoreScheduler)
|
||||
require.NoError(t, c.rootKeyGC(eval))
|
||||
require.NoError(t, c.rootKeyRotateOrGC(eval))
|
||||
|
||||
ws := memdb.NewWatchSet()
|
||||
key, err := store.RootKeyMetaByID(ws, key0.KeyID)
|
||||
@@ -2517,6 +2528,10 @@ func TestCoreScheduler_RootKeyGC(t *testing.T) {
|
||||
|
||||
key, err = store.RootKeyMetaByID(ws, key3.KeyID)
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, key, "old key newer than oldest alloc should not have been GCd")
|
||||
|
||||
key, err = store.RootKeyMetaByID(ws, key4.KeyID)
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, key, "new key should not have been GCd")
|
||||
}
|
||||
|
||||
|
||||
@@ -819,7 +819,7 @@ func (s *Server) schedulePeriodic(stopCh chan struct{}) {
|
||||
}
|
||||
case <-rootKeyGC.C:
|
||||
if index, ok := getLatest(); ok {
|
||||
s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobRootKeyGC, index))
|
||||
s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobRootKeyRotateOrGC, index))
|
||||
}
|
||||
case <-stopCh:
|
||||
return
|
||||
|
||||
@@ -10787,9 +10787,9 @@ const (
|
||||
// tokens. We periodically scan for expired tokens and delete them.
|
||||
CoreJobOneTimeTokenGC = "one-time-token-gc"
|
||||
|
||||
// CoreJobRootKeyGC is used for the garbage collection of unused
|
||||
// encryption keys.
|
||||
CoreJobRootKeyGC = "root-key-gc"
|
||||
// CoreJobRootKeyRotateGC is used for periodic key rotation and
|
||||
// garbage collection of unused encryption keys.
|
||||
CoreJobRootKeyRotateOrGC = "root-key-rotate-gc"
|
||||
|
||||
// CoreJobForceGC is used to force garbage collection of all GCable objects.
|
||||
CoreJobForceGC = "force-gc"
|
||||
|
||||
Reference in New Issue
Block a user