diff --git a/nomad/vault.go b/nomad/vault.go index 71263cbbe..a008885ad 100644 --- a/nomad/vault.go +++ b/nomad/vault.go @@ -1223,6 +1223,16 @@ func (v *vaultClient) parallelRevoke(ctx context.Context, accessors []*structs.V return g.Wait() } +// maxVaultRevokeBatchSize is the maximum tokens a revokeDaemon should revoke +// at any given time. +// +// Limiting the revocation batch size is beneficial for few reasons: +// * A single revocation failure of any entry in batch result into retrying the whole batch; +// the larger the batch is the higher likelihood of such failure +// * Smaller batch sizes result into more co-operativeness: provides hooks for +// reconsidering token TTL and leadership steps down. +const maxVaultRevokeBatchSize = 1000 + // revokeDaemon should be called in a goroutine and is used to periodically // revoke Vault accessors that failed the original revocation func (v *vaultClient) revokeDaemon() { @@ -1247,13 +1257,21 @@ func (v *vaultClient) revokeDaemon() { } // Build the list of accessors that need to be revoked while pruning any TTL'd checks - revoking := make([]*structs.VaultAccessor, 0, len(v.revoking)) + toRevoke := len(v.revoking) + if toRevoke > maxVaultRevokeBatchSize { + toRevoke = maxVaultRevokeBatchSize + } + revoking := make([]*structs.VaultAccessor, 0, toRevoke) for va, ttl := range v.revoking { if now.After(ttl) { delete(v.revoking, va) } else { revoking = append(revoking, va) } + + if len(revoking) >= toRevoke { + break + } } if err := v.parallelRevoke(context.Background(), revoking); err != nil {