command line tools for redacting keyring from snapshots (#24023)

In #23977 we moved the keyring into Raft, which can expose key material in Raft
snapshots when using the less-secure AEAD keyring instead of KMS. This changeset
adds tools for redacting this material from snapshots:

* The `operator snapshot state` command gains the ability to display key
  metadata (only), which respects the `-filter` option.
* The `operator snapshot save` command gains a `-redact` option that removes key
  material from the snapshot after it's downloaded.
* A new `operator snapshot redact` command allows removing key material from an
  existing snapshot.
This commit is contained in:
Tim Gross
2024-09-20 15:30:14 -04:00
committed by GitHub
parent 9247dc9108
commit a7f2cb879e
14 changed files with 415 additions and 40 deletions

3
.changelog/24023.txt Normal file
View File

@@ -0,0 +1,3 @@
```release-note:improvement
cli: Added redaction options to operator snapshot commands
```

View File

@@ -849,6 +849,11 @@ func Commands(metaPtr *Meta, agentUi cli.Ui) map[string]cli.CommandFactory {
Meta: meta,
}, nil
},
"operator snapshot redact": func() (cli.Command, error) {
return &OperatorSnapshotRedactCommand{
Meta: meta,
}, nil
},
"plan": func() (cli.Command, error) {
return &JobPlanCommand{

View File

@@ -0,0 +1,95 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
package command
import (
"fmt"
"io"
"os"
"strings"
"github.com/hashicorp/nomad/helper/raftutil"
"github.com/posener/complete"
)
type OperatorSnapshotRedactCommand struct {
Meta
}
func (c *OperatorSnapshotRedactCommand) Help() string {
helpText := `
Usage: nomad operator snapshot redact [options] <file>
Removes key material from an existing snapshot file created by the operator
snapshot save command, when using the AEAD keyring provider. When using a KMS
keyring provider, no cleartext key material is stored in snapshots and this
command is not necessary. Note that this command requires loading the entire
snapshot into memory locally and overwrites the existing snapshot.
This is useful for situations where you need to transmit a snapshot without
exposing key material.
General Options:
` + generalOptionsUsage(usageOptsDefault|usageOptsNoNamespace)
return strings.TrimSpace(helpText)
}
func (c *OperatorSnapshotRedactCommand) AutocompleteFlags() complete.Flags {
return complete.Flags{}
}
func (c *OperatorSnapshotRedactCommand) AutocompleteArgs() complete.Predictor {
return complete.PredictFiles("*")
}
func (c *OperatorSnapshotRedactCommand) Synopsis() string {
return "Redacts an existing snapshot of Nomad server state"
}
func (c *OperatorSnapshotRedactCommand) Name() string { return "operator snapshot redact" }
func (c *OperatorSnapshotRedactCommand) Run(args []string) int {
if len(args) != 1 {
c.Ui.Error("This command takes one argument: <file>")
c.Ui.Error(commandErrorText(c))
return 1
}
path := args[0]
f, err := os.Open(path)
if err != nil {
c.Ui.Error(fmt.Sprintf("Error opening snapshot file: %s", err))
return 1
}
defer f.Close()
tmpFile, err := os.Create(path + ".tmp")
if err != nil {
c.Ui.Error(fmt.Sprintf("Failed to create temporary file: %v", err))
return 1
}
_, err = io.Copy(tmpFile, f)
if err != nil {
c.Ui.Error(fmt.Sprintf("Failed to copy snapshot to temporary file: %v", err))
return 1
}
err = raftutil.RedactSnapshot(tmpFile)
if err != nil {
c.Ui.Error(fmt.Sprintf("Failed to redact snapshot: %v", err))
return 1
}
err = os.Rename(tmpFile.Name(), path)
if err != nil {
c.Ui.Error(fmt.Sprintf("Failed to finalize snapshot file: %v", err))
return 1
}
c.Ui.Output("Snapshot redacted")
return 0
}

View File

@@ -11,6 +11,7 @@ import (
"time"
"github.com/hashicorp/nomad/api"
"github.com/hashicorp/nomad/helper/raftutil"
"github.com/posener/complete"
)
@@ -48,8 +49,14 @@ General Options:
Snapshot Save Options:
-stale=[true|false]
The -stale argument defaults to "false" which means the leader provides the
-redact
The -redact option will locally edit the snapshot to remove any cleartext key
material from the root keyring. Only the AEAD keyring provider has cleartext
key material in Raft. Note that this operation requires loading the snapshot
into memory locally.
-stale
The -stale option defaults to "false" which means the leader provides the
result. If the cluster is in an outage state without a leader, you may need
to set -stale to "true" to get the configuration from a non-leader server.
`
@@ -74,12 +81,14 @@ func (c *OperatorSnapshotSaveCommand) Synopsis() string {
func (c *OperatorSnapshotSaveCommand) Name() string { return "operator snapshot save" }
func (c *OperatorSnapshotSaveCommand) Run(args []string) int {
var stale bool
var stale, redact bool
flags := c.Meta.FlagSet(c.Name(), FlagSetClient)
flags.Usage = func() { c.Ui.Output(c.Help()) }
flags.BoolVar(&stale, "stale", false, "")
flags.BoolVar(&redact, "redact", false, "")
if err := flags.Parse(args); err != nil {
c.Ui.Error(fmt.Sprintf("Failed to parse args: %v", err))
return 1
@@ -141,6 +150,15 @@ func (c *OperatorSnapshotSaveCommand) Run(args []string) int {
return 1
}
if redact {
c.Ui.Info("Redacting key material from snapshot")
err := raftutil.RedactSnapshot(tmpFile)
if err != nil {
c.Ui.Error(fmt.Sprintf("Could not redact snapshot: %v", err))
return 1
}
}
err = os.Rename(tmpFile.Name(), filename)
if err != nil {
c.Ui.Error(fmt.Sprintf("Failed to finalize snapshot file: %v", err))

View File

@@ -85,7 +85,7 @@ func (c *OperatorSnapshotStateCommand) Run(args []string) int {
}
defer f.Close()
state, meta, err := raftutil.RestoreFromArchive(f, filter)
_, state, meta, err := raftutil.RestoreFromArchive(f, filter)
if err != nil {
c.Ui.Error(fmt.Sprintf("Failed to read archive file: %s", err))
return 1

View File

@@ -14,6 +14,7 @@ import (
"github.com/hashicorp/go-memdb"
"github.com/hashicorp/nomad/nomad"
"github.com/hashicorp/nomad/nomad/state"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/raft"
raftboltdb "github.com/hashicorp/raft-boltdb/v2"
)
@@ -209,6 +210,7 @@ func StateAsMap(store *state.StateStore) map[string][]interface{} {
"Jobs": toArray(store.Jobs(nil, state.SortDefault)),
"Nodes": toArray(store.Nodes(nil)),
"PeriodicLaunches": toArray(store.PeriodicLaunches(nil)),
"RootKeys": rootKeyMeta(store),
"SITokenAccessors": toArray(store.SITokenAccessors(nil)),
"ScalingEvents": toArray(store.ScalingEvents(nil)),
"ScalingPolicies": toArray(store.ScalingPolicies(nil)),
@@ -265,3 +267,27 @@ func toArray(iter memdb.ResultIterator, err error) []interface{} {
return r
}
// rootKeyMeta allows displaying keys without their key material
func rootKeyMeta(store *state.StateStore) []any {
iter, err := store.RootKeys(nil)
if err != nil {
return []any{err}
}
keyMeta := []any{}
for {
raw := iter.Next()
if raw == nil {
break
}
k := raw.(*structs.RootKey)
if k == nil {
break
}
keyMeta = append(keyMeta, k.Meta())
}
return keyMeta
}

View File

@@ -6,21 +6,22 @@ package raftutil
import (
"fmt"
"io"
"os"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/raft"
"github.com/hashicorp/nomad/helper/snapshot"
"github.com/hashicorp/nomad/nomad"
"github.com/hashicorp/nomad/nomad/state"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/raft"
)
func RestoreFromArchive(archive io.Reader, filter *nomad.FSMFilter) (*state.StateStore, *raft.SnapshotMeta, error) {
func RestoreFromArchive(archive io.Reader, filter *nomad.FSMFilter) (raft.FSM, *state.StateStore, *raft.SnapshotMeta, error) {
logger := hclog.L()
fsm, err := dummyFSM(logger)
if err != nil {
return nil, nil, fmt.Errorf("failed to create FSM: %w", err)
return nil, nil, nil, fmt.Errorf("failed to create FSM: %w", err)
}
// r is closed by RestoreFiltered, w is closed by CopySnapshot
@@ -40,13 +41,68 @@ func RestoreFromArchive(archive io.Reader, filter *nomad.FSMFilter) (*state.Stat
err = fsm.RestoreWithFilter(r, filter)
if err != nil {
return nil, nil, fmt.Errorf("failed to restore from snapshot: %w", err)
return nil, nil, nil, fmt.Errorf("failed to restore from snapshot: %w", err)
}
select {
case err := <-errCh:
return nil, nil, err
return nil, nil, nil, err
case meta := <-metaCh:
return fsm.State(), meta, nil
return fsm, fsm.State(), meta, nil
}
}
func RedactSnapshot(srcFile *os.File) error {
srcFile.Seek(0, 0)
fsm, store, meta, err := RestoreFromArchive(srcFile, nil)
if err != nil {
return fmt.Errorf("Failed to load snapshot from archive: %w", err)
}
iter, err := store.RootKeys(nil)
if err != nil {
return fmt.Errorf("Failed to query for root keys: %v", err)
}
for {
raw := iter.Next()
if raw == nil {
break
}
rootKey := raw.(*structs.RootKey)
if rootKey == nil {
break
}
if len(rootKey.WrappedKeys) > 0 {
rootKey.KeyID = rootKey.KeyID + " [REDACTED]"
rootKey.WrappedKeys = nil
}
msg, err := structs.Encode(structs.WrappedRootKeysUpsertRequestType,
&structs.KeyringUpsertWrappedRootKeyRequest{
WrappedRootKeys: rootKey,
})
if err != nil {
return fmt.Errorf("Could not re-encode redacted key: %v", err)
}
fsm.Apply(&raft.Log{
Type: raft.LogCommand,
Data: msg,
})
}
snap, err := snapshot.NewFromFSM(hclog.Default(), fsm, meta)
if err != nil {
return fmt.Errorf("Failed to create redacted snapshot: %v", err)
}
srcFile.Truncate(0)
srcFile.Seek(0, 0)
_, err = io.Copy(srcFile, snap)
if err != nil {
return fmt.Errorf("Failed to copy snapshot to temporary file: %v", err)
}
return srcFile.Sync()
}

View File

@@ -43,6 +43,49 @@ func New(logger hclog.Logger, r *raft.Raft) (*Snapshot, error) {
if err != nil {
return nil, fmt.Errorf("failed to open snapshot: %v:", err)
}
return writeSnapshot(logger, metadata, snap)
}
// NewFromFSM takes a state snapshot of the given FSM (for when we don't have a
// Raft instance setup) into a temporary file and returns an object that gives
// access to the file as an io.Reader. You must arrange to call Close() on the
// returned object or else you will leak a temporary file.
func NewFromFSM(logger hclog.Logger, fsm raft.FSM, meta *raft.SnapshotMeta) (*Snapshot, error) {
_, trans := raft.NewInmemTransport("")
snapshotStore := raft.NewInmemSnapshotStore()
fsmSnap, err := fsm.Snapshot()
if err != nil {
return nil, err
}
sink, err := snapshotStore.Create(meta.Version, meta.Index, meta.Term,
meta.Configuration, meta.ConfigurationIndex, trans)
if err != nil {
return nil, err
}
err = fsmSnap.Persist(sink)
if err != nil {
return nil, err
}
err = sink.Close()
if err != nil {
return nil, err
}
snapshotID := sink.ID()
metadata, snap, err := snapshotStore.Open(snapshotID)
if err != nil {
return nil, err
}
return writeSnapshot(logger, metadata, snap)
}
func writeSnapshot(logger hclog.Logger, metadata *raft.SnapshotMeta, snap io.ReadCloser) (*Snapshot, error) {
defer func() {
if err := snap.Close(); err != nil {
logger.Error("Failed to close Raft snapshot", "error", err)

View File

@@ -17,8 +17,10 @@ import (
"github.com/hashicorp/consul/sdk/testutil"
"github.com/hashicorp/go-msgpack/v2/codec"
"github.com/hashicorp/nomad/helper/uuid"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/raft"
"github.com/shoenig/test/must"
"github.com/stretchr/testify/require"
)
@@ -350,3 +352,72 @@ func TestSnapshot_BadRestore(t *testing.T) {
}
}
}
func TestSnapshot_FromFSM(t *testing.T) {
dir := testutil.TempDir(t, "snapshot")
defer os.RemoveAll(dir)
// Make a Raft and populate it with some data. We tee everything we
// apply off to a buffer for checking post-snapshot.
var expected []bytes.Buffer
entries := 64 * 1024
before, fsm := makeRaft(t, filepath.Join(dir, "before"))
defer before.Shutdown()
for i := 0; i < entries; i++ {
var log bytes.Buffer
var copy bytes.Buffer
both := io.MultiWriter(&log, &copy)
_, err := io.CopyN(both, rand.Reader, 256)
must.NoError(t, err)
future := before.Apply(log.Bytes(), time.Second)
must.NoError(t, future.Error())
expected = append(expected, copy)
}
// Take a snapshot.
logger := testutil.Logger(t)
snap, err := NewFromFSM(logger, fsm, &raft.SnapshotMeta{
Version: 1,
ID: uuid.Generate(),
Index: uint64(entries) + 2,
Term: 2,
Peers: []byte{},
Configuration: raft.Configuration{},
})
must.NoError(t, err)
defer snap.Close()
// Verify the snapshot. We have to rewind it after for the restore.
metadata, err := Verify(snap)
must.NoError(t, err)
_, err = snap.file.Seek(0, 0)
must.NoError(t, err)
must.Eq(t, entries+2, int(metadata.Index))
// Make a new, independent Raft.
after, fsm := makeRaft(t, filepath.Join(dir, "after"))
defer after.Shutdown()
// Put some initial data in there that the snapshot should overwrite.
for i := 0; i < 16; i++ {
var log bytes.Buffer
_, err := io.CopyN(&log, rand.Reader, 256)
must.NoError(t, err)
future := after.Apply(log.Bytes(), time.Second)
must.NoError(t, future.Error())
}
// Restore the snapshot.
must.NoError(t, Restore(logger, snap, after))
// Compare the contents.
fsm.Lock()
defer fsm.Unlock()
must.Len(t, len(expected), fsm.logs)
for i := range fsm.logs {
if !bytes.Equal(fsm.logs[i], expected[i].Bytes()) {
t.Fatalf("bad: log %d doesn't match", i)
}
}
}

View File

@@ -1836,16 +1836,17 @@ func (n *nomadFSM) restoreImpl(old io.ReadCloser, filter *FSMFilter) error {
if err := dec.Decode(keyMeta); err != nil {
return err
}
if filter.Include(keyMeta) {
wrappedKeys := structs.NewRootKey(keyMeta)
if err := restore.RootKeyRestore(wrappedKeys); err != nil {
return err
}
wrappedKeys := structs.NewRootKey(keyMeta)
if err := restore.RootKeyRestore(wrappedKeys); err != nil {
return err
}
if n.encrypter != nil {
// only decrypt the key if we're running in a real server and
// not the 'operator snapshot' command context
go n.encrypter.AddWrappedKey(n.encrypter.srv.shutdownCtx, wrappedKeys)
if n.encrypter != nil {
// only decrypt the key if we're running in a real server and
// not the 'operator snapshot' command context
go n.encrypter.AddWrappedKey(n.encrypter.srv.shutdownCtx, wrappedKeys)
}
}
case RootKeySnapshot:
@@ -1853,15 +1854,16 @@ func (n *nomadFSM) restoreImpl(old io.ReadCloser, filter *FSMFilter) error {
if err := dec.Decode(wrappedKeys); err != nil {
return err
}
if filter.Include(wrappedKeys) {
if err := restore.RootKeyRestore(wrappedKeys); err != nil {
return err
}
if err := restore.RootKeyRestore(wrappedKeys); err != nil {
return err
}
if n.encrypter != nil {
// only decrypt the key if we're running in a real server and
// not the 'operator snapshot' command context
go n.encrypter.AddWrappedKey(n.encrypter.srv.shutdownCtx, wrappedKeys)
if n.encrypter != nil {
// only decrypt the key if we're running in a real server and
// not the 'operator snapshot' command context
go n.encrypter.AddWrappedKey(n.encrypter.srv.shutdownCtx, wrappedKeys)
}
}
case ACLRoleSnapshot:
@@ -2344,8 +2346,11 @@ func (n *nomadFSM) applyRootKeyMetaUpsert(msgType structs.MessageType, buf []byt
return err
}
// start a task to decrypt the key material
go n.encrypter.AddWrappedKey(n.encrypter.srv.shutdownCtx, wrappedRootKeys)
if n.encrypter != nil {
// start a task to decrypt the key material if we're running in a real
// server and not the 'operator snapshot' command context
go n.encrypter.AddWrappedKey(n.encrypter.srv.shutdownCtx, wrappedRootKeys)
}
return nil
}
@@ -2363,8 +2368,11 @@ func (n *nomadFSM) applyWrappedRootKeysUpsert(msgType structs.MessageType, buf [
return err
}
// start a task to decrypt the key material
go n.encrypter.AddWrappedKey(n.encrypter.srv.shutdownCtx, req.WrappedRootKeys)
if n.encrypter != nil {
// start a task to decrypt the key material if we're running in a real
// server and not the 'operator snapshot' command context
go n.encrypter.AddWrappedKey(n.encrypter.srv.shutdownCtx, req.WrappedRootKeys)
}
return nil
}
@@ -2382,7 +2390,9 @@ func (n *nomadFSM) applyWrappedRootKeysDelete(msgType structs.MessageType, buf [
return err
}
n.encrypter.RemoveKey(req.KeyID)
if n.encrypter != nil {
n.encrypter.RemoveKey(req.KeyID)
}
return nil
}

View File

@@ -73,7 +73,7 @@ func NewHarnessFromSnapshot(t testing.TB, snapshotPath string) (*scheduler.Harne
}
defer f.Close()
state, _, err := raftutil.RestoreFromArchive(f, nil)
_, state, _, err := raftutil.RestoreFromArchive(f, nil)
if err != nil {
return nil, err
}

View File

@@ -0,0 +1,36 @@
---
layout: docs
page_title: 'Commands: operator snapshot redact'
description: |
Redacts snapshot of Nomad server state
---
# Command: operator snapshot redact
The `operator snapshot redact` command removes key material from an existing
snapshot file created by the `operator snapshot save` command, when using the
AEAD keyring provider.
This is useful for situations where you need to transmit a snapshot without
exposing key material.
<Warning>
When using a [KMS keyring provider][], no cleartext key material is stored in
snapshots and this command is not necessary. Note that this command requires
loading the entire snapshot into memory locally and overwrites the existing
snapshot.
Snapshots made before Nomad 1.9.0 will not include the keyrings.
</Warning>
## Usage
```plaintext
nomad operator snapshot redact <file>
```
[KMS keyring provider]: /nomad/docs/configuration/keyring

View File

@@ -16,12 +16,14 @@ snapshot operations.
<Warning>
This command only saves a Raft snapshot. This snapshot does not include
keyrings. You must back up keyrings separately.
This command includes Nomad's keyring in the snapshot. If you are not using a
[KMS provider][] to secure the keyring, you should use the `-redact` flag to
remove key material before transmitting the snapshot to HashiCorp Support.
If you use this snapshot to recover a cluster, you also need to restore the
keyring onto at least one server. Refer to the Key Management's [Restoring the
Keyring from Backup][restore the keyring] section for instructions.
Snapshots made before Nomad 1.9.0 will not include the keyrings. If you use
older snapshots to recover a cluster, you also need to restore the keyring onto
at least one server. Refer to the Key Management's [Restoring the Keyring from
Backup][restore the keyring] section for instructions.
</Warning>
@@ -54,10 +56,16 @@ nomad operator snapshot save [options] <file>
## Snapshot Save Options
- `-stale`: The stale argument defaults to `false`, which means the leader
- `-redact`: The redact option will locally edit the snapshot to remove any
cleartext key material from the root keyring. Only the AEAD keyring provider
has cleartext key material in Raft. Note that this operation requires loading
the snapshot into memory locally.
- `-stale`: The stale option defaults to `false`, which means the leader
provides the result. If the cluster is in an outage state without a leader,
you may need to set `-stale` to `true` to get the configuration from a
non-leader server.
[outage recovery]: /nomad/tutorials/manage-clusters/outage-recovery
[restore the keyring]: /nomad/docs/operations/key-management#restoring-the-keyring-from-backup
[KMS provider]: /nomad/docs/configuration/keyring

View File

@@ -937,6 +937,10 @@
"title": "inspect",
"path": "commands/operator/snapshot/inspect"
},
{
"title": "redact",
"path": "commands/operator/snapshot/redact"
},
{
"title": "restore",
"path": "commands/operator/snapshot/restore"