Merge pull request #8047 from hashicorp/f-snapshot-save

API for atomic snapshot backups
This commit is contained in:
Mahmood Ali
2020-06-01 07:55:16 -04:00
committed by GitHub
31 changed files with 2121 additions and 25 deletions

View File

@@ -318,6 +318,7 @@ func (s *HTTPServer) registerHandlers(enableDebug bool) {
s.mux.HandleFunc("/v1/operator/raft/", s.wrap(s.OperatorRequest))
s.mux.HandleFunc("/v1/operator/autopilot/configuration", s.wrap(s.OperatorAutopilotConfiguration))
s.mux.HandleFunc("/v1/operator/autopilot/health", s.wrap(s.OperatorServerHealth))
s.mux.HandleFunc("/v1/operator/snapshot", s.wrap(s.SnapshotRequest))
s.mux.HandleFunc("/v1/system/gc", s.wrap(s.GarbageCollectRequest))
s.mux.HandleFunc("/v1/system/reconcile/summaries", s.wrap(s.ReconcileJobSummaries))

View File

@@ -1,6 +1,9 @@
package agent
import (
"context"
"io"
"net"
"net/http"
"strings"
@@ -9,6 +12,7 @@ import (
"time"
"github.com/hashicorp/consul/agent/consul/autopilot"
"github.com/hashicorp/go-msgpack/codec"
"github.com/hashicorp/nomad/api"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/raft"
@@ -283,3 +287,88 @@ func (s *HTTPServer) schedulerUpdateConfig(resp http.ResponseWriter, req *http.R
setIndex(resp, reply.Index)
return reply, nil
}
func (s *HTTPServer) SnapshotRequest(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
switch req.Method {
case "GET":
return s.snapshotSaveRequest(resp, req)
default:
return nil, CodedError(405, ErrInvalidMethod)
}
}
func (s *HTTPServer) snapshotSaveRequest(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
args := &structs.SnapshotSaveRequest{}
if s.parse(resp, req, &args.Region, &args.QueryOptions) {
return nil, nil
}
var handler structs.StreamingRpcHandler
var handlerErr error
if server := s.agent.Server(); server != nil {
handler, handlerErr = server.StreamingRpcHandler("Operator.SnapshotSave")
} else if client := s.agent.Client(); client != nil {
handler, handlerErr = client.RemoteStreamingRpcHandler("Operator.SnapshotSave")
} else {
handlerErr = fmt.Errorf("misconfigured connection")
}
if handlerErr != nil {
return nil, CodedError(500, handlerErr.Error())
}
httpPipe, handlerPipe := net.Pipe()
decoder := codec.NewDecoder(httpPipe, structs.MsgpackHandle)
encoder := codec.NewEncoder(httpPipe, structs.MsgpackHandle)
// Create a goroutine that closes the pipe if the connection closes.
ctx, cancel := context.WithCancel(req.Context())
defer cancel()
go func() {
<-ctx.Done()
httpPipe.Close()
}()
errCh := make(chan HTTPCodedError, 1)
go func() {
defer cancel()
// Send the request
if err := encoder.Encode(args); err != nil {
errCh <- CodedError(500, err.Error())
return
}
var res structs.SnapshotSaveResponse
if err := decoder.Decode(&res); err != nil {
errCh <- CodedError(500, err.Error())
return
}
if res.ErrorMsg != "" {
errCh <- CodedError(res.ErrorCode, res.ErrorMsg)
return
}
resp.Header().Add("Digest", res.SnapshotChecksum)
_, err := io.Copy(resp, httpPipe)
if err != nil &&
err != io.EOF &&
!strings.Contains(err.Error(), "closed") &&
!strings.Contains(err.Error(), "EOF") {
errCh <- CodedError(500, err.Error())
return
}
errCh <- nil
}()
handler(handlerPipe)
cancel()
codedErr := <-errCh
return nil, codedErr
}

View File

@@ -2,9 +2,15 @@ package agent
import (
"bytes"
"crypto/sha256"
"encoding/base64"
"fmt"
"io"
"io/ioutil"
"net/http"
"net/http/httptest"
"os"
"path"
"strings"
"testing"
"time"
@@ -382,3 +388,39 @@ func TestOperator_SchedulerCASConfiguration(t *testing.T) {
require.False(reply.SchedulerConfig.PreemptionConfig.BatchSchedulerEnabled)
})
}
func TestOperator_SnapshotSaveRequest(t *testing.T) {
t.Parallel()
////// Nomad clusters topology - not specific to test
dir, err := ioutil.TempDir("", "nomadtest-operator-")
require.NoError(t, err)
defer os.RemoveAll(dir)
httpTest(t, func(c *Config) {
c.Server.BootstrapExpect = 1
c.DevMode = false
c.DataDir = path.Join(dir, "server")
c.AdvertiseAddrs.HTTP = "127.0.0.1"
c.AdvertiseAddrs.RPC = "127.0.0.1"
c.AdvertiseAddrs.Serf = "127.0.0.1"
}, func(s *TestAgent) {
req, _ := http.NewRequest("GET", "/v1/operator/snapshot", nil)
resp := httptest.NewRecorder()
_, err := s.Server.SnapshotRequest(resp, req)
require.NoError(t, err)
require.Equal(t, 200, resp.Code)
digest := resp.Header().Get("Digest")
require.NotEmpty(t, digest)
require.Contains(t, digest, "sha-256=")
hash := sha256.New()
_, err = io.Copy(hash, resp.Body)
require.NoError(t, err)
expectedChecksum := "sha-256=" + base64.StdEncoding.EncodeToString(hash.Sum(nil))
require.Equal(t, digest, expectedChecksum)
})
}

View File

@@ -126,8 +126,15 @@ func (a *TestAgent) Start() *TestAgent {
i := 10
advertiseAddrs := *a.Config.AdvertiseAddrs
RETRY:
i--
// Clear out the advertise addresses such that through retries we
// re-normalize the addresses correctly instead of using the values from the
// last port selection that had a port conflict.
newAddrs := advertiseAddrs
a.Config.AdvertiseAddrs = &newAddrs
a.pickRandomPorts(a.Config)
if a.Config.NodeName == "" {
a.Config.NodeName = fmt.Sprintf("Node %d", a.Config.Ports.RPC)
@@ -312,15 +319,6 @@ func (a *TestAgent) pickRandomPorts(c *Config) {
c.Ports.RPC = ports[1]
c.Ports.Serf = ports[2]
// Clear out the advertise addresses such that through retries we
// re-normalize the addresses correctly instead of using the values from the
// last port selection that had a port conflict.
if c.AdvertiseAddrs != nil {
c.AdvertiseAddrs.HTTP = ""
c.AdvertiseAddrs.RPC = ""
c.AdvertiseAddrs.Serf = ""
}
if err := c.normalizeAddrs(); err != nil {
a.T.Fatalf("error normalizing config: %v", err)
}

View File

@@ -502,6 +502,22 @@ func Commands(metaPtr *Meta, agentUi cli.Ui) map[string]cli.CommandFactory {
}, nil
},
"operator snapshot": func() (cli.Command, error) {
return &OperatorSnapshotCommand{
Meta: meta,
}, nil
},
"operator snapshot save": func() (cli.Command, error) {
return &OperatorSnapshotSaveCommand{
Meta: meta,
}, nil
},
"operator snapshot inspect": func() (cli.Command, error) {
return &OperatorSnapshotInspectCommand{
Meta: meta,
}, nil
},
"plan": func() (cli.Command, error) {
return &JobPlanCommand{
Meta: meta,

View File

@@ -0,0 +1,50 @@
package command
import (
"strings"
"github.com/mitchellh/cli"
)
type OperatorSnapshotCommand struct {
Meta
}
func (f *OperatorSnapshotCommand) Help() string {
helpText := `
Usage: nomad operator snapshot <subcommand> [options]
This command has subcommands for saving and inspecting the state
of the Nomad servers for disaster recovery. These are atomic, point-in-time
snapshots which include jobs, nodes, allocations, periodic jobs, and ACLs.
If ACLs are enabled, a management token must be supplied in order to perform
snapshot operations.
Create a snapshot:
$ nomad operator snapshot save backup.snap
Inspect a snapshot:
$ nomad operator snapshot inspect backup.snap
Run a daemon process that locally saves a snapshot every hour (available only in
Nomad Enterprise) :
$ nomad operator snapshot agent
Please see the individual subcommand help for detailed usage information.
`
return strings.TrimSpace(helpText)
}
func (f *OperatorSnapshotCommand) Synopsis() string {
return "Saves and inspects snapshots of Nomad server state"
}
func (f *OperatorSnapshotCommand) Name() string { return "operator snapshot" }
func (f *OperatorSnapshotCommand) Run(args []string) int {
return cli.RunResultHelp
}

View File

@@ -0,0 +1,74 @@
package command
import (
"fmt"
"os"
"strings"
"github.com/hashicorp/nomad/helper/snapshot"
"github.com/posener/complete"
)
type OperatorSnapshotInspectCommand struct {
Meta
}
func (c *OperatorSnapshotInspectCommand) Help() string {
helpText := `
Usage: nomad operator snapshot inspect [options] FILE
Displays information about a snapshot file on disk.
To inspect the file "backup.snap":
$ nomad operator snapshot inspect backup.snap
`
return strings.TrimSpace(helpText)
}
func (c *OperatorSnapshotInspectCommand) AutocompleteFlags() complete.Flags {
return complete.Flags{}
}
func (c *OperatorSnapshotInspectCommand) AutocompleteArgs() complete.Predictor {
return complete.PredictNothing
}
func (c *OperatorSnapshotInspectCommand) Synopsis() string {
return "Displays information about a Nomad snapshot file"
}
func (c *OperatorSnapshotInspectCommand) Name() string { return "operator snapshot inspect" }
func (c *OperatorSnapshotInspectCommand) Run(args []string) int {
// Check that we either got no filename or exactly one.
if len(args) != 1 {
c.Ui.Error("This command takes one argument: <filename>")
c.Ui.Error(commandErrorText(c))
return 1
}
path := args[0]
f, err := os.Open(path)
if err != nil {
c.Ui.Error(fmt.Sprintf("Error opening snapshot file: %s", err))
return 1
}
defer f.Close()
meta, err := snapshot.Verify(f)
if err != nil {
c.Ui.Error(fmt.Sprintf("Error verifying snapshot: %s", err))
return 1
}
output := []string{
fmt.Sprintf("ID|%s", meta.ID),
fmt.Sprintf("Size|%d", meta.Size),
fmt.Sprintf("Index|%d", meta.Index),
fmt.Sprintf("Term|%d", meta.Term),
fmt.Sprintf("Version|%d", meta.Version),
}
c.Ui.Output(formatList(output))
return 0
}

View File

@@ -0,0 +1,99 @@
package command
import (
"io/ioutil"
"os"
"path/filepath"
"testing"
"github.com/hashicorp/nomad/command/agent"
"github.com/mitchellh/cli"
"github.com/stretchr/testify/require"
)
func TestOperatorSnapshotInspect_Works(t *testing.T) {
t.Parallel()
snapPath := generateSnapshotFile(t)
ui := new(cli.MockUi)
cmd := &OperatorSnapshotInspectCommand{Meta: Meta{Ui: ui}}
code := cmd.Run([]string{snapPath})
require.Zero(t, code)
output := ui.OutputWriter.String()
for _, key := range []string{
"ID",
"Size",
"Index",
"Term",
"Version",
} {
require.Contains(t, output, key)
}
}
func TestOperatorSnapshotInspect_HandlesFailure(t *testing.T) {
t.Parallel()
tmpDir, err := ioutil.TempDir("", "nomad-clitests-")
require.NoError(t, err)
defer os.RemoveAll(tmpDir)
err = ioutil.WriteFile(
filepath.Join(tmpDir, "invalid.snap"),
[]byte("invalid data"),
0600)
require.NoError(t, err)
t.Run("not found", func(t *testing.T) {
ui := new(cli.MockUi)
cmd := &OperatorSnapshotInspectCommand{Meta: Meta{Ui: ui}}
code := cmd.Run([]string{filepath.Join(tmpDir, "foo")})
require.NotZero(t, code)
require.Contains(t, ui.ErrorWriter.String(), "no such file")
})
t.Run("invalid file", func(t *testing.T) {
ui := new(cli.MockUi)
cmd := &OperatorSnapshotInspectCommand{Meta: Meta{Ui: ui}}
code := cmd.Run([]string{filepath.Join(tmpDir, "invalid.snap")})
require.NotZero(t, code)
require.Contains(t, ui.ErrorWriter.String(), "Error verifying snapshot")
})
}
func generateSnapshotFile(t *testing.T) string {
tmpDir, err := ioutil.TempDir("", "nomad-tempdir")
require.NoError(t, err)
t.Cleanup(func() { os.RemoveAll(tmpDir) })
srv, _, url := testServer(t, false, func(c *agent.Config) {
c.DevMode = false
c.DataDir = filepath.Join(tmpDir, "server")
c.AdvertiseAddrs.HTTP = "127.0.0.1"
c.AdvertiseAddrs.RPC = "127.0.0.1"
c.AdvertiseAddrs.Serf = "127.0.0.1"
})
defer srv.Shutdown()
ui := new(cli.MockUi)
cmd := &OperatorSnapshotSaveCommand{Meta: Meta{Ui: ui}}
dest := filepath.Join(tmpDir, "backup.snap")
code := cmd.Run([]string{
"--address=" + url,
dest,
})
require.Zero(t, code)
return dest
}

View File

@@ -0,0 +1,142 @@
package command
import (
"fmt"
"io"
"os"
"strings"
"time"
"github.com/hashicorp/nomad/api"
"github.com/posener/complete"
)
type OperatorSnapshotSaveCommand struct {
Meta
}
func (c *OperatorSnapshotSaveCommand) Help() string {
helpText := `
Usage: nomad operator snapshot save [options] <filename>
Retrieves an atomic, point-in-time snapshot of the state of the Nomad servers
which includes jobs, nodes, allocations, periodic jobs, and ACLs.
If ACLs are enabled, a management token must be supplied in order to perform
snapshot operations.
To create a snapshot from the leader server and save it to "backup.snap":
$ nomad snapshot save backup.snap
To create a potentially stale snapshot from any available server (useful if no
leader is available):
General Options:
` + generalOptionsUsage() + `
Snapshot Save Options:
-stale=[true|false]
The -stale argument defaults to "false" which means the leader provides the
result. If the cluster is in an outage state without a leader, you may need
to set -stale to "true" to get the configuration from a non-leader server.
`
return strings.TrimSpace(helpText)
}
func (c *OperatorSnapshotSaveCommand) AutocompleteFlags() complete.Flags {
return mergeAutocompleteFlags(c.Meta.AutocompleteFlags(FlagSetClient),
complete.Flags{
"-stale": complete.PredictAnything,
})
}
func (c *OperatorSnapshotSaveCommand) AutocompleteArgs() complete.Predictor {
return complete.PredictNothing
}
func (c *OperatorSnapshotSaveCommand) Synopsis() string {
return "Saves snapshot of Nomad server state"
}
func (c *OperatorSnapshotSaveCommand) Name() string { return "operator snapshot save" }
func (c *OperatorSnapshotSaveCommand) Run(args []string) int {
var stale bool
flags := c.Meta.FlagSet(c.Name(), FlagSetClient)
flags.Usage = func() { c.Ui.Output(c.Help()) }
flags.BoolVar(&stale, "stale", false, "")
if err := flags.Parse(args); err != nil {
c.Ui.Error(fmt.Sprintf("Failed to parse args: %v", err))
return 1
}
// Check for misuse
// Check that we either got no filename or exactly one.
args = flags.Args()
if len(args) > 1 {
c.Ui.Error("This command takes either no arguments or one: <filename>")
c.Ui.Error(commandErrorText(c))
return 1
}
now := time.Now()
filename := fmt.Sprintf("nomad-state-%04d%02d%0d-%d.snap", now.Year(), now.Month(), now.Day(), now.Unix())
if len(args) == 1 {
filename = args[0]
}
if _, err := os.Lstat(filename); err == nil {
c.Ui.Error(fmt.Sprintf("Destination file already exists: %q", filename))
c.Ui.Error(commandErrorText(c))
return 1
} else if !os.IsNotExist(err) {
c.Ui.Error(fmt.Sprintf("Unexpected failure checking %q: %v", filename, err))
return 1
}
// Set up a client.
client, err := c.Meta.Client()
if err != nil {
c.Ui.Error(fmt.Sprintf("Error initializing client: %s", err))
return 1
}
tmpFile, err := os.Create(filename + ".tmp")
if err != nil {
c.Ui.Error(fmt.Sprintf("Failed to create file: %v", err))
return 1
}
// Fetch the current configuration.
q := &api.QueryOptions{
AllowStale: stale,
}
snapIn, err := client.Operator().Snapshot(q)
if err != nil {
c.Ui.Error(fmt.Sprintf("Failed to get snapshot file: %v", err))
return 1
}
defer snapIn.Close()
_, err = io.Copy(tmpFile, snapIn)
if err != nil {
c.Ui.Error(fmt.Sprintf("Filed to download snapshot file: %v", err))
return 1
}
err = os.Rename(tmpFile.Name(), filename)
if err != nil {
c.Ui.Error(fmt.Sprintf("Filed to finalize snapshot file: %v", err))
return 1
}
c.Ui.Output(fmt.Sprintf("State file written to %v", filename))
return 0
}

View File

@@ -0,0 +1,51 @@
package command
import (
"io/ioutil"
"os"
"path/filepath"
"testing"
"github.com/hashicorp/nomad/command/agent"
"github.com/hashicorp/nomad/helper/snapshot"
"github.com/mitchellh/cli"
"github.com/stretchr/testify/require"
)
func TestOperatorSnapshotSave_Works(t *testing.T) {
t.Parallel()
tmpDir, err := ioutil.TempDir("", "nomad-tempdir")
require.NoError(t, err)
defer os.RemoveAll(tmpDir)
srv, _, url := testServer(t, false, func(c *agent.Config) {
c.DevMode = false
c.DataDir = filepath.Join(tmpDir, "server")
c.AdvertiseAddrs.HTTP = "127.0.0.1"
c.AdvertiseAddrs.RPC = "127.0.0.1"
c.AdvertiseAddrs.Serf = "127.0.0.1"
})
defer srv.Shutdown()
ui := new(cli.MockUi)
cmd := &OperatorSnapshotSaveCommand{Meta: Meta{Ui: ui}}
dest := filepath.Join(tmpDir, "backup.snap")
code := cmd.Run([]string{
"--address=" + url,
dest,
})
require.Zero(t, code)
require.Contains(t, ui.OutputWriter.String(), "State file written to "+dest)
f, err := os.Open(dest)
require.NoError(t, err)
meta, err := snapshot.Verify(f)
require.NoError(t, err)
require.NotZero(t, meta.Index)
}