From 716df52788a0b36a8220b4060737dc7a7f499c08 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Wed, 12 Feb 2025 09:25:50 -0500 Subject: [PATCH] CNI: migrate from persistent state to ephemeral state during restart (#25093) In #24650 we switched to using ephemeral state for CNI plugins, so that when a host reboots and we lose all the allocations we don't end up trying to use IPs we created in network namespaces we just destroyed. Unfortunately upgrade testing missed that in a non-reboot scenario, the existing CNI state was being used by plugins like the ipam plugin to hand out the "next available" IP address. So with no state carried over, we might allocate new addresses that conflict with existing allocations. (This can be avoided by draining the node first.) As a compatibility shim, copy the old CNI state directory to the new CNI state directory during agent startup, if the new CNI state directory doesn't already exist. Ref: https://github.com/hashicorp/nomad/pull/24650 --- .changelog/25093.txt | 3 ++ client/client.go | 21 +++++++++++ helper/escapingfs/copydir.go | 58 +++++++++++++++++++++++++++++++ helper/escapingfs/copydir_test.go | 41 ++++++++++++++++++++++ 4 files changed, 123 insertions(+) create mode 100644 .changelog/25093.txt create mode 100644 helper/escapingfs/copydir.go create mode 100644 helper/escapingfs/copydir_test.go diff --git a/.changelog/25093.txt b/.changelog/25093.txt new file mode 100644 index 000000000..1b39fcb2d --- /dev/null +++ b/.changelog/25093.txt @@ -0,0 +1,3 @@ +```release-note:bug +cni: Fixed a bug where CNI state was not migrated after upgrade, resulting in IP collisions +``` diff --git a/client/client.go b/client/client.go index 2f6440579..608efcb5c 100644 --- a/client/client.go +++ b/client/client.go @@ -54,6 +54,7 @@ import ( "github.com/hashicorp/nomad/command/agent/consul" "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/helper/envoy" + "github.com/hashicorp/nomad/helper/escapingfs" "github.com/hashicorp/nomad/helper/goruntime" "github.com/hashicorp/nomad/helper/group" "github.com/hashicorp/nomad/helper/pointer" @@ -762,6 +763,26 @@ func (c *Client) init() error { // setup the nsd check store c.checkStore = checkstore.NewStore(c.logger, c.stateDB) + // COMPAT(1.12.0): remove in Nomad 1.12.0 + oldCNIDir := "/var/lib/cni/networks/nomad" + newCNIDir := "/var/run/cni/nomad" + if _, err := os.Stat(newCNIDir); os.IsNotExist(err) { + if _, err := os.Stat(oldCNIDir); err == nil { + err := escapingfs.CopyDir(oldCNIDir, newCNIDir) + if err != nil { + c.logger.Error("failed to migrate existing CNI state", + "error", err, "src", oldCNIDir, "dest", newCNIDir) + } else { + err := os.RemoveAll(oldCNIDir) + if err != nil { + c.logger.Error("migrated CNI state but could not remove old state", + "error", err, "src", oldCNIDir, "dest", newCNIDir) + } + c.logger.Info("migrated CNI state", "src", oldCNIDir, "dest", newCNIDir) + } + } + } + return nil } diff --git a/helper/escapingfs/copydir.go b/helper/escapingfs/copydir.go new file mode 100644 index 000000000..6809bb176 --- /dev/null +++ b/helper/escapingfs/copydir.go @@ -0,0 +1,58 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +package escapingfs + +import ( + "fmt" + "io" + "io/fs" + "os" + "path/filepath" +) + +// CopyDir copies a directory's contents to a new location, returning an error +// on symlinks. This implementation is roughly the same as the stdlib os.CopyDir +// but with th e important difference that we preserve file modes. +func CopyDir(src, dst string) error { + srcFs := os.DirFS(src) + + return fs.WalkDir(srcFs, ".", func(oldPath string, d fs.DirEntry, err error) error { + if err != nil { + return err + } + + newPath := filepath.Join(dst, oldPath) + if d.IsDir() { + info, err := d.Info() + if err != nil { + return fmt.Errorf("could not stat directory: %v", err) + } + return os.MkdirAll(newPath, info.Mode()) + } + if !d.Type().IsRegular() { + return fmt.Errorf("copying cannot traverse symlinks") + } + + r, err := srcFs.Open(oldPath) + if err != nil { + return fmt.Errorf("could not open existing file: %v", err) + } + defer r.Close() + info, err := r.Stat() + if err != nil { + return fmt.Errorf("could not stat file: %v", err) + } + + w, err := os.OpenFile(newPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, info.Mode()) + if err != nil { + return err + } + + if _, err := io.Copy(w, r); err != nil { + w.Close() + return fmt.Errorf("could not copy file: %v", err) + } + return w.Close() + }) +} diff --git a/helper/escapingfs/copydir_test.go b/helper/escapingfs/copydir_test.go new file mode 100644 index 000000000..23cf62796 --- /dev/null +++ b/helper/escapingfs/copydir_test.go @@ -0,0 +1,41 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +package escapingfs + +import ( + "io/fs" + "os" + "path/filepath" + "testing" + + "github.com/shoenig/test/must" + "golang.org/x/sys/unix" +) + +func TestCopyDir(t *testing.T) { + + testDir := t.TempDir() + src := filepath.Join(testDir, "src") + dst := filepath.Join(testDir, "dst") + + must.NoError(t, os.Mkdir(src, 0700)) + must.NoError(t, os.WriteFile(filepath.Join(src, "foo"), []byte("foo"), 0770)) + must.NoError(t, os.WriteFile(filepath.Join(src, "bar"), []byte("bar"), 0555)) + must.NoError(t, os.Mkdir(filepath.Join(src, "bazDir"), 0700)) + must.NoError(t, os.WriteFile(filepath.Join(src, "bazDir", "baz"), []byte("baz"), 0555)) + + err := CopyDir(src, dst) + must.NoError(t, err) + + // This is really how you have to retrieve umask. See `man 2 umask` + umask := unix.Umask(0) + unix.Umask(umask) + + must.FileContains(t, filepath.Join(dst, "foo"), "foo") + must.FileMode(t, filepath.Join(dst, "foo"), fs.FileMode(0o770&(^umask))) + must.FileContains(t, filepath.Join(dst, "bar"), "bar") + must.FileMode(t, filepath.Join(dst, "bar"), fs.FileMode(0o555&(^umask))) + must.FileContains(t, filepath.Join(dst, "bazDir", "baz"), "baz") + must.FileMode(t, filepath.Join(dst, "bazDir", "baz"), fs.FileMode(0o555&(^umask))) +}