From 9682aa27248be131bfb52e120eb1c38513662dce Mon Sep 17 00:00:00 2001 From: Daniel Bennett Date: Thu, 4 Sep 2025 12:29:50 -0400 Subject: [PATCH] consul connect: allow "cni/*" network mode (#26449) don't require "bridge" network mode when using connect{} we document this as "at your own risk" because CNI configuration is so flexible that we can't guarantee a user's network will work, but Nomad's "bridge" CNI config may be used as a reference. --- .changelog/26449.txt | 3 + client/allocrunner/consul_grpc_sock_hook.go | 4 +- client/allocrunner/consul_http_sock_hook.go | 14 ++-- command/agent/consul/connect.go | 2 +- command/asset/connect.nomad.hcl | 2 +- e2e/connect/connect_test.go | 49 ++++++------ e2e/connect/input/demo.nomad | 8 +- .../cni/nomad_bridge_copy.conflist | 41 ++++++++++ .../packer/ubuntu-jammy-amd64/setup.sh | 4 + nomad/job_endpoint_hook_connect.go | 75 ++++++++++--------- nomad/job_endpoint_hook_connect_test.go | 65 ++++++++++++---- nomad/job_endpoint_hook_expose_check.go | 34 ++++----- nomad/job_endpoint_hook_expose_check_test.go | 52 ++++++++----- nomad/job_endpoint_test.go | 17 ++++- .../docs/job-declare/consul-service-mesh.mdx | 10 ++- .../docs/networking/consul/service-mesh.mdx | 29 +++++++ 16 files changed, 275 insertions(+), 134 deletions(-) create mode 100644 .changelog/26449.txt create mode 100644 e2e/terraform/packer/ubuntu-jammy-amd64/cni/nomad_bridge_copy.conflist diff --git a/.changelog/26449.txt b/.changelog/26449.txt new file mode 100644 index 000000000..5db4f73e9 --- /dev/null +++ b/.changelog/26449.txt @@ -0,0 +1,3 @@ +```release-note:improvement +consul connect: Allow cni/* network mode; use at your own risk +``` diff --git a/client/allocrunner/consul_grpc_sock_hook.go b/client/allocrunner/consul_grpc_sock_hook.go index 0cc53e981..312b93b17 100644 --- a/client/allocrunner/consul_grpc_sock_hook.go +++ b/client/allocrunner/consul_grpc_sock_hook.go @@ -117,8 +117,8 @@ func (*consulGRPCSocketHook) Name() string { func (h *consulGRPCSocketHook) shouldRun() bool { tg := h.alloc.Job.LookupTaskGroup(h.alloc.TaskGroup) - // we must be in bridge networking and at least one connect sidecar task - if !tgFirstNetworkIsBridge(tg) { + // we must be in bridge/cni networking and at least one connect sidecar task + if !tgFirstNetworkCanConsulConnect(tg) { return false } diff --git a/client/allocrunner/consul_http_sock_hook.go b/client/allocrunner/consul_http_sock_hook.go index 09f7f8757..f2a31f587 100644 --- a/client/allocrunner/consul_http_sock_hook.go +++ b/client/allocrunner/consul_http_sock_hook.go @@ -10,6 +10,7 @@ import ( "net" "os" "path/filepath" + "strings" "sync" "time" @@ -23,11 +24,12 @@ import ( "github.com/hashicorp/nomad/nomad/structs/config" ) -func tgFirstNetworkIsBridge(tg *structs.TaskGroup) bool { - if len(tg.Networks) < 1 || tg.Networks[0].Mode != "bridge" { +func tgFirstNetworkCanConsulConnect(tg *structs.TaskGroup) bool { + if len(tg.Networks) < 1 { return false } - return true + mode := tg.Networks[0].Mode + return mode == "bridge" || strings.HasPrefix(mode, "cni/") } const ( @@ -88,13 +90,11 @@ func (*consulHTTPSockHook) Name() string { // shouldRun returns true if the alloc contains at least one connect native // task and has a network configured in bridge mode -// -// todo(shoenig): what about CNI networks? func (h *consulHTTPSockHook) shouldRun() bool { tg := h.alloc.Job.LookupTaskGroup(h.alloc.TaskGroup) - // we must be in bridge networking and at least one connect native task - if !tgFirstNetworkIsBridge(tg) { + // we must be in bridge/cni networking and at least one connect native task + if !tgFirstNetworkCanConsulConnect(tg) { return false } diff --git a/command/agent/consul/connect.go b/command/agent/consul/connect.go index 79ae86686..711f9e82a 100644 --- a/command/agent/consul/connect.go +++ b/command/agent/consul/connect.go @@ -264,7 +264,7 @@ func connectProxyConfig(cfg map[string]interface{}, port int, info structs.Alloc func connectProxyBindAddress(networks structs.Networks) string { for _, n := range networks { - if n.Mode == "bridge" && n.IsIPv6() { + if n.IsIPv6() && (n.Mode == "bridge" || strings.HasPrefix(n.Mode, "cni/")) { return "::" } } diff --git a/command/asset/connect.nomad.hcl b/command/asset/connect.nomad.hcl index 116c1cdd4..935a23d4c 100644 --- a/command/asset/connect.nomad.hcl +++ b/command/asset/connect.nomad.hcl @@ -156,7 +156,7 @@ job "countdash" { # config { # image = "${meta.connect.sidecar_image}" # args = [ - # "-c", "${NOMAD_TASK_DIR}/bootstrap.json", + # "-c", "${NOMAD_SECRETS_DIR}/envoy_bootstrap.json", # "-l", "${meta.connect.log_level}" # ] # } diff --git a/e2e/connect/connect_test.go b/e2e/connect/connect_test.go index 6d65bb88e..89c154d53 100644 --- a/e2e/connect/connect_test.go +++ b/e2e/connect/connect_test.go @@ -25,7 +25,8 @@ func TestConnect(t *testing.T) { test.NoError(t, err) }) - t.Run("ConnectDemo", testConnectDemo) + t.Run("ConnectDemo", testConnectDemo("bridge")) + t.Run("ConnectDemoCNI", testConnectDemo("cni/nomad-bridge-copy")) t.Run("ConnectCustomSidecarExposed", testConnectCustomSidecarExposed) t.Run("ConnectNativeDemo", testConnectNativeDemo) t.Run("ConnectIngressGatewayDemo", testConnectIngressGatewayDemo) @@ -36,30 +37,34 @@ func TestConnect(t *testing.T) { } // testConnectDemo tests the demo job file used in Connect Integration examples. -func testConnectDemo(t *testing.T) { - sub, _ := jobs3.Submit(t, "./input/demo.nomad", jobs3.Timeout(time.Second*60)) +func testConnectDemo(networkMode string) func(t *testing.T) { + return func(t *testing.T) { + sub, _ := jobs3.Submit(t, "./input/demo.nomad", jobs3.Timeout(time.Second*60), + jobs3.Var("network_mode", networkMode), + ) - cc := e2eutil.ConsulClient(t) + cc := e2eutil.ConsulClient(t) - ixn := &capi.Intention{ - SourceName: "count-dashboard", - DestinationName: "count-api", - Action: "allow", + ixn := &capi.Intention{ + SourceName: "count-dashboard", + DestinationName: "count-api", + Action: "allow", + } + _, err := cc.Connect().IntentionUpsert(ixn, nil) + must.NoError(t, err, must.Sprint("could not create intention")) + + t.Cleanup(func() { + _, err := cc.Connect().IntentionDeleteExact("count-dashboard", "count-api", nil) + test.NoError(t, err) + }) + + assertServiceOk(t, cc, "count-api-sidecar-proxy") + assertServiceOk(t, cc, "count-dashboard-sidecar-proxy") + + logs := sub.Exec("dashboard", "dashboard", + []string{"/bin/sh", "-c", "wget -O /dev/null http://${NOMAD_UPSTREAM_ADDR_count_api}"}) + must.StrContains(t, logs.Stderr, "saving to") } - _, err := cc.Connect().IntentionUpsert(ixn, nil) - must.NoError(t, err, must.Sprint("could not create intention")) - - t.Cleanup(func() { - _, err := cc.Connect().IntentionDeleteExact("count-dashboard", "count-api", nil) - test.NoError(t, err) - }) - - assertServiceOk(t, cc, "count-api-sidecar-proxy") - assertServiceOk(t, cc, "count-dashboard-sidecar-proxy") - - logs := sub.Exec("dashboard", "dashboard", - []string{"/bin/sh", "-c", "wget -O /dev/null http://${NOMAD_UPSTREAM_ADDR_count_api}"}) - must.StrContains(t, logs.Stderr, "saving to") } // testConnectCustomSidecarExposed tests that a connect sidecar with custom task diff --git a/e2e/connect/input/demo.nomad b/e2e/connect/input/demo.nomad index a352b8edf..a5962ef16 100644 --- a/e2e/connect/input/demo.nomad +++ b/e2e/connect/input/demo.nomad @@ -1,6 +1,10 @@ # Copyright (c) HashiCorp, Inc. # SPDX-License-Identifier: BUSL-1.1 +variable "network_mode" { + default = "bridge" +} + job "countdash" { datacenters = ["dc1"] @@ -11,7 +15,7 @@ job "countdash" { group "api" { network { - mode = "bridge" + mode = var.network_mode } service { @@ -43,7 +47,7 @@ job "countdash" { group "dashboard" { network { - mode = "bridge" + mode = var.network_mode port "http" { static = 9002 diff --git a/e2e/terraform/packer/ubuntu-jammy-amd64/cni/nomad_bridge_copy.conflist b/e2e/terraform/packer/ubuntu-jammy-amd64/cni/nomad_bridge_copy.conflist new file mode 100644 index 000000000..2324a3e98 --- /dev/null +++ b/e2e/terraform/packer/ubuntu-jammy-amd64/cni/nomad_bridge_copy.conflist @@ -0,0 +1,41 @@ +{ + "cniVersion": "1.0.0", + "name": "nomad-bridge-copy", + "plugins": [ + { + "type": "loopback" + }, + { + "type": "bridge", + "bridge": "nomad", + "ipMasq": true, + "isGateway": true, + "forceAddress": true, + "hairpinMode": false, + "ipam": { + "type": "host-local", + "ranges": [ + [{"subnet": "172.26.64.0/20"}], + [{"subnet": "a110:c8::/112"}] + ], + "routes": [ + {"dst": "0.0.0.0/0"}, + {"dst": "::/0"} + ], + "dataDir": "/var/run/cni" + } + }, + { + "type": "firewall", + "backend": "iptables", + "iptablesAdminChainName": "NOMAD-ADMIN" + }, + { + "type": "portmap", + "capabilities": { + "portMappings": true + }, + "snat": true + } + ] +} diff --git a/e2e/terraform/packer/ubuntu-jammy-amd64/setup.sh b/e2e/terraform/packer/ubuntu-jammy-amd64/setup.sh index 39ded0a2e..24fd9c135 100755 --- a/e2e/terraform/packer/ubuntu-jammy-amd64/setup.sh +++ b/e2e/terraform/packer/ubuntu-jammy-amd64/setup.sh @@ -109,6 +109,10 @@ sudo mv /tmp/linux/cni/loopback.* /opt/cni/config/ sudo mv /tmp/linux/cni/cni_args.conflist /opt/cni/config/ sudo mv /tmp/linux/cni/cni_args.sh /opt/cni/bin/ +echo "Installing additional CNI network configs" +# copy of nomad's "bridge" for connect+cni test (e2e/connect/) +sudo mv /tmp/linux/cni/nomad_bridge_copy.conflist /opt/cni/config/ + # Podman echo "Installing Podman" sudo apt-get -y install podman catatonit diff --git a/nomad/job_endpoint_hook_connect.go b/nomad/job_endpoint_hook_connect.go index ab8e54bab..166ac1750 100644 --- a/nomad/job_endpoint_hook_connect.go +++ b/nomad/job_endpoint_hook_connect.go @@ -7,7 +7,6 @@ import ( "errors" "fmt" "net" - "slices" "strconv" "strings" "time" @@ -26,6 +25,11 @@ const ( defaultConnectTimeout = 5 * time.Second ) +var ( + ErrConnectRequireOneNetwork = errors.New("must have exactly one network for Consul Connect") + ErrConnectInvalidNetworkMode = errors.New("invalid network mode for Consul Connect") +) + // connectSidecarResources returns the set of resources used by default for // the Consul Connect sidecar task func connectSidecarResources() *structs.Resources { @@ -166,7 +170,7 @@ func (jobConnectHook) Validate(job *structs.Job) ([]error, error) { for _, g := range job.TaskGroups { if err := groupConnectValidate(g); err != nil { - return nil, err + return warnings, err } } @@ -441,7 +445,7 @@ func gatewayProxy(gateway *structs.ConsulGateway, mode string) *structs.ConsulGa proxy.ConnectTimeout = pointer.Of(defaultConnectTimeout) } - if mode == "bridge" { + if mode == "bridge" || strings.HasPrefix(mode, "cni/") { // magically configure bind address(es) for bridge networking, per gateway type // non-default configuration is gated above switch { @@ -544,28 +548,22 @@ func newConnectSidecarTask(service, driver, cluster string) *structs.Task { } func groupConnectValidate(g *structs.TaskGroup) error { + var err error for _, s := range g.Services { switch { case s.Connect.HasSidecar(): - if err := groupConnectSidecarValidate(g, s); err != nil { - return err - } + err = groupConnectSidecarValidate(g, s) case s.Connect.IsNative(): - if err := groupConnectNativeValidate(g, s); err != nil { - return err - } + err = groupConnectNativeValidate(g, s) case s.Connect.IsGateway(): - if err := groupConnectGatewayValidate(g); err != nil { - return err - } + err = groupConnectGatewayValidate(g) + } + if err != nil { + return err } } - - if err := groupConnectUpstreamsValidate(g, g.Services); err != nil { - return err - } - - return nil + err = groupConnectUpstreamsValidate(g, g.Services) + return err } func groupConnectUpstreamsValidate(g *structs.TaskGroup, services []*structs.Service) error { @@ -634,13 +632,29 @@ func transparentProxyPortLabelValidate(g *structs.TaskGroup, portLabel string) b return false } -func groupConnectSidecarValidate(g *structs.TaskGroup, s *structs.Service) error { - if n := len(g.Networks); n != 1 { - return fmt.Errorf("Consul Connect sidecars require exactly 1 network, found %d in group %q", n, g.Name) +func groupConnectNetworkModeValidate(g *structs.TaskGroup, errorPrefix string, allowHost bool) error { + if nn := len(g.Networks); nn != 1 { + return fmt.Errorf("%s: %w: group %q has %d networks", + errorPrefix, ErrConnectRequireOneNetwork, g.Name, nn) } - if g.Networks[0].Mode != "bridge" { - return fmt.Errorf("Consul Connect sidecar requires bridge network, found %q in group %q", g.Networks[0].Mode, g.Name) + mode := g.Networks[0].Mode + if mode == "bridge" || (allowHost && mode == "host") || strings.HasPrefix(mode, "cni/") { + return nil + } + + // helpful error message + allowed := `"bridge" or "cni/*"` + if allowHost { + allowed = `"bridge", "host", or "cni/*"` + } + return fmt.Errorf("%s: %w: group %q uses network mode %q; must be %s", + errorPrefix, ErrConnectInvalidNetworkMode, g.Name, mode, allowed) +} + +func groupConnectSidecarValidate(g *structs.TaskGroup, s *structs.Service) error { + if err := groupConnectNetworkModeValidate(g, "connect sidecar", false); err != nil { + return err } // We must enforce lowercase characters on group and service names for connect @@ -668,17 +682,6 @@ func groupConnectNativeValidate(g *structs.TaskGroup, s *structs.Service) error } func groupConnectGatewayValidate(g *structs.TaskGroup) error { - // the group needs to be either bridge or host mode so we know how to configure - // the docker driver config - - if n := len(g.Networks); n != 1 { - return fmt.Errorf("Consul Connect gateways require exactly 1 network, found %d in group %q", n, g.Name) - } - - modes := []string{"bridge", "host"} - if !slices.Contains(modes, g.Networks[0].Mode) { - return fmt.Errorf(`Consul Connect Gateway service requires Task Group with network mode of type "bridge" or "host"`) - } - - return nil + // note that gateways can run in host network mode + return groupConnectNetworkModeValidate(g, "connect gateway", true) } diff --git a/nomad/job_endpoint_hook_connect_test.go b/nomad/job_endpoint_hook_connect_test.go index ee304665a..4dd76a266 100644 --- a/nomad/job_endpoint_hook_connect_test.go +++ b/nomad/job_endpoint_hook_connect_test.go @@ -471,28 +471,41 @@ func TestJobEndpointConnect_groupConnectSidecarValidate(t *testing.T) { } t.Run("sidecar 0 networks", func(t *testing.T) { - require.EqualError(t, groupConnectSidecarValidate(&structs.TaskGroup{ + err := groupConnectSidecarValidate(&structs.TaskGroup{ Name: "g1", Networks: nil, - }, makeService("connect-service")), `Consul Connect sidecars require exactly 1 network, found 0 in group "g1"`) + }, makeService("connect-service")) + must.EqError(t, err, `connect sidecar: must have exactly one network for Consul Connect: group "g1" has 0 networks`) }) t.Run("sidecar non bridge", func(t *testing.T) { - require.EqualError(t, groupConnectSidecarValidate(&structs.TaskGroup{ + err := groupConnectSidecarValidate(&structs.TaskGroup{ Name: "g2", Networks: structs.Networks{{ Mode: "host", }}, - }, makeService("connect-service")), `Consul Connect sidecar requires bridge network, found "host" in group "g2"`) + }, makeService("connect-service")) + must.EqError(t, err, `connect sidecar: invalid network mode for Consul Connect: group "g2" uses network mode "host"; must be "bridge" or "cni/*"`) }) - t.Run("sidecar okay", func(t *testing.T) { - require.NoError(t, groupConnectSidecarValidate(&structs.TaskGroup{ + t.Run("sidecar okay bridge", func(t *testing.T) { + err := groupConnectSidecarValidate(&structs.TaskGroup{ Name: "g3", Networks: structs.Networks{{ Mode: "bridge", }}, - }, makeService("connect-service"))) + }, makeService("connect-service")) + must.NoError(t, err) + }) + + t.Run("sidecar okay cni", func(t *testing.T) { + err := groupConnectSidecarValidate(&structs.TaskGroup{ + Name: "g4", + Networks: structs.Networks{{ + Mode: "cni/test-net", + }}, + }, makeService("connect-service")) + must.NoError(t, err) }) // group and service name validation @@ -505,7 +518,7 @@ func TestJobEndpointConnect_groupConnectSidecarValidate(t *testing.T) { Name: "Other-Service", }}, }) - require.NoError(t, err) + must.NoError(t, err) }) t.Run("connect service contains uppercase characters", func(t *testing.T) { @@ -516,7 +529,7 @@ func TestJobEndpointConnect_groupConnectSidecarValidate(t *testing.T) { Name: "Other-Service", }, makeService("Connect-Service")}, }) - require.EqualError(t, err, `Consul Connect service name "Connect-Service" in group "group" must not contain uppercase characters`) + must.EqError(t, err, `Consul Connect service name "Connect-Service" in group "group" must not contain uppercase characters`) }) t.Run("non-connect group contains uppercase characters", func(t *testing.T) { @@ -527,7 +540,7 @@ func TestJobEndpointConnect_groupConnectSidecarValidate(t *testing.T) { Name: "other-service", }}, }) - require.NoError(t, err) + must.NoError(t, err) }) t.Run("connect-group contains uppercase characters", func(t *testing.T) { @@ -538,7 +551,7 @@ func TestJobEndpointConnect_groupConnectSidecarValidate(t *testing.T) { Name: "other-service", }, makeService("connect-service")}, }) - require.EqualError(t, err, `Consul Connect group "Connect-Group" with service "connect-service" must not contain uppercase characters`) + must.EqError(t, err, `Consul Connect group "Connect-Group" with service "connect-service" must not contain uppercase characters`) }) t.Run("connect group and service lowercase", func(t *testing.T) { @@ -549,7 +562,7 @@ func TestJobEndpointConnect_groupConnectSidecarValidate(t *testing.T) { Name: "other-service", }, makeService("connect-service")}, }) - require.NoError(t, err) + must.NoError(t, err) }) t.Run("connect group overlap upstreams", func(t *testing.T) { @@ -570,7 +583,7 @@ func TestJobEndpointConnect_groupConnectSidecarValidate(t *testing.T) { Networks: structs.Networks{{Mode: "bridge"}}, Services: []*structs.Service{s1, s2}, }) - require.EqualError(t, err, `Consul Connect services "s2" and "s1" in group "connect-group" using same address for upstreams (:8999)`) + must.EqError(t, err, `Consul Connect services "s2" and "s1" in group "connect-group" using same address for upstreams (:8999)`) }) } @@ -781,7 +794,7 @@ func TestJobEndpointConnect_groupConnectGatewayValidate(t *testing.T) { Name: "g1", Networks: nil, }) - require.EqualError(t, err, `Consul Connect gateways require exactly 1 network, found 0 in group "g1"`) + must.EqError(t, err, `connect gateway: must have exactly one network for Consul Connect: group "g1" has 0 networks`) }) t.Run("bad network mode", func(t *testing.T) { @@ -791,7 +804,29 @@ func TestJobEndpointConnect_groupConnectGatewayValidate(t *testing.T) { Mode: "", }}, }) - require.EqualError(t, err, `Consul Connect Gateway service requires Task Group with network mode of type "bridge" or "host"`) + must.EqError(t, err, `connect gateway: invalid network mode for Consul Connect: group "g1" uses network mode ""; must be "bridge", "host", or "cni/*"`) + }) + + for _, good := range []string{"bridge", "host"} { + t.Run("good network mode "+good, func(t *testing.T) { + err := groupConnectGatewayValidate(&structs.TaskGroup{ + Name: "g1", + Networks: structs.Networks{{ + Mode: good, + }}, + }) + must.NoError(t, err) + }) + } + + t.Run("good network mode cni", func(t *testing.T) { + err := groupConnectGatewayValidate(&structs.TaskGroup{ + Name: "g1", + Networks: structs.Networks{{ + Mode: "cni/test-net", + }}, + }) + must.NoError(t, err) }) } diff --git a/nomad/job_endpoint_hook_expose_check.go b/nomad/job_endpoint_hook_expose_check.go index 67198871e..fda57f06b 100644 --- a/nomad/job_endpoint_hook_expose_check.go +++ b/nomad/job_endpoint_hook_expose_check.go @@ -55,27 +55,24 @@ func (jobExposeCheckHook) Mutate(job *structs.Job) (_ *structs.Job, warnings []e // Validate will ensure: // - The job contains valid network configuration for each task group in which -// an expose path is configured. The network must be of type bridge mode. +// an expose path is configured. The network must be bridge or "cni/" mode. // - The check Expose field is configured only for connect-enabled group-services. func (jobExposeCheckHook) Validate(job *structs.Job) (warnings []error, err error) { for _, tg := range job.TaskGroups { // Make sure any group that contains a group-service that enables expose - // is configured with one network that is in "bridge" mode. This check - // is being done independently of the preceding Connect task injection - // hook, because at some point in the future Connect will not require the - // use of network namespaces, whereas the use of "expose" does not make - // sense without the use of network namespace. - if err := tgValidateUseOfBridgeMode(tg); err != nil { - return nil, err + // is configured with one network that is in "bridge" mode, or warn + // if the network is a "cni/*" mode. + if err = tgValidateExposeNetworkMode(tg); err != nil { + return warnings, err } // Make sure any group-service that contains a check that enables expose // is connect-enabled and does not specify a custom sidecar task. We only // support the expose feature when using the built-in Envoy integration. - if err := tgValidateUseOfCheckExpose(tg); err != nil { - return nil, err + if err = tgValidateUseOfCheckExpose(tg); err != nil { + return warnings, err } } - return nil, nil + return warnings, nil } // serviceExposeConfig digs through s to extract the connect sidecar service proxy @@ -137,17 +134,12 @@ func tgValidateUseOfCheckExpose(tg *structs.TaskGroup) error { return nil } -// tgValidateUseOfBridgeMode ensures there is exactly 1 network configured for -// the task group, and that it makes use of "bridge" mode (i.e. enables network +// tgValidateExposeNetworkMode ensures there is exactly 1 network configured for +// the task group, and that it uses "bridge" or "cni/*" mode (i.e. enables network // namespaces). -func tgValidateUseOfBridgeMode(tg *structs.TaskGroup) error { +func tgValidateExposeNetworkMode(tg *structs.TaskGroup) error { if tgUsesExposeCheck(tg) { - if len(tg.Networks) != 1 { - return fmt.Errorf("group %q must specify one bridge network for exposing service check(s)", tg.Name) - } - if tg.Networks[0].Mode != "bridge" { - return fmt.Errorf("group %q must use bridge network for exposing service check(s)", tg.Name) - } + return groupConnectNetworkModeValidate(tg, "connect expose check", false) } return nil } @@ -188,7 +180,7 @@ func exposePathForCheck(tg *structs.TaskGroup, s *structs.Service, check *struct // Borrow some of the validation before we start manipulating the group // network, which needs to exist once. - if err := tgValidateUseOfBridgeMode(tg); err != nil { + if err := tgValidateExposeNetworkMode(tg); err != nil { return nil, err } diff --git a/nomad/job_endpoint_hook_expose_check_test.go b/nomad/job_endpoint_hook_expose_check_test.go index 1de7ac0ab..79a77d9ea 100644 --- a/nomad/job_endpoint_hook_expose_check_test.go +++ b/nomad/job_endpoint_hook_expose_check_test.go @@ -8,6 +8,7 @@ import ( "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/nomad/structs" + "github.com/shoenig/test/must" "github.com/stretchr/testify/require" ) @@ -57,37 +58,52 @@ func TestJobExposeCheckHook_tgValidateUseOfBridgeMode(t *testing.T) { } t.Run("no networks but no use of expose", func(t *testing.T) { - require.Nil(t, tgValidateUseOfBridgeMode(&structs.TaskGroup{ + err := tgValidateExposeNetworkMode(&structs.TaskGroup{ Networks: make(structs.Networks, 0), - })) + }) + must.NoError(t, err) }) t.Run("no networks and uses expose", func(t *testing.T) { - require.EqualError(t, tgValidateUseOfBridgeMode(&structs.TaskGroup{ + err := tgValidateExposeNetworkMode(&structs.TaskGroup{ Name: "g1", Networks: make(structs.Networks, 0), Services: []*structs.Service{s1}, - }), `group "g1" must specify one bridge network for exposing service check(s)`) + }) + must.EqError(t, err, `connect expose check: must have exactly one network for Consul Connect: group "g1" has 0 networks`) }) t.Run("non-bridge network and uses expose", func(t *testing.T) { - require.EqualError(t, tgValidateUseOfBridgeMode(&structs.TaskGroup{ + err := tgValidateExposeNetworkMode(&structs.TaskGroup{ Name: "g1", Networks: structs.Networks{{ Mode: "host", }}, Services: []*structs.Service{s1}, - }), `group "g1" must use bridge network for exposing service check(s)`) + }) + must.EqError(t, err, `connect expose check: invalid network mode for Consul Connect: group "g1" uses network mode "host"; must be "bridge" or "cni/*"`) }) t.Run("bridge network uses expose", func(t *testing.T) { - require.Nil(t, tgValidateUseOfBridgeMode(&structs.TaskGroup{ + err := tgValidateExposeNetworkMode(&structs.TaskGroup{ Name: "g1", Networks: structs.Networks{{ Mode: "bridge", }}, Services: []*structs.Service{s1}, - })) + }) + must.NoError(t, err) + }) + + t.Run("cni network uses expose", func(t *testing.T) { + err := tgValidateExposeNetworkMode(&structs.TaskGroup{ + Name: "g1", + Networks: structs.Networks{{ + Mode: "cni/test-net", + }}, + Services: []*structs.Service{s1}, + }) + must.NoError(t, err) }) } @@ -165,8 +181,8 @@ func TestJobExposeCheckHook_Validate(t *testing.T) { Services: []*structs.Service{s1}, }}, }) - require.Empty(t, warnings) - require.EqualError(t, err, `group "g1" must specify one bridge network for exposing service check(s)`) + must.SliceEmpty(t, warnings) + must.EqError(t, err, `connect expose check: must have exactly one network for Consul Connect: group "g1" has 2 networks`) }) t.Run("expose in service check", func(t *testing.T) { @@ -189,8 +205,8 @@ func TestJobExposeCheckHook_Validate(t *testing.T) { }}, }}, }) - require.Empty(t, warnings) - require.EqualError(t, err, `exposed service check g1[t1]->s2->s2-check1 is not a task-group service`) + must.SliceEmpty(t, warnings) + must.EqError(t, err, `exposed service check g1[t1]->s2->s2-check1 is not a task-group service`) }) t.Run("ok", func(t *testing.T) { @@ -224,8 +240,8 @@ func TestJobExposeCheckHook_Validate(t *testing.T) { }}, }}, }) - require.Empty(t, warnings) - require.Nil(t, err) + must.SliceEmpty(t, warnings) + must.NoError(t, err) }) } @@ -321,9 +337,7 @@ func TestJobExposeCheckHook_exposePathForCheck(t *testing.T) { Services: []*structs.Service{s}, Networks: structs.Networks{{ Mode: "bridge", - DynamicPorts: []structs.Port{ - // service declares "sPort", but does not exist - }, + DynamicPorts: []structs.Port{}, // service declares "sPort", but does not exist }}, }, s, c, checkIdx) require.EqualError(t, err, `unable to determine local service port for service check group1->service1->check1`) @@ -400,8 +414,8 @@ func TestJobExposeCheckHook_exposePathForCheck(t *testing.T) { Networks: nil, // not set, should cause validation error } ePath, err := exposePathForCheck(tg, s, c, checkIdx) - require.EqualError(t, err, `group "group1" must specify one bridge network for exposing service check(s)`) - require.Nil(t, ePath) + must.EqError(t, err, `connect expose check: must have exactly one network for Consul Connect: group "group1" has 0 networks`) + must.Nil(t, ePath) }) } diff --git a/nomad/job_endpoint_test.go b/nomad/job_endpoint_test.go index df370b9be..cb3de1266 100644 --- a/nomad/job_endpoint_test.go +++ b/nomad/job_endpoint_test.go @@ -6293,6 +6293,17 @@ func TestJobEndpoint_ValidateJob_ConsulConnect(t *testing.T) { require.NoError(t, err) }) + t.Run("valid consul connect with cni", func(t *testing.T) { + j := mock.Job() + + tg := j.TaskGroups[0] + tg.Services = tgServices + tg.Networks[0].Mode = "cni/test-net" + + err := validateJob(j) + must.NoError(t, err) + }) + t.Run("consul connect but missing network", func(t *testing.T) { j := mock.Job() @@ -6301,8 +6312,7 @@ func TestJobEndpoint_ValidateJob_ConsulConnect(t *testing.T) { tg.Networks = nil err := validateJob(j) - require.Error(t, err) - require.Contains(t, err.Error(), `Consul Connect sidecars require exactly 1 network`) + must.ErrorContains(t, err, ErrConnectRequireOneNetwork.Error()) }) t.Run("consul connect but non bridge network", func(t *testing.T) { @@ -6316,8 +6326,7 @@ func TestJobEndpoint_ValidateJob_ConsulConnect(t *testing.T) { } err := validateJob(j) - require.Error(t, err) - require.Contains(t, err.Error(), `Consul Connect sidecar requires bridge network, found "host" in group "web"`) + must.ErrorContains(t, err, ErrConnectInvalidNetworkMode.Error()) }) } diff --git a/website/content/docs/job-declare/consul-service-mesh.mdx b/website/content/docs/job-declare/consul-service-mesh.mdx index 5585280ec..90c4c6176 100644 --- a/website/content/docs/job-declare/consul-service-mesh.mdx +++ b/website/content/docs/job-declare/consul-service-mesh.mdx @@ -36,15 +36,18 @@ service-based access control permissions throughout the entire mesh. Nomad has native integration with Consul to provide service mesh capabilities. The [`connect`][] block is the entrypoint for all service mesh configuration. Nomad automatically deploys a sidecar proxy task to all allocations that have a -[`sidecar_service`][] block. +[`sidecar_service`][] block. All incoming external traffic is handled by the +sidecar. This proxy task is responsible for exposing the service to the mesh and can also be used to access other services from within the allocation. These external services are called upstreams and are declared using the [`upstreams`][] block. -The allocation `network_mode` must be set to `bridge` for network isolation and -all external traffic is provided by the sidecar. +Consul service mesh requires network isolation to function, so you must set +job group's `network` [`mode`](/nomad/docs/job-specification/network#mode) +to `bridge`, or an [appropriately configured `cni/*` +network](/nomad/docs/networking/consul/service-mesh#network-mode). ~> **Warning:** To fully isolate your workloads make sure to bind them only to the `loopback` interface. @@ -167,4 +170,3 @@ The types of gateways provided by Consul Service Mesh are: [`upstreams`]: /nomad/docs/job-specification/upstreams [consul_cli_envoy]: /consul/commands/connect/envoy [runtime_network]: /nomad/docs/reference/runtime-environment-settings#network-related-variables - diff --git a/website/content/docs/networking/consul/service-mesh.mdx b/website/content/docs/networking/consul/service-mesh.mdx index b4c8b587d..7f863e660 100644 --- a/website/content/docs/networking/consul/service-mesh.mdx +++ b/website/content/docs/networking/consul/service-mesh.mdx @@ -59,6 +59,35 @@ For using the Consul service mesh integration with Consul ACLs enabled, see the [Secure Nomad Jobs with Consul Service Mesh](/nomad/tutorials/integrate-consul/consul-service-mesh) guide. +### Network mode + +Consul service mesh requires network isolation to function, so you must set +job group's `network` [`mode`](/nomad/docs/job-specification/network#mode) +to `bridge`, or an appropriately configured `cni/*` network. + +
+ + Using a custom cni/* network with Consul service mesh requires + extra care. + + +Given the variety of network configurations, the Nomad team and enterprise +support are limited in our ability to support custom network configurations. +Use custom CNI networks with Consul service mesh at your own risk. That said, +you may model your network configuration on Nomad's [`bridge` +network](/nomad/docs/networking/cni#create-a-custom-bridge-mode-configuration-with-cni-plugins). + +Consider these qualities when configuring your network: + + * Nomad provides an isolated network namespace, but your CNI configuration + should not expose the main task(s) to the host network. + * Incoming traffic needs to be able to reach the sidecar at the IP:port + which will be advertised on the sidecar service. + * Traffic needs to be able to flow from different allocs' sidecars to + one another. + +
+ ## Nomad Consul service mesh example The following section walks through an example to enable secure communication