api: acl bootstrap errors aren't 500

Noticed that ACL endpoints return 500 status code for user errors. This is confusing and can lead to false monitoring alerts. Here, I introduce a concept of RPCCoded errors to be returned by RPC that signal a code in addition to error message. Codes for now match HTTP codes to ease reasoning. ``` $ nomad acl bootstrap Error bootstrapping: Unexpected response code: 500 (ACL bootstrap already done (reset index: 9)) $ nomad acl bootstrap Error bootstrapping: Unexpected response code: 400 (ACL bootstrap already done (reset index: 9)) ```
2026-01-05 09:55:44 +03:00 · 2019-10-04 10:08:03 -04:00
parent 62751321bf
commit fd66b9c9ab
5 changed files with 102 additions and 19 deletions
--- a/command/agent/http.go
+++ b/command/agent/http.go
@@ -304,6 +304,9 @@ func (s *HTTPServer) wrap(handler func(resp http.ResponseWriter, req *http.Reque
 			errMsg := err.Error()
 			if http, ok := err.(HTTPCodedError); ok {
 				code = http.Code()
+			} else if ecode, emsg, ok := structs.CodeFromRPCCodedErr(err); ok {
+				code = ecode
+				errMsg = emsg
 			} else {
 				// RPC errors get wrapped, so manually unwrap by only looking at their suffix
 				if strings.HasSuffix(errMsg, structs.ErrPermissionDenied.Error()) {
--- a/nomad/acl_endpoint.go
+++ b/nomad/acl_endpoint.go
@@ -18,7 +18,7 @@ import (

 var (
 	// aclDisabled is returned when an ACL endpoint is hit but ACLs are not enabled
-	aclDisabled = fmt.Errorf("ACL support disabled")
+	aclDisabled = structs.NewErrRPCCoded(400, "ACL support disabled")
 )

 const (
@@ -55,13 +55,13 @@ func (a *ACL) UpsertPolicies(args *structs.ACLPolicyUpsertRequest, reply *struct

 	// Validate non-zero set of policies
 	if len(args.Policies) == 0 {
-		return fmt.Errorf("must specify as least one policy")
+		return structs.NewErrRPCCoded(400, "must specify as least one policy")
 	}

 	// Validate each policy, compute hash
 	for idx, policy := range args.Policies {
 		if err := policy.Validate(); err != nil {
-			return fmt.Errorf("policy %d invalid: %v", idx, err)
+			return structs.NewErrRPCCodedf(404, "policy %d invalid: %v", idx, err)
 		}
 		policy.SetHash()
 	}
@@ -99,7 +99,7 @@ func (a *ACL) DeletePolicies(args *structs.ACLPolicyDeleteRequest, reply *struct

 	// Validate non-zero set of policies
 	if len(args.Names) == 0 {
-		return fmt.Errorf("must specify as least one policy")
+		return structs.NewErrRPCCoded(400, "must specify as least one policy")
 	}

 	// Update via Raft
@@ -370,9 +370,9 @@ func (a *ACL) Bootstrap(args *structs.ACLTokenBootstrapRequest, reply *structs.A
 		// Check if there is a reset index specified
 		specifiedIndex := a.fileBootstrapResetIndex()
 		if specifiedIndex == 0 {
-			return fmt.Errorf("ACL bootstrap already done (reset index: %d)", resetIdx)
+			return structs.NewErrRPCCodedf(400, "ACL bootstrap already done (reset index: %d)", resetIdx)
 		} else if specifiedIndex != resetIdx {
-			return fmt.Errorf("Invalid bootstrap reset index (specified %d, reset index: %d)", specifiedIndex, resetIdx)
+			return structs.NewErrRPCCodedf(400, "Invalid bootstrap reset index (specified %d, reset index: %d)", specifiedIndex, resetIdx)
 		}

 		// Setup the reset index to allow bootstrapping again
@@ -404,7 +404,7 @@ func (a *ACL) Bootstrap(args *structs.ACLTokenBootstrapRequest, reply *structs.A
 	}
 	out, err := state.ACLTokenByAccessorID(nil, args.Token.AccessorID)
 	if err != nil {
-		return fmt.Errorf("token lookup failed: %v", err)
+		return structs.NewErrRPCCodedf(400, "token lookup failed: %v", err)
 	}
 	reply.Tokens = append(reply.Tokens, out)

@@ -448,7 +448,7 @@ func (a *ACL) UpsertTokens(args *structs.ACLTokenUpsertRequest, reply *structs.A

 	// Validate non-zero set of tokens
 	if len(args.Tokens) == 0 {
-		return fmt.Errorf("must specify as least one token")
+		return structs.NewErrRPCCoded(400, "must specify as least one token")
 	}

 	// Force the request to the authoritative region if we are creating global tokens
@@ -466,7 +466,7 @@ func (a *ACL) UpsertTokens(args *structs.ACLTokenUpsertRequest, reply *structs.A
 	// the entire request as a single batch.
 	if hasGlobal {
 		if !allGlobal {
-			return fmt.Errorf("cannot upsert mixed global and non-global tokens")
+			return structs.NewErrRPCCoded(400, "cannot upsert mixed global and non-global tokens")
 		}

 		// Force the request to the authoritative region if it has global
@@ -494,7 +494,7 @@ func (a *ACL) UpsertTokens(args *structs.ACLTokenUpsertRequest, reply *structs.A
 	// Validate each token
 	for idx, token := range args.Tokens {
 		if err := token.Validate(); err != nil {
-			return fmt.Errorf("token %d invalid: %v", idx, err)
+			return structs.NewErrRPCCodedf(400, "token %d invalid: %v", idx, err)
 		}

 		// Generate an accessor and secret ID if new
@@ -507,15 +507,15 @@ func (a *ACL) UpsertTokens(args *structs.ACLTokenUpsertRequest, reply *structs.A
 			// Verify the token exists
 			out, err := state.ACLTokenByAccessorID(nil, token.AccessorID)
 			if err != nil {
-				return fmt.Errorf("token lookup failed: %v", err)
+				return structs.NewErrRPCCodedf(400, "token lookup failed: %v", err)
 			}
 			if out == nil {
-				return fmt.Errorf("cannot find token %s", token.AccessorID)
+				return structs.NewErrRPCCodedf(400, "cannot find token %s", token.AccessorID)
 			}

 			// Cannot toggle the "Global" mode
 			if token.Global != out.Global {
-				return fmt.Errorf("cannot toggle global mode of %s", token.AccessorID)
+				return structs.NewErrRPCCodedf(400, "cannot toggle global mode of %s", token.AccessorID)
 			}
 		}

@@ -538,7 +538,7 @@ func (a *ACL) UpsertTokens(args *structs.ACLTokenUpsertRequest, reply *structs.A
 	for _, token := range args.Tokens {
 		out, err := state.ACLTokenByAccessorID(nil, token.AccessorID)
 		if err != nil {
-			return fmt.Errorf("token lookup failed: %v", err)
+			return structs.NewErrRPCCodedf(400, "token lookup failed: %v", err)
 		}
 		reply.Tokens = append(reply.Tokens, out)
 	}
@@ -557,7 +557,7 @@ func (a *ACL) DeleteTokens(args *structs.ACLTokenDeleteRequest, reply *structs.G

 	// Validate non-zero set of tokens
 	if len(args.AccessorIDs) == 0 {
-		return fmt.Errorf("must specify as least one token")
+		return structs.NewErrRPCCoded(400, "must specify as least one token")
 	}

 	if done, err := a.srv.forward("ACL.DeleteTokens", args, args, reply); done {
@@ -585,7 +585,7 @@ func (a *ACL) DeleteTokens(args *structs.ACLTokenDeleteRequest, reply *structs.G
 	for _, accessor := range args.AccessorIDs {
 		token, err := state.ACLTokenByAccessorID(nil, accessor)
 		if err != nil {
-			return fmt.Errorf("token lookup failed: %v", err)
+			return structs.NewErrRPCCodedf(400, "token lookup failed: %v", err)
 		}
 		if token == nil {
 			nonexistentTokens = append(nonexistentTokens, accessor)
@@ -599,14 +599,14 @@ func (a *ACL) DeleteTokens(args *structs.ACLTokenDeleteRequest, reply *structs.G
 	}

 	if len(nonexistentTokens) != 0 {
-		return fmt.Errorf("Cannot delete nonexistent tokens: %v", strings.Join(nonexistentTokens, ", "))
+		return structs.NewErrRPCCodedf(400, "Cannot delete nonexistent tokens: %v", strings.Join(nonexistentTokens, ", "))
 	}

 	// Disallow mixed requests with global and non-global tokens since we forward
 	// the entire request as a single batch.
 	if hasGlobal {
 		if !allGlobal {
-			return fmt.Errorf("cannot delete mixed global and non-global tokens")
+			return structs.NewErrRPCCoded(400, "cannot delete mixed global and non-global tokens")
 		}

 		// Force the request to the authoritative region if it has global
--- a/nomad/acl_endpoint_test.go
+++ b/nomad/acl_endpoint_test.go
@@ -931,7 +931,7 @@ func TestACLEndpoint_DeleteTokens_WithNonexistentToken(t *testing.T) {

 	assert.NotNil(err)
 	expectedError := fmt.Sprintf("Cannot delete nonexistent tokens: %s", nonexistentToken.AccessorID)
-	assert.Contains(expectedError, err.Error())
+	assert.Contains(err.Error(), expectedError)
 }

 func TestACLEndpoint_Bootstrap(t *testing.T) {
--- a/nomad/structs/errors.go
+++ b/nomad/structs/errors.go
@@ -3,6 +3,7 @@ package structs
 import (
 	"errors"
 	"fmt"
+	"strconv"
 	"strings"
 )

@@ -25,6 +26,8 @@ const (
 	ErrUnknownJobPrefix        = "Unknown job"
 	ErrUnknownEvaluationPrefix = "Unknown evaluation"
 	ErrUnknownDeploymentPrefix = "Unknown deployment"
+
+	errRPCCodedErrorPrefix = "RPC_ERROR::"
 )

 var (
@@ -144,3 +147,31 @@ func IsErrUnknownNomadVersion(err error) bool {
 func IsErrNodeLacksRpc(err error) bool {
 	return err != nil && strings.Contains(err.Error(), errNodeLacksRpc)
 }
+
+func NewErrRPCCoded(code int, msg string) error {
+	return fmt.Errorf("%s%d,%s", errRPCCodedErrorPrefix, code, msg)
+}
+
+func NewErrRPCCodedf(code int, format string, args ...interface{}) error {
+	msg := fmt.Sprintf(format, args...)
+	return fmt.Errorf("%s%d,%s", errRPCCodedErrorPrefix, code, msg)
+}
+
+func CodeFromRPCCodedErr(err error) (code int, msg string, ok bool) {
+	if err == nil || !strings.HasPrefix(err.Error(), errRPCCodedErrorPrefix) {
+		return 0, "", false
+	}
+
+	headerLen := len(errRPCCodedErrorPrefix)
+	parts := strings.SplitN(err.Error()[headerLen:], ",", 2)
+	if len(parts) != 2 {
+		return 0, "", false
+	}
+
+	code, err = strconv.Atoi(parts[0])
+	if err != nil {
+		return 0, "", false
+	}
+
+	return code, parts[1], true
+}
--- a/nomad/structs/errors_test.go
+++ b/nomad/structs/errors_test.go
@@ -0,0 +1,49 @@
+package structs
+
+import (
+	"errors"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestRPCCodedErrors(t *testing.T) {
+	cases := []struct {
+		err     error
+		code    int
+		message string
+	}{
+		{
+			NewErrRPCCoded(400, "a test message,here"),
+			400,
+			"a test message,here",
+		},
+		{
+			NewErrRPCCodedf(500, "a test message,here %s %s", "and,here%s", "second"),
+			500,
+			"a test message,here and,here%s second",
+		},
+	}
+
+	for _, c := range cases {
+		t.Run(c.err.Error(), func(t *testing.T) {
+			code, msg, ok := CodeFromRPCCodedErr(c.err)
+			assert.True(t, ok)
+			assert.Equal(t, c.code, code)
+			assert.Equal(t, c.message, msg)
+		})
+	}
+
+	negativeCases := []string{
+		"random error",
+		errRPCCodedErrorPrefix,
+		errRPCCodedErrorPrefix + "123",
+		errRPCCodedErrorPrefix + "qwer,asdf",
+	}
+	for _, c := range negativeCases {
+		t.Run(c, func(t *testing.T) {
+			_, _, ok := CodeFromRPCCodedErr(errors.New(c))
+			assert.False(t, ok)
+		})
+	}
+}