test: fix flaky RPC TLS enforcement test (#18155)

The RPC TLS enforcment test creates network connections to a server and these
are occassionally failing in testing with `write: broken pipe` errors. This has
been an ongoing issue where it'll appear to get fixed, then reoccur, and no one
seems to be able to reproduce outside of CI. The test assertion itself is
reliable, which is why it's been hard to spend effort to hunt this down.

The failing test cases are ones that are never supposed to work b/c they fail
our TLS cert role validation. The error message is coming from the TLS handshake
error. The RPC connection handler closes the connection immediately on getting
the error from the TLS handshake. The stdlib's TLS library flushes the
connection's buffer before returning the error. So the theory is that in the
failing case we don't get the error message before the connection is closed, but
do get the error return that allows the client to move on to a write, which
tries to write on the closed pipe.

I've been unable to reproduce this exactly, as the race is effectively between
the OS and the runtime. The equivalent test of the Raft TLS enforcement includes
handling of a EOF intead of the certificate error, so it appears this actually
expected (or at least known) behavior. Because the code under test is operating
as expected, this changeset updates the assertion to accept the error.
This commit is contained in:
Tim Gross
2023-08-07 11:17:06 -04:00
committed by GitHub
parent 388198abef
commit 5d2c1d1f03

View File

@@ -15,10 +15,10 @@ import (
"os"
"path"
"path/filepath"
"regexp"
"testing"
"time"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-msgpack/codec"
"github.com/hashicorp/go-sockaddr"
msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc"
@@ -1145,19 +1145,14 @@ func TestRPC_TLS_Enforcement_Raft(t *testing.T) {
func TestRPC_TLS_Enforcement_RPC(t *testing.T) {
ci.Parallel(t)
defer func() {
//TODO Avoid panics from logging during shutdown
time.Sleep(1 * time.Second)
}()
tlsHelper := newTLSTestHelper(t)
defer tlsHelper.cleanup()
t.Cleanup(tlsHelper.cleanup)
standardRPCs := map[string]interface{}{
standardRPCs := map[string]any{
"Status.Ping": &structs.GenericRequest{},
}
localServersOnlyRPCs := map[string]interface{}{
localServersOnlyRPCs := map[string]any{
"Eval.Update": &structs.EvalUpdateRequest{
WriteRequest: structs.WriteRequest{Region: "global"},
},
@@ -1187,7 +1182,7 @@ func TestRPC_TLS_Enforcement_RPC(t *testing.T) {
},
}
localClientsOnlyRPCs := map[string]interface{}{
localClientsOnlyRPCs := map[string]any{
"Alloc.GetAllocs": &structs.AllocsGetRequest{
QueryOptions: structs.QueryOptions{Region: "global"},
},
@@ -1210,7 +1205,7 @@ func TestRPC_TLS_Enforcement_RPC(t *testing.T) {
cases := []struct {
name string
cn string
rpcs map[string]interface{}
rpcs map[string]any
canRPC bool
}{
// Local server.
@@ -1325,11 +1320,20 @@ func TestRPC_TLS_Enforcement_RPC(t *testing.T) {
if tc.canRPC {
if err != nil {
require.NotContains(t, err, "certificate")
// note: lots of these RPCs will return
// validation errors after connection b/c we're
// focusing on testing TLS here
must.StrNotContains(t, err.Error(), "certificate")
}
} else {
require.Error(t, err)
require.Contains(t, err.Error(), "certificate")
// We expect "bad certificate" for these failures,
// but locally the error can return before the error
// message bytes have been received, in which case
// we immediately write on the pipe that was just
// closed by the client
must.Error(t, err)
must.RegexMatch(t,
regexp.MustCompile("(certificate|broken pipe)"), err.Error())
}
})
}
@@ -1337,7 +1341,7 @@ func TestRPC_TLS_Enforcement_RPC(t *testing.T) {
t.Run(fmt.Sprintf("nomad RPC: rpc=%s verify_hostname=false", method), func(t *testing.T) {
err := tlsHelper.nomadRPC(t, tlsHelper.nonVerifyServer, cfg, method, arg)
if err != nil {
require.NotContains(t, err, "certificate")
must.StrNotContains(t, "certificate", err.Error())
}
})
}
@@ -1387,7 +1391,7 @@ func newTLSTestHelper(t *testing.T) tlsTestHelper {
makeServer := func(bootstrapExpect int, verifyServerHostname bool) (*Server, func()) {
return TestServer(t, func(c *Config) {
c.Logger.SetLevel(hclog.Off)
c.NumSchedulers = 0
c.BootstrapExpect = bootstrapExpect
c.TLSConfig = &config.TLSConfig{
EnableRPC: true,
@@ -1441,32 +1445,34 @@ func (h tlsTestHelper) newCert(t *testing.T, name string) string {
}
func (h tlsTestHelper) connect(t *testing.T, s *Server, c *config.TLSConfig) net.Conn {
t.Helper()
conn, err := net.DialTimeout("tcp", s.config.RPCAddr.String(), time.Second)
require.NoError(t, err)
must.NoError(t, err)
// configure TLS
_, err = conn.Write([]byte{byte(pool.RpcTLS)})
require.NoError(t, err)
must.NoError(t, err)
// Client TLS verification isn't necessary for
// our assertions
tlsConf, err := tlsutil.NewTLSConfiguration(c, true, true)
require.NoError(t, err)
must.NoError(t, err)
outTLSConf, err := tlsConf.OutgoingTLSConfig()
require.NoError(t, err)
must.NoError(t, err)
outTLSConf.InsecureSkipVerify = true
tlsConn := tls.Client(conn, outTLSConf)
require.NoError(t, tlsConn.Handshake())
must.NoError(t, tlsConn.Handshake())
return tlsConn
}
func (h tlsTestHelper) nomadRPC(t *testing.T, s *Server, c *config.TLSConfig, method string, arg interface{}) error {
t.Helper()
conn := h.connect(t, s, c)
defer conn.Close()
_, err := conn.Write([]byte{byte(pool.RpcNomad)})
require.NoError(t, err)
must.NoError(t, err)
codec := pool.NewClientCodec(conn)