diff --git a/.changelog/13621.txt b/.changelog/13621.txt new file mode 100644 index 000000000..acdffdf07 --- /dev/null +++ b/.changelog/13621.txt @@ -0,0 +1,3 @@ +```release-note:improvement +api: HTTP server now returns a 429 error code when hitting the connection limit +``` diff --git a/command/agent/http.go b/command/agent/http.go index 2df2a6ac4..b0676ec82 100644 --- a/command/agent/http.go +++ b/command/agent/http.go @@ -14,6 +14,7 @@ import ( "strings" "time" + "github.com/armon/go-metrics" assetfs "github.com/elazarl/go-bindata-assetfs" "github.com/gorilla/handlers" "github.com/gorilla/websocket" @@ -22,6 +23,7 @@ import ( "github.com/hashicorp/go-msgpack/codec" multierror "github.com/hashicorp/go-multierror" "github.com/rs/cors" + "golang.org/x/time/rate" "github.com/hashicorp/nomad/acl" "github.com/hashicorp/nomad/helper/noxssrw" @@ -156,7 +158,7 @@ func NewHTTPServers(agent *Agent, config *Config) ([]*HTTPServer, error) { httpServer := http.Server{ Addr: srv.Addr, Handler: handlers.CompressHandler(srv.mux), - ConnState: makeConnState(config.TLSConfig.EnableHTTP, handshakeTimeout, maxConns), + ConnState: makeConnState(config.TLSConfig.EnableHTTP, handshakeTimeout, maxConns, srv.logger), ErrorLog: newHTTPServerLogger(srv.logger), } @@ -213,13 +215,12 @@ func NewHTTPServers(agent *Agent, config *Config) ([]*HTTPServer, error) { // // If limit > 0, a per-address connection limit will be enabled regardless of // TLS. If connLimit == 0 there is no connection limit. -func makeConnState(isTLS bool, handshakeTimeout time.Duration, connLimit int) func(conn net.Conn, state http.ConnState) { +func makeConnState(isTLS bool, handshakeTimeout time.Duration, connLimit int, logger log.Logger) func(conn net.Conn, state http.ConnState) { + connLimiter := connLimiter(connLimit, logger) if !isTLS || handshakeTimeout == 0 { if connLimit > 0 { // Still return the connection limiter - return connlimit.NewLimiter(connlimit.Config{ - MaxConnsPerClientIP: connLimit, - }).HTTPConnStateFunc() + return connLimiter } return nil } @@ -227,9 +228,6 @@ func makeConnState(isTLS bool, handshakeTimeout time.Duration, connLimit int) fu if connLimit > 0 { // Return conn state callback with connection limiting and a // handshake timeout. - connLimiter := connlimit.NewLimiter(connlimit.Config{ - MaxConnsPerClientIP: connLimit, - }).HTTPConnStateFunc() return func(conn net.Conn, state http.ConnState) { switch state { @@ -268,6 +266,35 @@ func makeConnState(isTLS bool, handshakeTimeout time.Duration, connLimit int) fu } } +// connLimiter returns a connection-limiter function with a rate-limited 429-response error handler. +// The rate-limit prevents the TLS handshake necessary to write the HTTP response +// from consuming too many server resources. +func connLimiter(connLimit int, logger log.Logger) func(conn net.Conn, state http.ConnState) { + // Global rate-limit of 10 responses per second with a 100-response burst. + limiter := rate.NewLimiter(10, 100) + + tooManyConnsMsg := "Your IP is issuing too many concurrent connections, please rate limit your calls\n" + tooManyRequestsResponse := []byte(fmt.Sprintf("HTTP/1.1 429 Too Many Requests\r\n"+ + "Content-Type: text/plain\r\n"+ + "Content-Length: %d\r\n"+ + "Connection: close\r\n\r\n%s", len(tooManyConnsMsg), tooManyConnsMsg)) + return connlimit.NewLimiter(connlimit.Config{ + MaxConnsPerClientIP: connLimit, + }).HTTPConnStateFuncWithErrorHandler(func(err error, conn net.Conn) { + if err == connlimit.ErrPerClientIPLimitReached { + metrics.IncrCounter([]string{"nomad", "agent", "http", "exceeded"}, 1) + if n := limiter.Reserve(); n.Delay() == 0 { + logger.Warn("Too many concurrent connections", "address", conn.RemoteAddr().String(), "limit", connLimit) + conn.SetDeadline(time.Now().Add(10 * time.Millisecond)) + conn.Write(tooManyRequestsResponse) + } else { + n.Cancel() + } + } + conn.Close() + }) +} + // tcpKeepAliveListener sets TCP keep-alive timeouts on accepted // connections. It's used by NewHttpServer so // dead TCP connections eventually go away. diff --git a/command/agent/http_test.go b/command/agent/http_test.go index 5c817c308..7beb8f080 100644 --- a/command/agent/http_test.go +++ b/command/agent/http_test.go @@ -1254,41 +1254,15 @@ func TestHTTPServer_Limits_OK(t *testing.T) { // Create a new connection that will go over the connection limit. limitConn, err := dial(t, addr, useTLS) - // At this point, the state of the connection + handshake are up in the - // air. - // - // 1) dial failed, handshake never started - // => Conn is nil + io.EOF - // 2) dial completed, handshake failed - // => Conn is malformed + (net.OpError OR io.EOF) - // 3) dial completed, handshake succeeded - // => Conn is not nil + no error, however using the Conn should - // result in io.EOF - // - // At no point should Nomad actually write through the limited Conn. - - if limitConn == nil || err != nil { - // Case 1 or Case 2 - returned Conn is useless and the error should - // be one of: - // "EOF" - // "closed network connection" - // "connection reset by peer" - msg := err.Error() - acceptable := strings.Contains(msg, "EOF") || - strings.Contains(msg, "closed network connection") || - strings.Contains(msg, "connection reset by peer") - require.True(t, acceptable) - } else { - // Case 3 - returned Conn is usable, but Nomad should not write - // anything before closing it. - buf := make([]byte, bufSize) - deadline := time.Now().Add(10 * time.Second) - require.NoError(t, limitConn.SetReadDeadline(deadline)) - n, err := limitConn.Read(buf) - require.Equal(t, io.EOF, err) - require.Zero(t, n) - require.NoError(t, limitConn.Close()) - } + response := "HTTP/1.1 429" + buf := make([]byte, len(response)) + deadline := time.Now().Add(10 * time.Second) + require.NoError(t, limitConn.SetReadDeadline(deadline)) + n, err := limitConn.Read(buf) + require.Equal(t, response, string(buf)) + require.Nil(t, err) + require.Equal(t, len(response), n) + require.NoError(t, limitConn.Close()) // Assert existing connections are ok require.Len(t, errCh, 0) diff --git a/website/content/docs/operations/metrics-reference.mdx b/website/content/docs/operations/metrics-reference.mdx index 434d5690f..125ba4484 100644 --- a/website/content/docs/operations/metrics-reference.mdx +++ b/website/content/docs/operations/metrics-reference.mdx @@ -480,6 +480,14 @@ Raft database metrics are emitted by the `raft-boltdb` library. | `nomad.raft.boltdb.txstats.write` | Count of total write operations | Integer | Counter | | `nomad.raft.boltdb.txstats.writeTime` | Sample of write operation times | Nanoseconds | Summary | +## Agent Metrics + +Agent metrics are emitted by all Nomad agents running in either client or server mode. + +| Metric | Description | Unit | Type | +| ----------------------------------------- | ----------------------------------------------------- | ----------- | ------- | +| `nomad.agent.http.exceeded` | Count of HTTP connections exceeding concurrency limit | Integer | Counter | + [tagged-metrics]: /docs/telemetry/metrics#tagged-metrics [sticky]: /docs/job-specification/ephemeral_disk#sticky [s_port_plan_failure]: /s/port-plan-failure