Files
goldpinger/pkg/restapi/server.go
Cooper Ry Lees 832bc7b598 Add UDP probe metrics: packet loss, hop count, and RTT
Add an opt-in UDP echo probe that runs alongside the existing HTTP
ping. Each goldpinger pod listens on a configurable UDP port (default
6969). During each ping cycle, the prober sends N sequenced packets
to the peer's listener, which echoes them back. From the replies we
compute packet loss percentage, path hop count (from IPv4 TTL / IPv6
HopLimit), and average round-trip time.

New Prometheus metrics:
  - goldpinger_peers_loss_pct      (gauge)     — per-peer UDP loss %
  - goldpinger_peers_path_length   (gauge)     — estimated hop count
  - goldpinger_peers_udp_rtt_ms    (histogram) — UDP RTT in milliseconds

The graph UI shows yellow edges for links with partial loss, and
displays sub-millisecond UDP RTT instead of HTTP latency when UDP
is enabled. Stale metric labels are cleaned up when a pinger is
destroyed so rolled pods don't leave ghost entries.

Configuration (all via env vars, disabled by default):
  UDP_ENABLED=true      enable UDP probing and listener
  UDP_PORT=6969         listener port
  UDP_PACKET_COUNT=10   packets per probe
  UDP_PACKET_SIZE=64    bytes per packet
  UDP_TIMEOUT=1s        probe timeout

New files:
  pkg/goldpinger/udp_probe.go       — echo listener + probe client
  pkg/goldpinger/udp_probe_test.go  — unit tests

Unit tests:
```
=== RUN   TestProbeUDP_NoLoss
    udp_probe_test.go:51: avg UDP RTT: 0.0823 ms
--- PASS: TestProbeUDP_NoLoss (0.00s)
=== RUN   TestProbeUDP_FullLoss
--- PASS: TestProbeUDP_FullLoss (0.00s)
=== RUN   TestProbeUDP_PacketFormat
--- PASS: TestProbeUDP_PacketFormat (0.00s)
=== RUN   TestEstimateHops
--- PASS: TestEstimateHops (0.00s)
PASS
```

Cluster test (6-node IPv6 k8s, UDP_ENABLED=true):
```
Prometheus metrics (healthy cluster, 0% loss):
  goldpinger_peers_loss_pct{...,pod_ip="fd00:4:69:3::3746"} 0
  goldpinger_peers_path_length{...,pod_ip="fd00:4:69:3::3746"} 0

Simulated 50% loss via ip6tables DROP in pod netns on node-0:
  goldpinger_peers_loss_pct{instance="server",...} 60
  goldpinger_peers_loss_pct{instance="node-1",...} 30
  goldpinger_peers_loss_pct{instance="server2",...} 30

UDP RTT vs HTTP RTT (check_all API):
  node-0 -> server:  udp=2.18ms  http=2ms
  node-2 -> node-2:  udp=0.40ms  http=1ms
  server -> node-0:  udp=0.55ms  http=2ms

Post-rollout stale metrics cleanup verified:
  All 36 edges show 0% loss, no stale pod IPs.
```

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Signed-off-by: Cooper Ry Lees <me@cooperlees.com>
2026-03-27 16:05:32 +00:00

508 lines
15 KiB
Go

// Code generated by go-swagger; DO NOT EDIT.
package restapi
import (
"context"
"crypto/tls"
"crypto/x509"
"errors"
"log"
"net"
"net/http"
"os"
"os/signal"
"strconv"
"sync"
"sync/atomic"
"syscall"
"time"
flags "github.com/jessevdk/go-flags"
"golang.org/x/net/netutil"
"github.com/go-openapi/runtime/flagext"
"github.com/go-openapi/swag"
"github.com/bloomberg/goldpinger/v3/pkg/restapi/operations"
)
const (
schemeHTTP = "http"
schemeHTTPS = "https"
schemeUnix = "unix"
)
var defaultSchemes []string
func init() {
defaultSchemes = []string{
schemeHTTP,
}
}
// NewServer creates a new api goldpinger server but does not configure it
func NewServer(api *operations.GoldpingerAPI) *Server {
s := new(Server)
s.shutdown = make(chan struct{})
s.api = api
s.interrupt = make(chan os.Signal, 1)
return s
}
// ConfigureAPI configures the API and handlers.
func (s *Server) ConfigureAPI() {
if s.api != nil {
s.handler = configureAPI(s.api)
}
}
// ConfigureFlags configures the additional flags defined by the handlers. Needs to be called before the parser.Parse
func (s *Server) ConfigureFlags() {
if s.api != nil {
configureFlags(s.api)
}
}
// Server for the goldpinger API
type Server struct {
EnabledListeners []string `long:"scheme" description:"the listeners to enable, this can be repeated and defaults to the schemes in the swagger spec"`
CleanupTimeout time.Duration `long:"cleanup-timeout" description:"grace period for which to wait before killing idle connections" default:"10s"`
GracefulTimeout time.Duration `long:"graceful-timeout" description:"grace period for which to wait before shutting down the server" default:"15s"`
MaxHeaderSize flagext.ByteSize `long:"max-header-size" description:"controls the maximum number of bytes the server will read parsing the request header's keys and values, including the request line. It does not limit the size of the request body." default:"1MiB"`
SocketPath flags.Filename `long:"socket-path" description:"the unix socket to listen on" default:"/var/run/goldpinger.sock"`
domainSocketL net.Listener
Host string `long:"host" description:"the IP to listen on" default:"localhost" env:"HOST"`
Port int `long:"port" description:"the port to listen on for insecure connections, defaults to a random value" env:"PORT"`
ListenLimit int `long:"listen-limit" description:"limit the number of outstanding requests"`
KeepAlive time.Duration `long:"keep-alive" description:"sets the TCP keep-alive timeouts on accepted connections. It prunes dead TCP connections ( e.g. closing laptop mid-download)" default:"3m"`
ReadTimeout time.Duration `long:"read-timeout" description:"maximum duration before timing out read of the request" default:"30s"`
WriteTimeout time.Duration `long:"write-timeout" description:"maximum duration before timing out write of the response" default:"30s"`
httpServerL net.Listener
TLSHost string `long:"tls-host" description:"the IP to listen on for tls, when not specified it's the same as --host" env:"TLS_HOST"`
TLSPort int `long:"tls-port" description:"the port to listen on for secure connections, defaults to a random value" env:"TLS_PORT"`
TLSCertificate flags.Filename `long:"tls-certificate" description:"the certificate to use for secure connections" env:"TLS_CERTIFICATE"`
TLSCertificateKey flags.Filename `long:"tls-key" description:"the private key to use for secure connections" env:"TLS_PRIVATE_KEY"`
TLSCACertificate flags.Filename `long:"tls-ca" description:"the certificate authority file to be used with mutual tls auth" env:"TLS_CA_CERTIFICATE"`
TLSListenLimit int `long:"tls-listen-limit" description:"limit the number of outstanding requests"`
TLSKeepAlive time.Duration `long:"tls-keep-alive" description:"sets the TCP keep-alive timeouts on accepted connections. It prunes dead TCP connections ( e.g. closing laptop mid-download)"`
TLSReadTimeout time.Duration `long:"tls-read-timeout" description:"maximum duration before timing out read of the request"`
TLSWriteTimeout time.Duration `long:"tls-write-timeout" description:"maximum duration before timing out write of the response"`
httpsServerL net.Listener
api *operations.GoldpingerAPI
handler http.Handler
hasListeners bool
shutdown chan struct{}
shuttingDown int32
interrupted bool
interrupt chan os.Signal
}
// Logf logs message either via defined user logger or via system one if no user logger is defined.
func (s *Server) Logf(f string, args ...any) {
if s.api != nil && s.api.Logger != nil {
s.api.Logger(f, args...)
} else {
log.Printf(f, args...)
}
}
// Fatalf logs message either via defined user logger or via system one if no user logger is defined.
// Exits with non-zero status after printing
func (s *Server) Fatalf(f string, args ...any) {
if s.api != nil && s.api.Logger != nil {
s.api.Logger(f, args...)
os.Exit(1)
} else {
log.Fatalf(f, args...)
}
}
// SetAPI configures the server with the specified API. Needs to be called before Serve
func (s *Server) SetAPI(api *operations.GoldpingerAPI) {
if api == nil {
s.api = nil
s.handler = nil
return
}
s.api = api
s.handler = configureAPI(api)
}
func (s *Server) hasScheme(scheme string) bool {
schemes := s.EnabledListeners
if len(schemes) == 0 {
schemes = defaultSchemes
}
for _, v := range schemes {
if v == scheme {
return true
}
}
return false
}
// Serve the api
func (s *Server) Serve() (err error) {
if !s.hasListeners {
if err = s.Listen(); err != nil {
return err
}
}
// set default handler, if none is set
if s.handler == nil {
if s.api == nil {
return errors.New("can't create the default handler, as no api is set")
}
s.SetHandler(s.api.Serve(nil))
}
wg := new(sync.WaitGroup)
once := new(sync.Once)
signalNotify(s.interrupt)
go handleInterrupt(once, s)
servers := []*http.Server{}
if s.hasScheme(schemeUnix) {
domainSocket := new(http.Server)
domainSocket.MaxHeaderBytes = int(s.MaxHeaderSize)
domainSocket.Handler = s.handler
if int64(s.CleanupTimeout) > 0 {
domainSocket.IdleTimeout = s.CleanupTimeout
}
configureServer(domainSocket, "unix", string(s.SocketPath))
servers = append(servers, domainSocket)
wg.Add(1)
s.Logf("Serving goldpinger at unix://%s", s.SocketPath)
go func(l net.Listener) {
defer wg.Done()
if errServe := domainSocket.Serve(l); errServe != nil && !errors.Is(errServe, http.ErrServerClosed) {
s.Fatalf("%v", errServe)
}
s.Logf("Stopped serving goldpinger at unix://%s", s.SocketPath)
}(s.domainSocketL)
}
if s.hasScheme(schemeHTTP) {
httpServer := new(http.Server)
httpServer.MaxHeaderBytes = int(s.MaxHeaderSize)
httpServer.ReadTimeout = s.ReadTimeout
httpServer.WriteTimeout = s.WriteTimeout
httpServer.SetKeepAlivesEnabled(int64(s.KeepAlive) > 0)
if s.ListenLimit > 0 {
s.httpServerL = netutil.LimitListener(s.httpServerL, s.ListenLimit)
}
if int64(s.CleanupTimeout) > 0 {
httpServer.IdleTimeout = s.CleanupTimeout
}
httpServer.Handler = s.handler
configureServer(httpServer, "http", s.httpServerL.Addr().String())
servers = append(servers, httpServer)
wg.Add(1)
s.Logf("Serving goldpinger at http://%s", s.httpServerL.Addr())
go func(l net.Listener) {
defer wg.Done()
if errServe := httpServer.Serve(l); errServe != nil && !errors.Is(errServe, http.ErrServerClosed) {
s.Fatalf("%v", errServe)
}
s.Logf("Stopped serving goldpinger at http://%s", l.Addr())
}(s.httpServerL)
}
if s.hasScheme(schemeHTTPS) {
httpsServer := new(http.Server)
httpsServer.MaxHeaderBytes = int(s.MaxHeaderSize)
httpsServer.ReadTimeout = s.TLSReadTimeout
httpsServer.WriteTimeout = s.TLSWriteTimeout
httpsServer.SetKeepAlivesEnabled(int64(s.TLSKeepAlive) > 0)
if s.TLSListenLimit > 0 {
s.httpsServerL = netutil.LimitListener(s.httpsServerL, s.TLSListenLimit)
}
if int64(s.CleanupTimeout) > 0 {
httpsServer.IdleTimeout = s.CleanupTimeout
}
httpsServer.Handler = s.handler
// Inspired by https://blog.bracebin.com/achieving-perfect-ssl-labs-score-with-go
httpsServer.TLSConfig = &tls.Config{
// Causes servers to use Go's default ciphersuite preferences,
// which are tuned to avoid attacks. Does nothing on clients.
PreferServerCipherSuites: true,
// Only use curves which have assembly implementations
// https://github.com/golang/go/tree/master/src/crypto/elliptic
CurvePreferences: []tls.CurveID{tls.CurveP256},
// Use modern tls mode https://wiki.mozilla.org/Security/Server_Side_TLS#Modern_compatibility
NextProtos: []string{"h2", "http/1.1"},
// https://www.owasp.org/index.php/Transport_Layer_Protection_Cheat_Sheet#Rule_-_Only_Support_Strong_Protocols
MinVersion: tls.VersionTLS12,
// These ciphersuites support Forward Secrecy: https://en.wikipedia.org/wiki/Forward_secrecy
CipherSuites: []uint16{
tls.TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,
tls.TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,
tls.TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,
tls.TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
tls.TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305,
tls.TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,
},
}
// build standard config from server options
if s.TLSCertificate != "" && s.TLSCertificateKey != "" {
httpsServer.TLSConfig.Certificates = make([]tls.Certificate, 1)
httpsServer.TLSConfig.Certificates[0], err = tls.LoadX509KeyPair(string(s.TLSCertificate), string(s.TLSCertificateKey))
if err != nil {
return err
}
}
if s.TLSCACertificate != "" {
// include specified CA certificate
caCert, caCertErr := os.ReadFile(string(s.TLSCACertificate))
if caCertErr != nil {
return caCertErr
}
caCertPool := x509.NewCertPool()
ok := caCertPool.AppendCertsFromPEM(caCert)
if !ok {
return errors.New("cannot parse CA certificate")
}
httpsServer.TLSConfig.ClientCAs = caCertPool
httpsServer.TLSConfig.ClientAuth = tls.RequireAndVerifyClientCert
}
// call custom TLS configurator
configureTLS(httpsServer.TLSConfig)
if len(httpsServer.TLSConfig.Certificates) == 0 && httpsServer.TLSConfig.GetCertificate == nil {
// after standard and custom config are passed, this ends up with no certificate
if s.TLSCertificate == "" {
if s.TLSCertificateKey == "" {
s.Fatalf("the required flags `--tls-certificate` and `--tls-key` were not specified")
}
s.Fatalf("the required flag `--tls-certificate` was not specified")
}
if s.TLSCertificateKey == "" {
s.Fatalf("the required flag `--tls-key` was not specified")
}
// this happens with a wrong custom TLS configurator
s.Fatalf("no certificate was configured for TLS")
}
configureServer(httpsServer, "https", s.httpsServerL.Addr().String())
servers = append(servers, httpsServer)
wg.Add(1)
s.Logf("Serving goldpinger at https://%s", s.httpsServerL.Addr())
go func(l net.Listener) {
defer wg.Done()
if errServe := httpsServer.Serve(l); errServe != nil && !errors.Is(errServe, http.ErrServerClosed) {
s.Fatalf("%v", errServe)
}
s.Logf("Stopped serving goldpinger at https://%s", l.Addr())
}(tls.NewListener(s.httpsServerL, httpsServer.TLSConfig))
}
wg.Add(1)
go s.handleShutdown(wg, &servers)
wg.Wait()
return nil
}
// Listen creates the listeners for the server
func (s *Server) Listen() error {
if s.hasListeners { // already done this
return nil
}
if s.hasScheme(schemeHTTPS) {
// Use http host if https host wasn't defined
if s.TLSHost == "" {
s.TLSHost = s.Host
}
// Use http listen limit if https listen limit wasn't defined
if s.TLSListenLimit == 0 {
s.TLSListenLimit = s.ListenLimit
}
// Use http tcp keep alive if https tcp keep alive wasn't defined
if int64(s.TLSKeepAlive) == 0 {
s.TLSKeepAlive = s.KeepAlive
}
// Use http read timeout if https read timeout wasn't defined
if int64(s.TLSReadTimeout) == 0 {
s.TLSReadTimeout = s.ReadTimeout
}
// Use http write timeout if https write timeout wasn't defined
if int64(s.TLSWriteTimeout) == 0 {
s.TLSWriteTimeout = s.WriteTimeout
}
}
if s.hasScheme(schemeUnix) {
domSockListener, err := net.Listen("unix", string(s.SocketPath))
if err != nil {
return err
}
s.domainSocketL = domSockListener
}
if s.hasScheme(schemeHTTP) {
listener, err := net.Listen("tcp", net.JoinHostPort(s.Host, strconv.Itoa(s.Port)))
if err != nil {
return err
}
h, p, err := swag.SplitHostPort(listener.Addr().String())
if err != nil {
return err
}
s.Host = h
s.Port = p
s.httpServerL = listener
}
if s.hasScheme(schemeHTTPS) {
tlsListener, err := net.Listen("tcp", net.JoinHostPort(s.TLSHost, strconv.Itoa(s.TLSPort)))
if err != nil {
return err
}
sh, sp, err := swag.SplitHostPort(tlsListener.Addr().String())
if err != nil {
return err
}
s.TLSHost = sh
s.TLSPort = sp
s.httpsServerL = tlsListener
}
s.hasListeners = true
return nil
}
// Shutdown server and clean up resources
func (s *Server) Shutdown() error {
if atomic.CompareAndSwapInt32(&s.shuttingDown, 0, 1) {
close(s.shutdown)
}
return nil
}
func (s *Server) handleShutdown(wg *sync.WaitGroup, serversPtr *[]*http.Server) {
// wg.Done must occur last, after s.api.ServerShutdown()
// (to preserve old behaviour)
defer wg.Done()
<-s.shutdown
servers := *serversPtr
ctx, cancel := context.WithTimeout(context.TODO(), s.GracefulTimeout)
defer cancel()
// first execute the pre-shutdown hook
s.api.PreServerShutdown()
shutdownChan := make(chan bool)
for i := range servers {
server := servers[i]
go func() {
var success bool
defer func() {
shutdownChan <- success
}()
if err := server.Shutdown(ctx); err != nil {
// Error from closing listeners, or context timeout:
s.Logf("HTTP server Shutdown: %v", err)
} else {
success = true
}
}()
}
// Wait until all listeners have successfully shut down before calling ServerShutdown
success := true
for range servers {
success = success && <-shutdownChan
}
if success {
s.api.ServerShutdown()
}
}
// GetHandler returns a handler useful for testing
func (s *Server) GetHandler() http.Handler {
return s.handler
}
// SetHandler allows for setting a http handler on this server
func (s *Server) SetHandler(handler http.Handler) {
s.handler = handler
}
// UnixListener returns the domain socket listener
func (s *Server) UnixListener() (net.Listener, error) {
if !s.hasListeners {
if err := s.Listen(); err != nil {
return nil, err
}
}
return s.domainSocketL, nil
}
// HTTPListener returns the http listener
func (s *Server) HTTPListener() (net.Listener, error) {
if !s.hasListeners {
if err := s.Listen(); err != nil {
return nil, err
}
}
return s.httpServerL, nil
}
// TLSListener returns the https listener
func (s *Server) TLSListener() (net.Listener, error) {
if !s.hasListeners {
if err := s.Listen(); err != nil {
return nil, err
}
}
return s.httpsServerL, nil
}
func handleInterrupt(once *sync.Once, s *Server) {
once.Do(func() {
for range s.interrupt {
if s.interrupted {
s.Logf("Server already shutting down")
continue
}
s.interrupted = true
s.Logf("Shutting down... ")
if err := s.Shutdown(); err != nil {
s.Logf("HTTP server Shutdown: %v", err)
}
}
})
}
func signalNotify(interrupt chan<- os.Signal) {
signal.Notify(interrupt, syscall.SIGINT, syscall.SIGTERM)
}