mirror of
https://github.com/bloomberg/goldpinger.git
synced 2026-04-22 09:56:38 +00:00
Add an opt-in UDP echo probe that runs alongside the existing HTTP
ping. Each goldpinger pod listens on a configurable UDP port (default
6969). During each ping cycle, the prober sends N sequenced packets
to the peer's listener, which echoes them back. From the replies we
compute packet loss percentage, path hop count (from IPv4 TTL / IPv6
HopLimit), and average round-trip time.
New Prometheus metrics:
- goldpinger_peers_loss_pct (gauge) — per-peer UDP loss %
- goldpinger_peers_path_length (gauge) — estimated hop count
- goldpinger_peers_udp_rtt_ms (histogram) — UDP RTT in milliseconds
The graph UI shows yellow edges for links with partial loss, and
displays sub-millisecond UDP RTT instead of HTTP latency when UDP
is enabled. Stale metric labels are cleaned up when a pinger is
destroyed so rolled pods don't leave ghost entries.
Configuration (all via env vars, disabled by default):
UDP_ENABLED=true enable UDP probing and listener
UDP_PORT=6969 listener port
UDP_PACKET_COUNT=10 packets per probe
UDP_PACKET_SIZE=64 bytes per packet
UDP_TIMEOUT=1s probe timeout
New files:
pkg/goldpinger/udp_probe.go — echo listener + probe client
pkg/goldpinger/udp_probe_test.go — unit tests
Unit tests:
```
=== RUN TestProbeUDP_NoLoss
udp_probe_test.go:51: avg UDP RTT: 0.0823 ms
--- PASS: TestProbeUDP_NoLoss (0.00s)
=== RUN TestProbeUDP_FullLoss
--- PASS: TestProbeUDP_FullLoss (0.00s)
=== RUN TestProbeUDP_PacketFormat
--- PASS: TestProbeUDP_PacketFormat (0.00s)
=== RUN TestEstimateHops
--- PASS: TestEstimateHops (0.00s)
PASS
```
Cluster test (6-node IPv6 k8s, UDP_ENABLED=true):
```
Prometheus metrics (healthy cluster, 0% loss):
goldpinger_peers_loss_pct{...,pod_ip="fd00:4:69:3::3746"} 0
goldpinger_peers_path_length{...,pod_ip="fd00:4:69:3::3746"} 0
Simulated 50% loss via ip6tables DROP in pod netns on node-0:
goldpinger_peers_loss_pct{instance="server",...} 60
goldpinger_peers_loss_pct{instance="node-1",...} 30
goldpinger_peers_loss_pct{instance="server2",...} 30
UDP RTT vs HTTP RTT (check_all API):
node-0 -> server: udp=2.18ms http=2ms
node-2 -> node-2: udp=0.40ms http=1ms
server -> node-0: udp=0.55ms http=2ms
Post-rollout stale metrics cleanup verified:
All 36 edges show 0% loss, no stale pod IPs.
```
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Signed-off-by: Cooper Ry Lees <me@cooperlees.com>
105 lines
2.4 KiB
Go
105 lines
2.4 KiB
Go
// Code generated by go-swagger; DO NOT EDIT.
|
|
|
|
package operations
|
|
|
|
import (
|
|
"encoding/json"
|
|
stderrors "errors"
|
|
"fmt"
|
|
"io"
|
|
|
|
"github.com/go-openapi/runtime"
|
|
"github.com/go-openapi/strfmt"
|
|
|
|
"github.com/bloomberg/goldpinger/v3/pkg/models"
|
|
)
|
|
|
|
// PingReader is a Reader for the Ping structure.
|
|
type PingReader struct {
|
|
formats strfmt.Registry
|
|
}
|
|
|
|
// ReadResponse reads a server response into the received o.
|
|
func (o *PingReader) ReadResponse(response runtime.ClientResponse, consumer runtime.Consumer) (any, error) {
|
|
switch response.Code() {
|
|
case 200:
|
|
result := NewPingOK()
|
|
if err := result.readResponse(response, consumer, o.formats); err != nil {
|
|
return nil, err
|
|
}
|
|
return result, nil
|
|
default:
|
|
return nil, runtime.NewAPIError("[GET /ping] ping", response, response.Code())
|
|
}
|
|
}
|
|
|
|
// NewPingOK creates a PingOK with default headers values
|
|
func NewPingOK() *PingOK {
|
|
return &PingOK{}
|
|
}
|
|
|
|
/*
|
|
PingOK describes a response with status code 200, with default header values.
|
|
|
|
return success
|
|
*/
|
|
type PingOK struct {
|
|
Payload *models.PingResults
|
|
}
|
|
|
|
// IsSuccess returns true when this ping o k response has a 2xx status code
|
|
func (o *PingOK) IsSuccess() bool {
|
|
return true
|
|
}
|
|
|
|
// IsRedirect returns true when this ping o k response has a 3xx status code
|
|
func (o *PingOK) IsRedirect() bool {
|
|
return false
|
|
}
|
|
|
|
// IsClientError returns true when this ping o k response has a 4xx status code
|
|
func (o *PingOK) IsClientError() bool {
|
|
return false
|
|
}
|
|
|
|
// IsServerError returns true when this ping o k response has a 5xx status code
|
|
func (o *PingOK) IsServerError() bool {
|
|
return false
|
|
}
|
|
|
|
// IsCode returns true when this ping o k response a status code equal to that given
|
|
func (o *PingOK) IsCode(code int) bool {
|
|
return code == 200
|
|
}
|
|
|
|
// Code gets the status code for the ping o k response
|
|
func (o *PingOK) Code() int {
|
|
return 200
|
|
}
|
|
|
|
func (o *PingOK) Error() string {
|
|
payload, _ := json.Marshal(o.Payload)
|
|
return fmt.Sprintf("[GET /ping][%d] pingOK %s", 200, payload)
|
|
}
|
|
|
|
func (o *PingOK) String() string {
|
|
payload, _ := json.Marshal(o.Payload)
|
|
return fmt.Sprintf("[GET /ping][%d] pingOK %s", 200, payload)
|
|
}
|
|
|
|
func (o *PingOK) GetPayload() *models.PingResults {
|
|
return o.Payload
|
|
}
|
|
|
|
func (o *PingOK) readResponse(response runtime.ClientResponse, consumer runtime.Consumer, formats strfmt.Registry) error {
|
|
|
|
o.Payload = new(models.PingResults)
|
|
|
|
// response payload
|
|
if err := consumer.Consume(response.Body(), o.Payload); err != nil && !stderrors.Is(err, io.EOF) {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|