5 Commits

Author SHA1 Message Date
skamboj
6f2fec6074 Merge pull request #169 from cooperlees/stale_pod_ips
Delete stale pod IPs
2026-04-23 11:37:08 -04:00
Sachin Kamboj
4036ef6a9a chore(version): bump to v3.11.2
Signed-off-by: Sachin Kamboj <skamboj1@bloomberg.net>
2026-04-23 11:20:18 -04:00
Cooper Ry Lees
4308eb1fa5 Also prune response-time histogram "check" call_type on peer removal
Address PR #169 review: DeletePeerMetrics only removed the "ping"
call_type label set, leaving stale "check" samples from CheckAllPods
in /metrics after a peer rolls. goldpingerResponseTimePeersHistogram
is observed at two sites — Pinger (pinger.go:56, "ping") and
CheckAllPods via GetLabeledPeersCallsTimer (client.go:209, "check")
— so both must be pruned.

Missed on the first pass because I traced the Pinger observation
(the hot path) and did not separately check for the "check" label
value emitted from the /check_all handler. Tests now seed both
call_types before teardown so either gap regresses.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Signed-off-by: Cooper Ry Lees <me@cooperlees.com>
2026-04-22 13:54:01 +00:00
Cooper Ry Lees
e08e21f15c Use table-driven subtests with IPv4, IPv6, and mixed address families
Converts the three DeletePeer* tests to table-driven subtests covering
IPv4, IPv6 (RFC 3849 2001:db8::/32 documentation prefix), and mixed
address-family scenarios. This matches the IPv6-only cluster where the
stale metrics bug was originally observed (#167).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Signed-off-by: Cooper Ry Lees <me@cooperlees.com>
2026-04-20 14:02:40 -05:00
Cooper Ry Lees
5e625bbd40 Prune stale Prometheus metrics for defunct peer pod IPs on teardown
After a DaemonSet rolling update, goldpinger retained response-time
histogram and error counter series for old pod IPs that no longer exist.
These stale single-sample series skewed P95/P99 latency calculations and
made transient rollout errors appear permanent. (Fixes #167)

The existing destroyPingers path only cleaned UDP-specific per-peer
metrics (and only when UDP was enabled). This adds:

- DeletePeerMetrics(): removes goldpinger_peers_response_time_s histogram
  label sets for destroyed peers, called unconditionally on pinger teardown
- goldpinger_udp_errors_total cleanup in DeletePeerUDPMetrics(), which was
  previously missed

Testing:
- TestDeletePeerMetrics_CleansResponseTimeHistogram: verifies the
  response-time histogram label set is removed after DeletePeerMetrics()
- TestDeletePeerMetrics_LeavesOtherPeersIntact: verifies pruning one
  peer does not affect another peer's metric series
- TestDeletePeerUDPMetrics_CleansAllPerPeerMetrics: extended to also
  verify goldpinger_udp_errors_total cleanup
- All 11 tests pass (go test ./pkg/goldpinger/ -v)

Validated on a 6-node IPv6 kubeadm cluster by upgrading goldpinger with
a rolling update and confirming /metrics only contains current pod IPs
after the rollout completes.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Signed-off-by: Cooper Ry Lees <me@cooperlees.com>
2026-04-20 14:02:40 -05:00
5 changed files with 187 additions and 38 deletions

View File

@@ -1,5 +1,5 @@
name ?= goldpinger
version ?= v3.11.1
version ?= v3.11.2
bin ?= goldpinger
pkg ?= "github.com/bloomberg/goldpinger"
tag = $(name):$(version)

View File

@@ -1,7 +1,7 @@
apiVersion: v1
name: goldpinger
appVersion: "3.11.1"
version: 1.1.1
appVersion: "3.11.2"
version: 1.1.2
description: Goldpinger is a tool to help debug, troubleshoot and visualize network connectivity and slowness issues.
home: https://github.com/bloomberg/goldpinger
sources:

View File

@@ -300,6 +300,15 @@ func SetPeerHopCount(hostIP, podIP string, hopCount int32) {
).Set(float64(hopCount))
}
// DeletePeerMetrics removes stale metric labels for a destroyed peer.
// The response-time histogram is observed with two call_type values:
// "ping" (continuous from Pinger) and "check" (from CheckAllPods), so both
// must be pruned. Must be called unconditionally when a peer is removed.
func DeletePeerMetrics(hostIP, podIP string) {
goldpingerResponseTimePeersHistogram.DeleteLabelValues(GoldpingerConfig.Hostname, "ping", hostIP, podIP)
goldpingerResponseTimePeersHistogram.DeleteLabelValues(GoldpingerConfig.Hostname, "check", hostIP, podIP)
}
// DeletePeerUDPMetrics removes stale UDP metric labels for a destroyed peer.
// This must be kept in sync with all per-peer UDP metrics to avoid stale
// label sets lingering in /metrics after a pod rolls.
@@ -309,6 +318,7 @@ func DeletePeerUDPMetrics(hostIP, podIP string) {
goldpingerPeersUDPRtt.DeleteLabelValues(GoldpingerConfig.Hostname, hostIP, podIP)
goldpingerUDPDuplicatesCounter.DeleteLabelValues(GoldpingerConfig.Hostname, hostIP, podIP)
goldpingerUDPOutOfOrderCounter.DeleteLabelValues(GoldpingerConfig.Hostname, hostIP, podIP)
goldpingerUDPErrorsCounter.DeleteLabelValues(GoldpingerConfig.Hostname, pickPodHostIP(podIP, hostIP))
}
// ObservePeerUDPRtt records a UDP RTT observation in seconds

View File

@@ -7,49 +7,186 @@ import (
dto "github.com/prometheus/client_model/go"
)
// TestDeletePeerMetrics_CleansResponseTimeHistogram verifies that
// DeletePeerMetrics removes the response-time histogram label set for
// a destroyed peer. This prevents stale pod IPs from lingering in
// /metrics after rolling updates (see #167).
func TestDeletePeerMetrics_CleansResponseTimeHistogram(t *testing.T) {
tests := []struct {
name string
hostIP string
podIP string
}{
{"IPv4", "10.0.0.1", "10.0.0.2"},
{"IPv6", "2001:db8::1", "2001:db8:1::2"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
origHostname := GoldpingerConfig.Hostname
GoldpingerConfig.Hostname = "test-instance"
defer func() { GoldpingerConfig.Hostname = origHostname }()
// Simulate observations for both call_type values the histogram uses
goldpingerResponseTimePeersHistogram.WithLabelValues(
GoldpingerConfig.Hostname, "ping", tt.hostIP, tt.podIP,
).Observe(0.005)
goldpingerResponseTimePeersHistogram.WithLabelValues(
GoldpingerConfig.Hostname, "check", tt.hostIP, tt.podIP,
).Observe(0.010)
if n := countMetrics(goldpingerResponseTimePeersHistogram); n != 2 {
t.Fatalf("response time histogram has %d label sets before cleanup, want 2 — test setup is broken", n)
}
DeletePeerMetrics(tt.hostIP, tt.podIP)
if n := countMetrics(goldpingerResponseTimePeersHistogram); n != 0 {
t.Errorf("response time histogram still has %d label set(s) after DeletePeerMetrics", n)
}
})
}
}
// TestDeletePeerMetrics_LeavesOtherPeersIntact verifies that pruning
// metrics for one peer does not affect a different peer's label set.
func TestDeletePeerMetrics_LeavesOtherPeersIntact(t *testing.T) {
tests := []struct {
name string
peerA [2]string // {hostIP, podIP}
peerB [2]string
}{
{
"IPv4",
[2]string{"10.0.0.1", "10.0.0.2"},
[2]string{"10.0.0.3", "10.0.0.4"},
},
{
"IPv6",
[2]string{"2001:db8::1", "2001:db8:1::2"},
[2]string{"2001:db8::3", "2001:db8:2::4"},
},
{
"MixedV4DeleteV6Survives",
[2]string{"10.0.0.1", "10.0.0.2"},
[2]string{"2001:db8::3", "2001:db8:2::4"},
},
{
"MixedV6DeleteV4Survives",
[2]string{"2001:db8::1", "2001:db8:1::2"},
[2]string{"10.0.0.3", "10.0.0.4"},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
origHostname := GoldpingerConfig.Hostname
GoldpingerConfig.Hostname = "test-instance"
defer func() { GoldpingerConfig.Hostname = origHostname }()
// Peer A — observe both call types
goldpingerResponseTimePeersHistogram.WithLabelValues(
GoldpingerConfig.Hostname, "ping", tt.peerA[0], tt.peerA[1],
).Observe(0.005)
goldpingerResponseTimePeersHistogram.WithLabelValues(
GoldpingerConfig.Hostname, "check", tt.peerA[0], tt.peerA[1],
).Observe(0.006)
SetPeerLossPct(tt.peerA[0], tt.peerA[1], 0)
// Peer B — observe both call types
goldpingerResponseTimePeersHistogram.WithLabelValues(
GoldpingerConfig.Hostname, "ping", tt.peerB[0], tt.peerB[1],
).Observe(0.010)
goldpingerResponseTimePeersHistogram.WithLabelValues(
GoldpingerConfig.Hostname, "check", tt.peerB[0], tt.peerB[1],
).Observe(0.011)
SetPeerLossPct(tt.peerB[0], tt.peerB[1], 1.5)
// Delete peer A only
DeletePeerMetrics(tt.peerA[0], tt.peerA[1])
DeletePeerUDPMetrics(tt.peerA[0], tt.peerA[1])
// Peer B's ping and check histogram entries should both survive
if n := countMetrics(goldpingerResponseTimePeersHistogram); n != 2 {
t.Errorf("response time histogram has %d label set(s), want 2 for peer B (ping+check)", n)
}
if countMetrics(goldpingerPeersLossPct) == 0 {
t.Error("loss pct gauge lost all label sets — peer B should still exist")
}
// Clean up peer B so it doesn't leak into other tests
goldpingerResponseTimePeersHistogram.DeleteLabelValues(
GoldpingerConfig.Hostname, "ping", tt.peerB[0], tt.peerB[1],
)
goldpingerResponseTimePeersHistogram.DeleteLabelValues(
GoldpingerConfig.Hostname, "check", tt.peerB[0], tt.peerB[1],
)
goldpingerPeersLossPct.DeleteLabelValues(
GoldpingerConfig.Hostname, tt.peerB[0], tt.peerB[1],
)
})
}
}
// TestDeletePeerUDPMetrics_CleansAllPerPeerMetrics verifies that
// DeletePeerUDPMetrics removes label sets from every per-peer UDP metric.
// If a new per-peer metric is added but not cleaned up in
// DeletePeerUDPMetrics, this test will fail.
func TestDeletePeerUDPMetrics_CleansAllPerPeerMetrics(t *testing.T) {
// Save and restore hostname since we set it for the test
origHostname := GoldpingerConfig.Hostname
GoldpingerConfig.Hostname = "test-instance"
defer func() { GoldpingerConfig.Hostname = origHostname }()
hostIP := "10.0.0.1"
podIP := "10.0.0.2"
// Populate all per-peer UDP metrics so they have label values
SetPeerLossPct(hostIP, podIP, 5.0)
SetPeerHopCount(hostIP, podIP, 2)
ObservePeerUDPRtt(hostIP, podIP, 0.001)
CountUDPDuplicates(hostIP, podIP, 1)
CountUDPOutOfOrder(hostIP, podIP, 1)
// Verify they exist before cleanup
perPeerCollectors := map[string]prometheus.Collector{
"goldpinger_peers_loss_pct": goldpingerPeersLossPct,
"goldpinger_peers_hop_count": goldpingerPeersHopCount,
"goldpinger_peers_udp_rtt_s": goldpingerPeersUDPRtt,
"goldpinger_udp_duplicates_total": goldpingerUDPDuplicatesCounter,
"goldpinger_udp_out_of_order_total": goldpingerUDPOutOfOrderCounter,
tests := []struct {
name string
hostIP string
podIP string
}{
{"IPv4", "10.0.0.1", "10.0.0.2"},
{"IPv6", "2001:db8::1", "2001:db8:1::2"},
}
for name, collector := range perPeerCollectors {
if countMetrics(collector) == 0 {
t.Fatalf("metric %s has no label values before cleanup — test setup is broken", name)
}
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
origHostname := GoldpingerConfig.Hostname
origUseHostIP := GoldpingerConfig.UseHostIP
GoldpingerConfig.Hostname = "test-instance"
GoldpingerConfig.UseHostIP = false
defer func() {
GoldpingerConfig.Hostname = origHostname
GoldpingerConfig.UseHostIP = origUseHostIP
}()
// Run cleanup
DeletePeerUDPMetrics(hostIP, podIP)
// Populate all per-peer UDP metrics so they have label values
SetPeerLossPct(tt.hostIP, tt.podIP, 5.0)
SetPeerHopCount(tt.hostIP, tt.podIP, 2)
ObservePeerUDPRtt(tt.hostIP, tt.podIP, 0.001)
CountUDPDuplicates(tt.hostIP, tt.podIP, 1)
CountUDPOutOfOrder(tt.hostIP, tt.podIP, 1)
CountUDPError(tt.podIP) // UseHostIP=false so target=podIP
// Verify all per-peer metrics are cleaned up
for name, collector := range perPeerCollectors {
if n := countMetrics(collector); n != 0 {
t.Errorf("metric %s still has %d label set(s) after DeletePeerUDPMetrics — add it to the cleanup function", name, n)
}
// Verify they exist before cleanup
perPeerCollectors := map[string]prometheus.Collector{
"goldpinger_peers_loss_pct": goldpingerPeersLossPct,
"goldpinger_peers_hop_count": goldpingerPeersHopCount,
"goldpinger_peers_udp_rtt_s": goldpingerPeersUDPRtt,
"goldpinger_udp_duplicates_total": goldpingerUDPDuplicatesCounter,
"goldpinger_udp_out_of_order_total": goldpingerUDPOutOfOrderCounter,
"goldpinger_udp_errors_total": goldpingerUDPErrorsCounter,
}
for name, collector := range perPeerCollectors {
if countMetrics(collector) == 0 {
t.Fatalf("metric %s has no label values before cleanup — test setup is broken", name)
}
}
// Run cleanup
DeletePeerUDPMetrics(tt.hostIP, tt.podIP)
// Verify all per-peer metrics are cleaned up
for name, collector := range perPeerCollectors {
if n := countMetrics(collector); n != 0 {
t.Errorf("metric %s still has %d label set(s) after DeletePeerUDPMetrics — add it to the cleanup function", name, n)
}
}
})
}
}

View File

@@ -137,7 +137,9 @@ func destroyPingers(pingers map[string]*Pinger, deletedPods map[string]*Goldping
// Close the channel to stop pinging
close(pinger.stopChan)
// Clean up stale UDP metric labels for this peer
// Clean up stale metric labels for this peer so defunct pod IPs
// don't linger in /metrics after rolling updates (see #167)
DeletePeerMetrics(pod.HostIP, pod.PodIP)
if GoldpingerConfig.UDPEnabled {
DeletePeerUDPMetrics(pod.HostIP, pod.PodIP)
}