mirror of
https://github.com/bloomberg/goldpinger.git
synced 2026-05-28 11:33:03 +00:00
Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6f2fec6074 | ||
|
|
4036ef6a9a | ||
|
|
4308eb1fa5 | ||
|
|
e08e21f15c | ||
|
|
5e625bbd40 |
2
Makefile
2
Makefile
@@ -1,5 +1,5 @@
|
|||||||
name ?= goldpinger
|
name ?= goldpinger
|
||||||
version ?= v3.11.1
|
version ?= v3.11.2
|
||||||
bin ?= goldpinger
|
bin ?= goldpinger
|
||||||
pkg ?= "github.com/bloomberg/goldpinger"
|
pkg ?= "github.com/bloomberg/goldpinger"
|
||||||
tag = $(name):$(version)
|
tag = $(name):$(version)
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
name: goldpinger
|
name: goldpinger
|
||||||
appVersion: "3.11.1"
|
appVersion: "3.11.2"
|
||||||
version: 1.1.1
|
version: 1.1.2
|
||||||
description: Goldpinger is a tool to help debug, troubleshoot and visualize network connectivity and slowness issues.
|
description: Goldpinger is a tool to help debug, troubleshoot and visualize network connectivity and slowness issues.
|
||||||
home: https://github.com/bloomberg/goldpinger
|
home: https://github.com/bloomberg/goldpinger
|
||||||
sources:
|
sources:
|
||||||
|
|||||||
@@ -300,6 +300,15 @@ func SetPeerHopCount(hostIP, podIP string, hopCount int32) {
|
|||||||
).Set(float64(hopCount))
|
).Set(float64(hopCount))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// DeletePeerMetrics removes stale metric labels for a destroyed peer.
|
||||||
|
// The response-time histogram is observed with two call_type values:
|
||||||
|
// "ping" (continuous from Pinger) and "check" (from CheckAllPods), so both
|
||||||
|
// must be pruned. Must be called unconditionally when a peer is removed.
|
||||||
|
func DeletePeerMetrics(hostIP, podIP string) {
|
||||||
|
goldpingerResponseTimePeersHistogram.DeleteLabelValues(GoldpingerConfig.Hostname, "ping", hostIP, podIP)
|
||||||
|
goldpingerResponseTimePeersHistogram.DeleteLabelValues(GoldpingerConfig.Hostname, "check", hostIP, podIP)
|
||||||
|
}
|
||||||
|
|
||||||
// DeletePeerUDPMetrics removes stale UDP metric labels for a destroyed peer.
|
// DeletePeerUDPMetrics removes stale UDP metric labels for a destroyed peer.
|
||||||
// This must be kept in sync with all per-peer UDP metrics to avoid stale
|
// This must be kept in sync with all per-peer UDP metrics to avoid stale
|
||||||
// label sets lingering in /metrics after a pod rolls.
|
// label sets lingering in /metrics after a pod rolls.
|
||||||
@@ -309,6 +318,7 @@ func DeletePeerUDPMetrics(hostIP, podIP string) {
|
|||||||
goldpingerPeersUDPRtt.DeleteLabelValues(GoldpingerConfig.Hostname, hostIP, podIP)
|
goldpingerPeersUDPRtt.DeleteLabelValues(GoldpingerConfig.Hostname, hostIP, podIP)
|
||||||
goldpingerUDPDuplicatesCounter.DeleteLabelValues(GoldpingerConfig.Hostname, hostIP, podIP)
|
goldpingerUDPDuplicatesCounter.DeleteLabelValues(GoldpingerConfig.Hostname, hostIP, podIP)
|
||||||
goldpingerUDPOutOfOrderCounter.DeleteLabelValues(GoldpingerConfig.Hostname, hostIP, podIP)
|
goldpingerUDPOutOfOrderCounter.DeleteLabelValues(GoldpingerConfig.Hostname, hostIP, podIP)
|
||||||
|
goldpingerUDPErrorsCounter.DeleteLabelValues(GoldpingerConfig.Hostname, pickPodHostIP(podIP, hostIP))
|
||||||
}
|
}
|
||||||
|
|
||||||
// ObservePeerUDPRtt records a UDP RTT observation in seconds
|
// ObservePeerUDPRtt records a UDP RTT observation in seconds
|
||||||
|
|||||||
@@ -7,49 +7,186 @@ import (
|
|||||||
dto "github.com/prometheus/client_model/go"
|
dto "github.com/prometheus/client_model/go"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// TestDeletePeerMetrics_CleansResponseTimeHistogram verifies that
|
||||||
|
// DeletePeerMetrics removes the response-time histogram label set for
|
||||||
|
// a destroyed peer. This prevents stale pod IPs from lingering in
|
||||||
|
// /metrics after rolling updates (see #167).
|
||||||
|
func TestDeletePeerMetrics_CleansResponseTimeHistogram(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
hostIP string
|
||||||
|
podIP string
|
||||||
|
}{
|
||||||
|
{"IPv4", "10.0.0.1", "10.0.0.2"},
|
||||||
|
{"IPv6", "2001:db8::1", "2001:db8:1::2"},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
origHostname := GoldpingerConfig.Hostname
|
||||||
|
GoldpingerConfig.Hostname = "test-instance"
|
||||||
|
defer func() { GoldpingerConfig.Hostname = origHostname }()
|
||||||
|
|
||||||
|
// Simulate observations for both call_type values the histogram uses
|
||||||
|
goldpingerResponseTimePeersHistogram.WithLabelValues(
|
||||||
|
GoldpingerConfig.Hostname, "ping", tt.hostIP, tt.podIP,
|
||||||
|
).Observe(0.005)
|
||||||
|
goldpingerResponseTimePeersHistogram.WithLabelValues(
|
||||||
|
GoldpingerConfig.Hostname, "check", tt.hostIP, tt.podIP,
|
||||||
|
).Observe(0.010)
|
||||||
|
|
||||||
|
if n := countMetrics(goldpingerResponseTimePeersHistogram); n != 2 {
|
||||||
|
t.Fatalf("response time histogram has %d label sets before cleanup, want 2 — test setup is broken", n)
|
||||||
|
}
|
||||||
|
|
||||||
|
DeletePeerMetrics(tt.hostIP, tt.podIP)
|
||||||
|
|
||||||
|
if n := countMetrics(goldpingerResponseTimePeersHistogram); n != 0 {
|
||||||
|
t.Errorf("response time histogram still has %d label set(s) after DeletePeerMetrics", n)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestDeletePeerMetrics_LeavesOtherPeersIntact verifies that pruning
|
||||||
|
// metrics for one peer does not affect a different peer's label set.
|
||||||
|
func TestDeletePeerMetrics_LeavesOtherPeersIntact(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
peerA [2]string // {hostIP, podIP}
|
||||||
|
peerB [2]string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
"IPv4",
|
||||||
|
[2]string{"10.0.0.1", "10.0.0.2"},
|
||||||
|
[2]string{"10.0.0.3", "10.0.0.4"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"IPv6",
|
||||||
|
[2]string{"2001:db8::1", "2001:db8:1::2"},
|
||||||
|
[2]string{"2001:db8::3", "2001:db8:2::4"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"MixedV4DeleteV6Survives",
|
||||||
|
[2]string{"10.0.0.1", "10.0.0.2"},
|
||||||
|
[2]string{"2001:db8::3", "2001:db8:2::4"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"MixedV6DeleteV4Survives",
|
||||||
|
[2]string{"2001:db8::1", "2001:db8:1::2"},
|
||||||
|
[2]string{"10.0.0.3", "10.0.0.4"},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
origHostname := GoldpingerConfig.Hostname
|
||||||
|
GoldpingerConfig.Hostname = "test-instance"
|
||||||
|
defer func() { GoldpingerConfig.Hostname = origHostname }()
|
||||||
|
|
||||||
|
// Peer A — observe both call types
|
||||||
|
goldpingerResponseTimePeersHistogram.WithLabelValues(
|
||||||
|
GoldpingerConfig.Hostname, "ping", tt.peerA[0], tt.peerA[1],
|
||||||
|
).Observe(0.005)
|
||||||
|
goldpingerResponseTimePeersHistogram.WithLabelValues(
|
||||||
|
GoldpingerConfig.Hostname, "check", tt.peerA[0], tt.peerA[1],
|
||||||
|
).Observe(0.006)
|
||||||
|
SetPeerLossPct(tt.peerA[0], tt.peerA[1], 0)
|
||||||
|
|
||||||
|
// Peer B — observe both call types
|
||||||
|
goldpingerResponseTimePeersHistogram.WithLabelValues(
|
||||||
|
GoldpingerConfig.Hostname, "ping", tt.peerB[0], tt.peerB[1],
|
||||||
|
).Observe(0.010)
|
||||||
|
goldpingerResponseTimePeersHistogram.WithLabelValues(
|
||||||
|
GoldpingerConfig.Hostname, "check", tt.peerB[0], tt.peerB[1],
|
||||||
|
).Observe(0.011)
|
||||||
|
SetPeerLossPct(tt.peerB[0], tt.peerB[1], 1.5)
|
||||||
|
|
||||||
|
// Delete peer A only
|
||||||
|
DeletePeerMetrics(tt.peerA[0], tt.peerA[1])
|
||||||
|
DeletePeerUDPMetrics(tt.peerA[0], tt.peerA[1])
|
||||||
|
|
||||||
|
// Peer B's ping and check histogram entries should both survive
|
||||||
|
if n := countMetrics(goldpingerResponseTimePeersHistogram); n != 2 {
|
||||||
|
t.Errorf("response time histogram has %d label set(s), want 2 for peer B (ping+check)", n)
|
||||||
|
}
|
||||||
|
if countMetrics(goldpingerPeersLossPct) == 0 {
|
||||||
|
t.Error("loss pct gauge lost all label sets — peer B should still exist")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clean up peer B so it doesn't leak into other tests
|
||||||
|
goldpingerResponseTimePeersHistogram.DeleteLabelValues(
|
||||||
|
GoldpingerConfig.Hostname, "ping", tt.peerB[0], tt.peerB[1],
|
||||||
|
)
|
||||||
|
goldpingerResponseTimePeersHistogram.DeleteLabelValues(
|
||||||
|
GoldpingerConfig.Hostname, "check", tt.peerB[0], tt.peerB[1],
|
||||||
|
)
|
||||||
|
goldpingerPeersLossPct.DeleteLabelValues(
|
||||||
|
GoldpingerConfig.Hostname, tt.peerB[0], tt.peerB[1],
|
||||||
|
)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// TestDeletePeerUDPMetrics_CleansAllPerPeerMetrics verifies that
|
// TestDeletePeerUDPMetrics_CleansAllPerPeerMetrics verifies that
|
||||||
// DeletePeerUDPMetrics removes label sets from every per-peer UDP metric.
|
// DeletePeerUDPMetrics removes label sets from every per-peer UDP metric.
|
||||||
// If a new per-peer metric is added but not cleaned up in
|
// If a new per-peer metric is added but not cleaned up in
|
||||||
// DeletePeerUDPMetrics, this test will fail.
|
// DeletePeerUDPMetrics, this test will fail.
|
||||||
func TestDeletePeerUDPMetrics_CleansAllPerPeerMetrics(t *testing.T) {
|
func TestDeletePeerUDPMetrics_CleansAllPerPeerMetrics(t *testing.T) {
|
||||||
// Save and restore hostname since we set it for the test
|
tests := []struct {
|
||||||
origHostname := GoldpingerConfig.Hostname
|
name string
|
||||||
GoldpingerConfig.Hostname = "test-instance"
|
hostIP string
|
||||||
defer func() { GoldpingerConfig.Hostname = origHostname }()
|
podIP string
|
||||||
|
}{
|
||||||
hostIP := "10.0.0.1"
|
{"IPv4", "10.0.0.1", "10.0.0.2"},
|
||||||
podIP := "10.0.0.2"
|
{"IPv6", "2001:db8::1", "2001:db8:1::2"},
|
||||||
|
|
||||||
// Populate all per-peer UDP metrics so they have label values
|
|
||||||
SetPeerLossPct(hostIP, podIP, 5.0)
|
|
||||||
SetPeerHopCount(hostIP, podIP, 2)
|
|
||||||
ObservePeerUDPRtt(hostIP, podIP, 0.001)
|
|
||||||
CountUDPDuplicates(hostIP, podIP, 1)
|
|
||||||
CountUDPOutOfOrder(hostIP, podIP, 1)
|
|
||||||
|
|
||||||
// Verify they exist before cleanup
|
|
||||||
perPeerCollectors := map[string]prometheus.Collector{
|
|
||||||
"goldpinger_peers_loss_pct": goldpingerPeersLossPct,
|
|
||||||
"goldpinger_peers_hop_count": goldpingerPeersHopCount,
|
|
||||||
"goldpinger_peers_udp_rtt_s": goldpingerPeersUDPRtt,
|
|
||||||
"goldpinger_udp_duplicates_total": goldpingerUDPDuplicatesCounter,
|
|
||||||
"goldpinger_udp_out_of_order_total": goldpingerUDPOutOfOrderCounter,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for name, collector := range perPeerCollectors {
|
for _, tt := range tests {
|
||||||
if countMetrics(collector) == 0 {
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
t.Fatalf("metric %s has no label values before cleanup — test setup is broken", name)
|
origHostname := GoldpingerConfig.Hostname
|
||||||
}
|
origUseHostIP := GoldpingerConfig.UseHostIP
|
||||||
}
|
GoldpingerConfig.Hostname = "test-instance"
|
||||||
|
GoldpingerConfig.UseHostIP = false
|
||||||
|
defer func() {
|
||||||
|
GoldpingerConfig.Hostname = origHostname
|
||||||
|
GoldpingerConfig.UseHostIP = origUseHostIP
|
||||||
|
}()
|
||||||
|
|
||||||
// Run cleanup
|
// Populate all per-peer UDP metrics so they have label values
|
||||||
DeletePeerUDPMetrics(hostIP, podIP)
|
SetPeerLossPct(tt.hostIP, tt.podIP, 5.0)
|
||||||
|
SetPeerHopCount(tt.hostIP, tt.podIP, 2)
|
||||||
|
ObservePeerUDPRtt(tt.hostIP, tt.podIP, 0.001)
|
||||||
|
CountUDPDuplicates(tt.hostIP, tt.podIP, 1)
|
||||||
|
CountUDPOutOfOrder(tt.hostIP, tt.podIP, 1)
|
||||||
|
CountUDPError(tt.podIP) // UseHostIP=false so target=podIP
|
||||||
|
|
||||||
// Verify all per-peer metrics are cleaned up
|
// Verify they exist before cleanup
|
||||||
for name, collector := range perPeerCollectors {
|
perPeerCollectors := map[string]prometheus.Collector{
|
||||||
if n := countMetrics(collector); n != 0 {
|
"goldpinger_peers_loss_pct": goldpingerPeersLossPct,
|
||||||
t.Errorf("metric %s still has %d label set(s) after DeletePeerUDPMetrics — add it to the cleanup function", name, n)
|
"goldpinger_peers_hop_count": goldpingerPeersHopCount,
|
||||||
}
|
"goldpinger_peers_udp_rtt_s": goldpingerPeersUDPRtt,
|
||||||
|
"goldpinger_udp_duplicates_total": goldpingerUDPDuplicatesCounter,
|
||||||
|
"goldpinger_udp_out_of_order_total": goldpingerUDPOutOfOrderCounter,
|
||||||
|
"goldpinger_udp_errors_total": goldpingerUDPErrorsCounter,
|
||||||
|
}
|
||||||
|
|
||||||
|
for name, collector := range perPeerCollectors {
|
||||||
|
if countMetrics(collector) == 0 {
|
||||||
|
t.Fatalf("metric %s has no label values before cleanup — test setup is broken", name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run cleanup
|
||||||
|
DeletePeerUDPMetrics(tt.hostIP, tt.podIP)
|
||||||
|
|
||||||
|
// Verify all per-peer metrics are cleaned up
|
||||||
|
for name, collector := range perPeerCollectors {
|
||||||
|
if n := countMetrics(collector); n != 0 {
|
||||||
|
t.Errorf("metric %s still has %d label set(s) after DeletePeerUDPMetrics — add it to the cleanup function", name, n)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -137,7 +137,9 @@ func destroyPingers(pingers map[string]*Pinger, deletedPods map[string]*Goldping
|
|||||||
// Close the channel to stop pinging
|
// Close the channel to stop pinging
|
||||||
close(pinger.stopChan)
|
close(pinger.stopChan)
|
||||||
|
|
||||||
// Clean up stale UDP metric labels for this peer
|
// Clean up stale metric labels for this peer so defunct pod IPs
|
||||||
|
// don't linger in /metrics after rolling updates (see #167)
|
||||||
|
DeletePeerMetrics(pod.HostIP, pod.PodIP)
|
||||||
if GoldpingerConfig.UDPEnabled {
|
if GoldpingerConfig.UDPEnabled {
|
||||||
DeletePeerUDPMetrics(pod.HostIP, pod.PodIP)
|
DeletePeerUDPMetrics(pod.HostIP, pod.PodIP)
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user