Files
Reloader/internal/pkg/metrics/prometheus.go
TheiLLeniumStudios 1945a740d0 chore: Format files#
2026-01-09 01:22:24 +01:00

400 lines
12 KiB
Go

package metrics
import (
"context"
"net/http"
"net/url"
"os"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"k8s.io/client-go/tools/metrics"
)
// clientGoRequestMetrics implements metrics.LatencyMetric and metrics.ResultMetric
// to expose client-go's rest_client_requests_total metric
type clientGoRequestMetrics struct {
requestCounter *prometheus.CounterVec
requestLatency *prometheus.HistogramVec
}
func (m *clientGoRequestMetrics) Increment(ctx context.Context, code string, method string, host string) {
m.requestCounter.WithLabelValues(code, method, host).Inc()
}
func (m *clientGoRequestMetrics) Observe(ctx context.Context, verb string, u url.URL, latency time.Duration) {
m.requestLatency.WithLabelValues(verb, u.Host).Observe(latency.Seconds())
}
var clientGoMetrics = &clientGoRequestMetrics{
requestCounter: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "rest_client_requests_total",
Help: "Number of HTTP requests, partitioned by status code, method, and host.",
},
[]string{"code", "method", "host"},
),
requestLatency: prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "rest_client_request_duration_seconds",
Help: "Request latency in seconds. Broken down by verb and host.",
Buckets: []float64{0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30},
},
[]string{"verb", "host"},
),
}
func init() {
// Register the metrics collectors
prometheus.MustRegister(clientGoMetrics.requestCounter)
prometheus.MustRegister(clientGoMetrics.requestLatency)
// Register our metrics implementation with client-go
metrics.RequestResult = clientGoMetrics
metrics.RequestLatency = clientGoMetrics
}
// Collectors holds all Prometheus metrics collectors for Reloader.
type Collectors struct {
Reloaded *prometheus.CounterVec
ReloadedByNamespace *prometheus.CounterVec
countByNamespace bool
ReconcileTotal *prometheus.CounterVec // Total reconcile calls by result
ReconcileDuration *prometheus.HistogramVec // Time spent in reconcile/handler
ActionTotal *prometheus.CounterVec // Total actions by workload kind and result
ActionLatency *prometheus.HistogramVec // Time from event to action applied
SkippedTotal *prometheus.CounterVec // Skipped operations by reason
QueueDepth prometheus.Gauge // Current queue depth
QueueAdds prometheus.Counter // Total items added to queue
QueueLatency *prometheus.HistogramVec // Time spent in queue
ErrorsTotal *prometheus.CounterVec // Errors by type
RetriesTotal prometheus.Counter // Total retries
EventsReceived *prometheus.CounterVec // Events received by type (add/update/delete)
EventsProcessed *prometheus.CounterVec // Events processed by type and result
WorkloadsScanned *prometheus.CounterVec // Workloads scanned by kind
WorkloadsMatched *prometheus.CounterVec // Workloads matched for reload by kind
}
// RecordReload records a reload event with the given success status and namespace.
// Preserved for backward compatibility.
func (c *Collectors) RecordReload(success bool, namespace string) {
if c == nil {
return
}
successLabel := "false"
if success {
successLabel = "true"
}
c.Reloaded.With(prometheus.Labels{"success": successLabel}).Inc()
if c.countByNamespace {
c.ReloadedByNamespace.With(prometheus.Labels{
"success": successLabel,
"namespace": namespace,
}).Inc()
}
}
// RecordReconcile records a reconcile/handler invocation.
func (c *Collectors) RecordReconcile(result string, duration time.Duration) {
if c == nil {
return
}
c.ReconcileTotal.With(prometheus.Labels{"result": result}).Inc()
c.ReconcileDuration.With(prometheus.Labels{"result": result}).Observe(duration.Seconds())
}
// RecordAction records a reload action on a workload.
func (c *Collectors) RecordAction(workloadKind string, result string, latency time.Duration) {
if c == nil {
return
}
c.ActionTotal.With(prometheus.Labels{"workload_kind": workloadKind, "result": result}).Inc()
c.ActionLatency.With(prometheus.Labels{"workload_kind": workloadKind}).Observe(latency.Seconds())
}
// RecordSkipped records a skipped operation with reason.
func (c *Collectors) RecordSkipped(reason string) {
if c == nil {
return
}
c.SkippedTotal.With(prometheus.Labels{"reason": reason}).Inc()
}
// RecordQueueAdd records an item being added to the queue.
func (c *Collectors) RecordQueueAdd() {
if c == nil {
return
}
c.QueueAdds.Inc()
}
// SetQueueDepth sets the current queue depth.
func (c *Collectors) SetQueueDepth(depth int) {
if c == nil {
return
}
c.QueueDepth.Set(float64(depth))
}
// RecordQueueLatency records how long an item spent in the queue.
func (c *Collectors) RecordQueueLatency(latency time.Duration) {
if c == nil {
return
}
c.QueueLatency.With(prometheus.Labels{}).Observe(latency.Seconds())
}
// RecordError records an error by type.
func (c *Collectors) RecordError(errorType string) {
if c == nil {
return
}
c.ErrorsTotal.With(prometheus.Labels{"type": errorType}).Inc()
}
// RecordRetry records a retry attempt.
func (c *Collectors) RecordRetry() {
if c == nil {
return
}
c.RetriesTotal.Inc()
}
// RecordEventReceived records an event being received.
func (c *Collectors) RecordEventReceived(eventType string, resourceType string) {
if c == nil {
return
}
c.EventsReceived.With(prometheus.Labels{"event_type": eventType, "resource_type": resourceType}).Inc()
}
// RecordEventProcessed records an event being processed.
func (c *Collectors) RecordEventProcessed(eventType string, resourceType string, result string) {
if c == nil {
return
}
c.EventsProcessed.With(prometheus.Labels{"event_type": eventType, "resource_type": resourceType, "result": result}).Inc()
}
// RecordWorkloadsScanned records workloads scanned during a reconcile.
func (c *Collectors) RecordWorkloadsScanned(kind string, count int) {
if c == nil {
return
}
c.WorkloadsScanned.With(prometheus.Labels{"kind": kind}).Add(float64(count))
}
// RecordWorkloadsMatched records workloads matched for reload.
func (c *Collectors) RecordWorkloadsMatched(kind string, count int) {
if c == nil {
return
}
c.WorkloadsMatched.With(prometheus.Labels{"kind": kind}).Add(float64(count))
}
func NewCollectors() Collectors {
// Existing metrics (preserved)
reloaded := prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "reloader",
Name: "reload_executed_total",
Help: "Counter of reloads executed by Reloader.",
},
[]string{"success"},
)
reloaded.With(prometheus.Labels{"success": "true"}).Add(0)
reloaded.With(prometheus.Labels{"success": "false"}).Add(0)
reloadedByNamespace := prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "reloader",
Name: "reload_executed_total_by_namespace",
Help: "Counter of reloads executed by Reloader by namespace.",
},
[]string{"success", "namespace"},
)
// === NEW: Comprehensive metrics ===
reconcileTotal := prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "reloader",
Name: "reconcile_total",
Help: "Total number of reconcile/handler invocations by result.",
},
[]string{"result"},
)
reconcileDuration := prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "reloader",
Name: "reconcile_duration_seconds",
Help: "Time spent in reconcile/handler in seconds.",
Buckets: []float64{0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10},
},
[]string{"result"},
)
actionTotal := prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "reloader",
Name: "action_total",
Help: "Total number of reload actions by workload kind and result.",
},
[]string{"workload_kind", "result"},
)
actionLatency := prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "reloader",
Name: "action_latency_seconds",
Help: "Time from event received to action applied in seconds.",
Buckets: []float64{0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60},
},
[]string{"workload_kind"},
)
skippedTotal := prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "reloader",
Name: "skipped_total",
Help: "Total number of skipped operations by reason.",
},
[]string{"reason"},
)
queueDepth := prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: "reloader",
Name: "workqueue_depth",
Help: "Current depth of the work queue.",
},
)
queueAdds := prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: "reloader",
Name: "workqueue_adds_total",
Help: "Total number of items added to the work queue.",
},
)
queueLatency := prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "reloader",
Name: "workqueue_latency_seconds",
Help: "Time spent in the work queue in seconds.",
Buckets: []float64{0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5},
},
[]string{},
)
errorsTotal := prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "reloader",
Name: "errors_total",
Help: "Total number of errors by type.",
},
[]string{"type"},
)
retriesTotal := prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: "reloader",
Name: "retries_total",
Help: "Total number of retry attempts.",
},
)
eventsReceived := prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "reloader",
Name: "events_received_total",
Help: "Total number of events received by type and resource.",
},
[]string{"event_type", "resource_type"},
)
eventsProcessed := prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "reloader",
Name: "events_processed_total",
Help: "Total number of events processed by type, resource, and result.",
},
[]string{"event_type", "resource_type", "result"},
)
workloadsScanned := prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "reloader",
Name: "workloads_scanned_total",
Help: "Total number of workloads scanned by kind.",
},
[]string{"kind"},
)
workloadsMatched := prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "reloader",
Name: "workloads_matched_total",
Help: "Total number of workloads matched for reload by kind.",
},
[]string{"kind"},
)
return Collectors{
Reloaded: reloaded,
ReloadedByNamespace: reloadedByNamespace,
countByNamespace: os.Getenv("METRICS_COUNT_BY_NAMESPACE") == "enabled",
ReconcileTotal: reconcileTotal,
ReconcileDuration: reconcileDuration,
ActionTotal: actionTotal,
ActionLatency: actionLatency,
SkippedTotal: skippedTotal,
QueueDepth: queueDepth,
QueueAdds: queueAdds,
QueueLatency: queueLatency,
ErrorsTotal: errorsTotal,
RetriesTotal: retriesTotal,
EventsReceived: eventsReceived,
EventsProcessed: eventsProcessed,
WorkloadsScanned: workloadsScanned,
WorkloadsMatched: workloadsMatched,
}
}
func SetupPrometheusEndpoint() Collectors {
collectors := NewCollectors()
// Register all metrics
prometheus.MustRegister(collectors.Reloaded)
prometheus.MustRegister(collectors.ReconcileTotal)
prometheus.MustRegister(collectors.ReconcileDuration)
prometheus.MustRegister(collectors.ActionTotal)
prometheus.MustRegister(collectors.ActionLatency)
prometheus.MustRegister(collectors.SkippedTotal)
prometheus.MustRegister(collectors.QueueDepth)
prometheus.MustRegister(collectors.QueueAdds)
prometheus.MustRegister(collectors.QueueLatency)
prometheus.MustRegister(collectors.ErrorsTotal)
prometheus.MustRegister(collectors.RetriesTotal)
prometheus.MustRegister(collectors.EventsReceived)
prometheus.MustRegister(collectors.EventsProcessed)
prometheus.MustRegister(collectors.WorkloadsScanned)
prometheus.MustRegister(collectors.WorkloadsMatched)
if os.Getenv("METRICS_COUNT_BY_NAMESPACE") == "enabled" {
prometheus.MustRegister(collectors.ReloadedByNamespace)
}
http.Handle("/metrics", promhttp.Handler())
return collectors
}