From 9b5155e68cf3b9352732869c776fc5b64a1d6d92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Mierzwa?= Date: Sun, 9 Jul 2017 08:28:51 -0700 Subject: [PATCH] Re-implement metrics calculation as a collector Split metrics code into a collector, this way it's self contained and doesn't require mixing metric calculation in the main logic. Fixes #130 --- alertmanager/metrics.go | 148 ++++++++++++++++++++++++++++++--------- alertmanager/models.go | 65 +++++------------ alertmanager/upstream.go | 17 ++--- 3 files changed, 136 insertions(+), 94 deletions(-) diff --git a/alertmanager/metrics.go b/alertmanager/metrics.go index f014888ee..c180e01fb 100644 --- a/alertmanager/metrics.go +++ b/alertmanager/metrics.go @@ -2,40 +2,120 @@ package alertmanager import "github.com/prometheus/client_golang/prometheus" -var ( - metricAlerts = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "unsee_collected_alerts", - Help: "Total number of alerts collected from Alertmanager API", - }, - []string{"alertmanager", "state"}, - ) - metricAlertGroups = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "unsee_collected_groups", - Help: "Total number of alert groups collected from Alertmanager API", - }, - []string{"alertmanager"}, - ) - metricAlertmanagerErrors = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "unsee_alertmanager_errors_total", - Help: "Total number of errors encounter when requesting data from Alertmanager API", - }, - []string{"alertmanager", "endpoint"}, - ) - metricCollectRuns = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "unsee_collect_cycles_total", - Help: "Total number of alert collection cycles run", - }, - []string{"alertmanager"}, - ) -) +type unseeCollector struct { + collectedAlerts *prometheus.Desc + collectedGroups *prometheus.Desc + cyclesTotal *prometheus.Desc + errorsTotal *prometheus.Desc +} + +func newUnseeCollector() *unseeCollector { + return &unseeCollector{ + collectedAlerts: prometheus.NewDesc( + "unsee_collected_alerts_count", + "Total number of alerts collected from Alertmanager API", + []string{"alertmanager", "state", "receiver"}, + prometheus.Labels{}, + ), + collectedGroups: prometheus.NewDesc( + "unsee_collected_groups_count", + "Total number of alert groups collected from Alertmanager API", + []string{"alertmanager", "receiver"}, + prometheus.Labels{}, + ), + cyclesTotal: prometheus.NewDesc( + "unsee_collect_cycles_total", + "Total number of alert collection cycles run", + []string{"alertmanager"}, + prometheus.Labels{}, + ), + errorsTotal: prometheus.NewDesc( + "unsee_alertmanager_errors_total", + "Total number of errors encounter when requesting data from Alertmanager API", + []string{"alertmanager", "endpoint"}, + prometheus.Labels{}, + ), + } +} + +func (c *unseeCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- c.collectedAlerts + ch <- c.collectedGroups + ch <- c.cyclesTotal + ch <- c.errorsTotal +} + +func (c *unseeCollector) Collect(ch chan<- prometheus.Metric) { + upstreams := GetAlertmanagers() + + for _, am := range upstreams { + + ch <- prometheus.MustNewConstMetric( + c.cyclesTotal, + prometheus.CounterValue, + am.metrics.cycles, + am.Name, + ) + for key, val := range am.metrics.errors { + ch <- prometheus.MustNewConstMetric( + c.errorsTotal, + prometheus.CounterValue, + val, + am.Name, + key, + ) + } + + // receiver name -> count + groupsByReceiver := map[string]float64{} + // receiver name -> state -> count + alertsByReceiverByState := map[string]map[string]float64{} + + // iterate all alert groups this instance stores + for _, group := range am.Alerts() { + // count all groups per receiver + if _, found := groupsByReceiver[group.Receiver]; !found { + groupsByReceiver[group.Receiver] = 0 + } + groupsByReceiver[group.Receiver]++ + + // count all alerts per receiver & state + for _, alert := range group.Alerts { + if _, found := alertsByReceiverByState[alert.Receiver]; !found { + alertsByReceiverByState[alert.Receiver] = map[string]float64{} + } + if _, found := alertsByReceiverByState[alert.Receiver][alert.State]; !found { + alertsByReceiverByState[alert.Receiver][alert.State] = 0 + } + alertsByReceiverByState[alert.Receiver][alert.State]++ + } + } + + // publish metrics using calculated values + for reciver, count := range groupsByReceiver { + ch <- prometheus.MustNewConstMetric( + c.collectedGroups, + prometheus.GaugeValue, + count, + am.Name, + reciver, + ) + } + for reciver, byState := range alertsByReceiverByState { + for state, count := range byState { + ch <- prometheus.MustNewConstMetric( + c.collectedAlerts, + prometheus.GaugeValue, + count, + am.Name, + reciver, + state, + ) + } + } + } +} func init() { - prometheus.MustRegister(metricAlerts) - prometheus.MustRegister(metricAlertGroups) - prometheus.MustRegister(metricAlertmanagerErrors) - prometheus.MustRegister(metricCollectRuns) + prometheus.MustRegister(newUnseeCollector()) } diff --git a/alertmanager/models.go b/alertmanager/models.go index 6ccae8364..546c639bf 100644 --- a/alertmanager/models.go +++ b/alertmanager/models.go @@ -11,11 +11,20 @@ import ( "github.com/cloudflare/unsee/models" "github.com/cloudflare/unsee/transform" "github.com/cloudflare/unsee/transport" - "github.com/prometheus/client_golang/prometheus" log "github.com/sirupsen/logrus" ) +const ( + labelValueErrorsAlerts = "alerts" + labelValueErrorsSilences = "silences" +) + +type alertmanagerMetrics struct { + cycles float64 + errors map[string]float64 +} + // Alertmanager represents Alertmanager upstream instance type Alertmanager struct { URI string `json:"uri"` @@ -29,6 +38,8 @@ type Alertmanager struct { colors models.LabelsColorMap autocomplete []models.Autocomplete lastError string + // metrics tracked per alertmanager instance + metrics alertmanagerMetrics } func (am *Alertmanager) detectVersion() string { @@ -68,22 +79,6 @@ func (am *Alertmanager) clearData() { am.colors = models.LabelsColorMap{} am.autocomplete = []models.Autocomplete{} am.lock.Unlock() - // reset metrics to 0 since we don't store anything anymore - am.resetMetrics() -} - -func (am *Alertmanager) resetMetrics() { - // reset alert state/instance counters - for _, state := range models.AlertStateList { - metricAlerts.With(prometheus.Labels{ - "alertmanager": am.Name, - "state": state, - }).Set(0) - } - // reset alert group counters - metricAlertGroups.With(prometheus.Labels{ - "alertmanager": am.Name, - }).Set(0) } func (am *Alertmanager) pullSilences(version string) error { @@ -154,12 +149,6 @@ func (am *Alertmanager) pullAlerts(version string) error { colors := models.LabelsColorMap{} autocompleteMap := map[string]models.Autocomplete{} - // we'll use this to update alert counter metrics (per state/instance) - alertMetrics := map[string]float64{} - for _, state := range models.AlertStateList { - alertMetrics[state] = 0 - } - log.Infof("[%s] Processing unique alert groups (%d)", am.Name, len(uniqueGroups)) for _, ag := range uniqueGroups { alerts := models.AlertList{} @@ -194,8 +183,6 @@ func (am *Alertmanager) pullAlerts(version string) error { alert.UpdateFingerprints() alerts = append(alerts, alert) - - alertMetrics[alert.State]++ } for _, hint := range transform.BuildAutocomplete(alerts) { @@ -211,14 +198,6 @@ func (am *Alertmanager) pullAlerts(version string) error { dedupedGroups = append(dedupedGroups, ag) } - // update internal metrics with new computed values - for state, val := range alertMetrics { - metricAlerts.With(prometheus.Labels{ - "alertmanager": am.Name, - "state": state, - }).Set(val) - } - log.Infof("[%s] Merging autocomplete data (%d)", am.Name, len(autocompleteMap)) autocomplete := []models.Autocomplete{} for _, hint := range autocompleteMap { @@ -231,25 +210,20 @@ func (am *Alertmanager) pullAlerts(version string) error { am.autocomplete = autocomplete am.lock.Unlock() - metricAlertGroups.With(prometheus.Labels{ - "alertmanager": am.Name, - }).Set(float64(len(dedupedGroups))) - return nil } // Pull data from upstream Alertmanager instance func (am *Alertmanager) Pull() error { + am.metrics.cycles++ + version := am.detectVersion() err := am.pullSilences(version) if err != nil { am.clearData() am.setError(err.Error()) - metricAlertmanagerErrors.With(prometheus.Labels{ - "alertmanager": am.Name, - "endpoint": "silences", - }).Inc() + am.metrics.errors[labelValueErrorsSilences]++ return err } @@ -257,17 +231,10 @@ func (am *Alertmanager) Pull() error { if err != nil { am.clearData() am.setError(err.Error()) - metricAlertmanagerErrors.With(prometheus.Labels{ - "alertmanager": am.Name, - "endpoint": "alerts", - }).Inc() + am.metrics.errors[labelValueErrorsAlerts]++ return err } - metricCollectRuns.With(prometheus.Labels{ - "alertmanager": am.Name, - }).Inc() - am.lastError = "" return nil } diff --git a/alertmanager/upstream.go b/alertmanager/upstream.go index 11f90108c..762fd5457 100644 --- a/alertmanager/upstream.go +++ b/alertmanager/upstream.go @@ -6,7 +6,6 @@ import ( "time" "github.com/cloudflare/unsee/models" - "github.com/prometheus/client_golang/prometheus" log "github.com/sirupsen/logrus" ) @@ -27,16 +26,6 @@ func NewAlertmanager(name, uri string, timeout time.Duration) error { } } - // initialize metrics - metricAlertmanagerErrors.With(prometheus.Labels{ - "alertmanager": name, - "endpoint": "alerts", - }).Set(0) - metricAlertmanagerErrors.With(prometheus.Labels{ - "alertmanager": name, - "endpoint": "silences", - }).Set(0) - upstreams[name] = &Alertmanager{ URI: uri, Timeout: timeout, @@ -46,6 +35,12 @@ func NewAlertmanager(name, uri string, timeout time.Duration) error { silences: map[string]models.Silence{}, colors: models.LabelsColorMap{}, autocomplete: []models.Autocomplete{}, + metrics: alertmanagerMetrics{ + errors: map[string]float64{ + labelValueErrorsAlerts: 0, + labelValueErrorsSilences: 0, + }, + }, } log.Infof("[%s] Configured Alertmanager source at %s", name, uri)