mirror of
https://github.com/prymitive/karma
synced 2026-05-05 03:16:51 +00:00
Re-implement metrics calculation as a collector
Split metrics code into a collector, this way it's self contained and doesn't require mixing metric calculation in the main logic. Fixes #130
This commit is contained in:
@@ -2,40 +2,120 @@ package alertmanager
|
||||
|
||||
import "github.com/prometheus/client_golang/prometheus"
|
||||
|
||||
var (
|
||||
metricAlerts = prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Name: "unsee_collected_alerts",
|
||||
Help: "Total number of alerts collected from Alertmanager API",
|
||||
},
|
||||
[]string{"alertmanager", "state"},
|
||||
)
|
||||
metricAlertGroups = prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Name: "unsee_collected_groups",
|
||||
Help: "Total number of alert groups collected from Alertmanager API",
|
||||
},
|
||||
[]string{"alertmanager"},
|
||||
)
|
||||
metricAlertmanagerErrors = prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Name: "unsee_alertmanager_errors_total",
|
||||
Help: "Total number of errors encounter when requesting data from Alertmanager API",
|
||||
},
|
||||
[]string{"alertmanager", "endpoint"},
|
||||
)
|
||||
metricCollectRuns = prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Name: "unsee_collect_cycles_total",
|
||||
Help: "Total number of alert collection cycles run",
|
||||
},
|
||||
[]string{"alertmanager"},
|
||||
)
|
||||
)
|
||||
type unseeCollector struct {
|
||||
collectedAlerts *prometheus.Desc
|
||||
collectedGroups *prometheus.Desc
|
||||
cyclesTotal *prometheus.Desc
|
||||
errorsTotal *prometheus.Desc
|
||||
}
|
||||
|
||||
func newUnseeCollector() *unseeCollector {
|
||||
return &unseeCollector{
|
||||
collectedAlerts: prometheus.NewDesc(
|
||||
"unsee_collected_alerts_count",
|
||||
"Total number of alerts collected from Alertmanager API",
|
||||
[]string{"alertmanager", "state", "receiver"},
|
||||
prometheus.Labels{},
|
||||
),
|
||||
collectedGroups: prometheus.NewDesc(
|
||||
"unsee_collected_groups_count",
|
||||
"Total number of alert groups collected from Alertmanager API",
|
||||
[]string{"alertmanager", "receiver"},
|
||||
prometheus.Labels{},
|
||||
),
|
||||
cyclesTotal: prometheus.NewDesc(
|
||||
"unsee_collect_cycles_total",
|
||||
"Total number of alert collection cycles run",
|
||||
[]string{"alertmanager"},
|
||||
prometheus.Labels{},
|
||||
),
|
||||
errorsTotal: prometheus.NewDesc(
|
||||
"unsee_alertmanager_errors_total",
|
||||
"Total number of errors encounter when requesting data from Alertmanager API",
|
||||
[]string{"alertmanager", "endpoint"},
|
||||
prometheus.Labels{},
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
func (c *unseeCollector) Describe(ch chan<- *prometheus.Desc) {
|
||||
ch <- c.collectedAlerts
|
||||
ch <- c.collectedGroups
|
||||
ch <- c.cyclesTotal
|
||||
ch <- c.errorsTotal
|
||||
}
|
||||
|
||||
func (c *unseeCollector) Collect(ch chan<- prometheus.Metric) {
|
||||
upstreams := GetAlertmanagers()
|
||||
|
||||
for _, am := range upstreams {
|
||||
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
c.cyclesTotal,
|
||||
prometheus.CounterValue,
|
||||
am.metrics.cycles,
|
||||
am.Name,
|
||||
)
|
||||
for key, val := range am.metrics.errors {
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
c.errorsTotal,
|
||||
prometheus.CounterValue,
|
||||
val,
|
||||
am.Name,
|
||||
key,
|
||||
)
|
||||
}
|
||||
|
||||
// receiver name -> count
|
||||
groupsByReceiver := map[string]float64{}
|
||||
// receiver name -> state -> count
|
||||
alertsByReceiverByState := map[string]map[string]float64{}
|
||||
|
||||
// iterate all alert groups this instance stores
|
||||
for _, group := range am.Alerts() {
|
||||
// count all groups per receiver
|
||||
if _, found := groupsByReceiver[group.Receiver]; !found {
|
||||
groupsByReceiver[group.Receiver] = 0
|
||||
}
|
||||
groupsByReceiver[group.Receiver]++
|
||||
|
||||
// count all alerts per receiver & state
|
||||
for _, alert := range group.Alerts {
|
||||
if _, found := alertsByReceiverByState[alert.Receiver]; !found {
|
||||
alertsByReceiverByState[alert.Receiver] = map[string]float64{}
|
||||
}
|
||||
if _, found := alertsByReceiverByState[alert.Receiver][alert.State]; !found {
|
||||
alertsByReceiverByState[alert.Receiver][alert.State] = 0
|
||||
}
|
||||
alertsByReceiverByState[alert.Receiver][alert.State]++
|
||||
}
|
||||
}
|
||||
|
||||
// publish metrics using calculated values
|
||||
for reciver, count := range groupsByReceiver {
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
c.collectedGroups,
|
||||
prometheus.GaugeValue,
|
||||
count,
|
||||
am.Name,
|
||||
reciver,
|
||||
)
|
||||
}
|
||||
for reciver, byState := range alertsByReceiverByState {
|
||||
for state, count := range byState {
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
c.collectedAlerts,
|
||||
prometheus.GaugeValue,
|
||||
count,
|
||||
am.Name,
|
||||
reciver,
|
||||
state,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func init() {
|
||||
prometheus.MustRegister(metricAlerts)
|
||||
prometheus.MustRegister(metricAlertGroups)
|
||||
prometheus.MustRegister(metricAlertmanagerErrors)
|
||||
prometheus.MustRegister(metricCollectRuns)
|
||||
prometheus.MustRegister(newUnseeCollector())
|
||||
}
|
||||
|
||||
@@ -11,11 +11,20 @@ import (
|
||||
"github.com/cloudflare/unsee/models"
|
||||
"github.com/cloudflare/unsee/transform"
|
||||
"github.com/cloudflare/unsee/transport"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
const (
|
||||
labelValueErrorsAlerts = "alerts"
|
||||
labelValueErrorsSilences = "silences"
|
||||
)
|
||||
|
||||
type alertmanagerMetrics struct {
|
||||
cycles float64
|
||||
errors map[string]float64
|
||||
}
|
||||
|
||||
// Alertmanager represents Alertmanager upstream instance
|
||||
type Alertmanager struct {
|
||||
URI string `json:"uri"`
|
||||
@@ -29,6 +38,8 @@ type Alertmanager struct {
|
||||
colors models.LabelsColorMap
|
||||
autocomplete []models.Autocomplete
|
||||
lastError string
|
||||
// metrics tracked per alertmanager instance
|
||||
metrics alertmanagerMetrics
|
||||
}
|
||||
|
||||
func (am *Alertmanager) detectVersion() string {
|
||||
@@ -68,22 +79,6 @@ func (am *Alertmanager) clearData() {
|
||||
am.colors = models.LabelsColorMap{}
|
||||
am.autocomplete = []models.Autocomplete{}
|
||||
am.lock.Unlock()
|
||||
// reset metrics to 0 since we don't store anything anymore
|
||||
am.resetMetrics()
|
||||
}
|
||||
|
||||
func (am *Alertmanager) resetMetrics() {
|
||||
// reset alert state/instance counters
|
||||
for _, state := range models.AlertStateList {
|
||||
metricAlerts.With(prometheus.Labels{
|
||||
"alertmanager": am.Name,
|
||||
"state": state,
|
||||
}).Set(0)
|
||||
}
|
||||
// reset alert group counters
|
||||
metricAlertGroups.With(prometheus.Labels{
|
||||
"alertmanager": am.Name,
|
||||
}).Set(0)
|
||||
}
|
||||
|
||||
func (am *Alertmanager) pullSilences(version string) error {
|
||||
@@ -154,12 +149,6 @@ func (am *Alertmanager) pullAlerts(version string) error {
|
||||
colors := models.LabelsColorMap{}
|
||||
autocompleteMap := map[string]models.Autocomplete{}
|
||||
|
||||
// we'll use this to update alert counter metrics (per state/instance)
|
||||
alertMetrics := map[string]float64{}
|
||||
for _, state := range models.AlertStateList {
|
||||
alertMetrics[state] = 0
|
||||
}
|
||||
|
||||
log.Infof("[%s] Processing unique alert groups (%d)", am.Name, len(uniqueGroups))
|
||||
for _, ag := range uniqueGroups {
|
||||
alerts := models.AlertList{}
|
||||
@@ -194,8 +183,6 @@ func (am *Alertmanager) pullAlerts(version string) error {
|
||||
|
||||
alert.UpdateFingerprints()
|
||||
alerts = append(alerts, alert)
|
||||
|
||||
alertMetrics[alert.State]++
|
||||
}
|
||||
|
||||
for _, hint := range transform.BuildAutocomplete(alerts) {
|
||||
@@ -211,14 +198,6 @@ func (am *Alertmanager) pullAlerts(version string) error {
|
||||
dedupedGroups = append(dedupedGroups, ag)
|
||||
}
|
||||
|
||||
// update internal metrics with new computed values
|
||||
for state, val := range alertMetrics {
|
||||
metricAlerts.With(prometheus.Labels{
|
||||
"alertmanager": am.Name,
|
||||
"state": state,
|
||||
}).Set(val)
|
||||
}
|
||||
|
||||
log.Infof("[%s] Merging autocomplete data (%d)", am.Name, len(autocompleteMap))
|
||||
autocomplete := []models.Autocomplete{}
|
||||
for _, hint := range autocompleteMap {
|
||||
@@ -231,25 +210,20 @@ func (am *Alertmanager) pullAlerts(version string) error {
|
||||
am.autocomplete = autocomplete
|
||||
am.lock.Unlock()
|
||||
|
||||
metricAlertGroups.With(prometheus.Labels{
|
||||
"alertmanager": am.Name,
|
||||
}).Set(float64(len(dedupedGroups)))
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Pull data from upstream Alertmanager instance
|
||||
func (am *Alertmanager) Pull() error {
|
||||
am.metrics.cycles++
|
||||
|
||||
version := am.detectVersion()
|
||||
|
||||
err := am.pullSilences(version)
|
||||
if err != nil {
|
||||
am.clearData()
|
||||
am.setError(err.Error())
|
||||
metricAlertmanagerErrors.With(prometheus.Labels{
|
||||
"alertmanager": am.Name,
|
||||
"endpoint": "silences",
|
||||
}).Inc()
|
||||
am.metrics.errors[labelValueErrorsSilences]++
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -257,17 +231,10 @@ func (am *Alertmanager) Pull() error {
|
||||
if err != nil {
|
||||
am.clearData()
|
||||
am.setError(err.Error())
|
||||
metricAlertmanagerErrors.With(prometheus.Labels{
|
||||
"alertmanager": am.Name,
|
||||
"endpoint": "alerts",
|
||||
}).Inc()
|
||||
am.metrics.errors[labelValueErrorsAlerts]++
|
||||
return err
|
||||
}
|
||||
|
||||
metricCollectRuns.With(prometheus.Labels{
|
||||
"alertmanager": am.Name,
|
||||
}).Inc()
|
||||
|
||||
am.lastError = ""
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -6,7 +6,6 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/cloudflare/unsee/models"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
@@ -27,16 +26,6 @@ func NewAlertmanager(name, uri string, timeout time.Duration) error {
|
||||
}
|
||||
}
|
||||
|
||||
// initialize metrics
|
||||
metricAlertmanagerErrors.With(prometheus.Labels{
|
||||
"alertmanager": name,
|
||||
"endpoint": "alerts",
|
||||
}).Set(0)
|
||||
metricAlertmanagerErrors.With(prometheus.Labels{
|
||||
"alertmanager": name,
|
||||
"endpoint": "silences",
|
||||
}).Set(0)
|
||||
|
||||
upstreams[name] = &Alertmanager{
|
||||
URI: uri,
|
||||
Timeout: timeout,
|
||||
@@ -46,6 +35,12 @@ func NewAlertmanager(name, uri string, timeout time.Duration) error {
|
||||
silences: map[string]models.Silence{},
|
||||
colors: models.LabelsColorMap{},
|
||||
autocomplete: []models.Autocomplete{},
|
||||
metrics: alertmanagerMetrics{
|
||||
errors: map[string]float64{
|
||||
labelValueErrorsAlerts: 0,
|
||||
labelValueErrorsSilences: 0,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
log.Infof("[%s] Configured Alertmanager source at %s", name, uri)
|
||||
|
||||
Reference in New Issue
Block a user