Re-implement metrics calculation as a collector

Split metrics code into a collector, this way it's self contained and doesn't require mixing metric calculation in the main logic.
Fixes #130
This commit is contained in:
Łukasz Mierzwa
2017-07-09 08:28:51 -07:00
parent 6c74a7b7e9
commit 9b5155e68c
3 changed files with 136 additions and 94 deletions

View File

@@ -2,40 +2,120 @@ package alertmanager
import "github.com/prometheus/client_golang/prometheus"
var (
metricAlerts = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "unsee_collected_alerts",
Help: "Total number of alerts collected from Alertmanager API",
},
[]string{"alertmanager", "state"},
)
metricAlertGroups = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "unsee_collected_groups",
Help: "Total number of alert groups collected from Alertmanager API",
},
[]string{"alertmanager"},
)
metricAlertmanagerErrors = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "unsee_alertmanager_errors_total",
Help: "Total number of errors encounter when requesting data from Alertmanager API",
},
[]string{"alertmanager", "endpoint"},
)
metricCollectRuns = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "unsee_collect_cycles_total",
Help: "Total number of alert collection cycles run",
},
[]string{"alertmanager"},
)
)
type unseeCollector struct {
collectedAlerts *prometheus.Desc
collectedGroups *prometheus.Desc
cyclesTotal *prometheus.Desc
errorsTotal *prometheus.Desc
}
func newUnseeCollector() *unseeCollector {
return &unseeCollector{
collectedAlerts: prometheus.NewDesc(
"unsee_collected_alerts_count",
"Total number of alerts collected from Alertmanager API",
[]string{"alertmanager", "state", "receiver"},
prometheus.Labels{},
),
collectedGroups: prometheus.NewDesc(
"unsee_collected_groups_count",
"Total number of alert groups collected from Alertmanager API",
[]string{"alertmanager", "receiver"},
prometheus.Labels{},
),
cyclesTotal: prometheus.NewDesc(
"unsee_collect_cycles_total",
"Total number of alert collection cycles run",
[]string{"alertmanager"},
prometheus.Labels{},
),
errorsTotal: prometheus.NewDesc(
"unsee_alertmanager_errors_total",
"Total number of errors encounter when requesting data from Alertmanager API",
[]string{"alertmanager", "endpoint"},
prometheus.Labels{},
),
}
}
func (c *unseeCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- c.collectedAlerts
ch <- c.collectedGroups
ch <- c.cyclesTotal
ch <- c.errorsTotal
}
func (c *unseeCollector) Collect(ch chan<- prometheus.Metric) {
upstreams := GetAlertmanagers()
for _, am := range upstreams {
ch <- prometheus.MustNewConstMetric(
c.cyclesTotal,
prometheus.CounterValue,
am.metrics.cycles,
am.Name,
)
for key, val := range am.metrics.errors {
ch <- prometheus.MustNewConstMetric(
c.errorsTotal,
prometheus.CounterValue,
val,
am.Name,
key,
)
}
// receiver name -> count
groupsByReceiver := map[string]float64{}
// receiver name -> state -> count
alertsByReceiverByState := map[string]map[string]float64{}
// iterate all alert groups this instance stores
for _, group := range am.Alerts() {
// count all groups per receiver
if _, found := groupsByReceiver[group.Receiver]; !found {
groupsByReceiver[group.Receiver] = 0
}
groupsByReceiver[group.Receiver]++
// count all alerts per receiver & state
for _, alert := range group.Alerts {
if _, found := alertsByReceiverByState[alert.Receiver]; !found {
alertsByReceiverByState[alert.Receiver] = map[string]float64{}
}
if _, found := alertsByReceiverByState[alert.Receiver][alert.State]; !found {
alertsByReceiverByState[alert.Receiver][alert.State] = 0
}
alertsByReceiverByState[alert.Receiver][alert.State]++
}
}
// publish metrics using calculated values
for reciver, count := range groupsByReceiver {
ch <- prometheus.MustNewConstMetric(
c.collectedGroups,
prometheus.GaugeValue,
count,
am.Name,
reciver,
)
}
for reciver, byState := range alertsByReceiverByState {
for state, count := range byState {
ch <- prometheus.MustNewConstMetric(
c.collectedAlerts,
prometheus.GaugeValue,
count,
am.Name,
reciver,
state,
)
}
}
}
}
func init() {
prometheus.MustRegister(metricAlerts)
prometheus.MustRegister(metricAlertGroups)
prometheus.MustRegister(metricAlertmanagerErrors)
prometheus.MustRegister(metricCollectRuns)
prometheus.MustRegister(newUnseeCollector())
}

View File

@@ -11,11 +11,20 @@ import (
"github.com/cloudflare/unsee/models"
"github.com/cloudflare/unsee/transform"
"github.com/cloudflare/unsee/transport"
"github.com/prometheus/client_golang/prometheus"
log "github.com/sirupsen/logrus"
)
const (
labelValueErrorsAlerts = "alerts"
labelValueErrorsSilences = "silences"
)
type alertmanagerMetrics struct {
cycles float64
errors map[string]float64
}
// Alertmanager represents Alertmanager upstream instance
type Alertmanager struct {
URI string `json:"uri"`
@@ -29,6 +38,8 @@ type Alertmanager struct {
colors models.LabelsColorMap
autocomplete []models.Autocomplete
lastError string
// metrics tracked per alertmanager instance
metrics alertmanagerMetrics
}
func (am *Alertmanager) detectVersion() string {
@@ -68,22 +79,6 @@ func (am *Alertmanager) clearData() {
am.colors = models.LabelsColorMap{}
am.autocomplete = []models.Autocomplete{}
am.lock.Unlock()
// reset metrics to 0 since we don't store anything anymore
am.resetMetrics()
}
func (am *Alertmanager) resetMetrics() {
// reset alert state/instance counters
for _, state := range models.AlertStateList {
metricAlerts.With(prometheus.Labels{
"alertmanager": am.Name,
"state": state,
}).Set(0)
}
// reset alert group counters
metricAlertGroups.With(prometheus.Labels{
"alertmanager": am.Name,
}).Set(0)
}
func (am *Alertmanager) pullSilences(version string) error {
@@ -154,12 +149,6 @@ func (am *Alertmanager) pullAlerts(version string) error {
colors := models.LabelsColorMap{}
autocompleteMap := map[string]models.Autocomplete{}
// we'll use this to update alert counter metrics (per state/instance)
alertMetrics := map[string]float64{}
for _, state := range models.AlertStateList {
alertMetrics[state] = 0
}
log.Infof("[%s] Processing unique alert groups (%d)", am.Name, len(uniqueGroups))
for _, ag := range uniqueGroups {
alerts := models.AlertList{}
@@ -194,8 +183,6 @@ func (am *Alertmanager) pullAlerts(version string) error {
alert.UpdateFingerprints()
alerts = append(alerts, alert)
alertMetrics[alert.State]++
}
for _, hint := range transform.BuildAutocomplete(alerts) {
@@ -211,14 +198,6 @@ func (am *Alertmanager) pullAlerts(version string) error {
dedupedGroups = append(dedupedGroups, ag)
}
// update internal metrics with new computed values
for state, val := range alertMetrics {
metricAlerts.With(prometheus.Labels{
"alertmanager": am.Name,
"state": state,
}).Set(val)
}
log.Infof("[%s] Merging autocomplete data (%d)", am.Name, len(autocompleteMap))
autocomplete := []models.Autocomplete{}
for _, hint := range autocompleteMap {
@@ -231,25 +210,20 @@ func (am *Alertmanager) pullAlerts(version string) error {
am.autocomplete = autocomplete
am.lock.Unlock()
metricAlertGroups.With(prometheus.Labels{
"alertmanager": am.Name,
}).Set(float64(len(dedupedGroups)))
return nil
}
// Pull data from upstream Alertmanager instance
func (am *Alertmanager) Pull() error {
am.metrics.cycles++
version := am.detectVersion()
err := am.pullSilences(version)
if err != nil {
am.clearData()
am.setError(err.Error())
metricAlertmanagerErrors.With(prometheus.Labels{
"alertmanager": am.Name,
"endpoint": "silences",
}).Inc()
am.metrics.errors[labelValueErrorsSilences]++
return err
}
@@ -257,17 +231,10 @@ func (am *Alertmanager) Pull() error {
if err != nil {
am.clearData()
am.setError(err.Error())
metricAlertmanagerErrors.With(prometheus.Labels{
"alertmanager": am.Name,
"endpoint": "alerts",
}).Inc()
am.metrics.errors[labelValueErrorsAlerts]++
return err
}
metricCollectRuns.With(prometheus.Labels{
"alertmanager": am.Name,
}).Inc()
am.lastError = ""
return nil
}

View File

@@ -6,7 +6,6 @@ import (
"time"
"github.com/cloudflare/unsee/models"
"github.com/prometheus/client_golang/prometheus"
log "github.com/sirupsen/logrus"
)
@@ -27,16 +26,6 @@ func NewAlertmanager(name, uri string, timeout time.Duration) error {
}
}
// initialize metrics
metricAlertmanagerErrors.With(prometheus.Labels{
"alertmanager": name,
"endpoint": "alerts",
}).Set(0)
metricAlertmanagerErrors.With(prometheus.Labels{
"alertmanager": name,
"endpoint": "silences",
}).Set(0)
upstreams[name] = &Alertmanager{
URI: uri,
Timeout: timeout,
@@ -46,6 +35,12 @@ func NewAlertmanager(name, uri string, timeout time.Duration) error {
silences: map[string]models.Silence{},
colors: models.LabelsColorMap{},
autocomplete: []models.Autocomplete{},
metrics: alertmanagerMetrics{
errors: map[string]float64{
labelValueErrorsAlerts: 0,
labelValueErrorsSilences: 0,
},
},
}
log.Infof("[%s] Configured Alertmanager source at %s", name, uri)