mirror of
https://github.com/prymitive/karma
synced 2026-05-05 03:16:51 +00:00
507 lines
13 KiB
Go
507 lines
13 KiB
Go
package alertmanager
|
|
|
|
import (
|
|
"fmt"
|
|
"net/http"
|
|
"path"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/prymitive/karma/internal/config"
|
|
"github.com/prymitive/karma/internal/filters"
|
|
"github.com/prymitive/karma/internal/mapper"
|
|
"github.com/prymitive/karma/internal/models"
|
|
"github.com/prymitive/karma/internal/slices"
|
|
"github.com/prymitive/karma/internal/transform"
|
|
"github.com/prymitive/karma/internal/uri"
|
|
"github.com/prymitive/karma/internal/verprobe"
|
|
|
|
log "github.com/sirupsen/logrus"
|
|
)
|
|
|
|
const (
|
|
labelValueErrorsAlerts = "alerts"
|
|
labelValueErrorsSilences = "silences"
|
|
)
|
|
|
|
type alertmanagerMetrics struct {
|
|
Cycles float64
|
|
Errors map[string]float64
|
|
}
|
|
|
|
// Alertmanager represents Alertmanager upstream instance
|
|
type Alertmanager struct {
|
|
URI string `json:"uri"`
|
|
ExternalURI string `json:"-"`
|
|
RequestTimeout time.Duration `json:"timeout"`
|
|
Cluster string `json:"cluster"`
|
|
Name string `json:"name"`
|
|
// whenever this instance should be proxied
|
|
ProxyRequests bool `json:"proxyRequests"`
|
|
ReadOnly bool `json:"readonly"`
|
|
// reader instances are specific to URI scheme we collect from
|
|
reader uri.Reader
|
|
// implements how we fetch requests from the Alertmanager, we don't set it
|
|
// by default so it's nil and http.DefaultTransport is used
|
|
HTTPTransport http.RoundTripper `json:"-"`
|
|
// lock protects data access while updating
|
|
lock sync.RWMutex
|
|
// fields for storing pulled data
|
|
alertGroups []models.AlertGroup
|
|
silences map[string]models.Silence
|
|
colors models.LabelsColorMap
|
|
autocomplete []models.Autocomplete
|
|
knownLabels []string
|
|
lastError string
|
|
status models.AlertmanagerStatus
|
|
clusterName string
|
|
// metrics tracked per alertmanager instance
|
|
Metrics alertmanagerMetrics
|
|
// headers to send with each AlertManager request
|
|
HTTPHeaders map[string]string
|
|
// CORS credentials
|
|
CORSCredentials string `json:"corsCredentials"`
|
|
}
|
|
|
|
func (am *Alertmanager) probeVersion() string {
|
|
const fakeVersion = "999.0.0"
|
|
|
|
url, err := uri.JoinURL(am.URI, "metrics")
|
|
if err != nil {
|
|
log.Errorf("Failed to join url '%s' and path 'metrics': %s", am.SanitizedURI(), err)
|
|
return fakeVersion
|
|
}
|
|
|
|
source, err := am.reader.Read(url, am.HTTPHeaders)
|
|
if err != nil {
|
|
log.Errorf("[%s] %s request failed: %s", am.Name, uri.SanitizeURI(url), err)
|
|
return fakeVersion
|
|
}
|
|
defer source.Close()
|
|
|
|
version, err := verprobe.Detect(source)
|
|
if err != nil {
|
|
return fakeVersion
|
|
}
|
|
log.Infof("[%s] Upstream version: %s", am.Name, version)
|
|
|
|
if version == "0.17.0" || version == "0.18.0" {
|
|
log.Warningf("Alertmanager %s might return incomplete list of alert groups in the API, please upgrade to >=0.19.0, see https://github.com/prymitive/karma/issues/812", version)
|
|
}
|
|
|
|
return version
|
|
}
|
|
|
|
func (am *Alertmanager) fetchStatus(version string) (*models.AlertmanagerStatus, error) {
|
|
mapper, err := mapper.GetStatusMapper(version)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var status models.AlertmanagerStatus
|
|
|
|
status, err = mapper.Collect(am.URI, am.HTTPHeaders, am.RequestTimeout, am.HTTPTransport)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return &status, nil
|
|
}
|
|
|
|
func (am *Alertmanager) clearData() {
|
|
am.lock.Lock()
|
|
am.alertGroups = []models.AlertGroup{}
|
|
am.silences = map[string]models.Silence{}
|
|
am.colors = models.LabelsColorMap{}
|
|
am.autocomplete = []models.Autocomplete{}
|
|
am.knownLabels = []string{}
|
|
am.status = models.AlertmanagerStatus{
|
|
Version: "",
|
|
ID: "",
|
|
PeerIDs: []string{},
|
|
}
|
|
am.lock.Unlock()
|
|
}
|
|
|
|
func (am *Alertmanager) pullSilences(version string) error {
|
|
mapper, err := mapper.GetSilenceMapper(version)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
var silences []models.Silence
|
|
|
|
start := time.Now()
|
|
silences, err = mapper.Collect(am.URI, am.HTTPHeaders, am.RequestTimeout, am.HTTPTransport)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
log.Infof("[%s] Got %d silences(s) in %s", am.Name, len(silences), time.Since(start))
|
|
|
|
log.Infof("[%s] Detecting ticket links in silences (%d)", am.Name, len(silences))
|
|
silenceMap := make(map[string]models.Silence, len(silences))
|
|
for _, silence := range silences {
|
|
silence := silence // scopelint pin
|
|
silence.TicketID, silence.TicketURL = transform.DetectLinks(&silence)
|
|
silenceMap[silence.ID] = silence
|
|
}
|
|
|
|
am.lock.Lock()
|
|
am.silences = silenceMap
|
|
am.lock.Unlock()
|
|
|
|
return nil
|
|
}
|
|
|
|
// InternalURI is the URI of this Alertmanager that will be used for all request made by the UI
|
|
func (am *Alertmanager) InternalURI() string {
|
|
if am.ProxyRequests {
|
|
sub := fmt.Sprintf("/proxy/alertmanager/%s", am.Name)
|
|
uri := path.Join(config.Config.Listen.Prefix, sub)
|
|
if strings.HasSuffix(sub, "/") {
|
|
// if sub path had trailing slash then add it here, since path.Join will
|
|
// skip it
|
|
return uri + "/"
|
|
}
|
|
|
|
return uri
|
|
}
|
|
|
|
// strip all user/pass information, fetch() doesn't support it anyway
|
|
return uri.WithoutUserinfo(am.PublicURI())
|
|
}
|
|
|
|
// PublicURI is the URI of this Alertmanager that will be used for browser links
|
|
func (am *Alertmanager) PublicURI() string {
|
|
// external_uri is always the first setting to check for browser links
|
|
if am.ExternalURI != "" {
|
|
return am.ExternalURI
|
|
}
|
|
|
|
return am.URI
|
|
}
|
|
|
|
func (am *Alertmanager) pullAlerts(version string) error {
|
|
mapper, err := mapper.GetAlertMapper(version)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
var groups []models.AlertGroup
|
|
|
|
start := time.Now()
|
|
|
|
groups, err = mapper.Collect(am.URI, am.HTTPHeaders, am.RequestTimeout, am.HTTPTransport)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
log.Infof("[%s] Got %d alert group(s) in %s", am.Name, len(groups), time.Since(start))
|
|
|
|
log.Infof("[%s] Deduplicating alert groups (%d)", am.Name, len(groups))
|
|
uniqueGroups := map[string]models.AlertGroup{}
|
|
uniqueAlerts := map[string]map[string]models.Alert{}
|
|
knownLabelsMap := map[string]bool{}
|
|
for _, ag := range groups {
|
|
agID := ag.LabelsFingerprint()
|
|
if _, found := uniqueGroups[agID]; !found {
|
|
uniqueGroups[agID] = models.AlertGroup{
|
|
Receiver: ag.Receiver,
|
|
Labels: ag.Labels,
|
|
ID: agID,
|
|
}
|
|
}
|
|
for _, alert := range ag.Alerts {
|
|
if _, found := uniqueAlerts[agID]; !found {
|
|
uniqueAlerts[agID] = map[string]models.Alert{}
|
|
}
|
|
alertCFP := alert.ContentFingerprint()
|
|
if _, found := uniqueAlerts[agID][alertCFP]; !found {
|
|
uniqueAlerts[agID][alertCFP] = alert
|
|
}
|
|
for key := range alert.Labels {
|
|
knownLabelsMap[key] = true
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
dedupedGroups := make([]models.AlertGroup, 0, len(uniqueGroups))
|
|
colors := models.LabelsColorMap{}
|
|
autocompleteMap := map[string]models.Autocomplete{}
|
|
|
|
log.Infof("[%s] Processing unique alert groups (%d)", am.Name, len(uniqueGroups))
|
|
for _, ag := range uniqueGroups {
|
|
alerts := make(models.AlertList, 0, len(uniqueAlerts[ag.ID]))
|
|
for _, alert := range uniqueAlerts[ag.ID] {
|
|
|
|
silences := map[string]*models.Silence{}
|
|
for _, silenceID := range alert.SilencedBy {
|
|
silence, err := am.SilenceByID(silenceID)
|
|
if err == nil {
|
|
silences[silenceID] = &silence
|
|
}
|
|
}
|
|
|
|
alert.Alertmanager = []models.AlertmanagerInstance{
|
|
{
|
|
Name: am.Name,
|
|
Cluster: am.ClusterName(),
|
|
State: alert.State,
|
|
StartsAt: alert.StartsAt,
|
|
Source: alert.GeneratorURL,
|
|
Silences: silences,
|
|
SilencedBy: alert.SilencedBy,
|
|
InhibitedBy: alert.InhibitedBy,
|
|
},
|
|
}
|
|
|
|
transform.ColorLabel(colors, "@receiver", alert.Receiver)
|
|
for _, am := range alert.Alertmanager {
|
|
transform.ColorLabel(colors, "@alertmanager", am.Name)
|
|
transform.ColorLabel(colors, "@cluster", am.Cluster)
|
|
}
|
|
for k, v := range alert.Labels {
|
|
transform.ColorLabel(colors, k, v)
|
|
}
|
|
|
|
alert.UpdateFingerprints()
|
|
alerts = append(alerts, alert)
|
|
}
|
|
|
|
for _, hint := range filters.BuildAutocomplete(alerts) {
|
|
autocompleteMap[hint.Value] = hint
|
|
}
|
|
|
|
sort.Sort(&alerts)
|
|
ag.Alerts = alerts
|
|
|
|
// Hash is a checksum of all alerts, used to tell when any alert in the group changed
|
|
ag.Hash = ag.ContentFingerprint()
|
|
|
|
dedupedGroups = append(dedupedGroups, ag)
|
|
}
|
|
|
|
log.Infof("[%s] Merging autocomplete data (%d)", am.Name, len(autocompleteMap))
|
|
autocomplete := make([]models.Autocomplete, 0, len(autocompleteMap))
|
|
for _, hint := range autocompleteMap {
|
|
autocomplete = append(autocomplete, hint)
|
|
}
|
|
|
|
knownLabels := make([]string, 0, len(knownLabelsMap))
|
|
for key := range knownLabelsMap {
|
|
knownLabels = append(knownLabels, key)
|
|
}
|
|
|
|
am.lock.Lock()
|
|
am.alertGroups = dedupedGroups
|
|
am.colors = colors
|
|
am.autocomplete = autocomplete
|
|
am.knownLabels = knownLabels
|
|
am.lock.Unlock()
|
|
|
|
return nil
|
|
}
|
|
|
|
// Pull data from upstream Alertmanager instance
|
|
func (am *Alertmanager) Pull() error {
|
|
am.Metrics.Cycles++
|
|
|
|
version := am.probeVersion()
|
|
|
|
status, err := am.fetchStatus(version)
|
|
if err != nil {
|
|
am.clearData()
|
|
am.setError(err.Error())
|
|
am.Metrics.Errors[labelValueErrorsSilences]++
|
|
return err
|
|
}
|
|
|
|
err = am.pullSilences(version)
|
|
if err != nil {
|
|
am.clearData()
|
|
am.setError(err.Error())
|
|
am.Metrics.Errors[labelValueErrorsSilences]++
|
|
return err
|
|
}
|
|
|
|
err = am.pullAlerts(version)
|
|
if err != nil {
|
|
am.clearData()
|
|
am.setError(err.Error())
|
|
am.Metrics.Errors[labelValueErrorsAlerts]++
|
|
return err
|
|
}
|
|
|
|
am.lock.Lock()
|
|
am.status = *status
|
|
am.lastError = ""
|
|
am.clusterName = ""
|
|
am.lock.Unlock()
|
|
|
|
return nil
|
|
}
|
|
|
|
// Alerts returns a copy of all alert groups
|
|
func (am *Alertmanager) Alerts() []models.AlertGroup {
|
|
am.lock.RLock()
|
|
defer am.lock.RUnlock()
|
|
|
|
alerts := make([]models.AlertGroup, len(am.alertGroups))
|
|
copy(alerts, am.alertGroups)
|
|
return alerts
|
|
}
|
|
|
|
// Silences returns a copy of all silences
|
|
func (am *Alertmanager) Silences() map[string]models.Silence {
|
|
am.lock.RLock()
|
|
defer am.lock.RUnlock()
|
|
|
|
silences := make(map[string]models.Silence, len(am.silences))
|
|
for id, silence := range am.silences {
|
|
silences[id] = silence
|
|
}
|
|
return silences
|
|
}
|
|
|
|
// SilenceByID allows to query for a silence by it's ID, returns error if not found
|
|
func (am *Alertmanager) SilenceByID(id string) (models.Silence, error) {
|
|
am.lock.RLock()
|
|
defer am.lock.RUnlock()
|
|
|
|
s, found := am.silences[id]
|
|
if !found {
|
|
return models.Silence{}, fmt.Errorf("silence '%s' not found", id)
|
|
}
|
|
return s, nil
|
|
}
|
|
|
|
// Colors returns a copy of all color maps
|
|
func (am *Alertmanager) Colors() models.LabelsColorMap {
|
|
am.lock.RLock()
|
|
defer am.lock.RUnlock()
|
|
|
|
colors := make(models.LabelsColorMap, len(am.colors))
|
|
for k, v := range am.colors {
|
|
colors[k] = make(map[string]models.LabelColors, len(v))
|
|
for nk, nv := range v {
|
|
colors[k][nk] = nv
|
|
}
|
|
}
|
|
return colors
|
|
}
|
|
|
|
// Autocomplete returns a copy of all autocomplete data
|
|
func (am *Alertmanager) Autocomplete() []models.Autocomplete {
|
|
am.lock.RLock()
|
|
defer am.lock.RUnlock()
|
|
|
|
autocomplete := make([]models.Autocomplete, len(am.autocomplete))
|
|
copy(autocomplete, am.autocomplete)
|
|
return autocomplete
|
|
}
|
|
|
|
// KnownLabels returns a copy of a map with known labels
|
|
func (am *Alertmanager) KnownLabels() []string {
|
|
am.lock.RLock()
|
|
defer am.lock.RUnlock()
|
|
|
|
knownLabels := make([]string, len(am.knownLabels))
|
|
copy(knownLabels, am.knownLabels)
|
|
|
|
return knownLabels
|
|
}
|
|
|
|
func (am *Alertmanager) setError(err string) {
|
|
am.lock.Lock()
|
|
defer am.lock.Unlock()
|
|
|
|
am.lastError = err
|
|
}
|
|
|
|
func (am *Alertmanager) Error() string {
|
|
am.lock.RLock()
|
|
defer am.lock.RUnlock()
|
|
|
|
return am.lastError
|
|
}
|
|
|
|
// SanitizedURI returns a copy of Alertmanager.URI with password replaced by
|
|
// "xxx"
|
|
func (am *Alertmanager) SanitizedURI() string {
|
|
return uri.SanitizeURI(am.URI)
|
|
}
|
|
|
|
// Version returns last known version of this Alertmanager instance
|
|
func (am *Alertmanager) Version() string {
|
|
am.lock.RLock()
|
|
defer am.lock.RUnlock()
|
|
|
|
return am.status.Version
|
|
}
|
|
|
|
// ClusterPeers returns a list of IDs of all peers this instance
|
|
// is connected to.
|
|
// IDs are the same as in Alertmanager API.
|
|
func (am *Alertmanager) ClusterPeers() []string {
|
|
am.lock.RLock()
|
|
defer am.lock.RUnlock()
|
|
|
|
return am.status.PeerIDs
|
|
}
|
|
|
|
// ClusterMemberNames returns a list of names of all Alertmanager instances
|
|
// that are in the same cluster as this instance (including self).
|
|
// Names are the same as in karma configuration.
|
|
func (am *Alertmanager) ClusterMemberNames() []string {
|
|
am.lock.RLock()
|
|
defer am.lock.RUnlock()
|
|
|
|
members := []string{am.Name}
|
|
|
|
upstreams := GetAlertmanagers()
|
|
for _, upstream := range upstreams {
|
|
if upstream.Name == am.Name {
|
|
continue
|
|
}
|
|
for _, peerID := range upstream.ClusterPeers() {
|
|
if slices.StringInSlice(am.status.PeerIDs, peerID) {
|
|
if !slices.StringInSlice(members, upstream.Name) {
|
|
members = append(members, upstream.Name)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
sort.Strings(members)
|
|
return members
|
|
}
|
|
|
|
func (am *Alertmanager) ClusterName() string {
|
|
am.lock.RLock()
|
|
if am.clusterName != "" {
|
|
am.lock.RUnlock()
|
|
return am.clusterName
|
|
}
|
|
am.lock.RUnlock()
|
|
|
|
var clusterName string
|
|
if am.Cluster != "" {
|
|
configPeers := clusterMembersFromConfig(am)
|
|
apiPeers := clusterMembersFromAPI(am)
|
|
missing, extra := slices.StringSliceDiff(configPeers, apiPeers)
|
|
|
|
if len(missing) == 0 && len(extra) == 0 {
|
|
clusterName = am.Cluster
|
|
} else {
|
|
clusterName = strings.Join(am.ClusterMemberNames(), " | ")
|
|
}
|
|
} else {
|
|
clusterName = strings.Join(am.ClusterMemberNames(), " | ")
|
|
}
|
|
am.clusterName = clusterName
|
|
return clusterName
|
|
}
|