feat(backend): add healtcheck:filters option

This commit is contained in:
Łukasz Mierzwa
2020-12-12 17:38:44 +00:00
committed by Łukasz Mierzwa
parent 886a59d97d
commit 17b4e943b8
19 changed files with 329 additions and 4 deletions

View File

@@ -6,6 +6,39 @@
- Don't reset regex toggle when adding new silence labels #2520
### Added
- Added support for DeadMansSwitch alerts via `healtcheck:alerts` option
on alertmanager upstream configuration #2512.
Example:
- Setup always on alert in each Prometheus server (prom1 and prom2):
```YAML
- alert: DeadMansSwitch
expr: vector(1)
```
- Add healtcheck configuration to karma:
```YAML
alertmanager:
servers:
- name: am
uri: https://alertmanager.example.com
healthcheck:
alerts:
prom1:
- alertname=DeadMansSwitch
- instance=prom1
prom2:
- alertname=DeadMansSwitch
- instance=prom2
```
If any of these alerts is missing from alertmanager karma will show a warning
in the UI.
## v0.77
### Fixed

View File

@@ -110,6 +110,15 @@ only after all alerts are resolved you can use
See [configuration docs](/docs/CONFIGURATION.md#alert-acknowledgement) for
details.
### Dead Mans Switch support
Starting with `v0.78` karma can be configured to check for
[Dead Mans Switch](https://en.wikipedia.org/wiki/Dead_man%27s_switch)
style alerts (alert that is always firing). If no alert is found in given
alertmanager karma will show an error in the UI.
See `healthcheck:filters` option on [configuration docs](/docs/CONFIGURATION.md#alertmanagers)
for details.
### Dark mode
Starting with `v0.52` release karma includes both light and dark themes.

View File

@@ -192,6 +192,7 @@ func setupUpstreams() error {
alertmanager.WithHTTPTransport(httpTransport), // we will pass a nil unless TLS.CA or TLS.Cert is set
alertmanager.WithHTTPHeaders(s.Headers),
alertmanager.WithCORSCredentials(s.CORS.Credentials),
alertmanager.WithHealthchecks(s.Healthcheck.Filters),
)
if err != nil {
return fmt.Errorf("failed to create Alertmanager '%s' with URI '%s': %s", s.Name, uri.SanitizeURI(s.URI), err)

View File

@@ -102,6 +102,8 @@ level=info msg=" insecureSkipVerify: false"
level=info msg=" headers: {}"
level=info msg=" cors:"
level=info msg=" credentials: include"
level=info msg=" healthcheck:"
level=info msg=" filters: {}"
level=info msg="alertAcknowledgement:"
level=info msg=" enabled: true"
level=info msg=" duration: 5m0s"

View File

@@ -43,6 +43,8 @@ level=info msg=" insecureSkipVerify: false"
level=info msg=" headers: {}"
level=info msg=" cors:"
level=info msg=" credentials: include"
level=info msg=" healthcheck:"
level=info msg=" filters: {}"
level=info msg=" - cluster: HA"
level=info msg=" name: ha2"
level=info msg=" uri: http://127.0.0.1:9094"
@@ -58,6 +60,8 @@ level=info msg=" insecureSkipVerify: false"
level=info msg=" headers: {}"
level=info msg=" cors:"
level=info msg=" credentials: omit"
level=info msg=" healthcheck:"
level=info msg=" filters: {}"
level=info msg=" - cluster: \"\""
level=info msg=" name: local"
level=info msg=" uri: http://foo:xxx@127.0.0.1:9095"
@@ -74,6 +78,14 @@ level=info msg=" headers:"
level=info msg=" X-Auth-Test: some-token-or-other-string"
level=info msg=" cors:"
level=info msg=" credentials: same-origin"
level=info msg=" healthcheck:"
level=info msg=" filters:"
level=info msg=" prom1:"
level=info msg=" - alertname=DeadMansSwitch"
level=info msg=" - instance=prom1"
level=info msg=" prom2:"
level=info msg=" - alertname=DeadMansSwitch"
level=info msg=" - instance=prom2"
level=info msg=" - cluster: \"\""
level=info msg=" name: client-auth"
level=info msg=" uri: https://127.0.0.1:9096"
@@ -89,6 +101,8 @@ level=info msg=" insecureSkipVerify: false"
level=info msg=" headers: {}"
level=info msg=" cors:"
level=info msg=" credentials: include"
level=info msg=" healthcheck:"
level=info msg=" filters: {}"
level=info msg="alertAcknowledgement:"
level=info msg=" enabled: true"
level=info msg=" duration: 7m0s"
@@ -250,6 +264,14 @@ alertmanager:
X-Auth-Test: some-token-or-other-string
cors:
credentials: same-origin
healthcheck:
filters:
prom1:
- alertname=DeadMansSwitch
- instance=prom1
prom2:
- alertname=DeadMansSwitch
- instance=prom2
- name: client-auth
uri: https://127.0.0.1:9096
timeout: 10s

View File

@@ -35,6 +35,8 @@ level=info msg=" insecureSkipVerify: false"
level=info msg=" headers: {}"
level=info msg=" cors:"
level=info msg=" credentials: include"
level=info msg=" healthcheck:"
level=info msg=" filters: {}"
level=info msg="alertAcknowledgement:"
level=info msg=" enabled: false"
level=info msg=" duration: 15m0s"

View File

@@ -35,6 +35,8 @@ level=info msg=" insecureSkipVerify: false"
level=info msg=" headers: {}"
level=info msg=" cors:"
level=info msg=" credentials: include"
level=info msg=" healthcheck:"
level=info msg=" filters: {}"
level=info msg="alertAcknowledgement:"
level=info msg=" enabled: false"
level=info msg=" duration: 15m0s"

View File

@@ -35,6 +35,8 @@ level=info msg=" insecureSkipVerify: false"
level=info msg=" headers: {}"
level=info msg=" cors:"
level=info msg=" credentials: include"
level=info msg=" healthcheck:"
level=info msg=" filters: {}"
level=info msg="alertAcknowledgement:"
level=info msg=" enabled: false"
level=info msg=" duration: 15m0s"

View File

@@ -37,6 +37,8 @@ level=info msg=" insecureSkipVerify: false"
level=info msg=" headers: {}"
level=info msg=" cors:"
level=info msg=" credentials: include"
level=info msg=" healthcheck:"
level=info msg=" filters: {}"
level=info msg="alertAcknowledgement:"
level=info msg=" enabled: false"
level=info msg=" duration: 15m0s"

View File

@@ -35,6 +35,8 @@ level=info msg=" insecureSkipVerify: false"
level=info msg=" headers: {}"
level=info msg=" cors:"
level=info msg=" credentials: include"
level=info msg=" healthcheck:"
level=info msg=" filters: {}"
level=info msg="alertAcknowledgement:"
level=info msg=" enabled: false"
level=info msg=" duration: 15m0s"

View File

@@ -0,0 +1,18 @@
# Raises an error if healthcheck uses invalid filter
karma.bin-should-fail --check-config
! stdout .
cmp stderr stderr.txt
-- stderr.txt --
level=info msg="Reading configuration file" path=karma.yaml
level=info msg="Version: dev"
level=error msg="Execution failed" error="failed to create Alertmanager 'default' with URI 'https://127.0.0.1:9093': \"alertname==\" is not a valid filter"
-- karma.yaml --
alertmanager:
servers:
- name: default
uri: https://127.0.0.1:9093
healthcheck:
filters:
prom1:
- alertname==

View File

@@ -2330,3 +2330,78 @@ func TestGetUserFromContextPresent(t *testing.T) {
t.Errorf("getUserFromContext() returned user=%q", user)
}
}
func TestHealthcheckAlerts(t *testing.T) {
type testCaseT struct {
healthchecks map[string][]string
hasError bool
}
testCases := []testCaseT{
{
healthchecks: map[string][]string{},
hasError: false,
},
{
healthchecks: map[string][]string{
"active": {"alertname=Host_Down"},
},
hasError: false,
},
{
healthchecks: map[string][]string{
"active": {
"alertname=Host_Down",
"cluster=staging",
},
},
hasError: false,
},
{
healthchecks: map[string][]string{
"active": {"alertname=FooBar"},
},
hasError: true,
},
{
healthchecks: map[string][]string{
"active": {
"alertname=Host_Down",
"cluster=unknown",
},
},
hasError: true,
},
}
zerolog.SetGlobalLevel(zerolog.FatalLevel)
for i, testCase := range testCases {
for _, version := range mock.ListAllMocks() {
t.Run(fmt.Sprintf("%d/%s", i, version), func(t *testing.T) {
httpmock.Activate()
defer httpmock.DeactivateAndReset()
mockCache()
mock.RegisterURL("http://localhost/metrics", version, "metrics")
mock.RegisterURL("http://localhost/api/v2/status", version, "api/v2/status")
mock.RegisterURL("http://localhost/api/v2/silences", version, "api/v2/silences")
mock.RegisterURL("http://localhost/api/v2/alerts/groups", version, "api/v2/alerts/groups")
am, err := alertmanager.NewAlertmanager(
"cluster",
"healthchecks",
"http://localhost",
alertmanager.WithHealthchecks(testCase.healthchecks),
)
if err != nil {
t.Error(err)
return
}
_ = am.Pull()
hasError := am.Error() != ""
if hasError != testCase.hasError {
t.Errorf("error=%q expected=%v", am.Error(), testCase.hasError)
}
})
}
}
}

View File

@@ -176,6 +176,8 @@ alertmanager:
any: string
cors:
credentials: string
healthcheck:
filters: map (string: list of strings)
```
- `interval` - how often alerts should be refreshed, a string in
@@ -244,6 +246,41 @@ alertmanager:
`omit` or `same-origin` if Alertmanager is configured to respond with
`Access-Control-Allow-Origin: *`,
[see docs](https://developer.mozilla.org/en-US/docs/Web/HTTP/CORS/Errors/CORSNotSupportingCredentials).
- `healthcheck:filters` - define healtchecks using alert filters. When set karma
will search for alerts matching defined filters and show an error if it doesn't
match anything. This can be used with a [Dead man's switch](https://en.wikipedia.org/wiki/Dead_man%27s_switch)
style alert to notify karma users that there's a problem with alerting pipeline.
Syntax for this option is a map where key is the name of the filter set (used in
the UI when showing errors) and the value is a list of filters.
Example:
- Setup always on alert in each Prometheus server (prom1 and prom2):
```YAML
- alert: DeadMansSwitch
expr: vector(1)
```
- Add healtcheck configuration to karma:
```YAML
alertmanager:
servers:
- name: am
uri: https://alertmanager.example.com
healthcheck:
filters:
prom1:
- alertname=DeadMansSwitch
- instance=prom1
prom2:
- alertname=DeadMansSwitch
- instance=prom2
```
If any of these alerts is missing from alertmanager karma will show a warning
in the UI.
Note: there are multiple supported combination of URI settings which result in
a slightly different behavior. Settings that control it are:

View File

@@ -32,6 +32,11 @@ type alertmanagerMetrics struct {
Errors map[string]float64
}
type healthCheck struct {
filters []filters.FilterT
wasFound bool
}
// Alertmanager represents Alertmanager upstream instance
type Alertmanager struct {
URI string `json:"uri"`
@@ -65,6 +70,7 @@ type Alertmanager struct {
HTTPHeaders map[string]string
// CORS credentials
CORSCredentials string `json:"corsCredentials"`
healthchecks map[string]healthCheck
}
func (am *Alertmanager) probeVersion() string {
@@ -203,6 +209,16 @@ func (am *Alertmanager) pullAlerts(version string) error {
return err
}
healthchecks := map[string]healthCheck{}
am.lock.RLock()
for name, hc := range am.healthchecks {
healthchecks[name] = healthCheck{
filters: hc.filters,
wasFound: false,
}
}
am.lock.RUnlock()
var groups []models.AlertGroup
start := time.Now()
@@ -233,6 +249,8 @@ func (am *Alertmanager) pullAlerts(version string) error {
}
}
for _, alert := range ag.Alerts {
alert := alert
if _, found := uniqueAlerts[agID]; !found {
uniqueAlerts[agID] = map[string]models.Alert{}
}
@@ -243,8 +261,33 @@ func (am *Alertmanager) pullAlerts(version string) error {
for key := range alert.Labels {
knownLabelsMap[key] = true
}
}
for name, hc := range am.healthchecks {
positiveMatch := false
negativeMatch := false
for _, hcFilter := range hc.filters {
if hcFilter.Match(&alert, 0) {
log.Debug().
Str("alertmanager", am.Name).
Str("healthcheck", name).
Msg("Healthcheck alert matched")
positiveMatch = true
} else {
negativeMatch = true
}
}
if positiveMatch && !negativeMatch {
log.Debug().
Str("alertmanager", am.Name).
Str("healthcheck", name).
Msg("Marking healthcheck alert as found")
healthchecks[name] = healthCheck{
filters: hc.filters,
wasFound: true,
}
}
}
}
}
dedupedGroups := make([]models.AlertGroup, 0, len(uniqueGroups))
@@ -326,6 +369,7 @@ func (am *Alertmanager) pullAlerts(version string) error {
am.colors = colors
am.autocomplete = autocomplete
am.knownLabels = knownLabels
am.healthchecks = healthchecks
am.lock.Unlock()
return nil
@@ -381,6 +425,12 @@ func (am *Alertmanager) Pull() error {
am.clusterName = ""
am.lock.Unlock()
for name, hc := range am.healthchecks {
if !hc.wasFound {
am.setError(fmt.Sprintf("Healthcheck filter %q didn't match any alerts", name))
}
}
return nil
}

View File

@@ -8,6 +8,7 @@ import (
"sync"
"time"
"github.com/prymitive/karma/internal/filters"
"github.com/prymitive/karma/internal/models"
"github.com/prymitive/karma/internal/uri"
@@ -42,7 +43,8 @@ func NewAlertmanager(cluster, name, upstreamURI string, opts ...Option) (*Alertm
labelValueErrorsSilences: 0,
},
},
status: models.AlertmanagerStatus{},
status: models.AlertmanagerStatus{},
healthchecks: map[string]healthCheck{},
}
for _, opt := range opts {
@@ -176,3 +178,24 @@ func WithCORSCredentials(val string) Option {
return nil
}
}
func WithHealthchecks(val map[string][]string) Option {
return func(am *Alertmanager) error {
healthchecks := map[string]healthCheck{}
for name, filterExpressions := range val {
hc := healthCheck{
filters: []filters.FilterT{},
}
for _, filterExpression := range filterExpressions {
f := filters.NewFilter(filterExpression)
if f == nil || !f.GetIsValid() {
return fmt.Errorf("%q is not a valid filter", filterExpression)
}
hc.filters = append(hc.filters, f)
}
healthchecks[name] = hc
}
am.healthchecks = healthchecks
return nil
}
}

View File

@@ -76,6 +76,42 @@ var testCases = []testCase{
},
shouldFail: true,
},
{
config: config.AlertmanagerConfig{
Cluster: "cluster",
Name: "name",
URI: "http://localhost:9093",
ExternalURI: "http://localhost:9093",
Timeout: time.Second * 30,
Proxy: false,
ReadOnly: false,
Headers: map[string]string{},
Healthcheck: config.AlertmanagerHealthcheck{
Filters: map[string][]string{
"prom1": {"@age>a"},
},
},
},
shouldFail: true,
},
{
config: config.AlertmanagerConfig{
Cluster: "cluster",
Name: "name",
URI: "http://localhost:9093",
ExternalURI: "http://localhost:9093",
Timeout: time.Second * 30,
Proxy: false,
ReadOnly: false,
Headers: map[string]string{},
Healthcheck: config.AlertmanagerHealthcheck{
Filters: map[string][]string{
"prom1": {" "},
},
},
},
shouldFail: true,
},
}
func TestOptions(t *testing.T) {
@@ -100,6 +136,7 @@ func TestOptions(t *testing.T) {
WithHTTPTransport(httpTransport), // we will pass a nil unless TLS.CA or TLS.Cert is set
WithHTTPHeaders(tc.config.Headers),
WithCORSCredentials(tc.config.CORS.Credentials),
WithHealthchecks(tc.config.Healthcheck.Filters),
)
didFail := err != nil
if didFail != tc.shouldFail {

View File

@@ -409,6 +409,7 @@ func (config *configSchema) LogValues() {
ReadOnly: s.ReadOnly,
Headers: s.Headers,
CORS: s.CORS,
Healthcheck: s.Healthcheck,
}
servers = append(servers, server)
}

View File

@@ -48,6 +48,8 @@ alertmanager:
headers: {}
cors:
credentials: include
healthcheck:
filters: {}
alertAcknowledgement:
enabled: false
duration: 15m0s

View File

@@ -9,6 +9,10 @@ type AlertmanagerCORS struct {
Credentials string
}
type AlertmanagerHealthcheck struct {
Filters map[string][]string `yaml:"filters" koanf:"filters"`
}
type AlertmanagerConfig struct {
Cluster string
Name string
@@ -23,8 +27,9 @@ type AlertmanagerConfig struct {
Key string
InsecureSkipVerify bool `yaml:"insecureSkipVerify" koanf:"insecureSkipVerify"`
}
Headers map[string]string
CORS AlertmanagerCORS `yaml:"cors" koanf:"cors"`
Headers map[string]string
CORS AlertmanagerCORS `yaml:"cors" koanf:"cors"`
Healthcheck AlertmanagerHealthcheck `yaml:"healthcheck" koanf:"healthcheck"`
}
type LinkDetectRules struct {