mirror of
https://github.com/prymitive/karma
synced 2026-02-13 20:59:53 +00:00
feat(backend): add healtcheck:filters option
This commit is contained in:
committed by
Łukasz Mierzwa
parent
886a59d97d
commit
17b4e943b8
33
CHANGELOG.md
33
CHANGELOG.md
@@ -6,6 +6,39 @@
|
||||
|
||||
- Don't reset regex toggle when adding new silence labels #2520
|
||||
|
||||
### Added
|
||||
|
||||
- Added support for DeadMansSwitch alerts via `healtcheck:alerts` option
|
||||
on alertmanager upstream configuration #2512.
|
||||
Example:
|
||||
|
||||
- Setup always on alert in each Prometheus server (prom1 and prom2):
|
||||
|
||||
```YAML
|
||||
- alert: DeadMansSwitch
|
||||
expr: vector(1)
|
||||
```
|
||||
|
||||
- Add healtcheck configuration to karma:
|
||||
|
||||
```YAML
|
||||
alertmanager:
|
||||
servers:
|
||||
- name: am
|
||||
uri: https://alertmanager.example.com
|
||||
healthcheck:
|
||||
alerts:
|
||||
prom1:
|
||||
- alertname=DeadMansSwitch
|
||||
- instance=prom1
|
||||
prom2:
|
||||
- alertname=DeadMansSwitch
|
||||
- instance=prom2
|
||||
```
|
||||
|
||||
If any of these alerts is missing from alertmanager karma will show a warning
|
||||
in the UI.
|
||||
|
||||
## v0.77
|
||||
|
||||
### Fixed
|
||||
|
||||
@@ -110,6 +110,15 @@ only after all alerts are resolved you can use
|
||||
See [configuration docs](/docs/CONFIGURATION.md#alert-acknowledgement) for
|
||||
details.
|
||||
|
||||
### Dead Man’s Switch support
|
||||
|
||||
Starting with `v0.78` karma can be configured to check for
|
||||
[Dead Man’s Switch](https://en.wikipedia.org/wiki/Dead_man%27s_switch)
|
||||
style alerts (alert that is always firing). If no alert is found in given
|
||||
alertmanager karma will show an error in the UI.
|
||||
See `healthcheck:filters` option on [configuration docs](/docs/CONFIGURATION.md#alertmanagers)
|
||||
for details.
|
||||
|
||||
### Dark mode
|
||||
|
||||
Starting with `v0.52` release karma includes both light and dark themes.
|
||||
|
||||
@@ -192,6 +192,7 @@ func setupUpstreams() error {
|
||||
alertmanager.WithHTTPTransport(httpTransport), // we will pass a nil unless TLS.CA or TLS.Cert is set
|
||||
alertmanager.WithHTTPHeaders(s.Headers),
|
||||
alertmanager.WithCORSCredentials(s.CORS.Credentials),
|
||||
alertmanager.WithHealthchecks(s.Healthcheck.Filters),
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create Alertmanager '%s' with URI '%s': %s", s.Name, uri.SanitizeURI(s.URI), err)
|
||||
|
||||
@@ -102,6 +102,8 @@ level=info msg=" insecureSkipVerify: false"
|
||||
level=info msg=" headers: {}"
|
||||
level=info msg=" cors:"
|
||||
level=info msg=" credentials: include"
|
||||
level=info msg=" healthcheck:"
|
||||
level=info msg=" filters: {}"
|
||||
level=info msg="alertAcknowledgement:"
|
||||
level=info msg=" enabled: true"
|
||||
level=info msg=" duration: 5m0s"
|
||||
|
||||
@@ -43,6 +43,8 @@ level=info msg=" insecureSkipVerify: false"
|
||||
level=info msg=" headers: {}"
|
||||
level=info msg=" cors:"
|
||||
level=info msg=" credentials: include"
|
||||
level=info msg=" healthcheck:"
|
||||
level=info msg=" filters: {}"
|
||||
level=info msg=" - cluster: HA"
|
||||
level=info msg=" name: ha2"
|
||||
level=info msg=" uri: http://127.0.0.1:9094"
|
||||
@@ -58,6 +60,8 @@ level=info msg=" insecureSkipVerify: false"
|
||||
level=info msg=" headers: {}"
|
||||
level=info msg=" cors:"
|
||||
level=info msg=" credentials: omit"
|
||||
level=info msg=" healthcheck:"
|
||||
level=info msg=" filters: {}"
|
||||
level=info msg=" - cluster: \"\""
|
||||
level=info msg=" name: local"
|
||||
level=info msg=" uri: http://foo:xxx@127.0.0.1:9095"
|
||||
@@ -74,6 +78,14 @@ level=info msg=" headers:"
|
||||
level=info msg=" X-Auth-Test: some-token-or-other-string"
|
||||
level=info msg=" cors:"
|
||||
level=info msg=" credentials: same-origin"
|
||||
level=info msg=" healthcheck:"
|
||||
level=info msg=" filters:"
|
||||
level=info msg=" prom1:"
|
||||
level=info msg=" - alertname=DeadMansSwitch"
|
||||
level=info msg=" - instance=prom1"
|
||||
level=info msg=" prom2:"
|
||||
level=info msg=" - alertname=DeadMansSwitch"
|
||||
level=info msg=" - instance=prom2"
|
||||
level=info msg=" - cluster: \"\""
|
||||
level=info msg=" name: client-auth"
|
||||
level=info msg=" uri: https://127.0.0.1:9096"
|
||||
@@ -89,6 +101,8 @@ level=info msg=" insecureSkipVerify: false"
|
||||
level=info msg=" headers: {}"
|
||||
level=info msg=" cors:"
|
||||
level=info msg=" credentials: include"
|
||||
level=info msg=" healthcheck:"
|
||||
level=info msg=" filters: {}"
|
||||
level=info msg="alertAcknowledgement:"
|
||||
level=info msg=" enabled: true"
|
||||
level=info msg=" duration: 7m0s"
|
||||
@@ -250,6 +264,14 @@ alertmanager:
|
||||
X-Auth-Test: some-token-or-other-string
|
||||
cors:
|
||||
credentials: same-origin
|
||||
healthcheck:
|
||||
filters:
|
||||
prom1:
|
||||
- alertname=DeadMansSwitch
|
||||
- instance=prom1
|
||||
prom2:
|
||||
- alertname=DeadMansSwitch
|
||||
- instance=prom2
|
||||
- name: client-auth
|
||||
uri: https://127.0.0.1:9096
|
||||
timeout: 10s
|
||||
|
||||
@@ -35,6 +35,8 @@ level=info msg=" insecureSkipVerify: false"
|
||||
level=info msg=" headers: {}"
|
||||
level=info msg=" cors:"
|
||||
level=info msg=" credentials: include"
|
||||
level=info msg=" healthcheck:"
|
||||
level=info msg=" filters: {}"
|
||||
level=info msg="alertAcknowledgement:"
|
||||
level=info msg=" enabled: false"
|
||||
level=info msg=" duration: 15m0s"
|
||||
|
||||
@@ -35,6 +35,8 @@ level=info msg=" insecureSkipVerify: false"
|
||||
level=info msg=" headers: {}"
|
||||
level=info msg=" cors:"
|
||||
level=info msg=" credentials: include"
|
||||
level=info msg=" healthcheck:"
|
||||
level=info msg=" filters: {}"
|
||||
level=info msg="alertAcknowledgement:"
|
||||
level=info msg=" enabled: false"
|
||||
level=info msg=" duration: 15m0s"
|
||||
|
||||
@@ -35,6 +35,8 @@ level=info msg=" insecureSkipVerify: false"
|
||||
level=info msg=" headers: {}"
|
||||
level=info msg=" cors:"
|
||||
level=info msg=" credentials: include"
|
||||
level=info msg=" healthcheck:"
|
||||
level=info msg=" filters: {}"
|
||||
level=info msg="alertAcknowledgement:"
|
||||
level=info msg=" enabled: false"
|
||||
level=info msg=" duration: 15m0s"
|
||||
|
||||
@@ -37,6 +37,8 @@ level=info msg=" insecureSkipVerify: false"
|
||||
level=info msg=" headers: {}"
|
||||
level=info msg=" cors:"
|
||||
level=info msg=" credentials: include"
|
||||
level=info msg=" healthcheck:"
|
||||
level=info msg=" filters: {}"
|
||||
level=info msg="alertAcknowledgement:"
|
||||
level=info msg=" enabled: false"
|
||||
level=info msg=" duration: 15m0s"
|
||||
|
||||
@@ -35,6 +35,8 @@ level=info msg=" insecureSkipVerify: false"
|
||||
level=info msg=" headers: {}"
|
||||
level=info msg=" cors:"
|
||||
level=info msg=" credentials: include"
|
||||
level=info msg=" healthcheck:"
|
||||
level=info msg=" filters: {}"
|
||||
level=info msg="alertAcknowledgement:"
|
||||
level=info msg=" enabled: false"
|
||||
level=info msg=" duration: 15m0s"
|
||||
|
||||
@@ -0,0 +1,18 @@
|
||||
# Raises an error if healthcheck uses invalid filter
|
||||
karma.bin-should-fail --check-config
|
||||
! stdout .
|
||||
cmp stderr stderr.txt
|
||||
|
||||
-- stderr.txt --
|
||||
level=info msg="Reading configuration file" path=karma.yaml
|
||||
level=info msg="Version: dev"
|
||||
level=error msg="Execution failed" error="failed to create Alertmanager 'default' with URI 'https://127.0.0.1:9093': \"alertname==\" is not a valid filter"
|
||||
-- karma.yaml --
|
||||
alertmanager:
|
||||
servers:
|
||||
- name: default
|
||||
uri: https://127.0.0.1:9093
|
||||
healthcheck:
|
||||
filters:
|
||||
prom1:
|
||||
- alertname==
|
||||
@@ -2330,3 +2330,78 @@ func TestGetUserFromContextPresent(t *testing.T) {
|
||||
t.Errorf("getUserFromContext() returned user=%q", user)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHealthcheckAlerts(t *testing.T) {
|
||||
type testCaseT struct {
|
||||
healthchecks map[string][]string
|
||||
hasError bool
|
||||
}
|
||||
|
||||
testCases := []testCaseT{
|
||||
{
|
||||
healthchecks: map[string][]string{},
|
||||
hasError: false,
|
||||
},
|
||||
{
|
||||
healthchecks: map[string][]string{
|
||||
"active": {"alertname=Host_Down"},
|
||||
},
|
||||
hasError: false,
|
||||
},
|
||||
{
|
||||
healthchecks: map[string][]string{
|
||||
"active": {
|
||||
"alertname=Host_Down",
|
||||
"cluster=staging",
|
||||
},
|
||||
},
|
||||
hasError: false,
|
||||
},
|
||||
{
|
||||
healthchecks: map[string][]string{
|
||||
"active": {"alertname=FooBar"},
|
||||
},
|
||||
hasError: true,
|
||||
},
|
||||
{
|
||||
healthchecks: map[string][]string{
|
||||
"active": {
|
||||
"alertname=Host_Down",
|
||||
"cluster=unknown",
|
||||
},
|
||||
},
|
||||
hasError: true,
|
||||
},
|
||||
}
|
||||
|
||||
zerolog.SetGlobalLevel(zerolog.FatalLevel)
|
||||
for i, testCase := range testCases {
|
||||
for _, version := range mock.ListAllMocks() {
|
||||
t.Run(fmt.Sprintf("%d/%s", i, version), func(t *testing.T) {
|
||||
httpmock.Activate()
|
||||
defer httpmock.DeactivateAndReset()
|
||||
mockCache()
|
||||
mock.RegisterURL("http://localhost/metrics", version, "metrics")
|
||||
mock.RegisterURL("http://localhost/api/v2/status", version, "api/v2/status")
|
||||
mock.RegisterURL("http://localhost/api/v2/silences", version, "api/v2/silences")
|
||||
mock.RegisterURL("http://localhost/api/v2/alerts/groups", version, "api/v2/alerts/groups")
|
||||
|
||||
am, err := alertmanager.NewAlertmanager(
|
||||
"cluster",
|
||||
"healthchecks",
|
||||
"http://localhost",
|
||||
alertmanager.WithHealthchecks(testCase.healthchecks),
|
||||
)
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
return
|
||||
}
|
||||
_ = am.Pull()
|
||||
hasError := am.Error() != ""
|
||||
if hasError != testCase.hasError {
|
||||
t.Errorf("error=%q expected=%v", am.Error(), testCase.hasError)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -176,6 +176,8 @@ alertmanager:
|
||||
any: string
|
||||
cors:
|
||||
credentials: string
|
||||
healthcheck:
|
||||
filters: map (string: list of strings)
|
||||
```
|
||||
|
||||
- `interval` - how often alerts should be refreshed, a string in
|
||||
@@ -244,6 +246,41 @@ alertmanager:
|
||||
`omit` or `same-origin` if Alertmanager is configured to respond with
|
||||
`Access-Control-Allow-Origin: *`,
|
||||
[see docs](https://developer.mozilla.org/en-US/docs/Web/HTTP/CORS/Errors/CORSNotSupportingCredentials).
|
||||
- `healthcheck:filters` - define healtchecks using alert filters. When set karma
|
||||
will search for alerts matching defined filters and show an error if it doesn't
|
||||
match anything. This can be used with a [Dead man's switch](https://en.wikipedia.org/wiki/Dead_man%27s_switch)
|
||||
style alert to notify karma users that there's a problem with alerting pipeline.
|
||||
Syntax for this option is a map where key is the name of the filter set (used in
|
||||
the UI when showing errors) and the value is a list of filters.
|
||||
|
||||
Example:
|
||||
|
||||
- Setup always on alert in each Prometheus server (prom1 and prom2):
|
||||
|
||||
```YAML
|
||||
- alert: DeadMansSwitch
|
||||
expr: vector(1)
|
||||
```
|
||||
|
||||
- Add healtcheck configuration to karma:
|
||||
|
||||
```YAML
|
||||
alertmanager:
|
||||
servers:
|
||||
- name: am
|
||||
uri: https://alertmanager.example.com
|
||||
healthcheck:
|
||||
filters:
|
||||
prom1:
|
||||
- alertname=DeadMansSwitch
|
||||
- instance=prom1
|
||||
prom2:
|
||||
- alertname=DeadMansSwitch
|
||||
- instance=prom2
|
||||
```
|
||||
|
||||
If any of these alerts is missing from alertmanager karma will show a warning
|
||||
in the UI.
|
||||
|
||||
Note: there are multiple supported combination of URI settings which result in
|
||||
a slightly different behavior. Settings that control it are:
|
||||
|
||||
@@ -32,6 +32,11 @@ type alertmanagerMetrics struct {
|
||||
Errors map[string]float64
|
||||
}
|
||||
|
||||
type healthCheck struct {
|
||||
filters []filters.FilterT
|
||||
wasFound bool
|
||||
}
|
||||
|
||||
// Alertmanager represents Alertmanager upstream instance
|
||||
type Alertmanager struct {
|
||||
URI string `json:"uri"`
|
||||
@@ -65,6 +70,7 @@ type Alertmanager struct {
|
||||
HTTPHeaders map[string]string
|
||||
// CORS credentials
|
||||
CORSCredentials string `json:"corsCredentials"`
|
||||
healthchecks map[string]healthCheck
|
||||
}
|
||||
|
||||
func (am *Alertmanager) probeVersion() string {
|
||||
@@ -203,6 +209,16 @@ func (am *Alertmanager) pullAlerts(version string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
healthchecks := map[string]healthCheck{}
|
||||
am.lock.RLock()
|
||||
for name, hc := range am.healthchecks {
|
||||
healthchecks[name] = healthCheck{
|
||||
filters: hc.filters,
|
||||
wasFound: false,
|
||||
}
|
||||
}
|
||||
am.lock.RUnlock()
|
||||
|
||||
var groups []models.AlertGroup
|
||||
|
||||
start := time.Now()
|
||||
@@ -233,6 +249,8 @@ func (am *Alertmanager) pullAlerts(version string) error {
|
||||
}
|
||||
}
|
||||
for _, alert := range ag.Alerts {
|
||||
alert := alert
|
||||
|
||||
if _, found := uniqueAlerts[agID]; !found {
|
||||
uniqueAlerts[agID] = map[string]models.Alert{}
|
||||
}
|
||||
@@ -243,8 +261,33 @@ func (am *Alertmanager) pullAlerts(version string) error {
|
||||
for key := range alert.Labels {
|
||||
knownLabelsMap[key] = true
|
||||
}
|
||||
}
|
||||
|
||||
for name, hc := range am.healthchecks {
|
||||
positiveMatch := false
|
||||
negativeMatch := false
|
||||
for _, hcFilter := range hc.filters {
|
||||
if hcFilter.Match(&alert, 0) {
|
||||
log.Debug().
|
||||
Str("alertmanager", am.Name).
|
||||
Str("healthcheck", name).
|
||||
Msg("Healthcheck alert matched")
|
||||
positiveMatch = true
|
||||
} else {
|
||||
negativeMatch = true
|
||||
}
|
||||
}
|
||||
if positiveMatch && !negativeMatch {
|
||||
log.Debug().
|
||||
Str("alertmanager", am.Name).
|
||||
Str("healthcheck", name).
|
||||
Msg("Marking healthcheck alert as found")
|
||||
healthchecks[name] = healthCheck{
|
||||
filters: hc.filters,
|
||||
wasFound: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dedupedGroups := make([]models.AlertGroup, 0, len(uniqueGroups))
|
||||
@@ -326,6 +369,7 @@ func (am *Alertmanager) pullAlerts(version string) error {
|
||||
am.colors = colors
|
||||
am.autocomplete = autocomplete
|
||||
am.knownLabels = knownLabels
|
||||
am.healthchecks = healthchecks
|
||||
am.lock.Unlock()
|
||||
|
||||
return nil
|
||||
@@ -381,6 +425,12 @@ func (am *Alertmanager) Pull() error {
|
||||
am.clusterName = ""
|
||||
am.lock.Unlock()
|
||||
|
||||
for name, hc := range am.healthchecks {
|
||||
if !hc.wasFound {
|
||||
am.setError(fmt.Sprintf("Healthcheck filter %q didn't match any alerts", name))
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@ import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/prymitive/karma/internal/filters"
|
||||
"github.com/prymitive/karma/internal/models"
|
||||
"github.com/prymitive/karma/internal/uri"
|
||||
|
||||
@@ -42,7 +43,8 @@ func NewAlertmanager(cluster, name, upstreamURI string, opts ...Option) (*Alertm
|
||||
labelValueErrorsSilences: 0,
|
||||
},
|
||||
},
|
||||
status: models.AlertmanagerStatus{},
|
||||
status: models.AlertmanagerStatus{},
|
||||
healthchecks: map[string]healthCheck{},
|
||||
}
|
||||
|
||||
for _, opt := range opts {
|
||||
@@ -176,3 +178,24 @@ func WithCORSCredentials(val string) Option {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func WithHealthchecks(val map[string][]string) Option {
|
||||
return func(am *Alertmanager) error {
|
||||
healthchecks := map[string]healthCheck{}
|
||||
for name, filterExpressions := range val {
|
||||
hc := healthCheck{
|
||||
filters: []filters.FilterT{},
|
||||
}
|
||||
for _, filterExpression := range filterExpressions {
|
||||
f := filters.NewFilter(filterExpression)
|
||||
if f == nil || !f.GetIsValid() {
|
||||
return fmt.Errorf("%q is not a valid filter", filterExpression)
|
||||
}
|
||||
hc.filters = append(hc.filters, f)
|
||||
}
|
||||
healthchecks[name] = hc
|
||||
}
|
||||
am.healthchecks = healthchecks
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
@@ -76,6 +76,42 @@ var testCases = []testCase{
|
||||
},
|
||||
shouldFail: true,
|
||||
},
|
||||
{
|
||||
config: config.AlertmanagerConfig{
|
||||
Cluster: "cluster",
|
||||
Name: "name",
|
||||
URI: "http://localhost:9093",
|
||||
ExternalURI: "http://localhost:9093",
|
||||
Timeout: time.Second * 30,
|
||||
Proxy: false,
|
||||
ReadOnly: false,
|
||||
Headers: map[string]string{},
|
||||
Healthcheck: config.AlertmanagerHealthcheck{
|
||||
Filters: map[string][]string{
|
||||
"prom1": {"@age>a"},
|
||||
},
|
||||
},
|
||||
},
|
||||
shouldFail: true,
|
||||
},
|
||||
{
|
||||
config: config.AlertmanagerConfig{
|
||||
Cluster: "cluster",
|
||||
Name: "name",
|
||||
URI: "http://localhost:9093",
|
||||
ExternalURI: "http://localhost:9093",
|
||||
Timeout: time.Second * 30,
|
||||
Proxy: false,
|
||||
ReadOnly: false,
|
||||
Headers: map[string]string{},
|
||||
Healthcheck: config.AlertmanagerHealthcheck{
|
||||
Filters: map[string][]string{
|
||||
"prom1": {" "},
|
||||
},
|
||||
},
|
||||
},
|
||||
shouldFail: true,
|
||||
},
|
||||
}
|
||||
|
||||
func TestOptions(t *testing.T) {
|
||||
@@ -100,6 +136,7 @@ func TestOptions(t *testing.T) {
|
||||
WithHTTPTransport(httpTransport), // we will pass a nil unless TLS.CA or TLS.Cert is set
|
||||
WithHTTPHeaders(tc.config.Headers),
|
||||
WithCORSCredentials(tc.config.CORS.Credentials),
|
||||
WithHealthchecks(tc.config.Healthcheck.Filters),
|
||||
)
|
||||
didFail := err != nil
|
||||
if didFail != tc.shouldFail {
|
||||
|
||||
@@ -409,6 +409,7 @@ func (config *configSchema) LogValues() {
|
||||
ReadOnly: s.ReadOnly,
|
||||
Headers: s.Headers,
|
||||
CORS: s.CORS,
|
||||
Healthcheck: s.Healthcheck,
|
||||
}
|
||||
servers = append(servers, server)
|
||||
}
|
||||
|
||||
@@ -48,6 +48,8 @@ alertmanager:
|
||||
headers: {}
|
||||
cors:
|
||||
credentials: include
|
||||
healthcheck:
|
||||
filters: {}
|
||||
alertAcknowledgement:
|
||||
enabled: false
|
||||
duration: 15m0s
|
||||
|
||||
@@ -9,6 +9,10 @@ type AlertmanagerCORS struct {
|
||||
Credentials string
|
||||
}
|
||||
|
||||
type AlertmanagerHealthcheck struct {
|
||||
Filters map[string][]string `yaml:"filters" koanf:"filters"`
|
||||
}
|
||||
|
||||
type AlertmanagerConfig struct {
|
||||
Cluster string
|
||||
Name string
|
||||
@@ -23,8 +27,9 @@ type AlertmanagerConfig struct {
|
||||
Key string
|
||||
InsecureSkipVerify bool `yaml:"insecureSkipVerify" koanf:"insecureSkipVerify"`
|
||||
}
|
||||
Headers map[string]string
|
||||
CORS AlertmanagerCORS `yaml:"cors" koanf:"cors"`
|
||||
Headers map[string]string
|
||||
CORS AlertmanagerCORS `yaml:"cors" koanf:"cors"`
|
||||
Healthcheck AlertmanagerHealthcheck `yaml:"healthcheck" koanf:"healthcheck"`
|
||||
}
|
||||
|
||||
type LinkDetectRules struct {
|
||||
|
||||
Reference in New Issue
Block a user