From c71c9e6107b1018462e6834694b798e43231ed0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Mierzwa?= Date: Mon, 1 Nov 2021 11:59:39 +0000 Subject: [PATCH] fix(backend): retry failed alertmanager requests --- CHANGELOG.md | 1 + cmd/karma/tests/testscript/068_sentry.txt | 5 ++++- .../tests/testscript/069_simple_config.txt | 5 ++++- .../testscript/073_pid_file_remove_error.txt | 5 ++++- .../testscript/077_listen_invalid_port.txt | 5 ++++- cmd/karma/tests/testscript/083_metrics.txt | 5 ++++- cmd/karma/tests/testscript/085_debug.txt | 5 ++++- .../testscript/088_listen_tls_key_invalid.txt | 5 ++++- .../089_listen_tls_cert_invalid.txt | 5 ++++- .../testscript/090_listen_tls_key_missing.txt | 5 ++++- .../091_listen_tls_cert_missing.txt | 5 ++++- .../testscript/094_shutdown_slow_client.txt | 5 ++++- .../tests/testscript/099_alert_history.txt | 5 ++++- cmd/karma/timer.go | 19 ++++++++++++++++--- 14 files changed, 65 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eafeaf734..f15aa138c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ - Refactored internal APIs. - Overview modal won't show label name for every value to save screen space. +- Retry failed requests when collecting alerts and silences from alertmanager. ## v0.92 diff --git a/cmd/karma/tests/testscript/068_sentry.txt b/cmd/karma/tests/testscript/068_sentry.txt index 3e03703bb..b1df8ed42 100644 --- a/cmd/karma/tests/testscript/068_sentry.txt +++ b/cmd/karma/tests/testscript/068_sentry.txt @@ -143,7 +143,10 @@ level=info msg="Pulling latest alerts and silences from Alertmanager" level=info msg="Collecting alerts and silences" alertmanager=default level=info msg="GET request" timeout=40 uri=http://127.0.0.1:9093/metrics level=error msg="Request failed" error="Get \"http://127.0.0.1:9093/metrics\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=default uri=http://127.0.0.1:9093 -level=error msg="Collection failed" error="Get \"http://127.0.0.1:9093/api/v2/status\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=default +level=error msg="Collection failed" error="Get \"http://127.0.0.1:9093/api/v2/status\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=default try=1/2 +level=info msg="GET request" timeout=40 uri=http://127.0.0.1:9093/metrics +level=error msg="Request failed" error="Get \"http://127.0.0.1:9093/metrics\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=default uri=http://127.0.0.1:9093 +level=error msg="Collection failed" error="Get \"http://127.0.0.1:9093/api/v2/status\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=default try=2/2 level=info msg="Collection completed" level=info msg="Done, starting HTTP server" level=info msg="Starting HTTP server" address=127.0.0.1:8068 diff --git a/cmd/karma/tests/testscript/069_simple_config.txt b/cmd/karma/tests/testscript/069_simple_config.txt index 45017c680..86a0cd925 100644 --- a/cmd/karma/tests/testscript/069_simple_config.txt +++ b/cmd/karma/tests/testscript/069_simple_config.txt @@ -14,7 +14,10 @@ level=info msg="Pulling latest alerts and silences from Alertmanager" level=info msg="Collecting alerts and silences" alertmanager=default level=info msg="GET request" timeout=40 uri=http://127.0.0.1/metrics level=error msg="Request failed" error="Get \"http://127.0.0.1/metrics\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default uri=http://127.0.0.1 -level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default +level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default try=1/2 +level=info msg="GET request" timeout=40 uri=http://127.0.0.1/metrics +level=error msg="Request failed" error="Get \"http://127.0.0.1/metrics\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default uri=http://127.0.0.1 +level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default try=2/2 level=info msg="Collection completed" level=info msg="Done, starting HTTP server" level=info msg="Starting HTTP server" address=127.0.0.1:8069 diff --git a/cmd/karma/tests/testscript/073_pid_file_remove_error.txt b/cmd/karma/tests/testscript/073_pid_file_remove_error.txt index 907a5b19f..6d525664e 100644 --- a/cmd/karma/tests/testscript/073_pid_file_remove_error.txt +++ b/cmd/karma/tests/testscript/073_pid_file_remove_error.txt @@ -14,7 +14,10 @@ level=info msg="Pulling latest alerts and silences from Alertmanager" level=info msg="Collecting alerts and silences" alertmanager=default level=info msg="GET request" timeout=40 uri=http://127.0.0.1/metrics level=error msg="Request failed" error="Get \"http://127.0.0.1/metrics\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default uri=http://127.0.0.1 -level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default +level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default try=1/2 +level=info msg="GET request" timeout=40 uri=http://127.0.0.1/metrics +level=error msg="Request failed" error="Get \"http://127.0.0.1/metrics\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default uri=http://127.0.0.1 +level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default try=2/2 level=info msg="Collection completed" level=info msg="Done, starting HTTP server" level=info msg="Starting HTTP server" address=127.0.0.1:8073 diff --git a/cmd/karma/tests/testscript/077_listen_invalid_port.txt b/cmd/karma/tests/testscript/077_listen_invalid_port.txt index e4e2cc775..dd9cef08d 100644 --- a/cmd/karma/tests/testscript/077_listen_invalid_port.txt +++ b/cmd/karma/tests/testscript/077_listen_invalid_port.txt @@ -13,7 +13,10 @@ level=info msg="Pulling latest alerts and silences from Alertmanager" level=info msg="Collecting alerts and silences" alertmanager=default level=info msg="GET request" timeout=40 uri=http://foo:xxx@127.0.0.1/metrics level=error msg="Request failed" error="Get \"http://foo:***@127.0.0.1/metrics\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default uri=http://foo:xxx@127.0.0.1 -level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default +level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default try=1/2 +level=info msg="GET request" timeout=40 uri=http://foo:xxx@127.0.0.1/metrics +level=error msg="Request failed" error="Get \"http://foo:***@127.0.0.1/metrics\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default uri=http://foo:xxx@127.0.0.1 +level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default try=2/2 level=info msg="Collection completed" level=info msg="Done, starting HTTP server" level=error msg="Execution failed" error="listen tcp: address 9999999: invalid port" diff --git a/cmd/karma/tests/testscript/083_metrics.txt b/cmd/karma/tests/testscript/083_metrics.txt index 5d6753b56..b84120423 100644 --- a/cmd/karma/tests/testscript/083_metrics.txt +++ b/cmd/karma/tests/testscript/083_metrics.txt @@ -124,7 +124,10 @@ level=info msg="Pulling latest alerts and silences from Alertmanager" level=info msg="Collecting alerts and silences" alertmanager=default level=info msg="GET request" timeout=40 uri=http://127.0.0.1/metrics level=error msg="Request failed" error="Get \"http://127.0.0.1/metrics\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default uri=http://127.0.0.1 -level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default +level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default try=1/2 +level=info msg="GET request" timeout=40 uri=http://127.0.0.1/metrics +level=error msg="Request failed" error="Get \"http://127.0.0.1/metrics\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default uri=http://127.0.0.1 +level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default try=2/2 level=info msg="Collection completed" level=info msg="Done, starting HTTP server" level=info msg="Starting HTTP server" address=127.0.0.1:8083 diff --git a/cmd/karma/tests/testscript/085_debug.txt b/cmd/karma/tests/testscript/085_debug.txt index 389a6652f..003cef36e 100644 --- a/cmd/karma/tests/testscript/085_debug.txt +++ b/cmd/karma/tests/testscript/085_debug.txt @@ -14,7 +14,10 @@ level=info msg="Pulling latest alerts and silences from Alertmanager" level=info msg="Collecting alerts and silences" alertmanager=default level=info msg="GET request" timeout=40 uri=http://127.0.0.1/metrics level=error msg="Request failed" error="Get \"http://127.0.0.1/metrics\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default uri=http://127.0.0.1 -level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default +level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default try=1/2 +level=info msg="GET request" timeout=40 uri=http://127.0.0.1/metrics +level=error msg="Request failed" error="Get \"http://127.0.0.1/metrics\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default uri=http://127.0.0.1 +level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default try=2/2 level=info msg="Collection completed" level=info msg="Done, starting HTTP server" level=info msg="Starting HTTP server" address=127.0.0.1:8085 diff --git a/cmd/karma/tests/testscript/088_listen_tls_key_invalid.txt b/cmd/karma/tests/testscript/088_listen_tls_key_invalid.txt index d0dc08d75..30aed33b7 100644 --- a/cmd/karma/tests/testscript/088_listen_tls_key_invalid.txt +++ b/cmd/karma/tests/testscript/088_listen_tls_key_invalid.txt @@ -17,7 +17,10 @@ level=info msg="Pulling latest alerts and silences from Alertmanager" level=info msg="Collecting alerts and silences" alertmanager=local level=info msg="GET request" timeout=10 uri=https://127.0.0.1:9093/metrics level=error msg="Request failed" error="Get \"https://127.0.0.1:9093/metrics\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local uri=https://127.0.0.1:9093 -level=error msg="Collection failed" error="Get \"https://127.0.0.1:9093/api/v2/status\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local +level=error msg="Collection failed" error="Get \"https://127.0.0.1:9093/api/v2/status\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local try=1/2 +level=info msg="GET request" timeout=10 uri=https://127.0.0.1:9093/metrics +level=error msg="Request failed" error="Get \"https://127.0.0.1:9093/metrics\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local uri=https://127.0.0.1:9093 +level=error msg="Collection failed" error="Get \"https://127.0.0.1:9093/api/v2/status\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local try=2/2 level=info msg="Collection completed" level=info msg="Done, starting HTTP server" level=info msg="Starting HTTPS server" address=127.0.0.1:8088 diff --git a/cmd/karma/tests/testscript/089_listen_tls_cert_invalid.txt b/cmd/karma/tests/testscript/089_listen_tls_cert_invalid.txt index acda5b223..75a5f4bd6 100644 --- a/cmd/karma/tests/testscript/089_listen_tls_cert_invalid.txt +++ b/cmd/karma/tests/testscript/089_listen_tls_cert_invalid.txt @@ -17,7 +17,10 @@ level=info msg="Pulling latest alerts and silences from Alertmanager" level=info msg="Collecting alerts and silences" alertmanager=local level=info msg="GET request" timeout=10 uri=https://127.0.0.1:9093/metrics level=error msg="Request failed" error="Get \"https://127.0.0.1:9093/metrics\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local uri=https://127.0.0.1:9093 -level=error msg="Collection failed" error="Get \"https://127.0.0.1:9093/api/v2/status\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local +level=error msg="Collection failed" error="Get \"https://127.0.0.1:9093/api/v2/status\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local try=1/2 +level=info msg="GET request" timeout=10 uri=https://127.0.0.1:9093/metrics +level=error msg="Request failed" error="Get \"https://127.0.0.1:9093/metrics\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local uri=https://127.0.0.1:9093 +level=error msg="Collection failed" error="Get \"https://127.0.0.1:9093/api/v2/status\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local try=2/2 level=info msg="Collection completed" level=info msg="Done, starting HTTP server" level=info msg="Starting HTTPS server" address=127.0.0.1:8089 diff --git a/cmd/karma/tests/testscript/090_listen_tls_key_missing.txt b/cmd/karma/tests/testscript/090_listen_tls_key_missing.txt index 615f0629f..60896c31e 100644 --- a/cmd/karma/tests/testscript/090_listen_tls_key_missing.txt +++ b/cmd/karma/tests/testscript/090_listen_tls_key_missing.txt @@ -17,7 +17,10 @@ level=info msg="Pulling latest alerts and silences from Alertmanager" level=info msg="Collecting alerts and silences" alertmanager=local level=info msg="GET request" timeout=10 uri=https://127.0.0.1:9093/metrics level=error msg="Request failed" error="Get \"https://127.0.0.1:9093/metrics\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local uri=https://127.0.0.1:9093 -level=error msg="Collection failed" error="Get \"https://127.0.0.1:9093/api/v2/status\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local +level=error msg="Collection failed" error="Get \"https://127.0.0.1:9093/api/v2/status\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local try=1/2 +level=info msg="GET request" timeout=10 uri=https://127.0.0.1:9093/metrics +level=error msg="Request failed" error="Get \"https://127.0.0.1:9093/metrics\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local uri=https://127.0.0.1:9093 +level=error msg="Collection failed" error="Get \"https://127.0.0.1:9093/api/v2/status\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local try=2/2 level=info msg="Collection completed" level=info msg="Done, starting HTTP server" level=info msg="Starting HTTPS server" address=127.0.0.1:8090 diff --git a/cmd/karma/tests/testscript/091_listen_tls_cert_missing.txt b/cmd/karma/tests/testscript/091_listen_tls_cert_missing.txt index 5073b5571..430b4684d 100644 --- a/cmd/karma/tests/testscript/091_listen_tls_cert_missing.txt +++ b/cmd/karma/tests/testscript/091_listen_tls_cert_missing.txt @@ -17,7 +17,10 @@ level=info msg="Pulling latest alerts and silences from Alertmanager" level=info msg="Collecting alerts and silences" alertmanager=local level=info msg="GET request" timeout=10 uri=https://127.0.0.1:9093/metrics level=error msg="Request failed" error="Get \"https://127.0.0.1:9093/metrics\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local uri=https://127.0.0.1:9093 -level=error msg="Collection failed" error="Get \"https://127.0.0.1:9093/api/v2/status\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local +level=error msg="Collection failed" error="Get \"https://127.0.0.1:9093/api/v2/status\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local try=1/2 +level=info msg="GET request" timeout=10 uri=https://127.0.0.1:9093/metrics +level=error msg="Request failed" error="Get \"https://127.0.0.1:9093/metrics\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local uri=https://127.0.0.1:9093 +level=error msg="Collection failed" error="Get \"https://127.0.0.1:9093/api/v2/status\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local try=2/2 level=info msg="Collection completed" level=info msg="Done, starting HTTP server" level=info msg="Starting HTTPS server" address=127.0.0.1:8091 diff --git a/cmd/karma/tests/testscript/094_shutdown_slow_client.txt b/cmd/karma/tests/testscript/094_shutdown_slow_client.txt index 6b3c21e17..0d3c8bfb6 100644 --- a/cmd/karma/tests/testscript/094_shutdown_slow_client.txt +++ b/cmd/karma/tests/testscript/094_shutdown_slow_client.txt @@ -16,7 +16,10 @@ level=info msg="Pulling latest alerts and silences from Alertmanager" level=info msg="Collecting alerts and silences" alertmanager=proxied level=info msg="GET request" timeout=40 uri=http://127.0.0.1:9094/metrics level=error msg="Request failed" error="Get \"http://127.0.0.1:9094/metrics\": dial tcp 127.0.0.1:9094: connect: connection refused" alertmanager=proxied uri=http://127.0.0.1:9094 -level=error msg="Collection failed" error="Get \"http://127.0.0.1:9094/api/v2/status\": dial tcp 127.0.0.1:9094: connect: connection refused" alertmanager=proxied +level=error msg="Collection failed" error="Get \"http://127.0.0.1:9094/api/v2/status\": dial tcp 127.0.0.1:9094: connect: connection refused" alertmanager=proxied try=1/2 +level=info msg="GET request" timeout=40 uri=http://127.0.0.1:9094/metrics +level=error msg="Request failed" error="Get \"http://127.0.0.1:9094/metrics\": dial tcp 127.0.0.1:9094: connect: connection refused" alertmanager=proxied uri=http://127.0.0.1:9094 +level=error msg="Collection failed" error="Get \"http://127.0.0.1:9094/api/v2/status\": dial tcp 127.0.0.1:9094: connect: connection refused" alertmanager=proxied try=2/2 level=info msg="Collection completed" level=info msg="Done, starting HTTP server" level=info msg="Starting HTTP server" address=127.0.0.1:8094 diff --git a/cmd/karma/tests/testscript/099_alert_history.txt b/cmd/karma/tests/testscript/099_alert_history.txt index 00c3d9727..8e01c440a 100644 --- a/cmd/karma/tests/testscript/099_alert_history.txt +++ b/cmd/karma/tests/testscript/099_alert_history.txt @@ -14,7 +14,10 @@ level=info msg="Pulling latest alerts and silences from Alertmanager" level=info msg="Collecting alerts and silences" alertmanager=default level=info msg="GET request" timeout=40 uri=http://127.0.0.1/metrics level=error msg="Request failed" error="Get \"http://127.0.0.1/metrics\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default uri=http://127.0.0.1 -level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default +level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default try=1/2 +level=info msg="GET request" timeout=40 uri=http://127.0.0.1/metrics +level=error msg="Request failed" error="Get \"http://127.0.0.1/metrics\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default uri=http://127.0.0.1 +level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default try=2/2 level=info msg="Collection completed" level=info msg="Done, starting HTTP server" level=info msg="Starting HTTP server" address=127.0.0.1:8099 diff --git a/cmd/karma/timer.go b/cmd/karma/timer.go index 84b04063b..334ae39a1 100644 --- a/cmd/karma/timer.go +++ b/cmd/karma/timer.go @@ -1,6 +1,7 @@ package main import ( + "fmt" "runtime" "sync" @@ -9,6 +10,10 @@ import ( "github.com/rs/zerolog/log" ) +const ( + maxTries = 2 +) + func pullFromAlertmanager() { // always flush cache once we're done defer apiCache.Purge() @@ -22,9 +27,17 @@ func pullFromAlertmanager() { for _, upstream := range upstreams { go func(am *alertmanager.Alertmanager) { log.Info().Str("alertmanager", am.Name).Msg("Collecting alerts and silences") - err := am.Pull() - if err != nil { - log.Error().Err(err).Str("alertmanager", am.Name).Msg("Collection failed") + for i := 1; i <= maxTries; i++ { + err := am.Pull() + if err != nil { + log.Error(). + Err(err). + Str("alertmanager", am.Name). + Str("try", fmt.Sprintf("%d/%d", i, maxTries)). + Msg("Collection failed") + } else { + break + } } wg.Done() }(upstream)