fix(backend): retry failed alertmanager requests

This commit is contained in:
Łukasz Mierzwa
2021-11-01 11:59:39 +00:00
committed by Łukasz Mierzwa
parent d172c58a1a
commit c71c9e6107
14 changed files with 65 additions and 15 deletions

View File

@@ -19,6 +19,7 @@
- Refactored internal APIs.
- Overview modal won't show label name for every value to save screen space.
- Retry failed requests when collecting alerts and silences from alertmanager.
## v0.92

View File

@@ -143,7 +143,10 @@ level=info msg="Pulling latest alerts and silences from Alertmanager"
level=info msg="Collecting alerts and silences" alertmanager=default
level=info msg="GET request" timeout=40 uri=http://127.0.0.1:9093/metrics
level=error msg="Request failed" error="Get \"http://127.0.0.1:9093/metrics\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=default uri=http://127.0.0.1:9093
level=error msg="Collection failed" error="Get \"http://127.0.0.1:9093/api/v2/status\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=default
level=error msg="Collection failed" error="Get \"http://127.0.0.1:9093/api/v2/status\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=default try=1/2
level=info msg="GET request" timeout=40 uri=http://127.0.0.1:9093/metrics
level=error msg="Request failed" error="Get \"http://127.0.0.1:9093/metrics\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=default uri=http://127.0.0.1:9093
level=error msg="Collection failed" error="Get \"http://127.0.0.1:9093/api/v2/status\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=default try=2/2
level=info msg="Collection completed"
level=info msg="Done, starting HTTP server"
level=info msg="Starting HTTP server" address=127.0.0.1:8068

View File

@@ -14,7 +14,10 @@ level=info msg="Pulling latest alerts and silences from Alertmanager"
level=info msg="Collecting alerts and silences" alertmanager=default
level=info msg="GET request" timeout=40 uri=http://127.0.0.1/metrics
level=error msg="Request failed" error="Get \"http://127.0.0.1/metrics\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default uri=http://127.0.0.1
level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default
level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default try=1/2
level=info msg="GET request" timeout=40 uri=http://127.0.0.1/metrics
level=error msg="Request failed" error="Get \"http://127.0.0.1/metrics\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default uri=http://127.0.0.1
level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default try=2/2
level=info msg="Collection completed"
level=info msg="Done, starting HTTP server"
level=info msg="Starting HTTP server" address=127.0.0.1:8069

View File

@@ -14,7 +14,10 @@ level=info msg="Pulling latest alerts and silences from Alertmanager"
level=info msg="Collecting alerts and silences" alertmanager=default
level=info msg="GET request" timeout=40 uri=http://127.0.0.1/metrics
level=error msg="Request failed" error="Get \"http://127.0.0.1/metrics\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default uri=http://127.0.0.1
level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default
level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default try=1/2
level=info msg="GET request" timeout=40 uri=http://127.0.0.1/metrics
level=error msg="Request failed" error="Get \"http://127.0.0.1/metrics\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default uri=http://127.0.0.1
level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default try=2/2
level=info msg="Collection completed"
level=info msg="Done, starting HTTP server"
level=info msg="Starting HTTP server" address=127.0.0.1:8073

View File

@@ -13,7 +13,10 @@ level=info msg="Pulling latest alerts and silences from Alertmanager"
level=info msg="Collecting alerts and silences" alertmanager=default
level=info msg="GET request" timeout=40 uri=http://foo:xxx@127.0.0.1/metrics
level=error msg="Request failed" error="Get \"http://foo:***@127.0.0.1/metrics\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default uri=http://foo:xxx@127.0.0.1
level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default
level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default try=1/2
level=info msg="GET request" timeout=40 uri=http://foo:xxx@127.0.0.1/metrics
level=error msg="Request failed" error="Get \"http://foo:***@127.0.0.1/metrics\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default uri=http://foo:xxx@127.0.0.1
level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default try=2/2
level=info msg="Collection completed"
level=info msg="Done, starting HTTP server"
level=error msg="Execution failed" error="listen tcp: address 9999999: invalid port"

View File

@@ -124,7 +124,10 @@ level=info msg="Pulling latest alerts and silences from Alertmanager"
level=info msg="Collecting alerts and silences" alertmanager=default
level=info msg="GET request" timeout=40 uri=http://127.0.0.1/metrics
level=error msg="Request failed" error="Get \"http://127.0.0.1/metrics\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default uri=http://127.0.0.1
level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default
level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default try=1/2
level=info msg="GET request" timeout=40 uri=http://127.0.0.1/metrics
level=error msg="Request failed" error="Get \"http://127.0.0.1/metrics\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default uri=http://127.0.0.1
level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default try=2/2
level=info msg="Collection completed"
level=info msg="Done, starting HTTP server"
level=info msg="Starting HTTP server" address=127.0.0.1:8083

View File

@@ -14,7 +14,10 @@ level=info msg="Pulling latest alerts and silences from Alertmanager"
level=info msg="Collecting alerts and silences" alertmanager=default
level=info msg="GET request" timeout=40 uri=http://127.0.0.1/metrics
level=error msg="Request failed" error="Get \"http://127.0.0.1/metrics\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default uri=http://127.0.0.1
level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default
level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default try=1/2
level=info msg="GET request" timeout=40 uri=http://127.0.0.1/metrics
level=error msg="Request failed" error="Get \"http://127.0.0.1/metrics\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default uri=http://127.0.0.1
level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default try=2/2
level=info msg="Collection completed"
level=info msg="Done, starting HTTP server"
level=info msg="Starting HTTP server" address=127.0.0.1:8085

View File

@@ -17,7 +17,10 @@ level=info msg="Pulling latest alerts and silences from Alertmanager"
level=info msg="Collecting alerts and silences" alertmanager=local
level=info msg="GET request" timeout=10 uri=https://127.0.0.1:9093/metrics
level=error msg="Request failed" error="Get \"https://127.0.0.1:9093/metrics\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local uri=https://127.0.0.1:9093
level=error msg="Collection failed" error="Get \"https://127.0.0.1:9093/api/v2/status\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local
level=error msg="Collection failed" error="Get \"https://127.0.0.1:9093/api/v2/status\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local try=1/2
level=info msg="GET request" timeout=10 uri=https://127.0.0.1:9093/metrics
level=error msg="Request failed" error="Get \"https://127.0.0.1:9093/metrics\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local uri=https://127.0.0.1:9093
level=error msg="Collection failed" error="Get \"https://127.0.0.1:9093/api/v2/status\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local try=2/2
level=info msg="Collection completed"
level=info msg="Done, starting HTTP server"
level=info msg="Starting HTTPS server" address=127.0.0.1:8088

View File

@@ -17,7 +17,10 @@ level=info msg="Pulling latest alerts and silences from Alertmanager"
level=info msg="Collecting alerts and silences" alertmanager=local
level=info msg="GET request" timeout=10 uri=https://127.0.0.1:9093/metrics
level=error msg="Request failed" error="Get \"https://127.0.0.1:9093/metrics\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local uri=https://127.0.0.1:9093
level=error msg="Collection failed" error="Get \"https://127.0.0.1:9093/api/v2/status\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local
level=error msg="Collection failed" error="Get \"https://127.0.0.1:9093/api/v2/status\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local try=1/2
level=info msg="GET request" timeout=10 uri=https://127.0.0.1:9093/metrics
level=error msg="Request failed" error="Get \"https://127.0.0.1:9093/metrics\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local uri=https://127.0.0.1:9093
level=error msg="Collection failed" error="Get \"https://127.0.0.1:9093/api/v2/status\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local try=2/2
level=info msg="Collection completed"
level=info msg="Done, starting HTTP server"
level=info msg="Starting HTTPS server" address=127.0.0.1:8089

View File

@@ -17,7 +17,10 @@ level=info msg="Pulling latest alerts and silences from Alertmanager"
level=info msg="Collecting alerts and silences" alertmanager=local
level=info msg="GET request" timeout=10 uri=https://127.0.0.1:9093/metrics
level=error msg="Request failed" error="Get \"https://127.0.0.1:9093/metrics\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local uri=https://127.0.0.1:9093
level=error msg="Collection failed" error="Get \"https://127.0.0.1:9093/api/v2/status\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local
level=error msg="Collection failed" error="Get \"https://127.0.0.1:9093/api/v2/status\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local try=1/2
level=info msg="GET request" timeout=10 uri=https://127.0.0.1:9093/metrics
level=error msg="Request failed" error="Get \"https://127.0.0.1:9093/metrics\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local uri=https://127.0.0.1:9093
level=error msg="Collection failed" error="Get \"https://127.0.0.1:9093/api/v2/status\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local try=2/2
level=info msg="Collection completed"
level=info msg="Done, starting HTTP server"
level=info msg="Starting HTTPS server" address=127.0.0.1:8090

View File

@@ -17,7 +17,10 @@ level=info msg="Pulling latest alerts and silences from Alertmanager"
level=info msg="Collecting alerts and silences" alertmanager=local
level=info msg="GET request" timeout=10 uri=https://127.0.0.1:9093/metrics
level=error msg="Request failed" error="Get \"https://127.0.0.1:9093/metrics\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local uri=https://127.0.0.1:9093
level=error msg="Collection failed" error="Get \"https://127.0.0.1:9093/api/v2/status\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local
level=error msg="Collection failed" error="Get \"https://127.0.0.1:9093/api/v2/status\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local try=1/2
level=info msg="GET request" timeout=10 uri=https://127.0.0.1:9093/metrics
level=error msg="Request failed" error="Get \"https://127.0.0.1:9093/metrics\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local uri=https://127.0.0.1:9093
level=error msg="Collection failed" error="Get \"https://127.0.0.1:9093/api/v2/status\": dial tcp 127.0.0.1:9093: connect: connection refused" alertmanager=local try=2/2
level=info msg="Collection completed"
level=info msg="Done, starting HTTP server"
level=info msg="Starting HTTPS server" address=127.0.0.1:8091

View File

@@ -16,7 +16,10 @@ level=info msg="Pulling latest alerts and silences from Alertmanager"
level=info msg="Collecting alerts and silences" alertmanager=proxied
level=info msg="GET request" timeout=40 uri=http://127.0.0.1:9094/metrics
level=error msg="Request failed" error="Get \"http://127.0.0.1:9094/metrics\": dial tcp 127.0.0.1:9094: connect: connection refused" alertmanager=proxied uri=http://127.0.0.1:9094
level=error msg="Collection failed" error="Get \"http://127.0.0.1:9094/api/v2/status\": dial tcp 127.0.0.1:9094: connect: connection refused" alertmanager=proxied
level=error msg="Collection failed" error="Get \"http://127.0.0.1:9094/api/v2/status\": dial tcp 127.0.0.1:9094: connect: connection refused" alertmanager=proxied try=1/2
level=info msg="GET request" timeout=40 uri=http://127.0.0.1:9094/metrics
level=error msg="Request failed" error="Get \"http://127.0.0.1:9094/metrics\": dial tcp 127.0.0.1:9094: connect: connection refused" alertmanager=proxied uri=http://127.0.0.1:9094
level=error msg="Collection failed" error="Get \"http://127.0.0.1:9094/api/v2/status\": dial tcp 127.0.0.1:9094: connect: connection refused" alertmanager=proxied try=2/2
level=info msg="Collection completed"
level=info msg="Done, starting HTTP server"
level=info msg="Starting HTTP server" address=127.0.0.1:8094

View File

@@ -14,7 +14,10 @@ level=info msg="Pulling latest alerts and silences from Alertmanager"
level=info msg="Collecting alerts and silences" alertmanager=default
level=info msg="GET request" timeout=40 uri=http://127.0.0.1/metrics
level=error msg="Request failed" error="Get \"http://127.0.0.1/metrics\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default uri=http://127.0.0.1
level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default
level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default try=1/2
level=info msg="GET request" timeout=40 uri=http://127.0.0.1/metrics
level=error msg="Request failed" error="Get \"http://127.0.0.1/metrics\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default uri=http://127.0.0.1
level=error msg="Collection failed" error="Get \"http://127.0.0.1/api/v2/status\": dial tcp 127.0.0.1:80: connect: connection refused" alertmanager=default try=2/2
level=info msg="Collection completed"
level=info msg="Done, starting HTTP server"
level=info msg="Starting HTTP server" address=127.0.0.1:8099

View File

@@ -1,6 +1,7 @@
package main
import (
"fmt"
"runtime"
"sync"
@@ -9,6 +10,10 @@ import (
"github.com/rs/zerolog/log"
)
const (
maxTries = 2
)
func pullFromAlertmanager() {
// always flush cache once we're done
defer apiCache.Purge()
@@ -22,9 +27,17 @@ func pullFromAlertmanager() {
for _, upstream := range upstreams {
go func(am *alertmanager.Alertmanager) {
log.Info().Str("alertmanager", am.Name).Msg("Collecting alerts and silences")
err := am.Pull()
if err != nil {
log.Error().Err(err).Str("alertmanager", am.Name).Msg("Collection failed")
for i := 1; i <= maxTries; i++ {
err := am.Pull()
if err != nil {
log.Error().
Err(err).
Str("alertmanager", am.Name).
Str("try", fmt.Sprintf("%d/%d", i, maxTries)).
Msg("Collection failed")
} else {
break
}
}
wg.Done()
}(upstream)