Merge pull request #3 from nais/metrics

Metrics and dashboard
This commit is contained in:
Morten Lied Johansen
2021-09-29 15:06:54 +02:00
committed by GitHub
3 changed files with 142 additions and 6 deletions

94
hack/dashboard.yaml Normal file
View File

@@ -0,0 +1,94 @@
title: Wonderwall
editable: true
tags: [generated, yaml]
auto_refresh: 1m
time: ["now-24h", "now"]
timezone: default # valid values are: utc, browser, default
# Render to JSON using https://github.com/K-Phoen/grabana v0.17.0 or newer
# Import into Grafana using UI (remember to select folder)
variables:
- custom:
name: env
default: dev
values_map:
dev: dev
prod: prod
- datasource:
name: ds
type: prometheus
regex: $env-gcp
include_all: true
hide: variable
- query:
name: redis_op
label: Redis Operation
datasource: $env-gcp
request: "label_values(wonderwall_redis_latency_bucket, operation)"
include_all: true
default_all: true
hide: variable
rows:
- name: Resource usage
collapse: false
panels:
- graph:
title: Memory usage - $ds
datasource: $ds
transparent: true
span: 4
targets:
- prometheus:
query: avg(kube_pod_container_resource_limits{container="wonderwall",resource="memory"}) by (namespace)
legend: "limits in {{ namespace }}"
- prometheus:
query: avg(kube_pod_container_resource_requests{container="wonderwall",resource="memory"}) by (namespace)
legend: "requests in {{ namespace }}"
- prometheus:
query: sum(container_memory_working_set_bytes{container="wonderwall"}) by (pod, namespace)
legend: "working set {{ pod }} in {{ namespace }}"
- prometheus:
query: sum(container_memory_usage_bytes{container="wonderwall"}) by (pod, namespace)
legend: "Resident set size {{ pod }} in {{ namespace }}"
- graph:
title: CPU usage - $ds
datasource: $ds
transparent: true
span: 4
targets:
- prometheus:
query: avg(kube_pod_container_resource_limits{container="wonderwall",resource="cpu"}) by (namespace)
legend: "limits in {{ namespace }}"
- prometheus:
query: avg(kube_pod_container_resource_requests{container="wonderwall",resource="cpu"}) by (namespace)
legend: "requests in {{ namespace }}"
- prometheus:
query: sum(irate(container_cpu_usage_seconds_total{container="wonderwall"}[2m])) by (pod, namespace)
legend: "{{ pod }} in {{ namespace }}"
- name: Redis Latency - $redis_op
repeat_for: redis_op
collapse: true
panels:
- heatmap:
# Must be done manually in Grafana after import: Set max datapoints to 25
title: $ds
datasource: $ds
repeat: ds
data_format: time_series_buckets
hide_zero_buckets: true
transparent: true
span: 4
tooltip:
show: true
showhistogram: false
decimals: 0
yaxis:
unit: "dtdurations"
decimals: 0
targets:
- prometheus:
query: sum(increase(wonderwall_redis_latency_bucket{operation="$redis_op"}[$__interval])) by (le)
legend: "{{ le }}"
format: heatmap

View File

@@ -1,11 +1,44 @@
package metrics
import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"net/http"
"time"
)
const (
Namespace = "wonderwall"
RedisOperationLabel = "operation"
)
var (
RedisLatency = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: "redis_latency",
Namespace: Namespace,
Help: "latency in redis operations",
Buckets: prometheus.ExponentialBuckets(0.02, 2, 14),
}, []string{RedisOperationLabel})
)
func Handle(address string) error {
handler := promhttp.Handler()
return http.ListenAndServe(address, handler)
}
func Register(registry prometheus.Registerer) {
registry.MustRegister(
RedisLatency,
)
}
func ObserveRedisLatency(operation string, fun func() error) error {
timer := time.Now()
err := fun()
used := time.Now().Sub(timer)
RedisLatency.With(prometheus.Labels{
RedisOperationLabel: operation,
}).Observe(used.Seconds())
return err
}

View File

@@ -3,6 +3,7 @@ package session
import (
"context"
"github.com/go-redis/redis/v8"
"github.com/nais/wonderwall/pkg/metrics"
"time"
)
@@ -20,8 +21,12 @@ func NewRedis(client redis.Cmdable) Store {
func (s *redisSessionStore) Read(ctx context.Context, key string) (*Data, error) {
data := &Data{}
status := s.client.Get(ctx, key)
err := status.Scan(data)
err := metrics.ObserveRedisLatency("Read", func() error {
var err error
status := s.client.Get(ctx, key)
err = status.Scan(data)
return err
})
if err != nil {
return nil, err
}
@@ -29,11 +34,15 @@ func (s *redisSessionStore) Read(ctx context.Context, key string) (*Data, error)
}
func (s *redisSessionStore) Write(ctx context.Context, key string, value *Data, expiration time.Duration) error {
status := s.client.Set(ctx, key, value, expiration)
return status.Err()
return metrics.ObserveRedisLatency("Write", func() error {
status := s.client.Set(ctx, key, value, expiration)
return status.Err()
})
}
func (s *redisSessionStore) Delete(ctx context.Context, keys ...string) error {
status := s.client.Del(ctx, keys...)
return status.Err()
return metrics.ObserveRedisLatency("Delete", func() error {
status := s.client.Del(ctx, keys...)
return status.Err()
})
}