From f551386113a10536115738e047595420bada05a7 Mon Sep 17 00:00:00 2001 From: Morten Lied Johansen Date: Wed, 29 Sep 2021 10:50:27 +0200 Subject: [PATCH 1/3] Add Redis latency metrics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sindre Rødseth Hansen --- pkg/metrics/metrics.go | 33 +++++++++++++++++++++++++++++++++ pkg/session/redis.go | 21 +++++++++++++++------ 2 files changed, 48 insertions(+), 6 deletions(-) diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index e7cc41b..b9ef95a 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -1,11 +1,44 @@ package metrics import ( + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" "net/http" + "time" +) + +const ( + Namespace = "wonderwall" + + RedisOperationLabel = "operation" +) + +var ( + RedisLatency = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "redis_latency", + Namespace: Namespace, + Help: "latency in redis operations", + Buckets: prometheus.ExponentialBuckets(0.02, 2, 14), + }, []string{RedisOperationLabel}) ) func Handle(address string) error { handler := promhttp.Handler() return http.ListenAndServe(address, handler) } + +func Register(registry prometheus.Registerer) { + registry.MustRegister( + RedisLatency, + ) +} + +func ObserveRedisLatency(operation string, fun func() error) error { + timer := time.Now() + err := fun() + used := time.Now().Sub(timer) + RedisLatency.With(prometheus.Labels{ + RedisOperationLabel: operation, + }).Observe(used.Seconds()) + return err +} diff --git a/pkg/session/redis.go b/pkg/session/redis.go index 6963385..cf901b1 100644 --- a/pkg/session/redis.go +++ b/pkg/session/redis.go @@ -3,6 +3,7 @@ package session import ( "context" "github.com/go-redis/redis/v8" + "github.com/nais/wonderwall/pkg/metrics" "time" ) @@ -20,8 +21,12 @@ func NewRedis(client redis.Cmdable) Store { func (s *redisSessionStore) Read(ctx context.Context, key string) (*Data, error) { data := &Data{} - status := s.client.Get(ctx, key) - err := status.Scan(data) + err := metrics.ObserveRedisLatency("Read", func() error { + var err error + status := s.client.Get(ctx, key) + err = status.Scan(data) + return err + }) if err != nil { return nil, err } @@ -29,11 +34,15 @@ func (s *redisSessionStore) Read(ctx context.Context, key string) (*Data, error) } func (s *redisSessionStore) Write(ctx context.Context, key string, value *Data, expiration time.Duration) error { - status := s.client.Set(ctx, key, value, expiration) - return status.Err() + return metrics.ObserveRedisLatency("Write", func() error { + status := s.client.Set(ctx, key, value, expiration) + return status.Err() + }) } func (s *redisSessionStore) Delete(ctx context.Context, keys ...string) error { - status := s.client.Del(ctx, keys...) - return status.Err() + return metrics.ObserveRedisLatency("Delete", func() error { + status := s.client.Del(ctx, keys...) + return status.Err() + }) } From 345691eb08a33bb6171dbf9e3b0d8723f2b6ea5f Mon Sep 17 00:00:00 2001 From: Morten Lied Johansen Date: Wed, 29 Sep 2021 11:24:31 +0200 Subject: [PATCH 2/3] Starting on a dashboard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sindre Rødseth Hansen --- hack/dashboard.yaml | 96 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 hack/dashboard.yaml diff --git a/hack/dashboard.yaml b/hack/dashboard.yaml new file mode 100644 index 0000000..5d1f75f --- /dev/null +++ b/hack/dashboard.yaml @@ -0,0 +1,96 @@ +title: Wonderwall +editable: true +tags: [generated, yaml] +auto_refresh: 1m +time: ["now-24h", "now"] +timezone: default # valid values are: utc, browser, default + +# Render to JSON using https://github.com/K-Phoen/grabana v0.17.0 or newer +# Import into Grafana using UI (remember to select folder) + +variables: + - custom: + name: env + default: dev + values_map: + dev: dev + prod: prod + - datasource: + name: ds + type: prometheus + regex: $env-(gcp|fss|sbs) + include_all: true + hide: variable + - query: + name: redis_op + label: Redis Operation + datasource: $env-gcp + request: "label_values(wonderwall_redis_latency_bucket, operation)" + include_all: true + default_all: true + hide: variable + +rows: + - name: Resource usage + collapse: true + panels: + - graph: + title: Memory usage - $ds + datasource: $ds + repeat: ds + transparent: true + span: 4 + targets: + - prometheus: + query: avg(kube_pod_container_resource_limits{container="wonderwall",resource="memory"}) by (namespace) + legend: "limits in {{ namespace }}" + - prometheus: + query: avg(kube_pod_container_resource_requests{container="wonderwall",resource="memory"}) by (namespace) + legend: "requests in {{ namespace }}" + - prometheus: + query: sum(container_memory_working_set_bytes{container="wonderwall"}) by (pod, namespace) + legend: "working set {{ pod }} in {{ namespace }}" + - prometheus: + query: sum(container_memory_usage_bytes{container="wonderwall"}) by (pod, namespace) + legend: "Resident set size {{ pod }} in {{ namespace }}" + - graph: + title: CPU usage - $ds + datasource: $ds + repeat: ds + transparent: true + span: 4 + targets: + - prometheus: + query: avg(kube_pod_container_resource_limits{container="wonderwall",resource="cpu"}) by (namespace) + legend: "limits in {{ namespace }}" + - prometheus: + query: avg(kube_pod_container_resource_requests{container="wonderwall",resource="cpu"}) by (namespace) + legend: "requests in {{ namespace }}" + - prometheus: + query: sum(irate(container_cpu_usage_seconds_total{container="wonderwall"}[2m])) by (pod, namespace) + legend: "{{ pod }} in {{ namespace }}" + - name: Redis Latency - $redis_op + repeat_for: redis_op + collapse: true + panels: + - heatmap: + # Must be done manually in Grafana after import: Set max datapoints to 25 + title: $ds + datasource: $ds + repeat: ds + data_format: time_series_buckets + hide_zero_buckets: true + transparent: true + span: 4 + tooltip: + show: true + showhistogram: false + decimals: 0 + yaxis: + unit: "dtdurations" + decimals: 0 + targets: + - prometheus: + query: sum(increase(wonderwall_redis_latency_bucket{operation="$redis_op"}[$__interval])) by (le) + legend: "{{ le }}" + format: heatmap From fb6dc12a9acd77def6ce8737eb0bae0b6809a2f9 Mon Sep 17 00:00:00 2001 From: Morten Lied Johansen Date: Wed, 29 Sep 2021 13:56:59 +0200 Subject: [PATCH 3/3] Only in gcp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Trong Huu Nguyen Co-authored-by: Sindre Rødseth Hansen Co-authored-by: Terje Sannum --- hack/dashboard.yaml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/hack/dashboard.yaml b/hack/dashboard.yaml index 5d1f75f..3443d77 100644 --- a/hack/dashboard.yaml +++ b/hack/dashboard.yaml @@ -18,7 +18,7 @@ variables: - datasource: name: ds type: prometheus - regex: $env-(gcp|fss|sbs) + regex: $env-gcp include_all: true hide: variable - query: @@ -32,12 +32,11 @@ variables: rows: - name: Resource usage - collapse: true + collapse: false panels: - graph: title: Memory usage - $ds datasource: $ds - repeat: ds transparent: true span: 4 targets: @@ -56,7 +55,6 @@ rows: - graph: title: CPU usage - $ds datasource: $ds - repeat: ds transparent: true span: 4 targets: