From 5adbcd5189db08ab33a9bd815fc3698e25903bbb Mon Sep 17 00:00:00 2001 From: Stefan Prodan Date: Fri, 28 Sep 2018 13:28:12 +0300 Subject: [PATCH] Update CRD with req duration metric --- Makefile | 2 + README.md | 72 +++++++++++++++++++++++-------- artifacts/rollouts/podinfo.yaml | 19 ++++++-- artifacts/steerer/crd.yaml | 22 +++++----- artifacts/steerer/deployment.yaml | 2 +- chart/steerer/Chart.yaml | 2 +- chart/steerer/templates/crd.yaml | 22 +++++----- chart/steerer/values.yaml | 2 +- 8 files changed, 98 insertions(+), 45 deletions(-) diff --git a/Makefile b/Makefile index 45a46e93..b5d26540 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,8 @@ VERSION?=$(shell grep 'VERSION' pkg/version/version.go | awk '{ print $$4 }' | t VERSION_MINOR:=$(shell grep 'VERSION' pkg/version/version.go | awk '{ print $$4 }' | tr -d '"' | rev | cut -d'.' -f2- | rev) PATCH:=$(shell grep 'VERSION' pkg/version/version.go | awk '{ print $$4 }' | tr -d '"' | awk -F. '{print $$NF}') +run: + go run cmd/steerer/* -kubeconfig=$$HOME/.kube/config -log-level=debug -metrics-server=https://prometheus.istio.weavedx.com build: docker build -t stefanprodan/steerer:$(TAG) . -f Dockerfile diff --git a/README.md b/README.md index ad4c22f2..761d4a3e 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,8 @@ Gated rollout stages: * check canary HTTP success rate * halt rollout if percentage is under the specified threshold * increase canary traffic wight by 10% till it reaches 100% - * halt rollout while canary success rate is under the threshold + * halt rollout while canary request success rate is under the threshold + * halt rollout while canary request duration are over the threshold * halt rollout if the primary or canary deployment becomes unhealthy * halt rollout while canary deployment is being scaled up/down by HPA * promote canary to primary @@ -118,17 +119,25 @@ spec: host: podinfo-canary virtualService: name: podinfo - # used to increment the canary weight + # canary increment step + # percentage (0-100) weight: 10 - metric: - type: counter - name: istio_requests_total - interval: 1m - # success rate percentage used in canary analysis + metrics: + - name: istio_requests_total + # minimum req success rate (non 5xx responses) + # percentage (0-100) threshold: 99 + interval: 1m + - name: istio_request_duration_seconds_bucket + # maximum req duration P99 + # milliseconds + threshold: 500 + interval: 1m ``` -The canary analysis is using the following promql query to determine the HTTP success rate percentage: +The canary analysis is using the following promql queries: + +HTTP requests success rate percentage: ```sql sum( @@ -153,6 +162,22 @@ sum( ) ``` +HTTP requests milliseconds duration P99: + +```sql +histogram_quantile(0.99, + sum( + irate( + istio_request_duration_seconds_bucket{ + reporter="destination", + destination_workload=~"$workload", + destination_workload_namespace=~"$namespace" + }[$interval] + ) + ) by (le) +) +``` + ### Example Create a test namespace with Istio sidecard injection enabled: @@ -200,16 +225,14 @@ Events: Normal Synced 3m steerer Advance rollout podinfo.test weight 10 Normal Synced 3m steerer Advance rollout podinfo.test weight 20 Normal Synced 2m steerer Advance rollout podinfo.test weight 30 + Warning Synced 3m steerer Halt rollout podinfo.test request duration 2.525s > 500ms + Warning Synced 3m steerer Halt rollout podinfo.test request duration 1.567s > 500ms + Warning Synced 3m steerer Halt rollout podinfo.test request duration 823ms > 500ms Normal Synced 2m steerer Advance rollout podinfo.test weight 40 Normal Synced 2m steerer Advance rollout podinfo.test weight 50 - Normal Synced 2m steerer Advance rollout podinfo.test weight 60 - Normal Synced 2m steerer Advance rollout podinfo.test weight 60 - Warning Synced 2m steerer Halt rollout podinfo.test success rate 88.89% < 99% - Warning Synced 2m steerer Halt rollout podinfo.test success rate 82.86% < 99% - Warning Synced 1m steerer Halt rollout podinfo.test success rate 80.49% < 99% - Warning Synced 1m steerer Halt rollout podinfo.test success rate 82.98% < 99% - Warning Synced 1m steerer Halt rollout podinfo.test success rate 83.33% < 99% - Warning Synced 1m steerer Halt rollout podinfo.test success rate 82.22% < 99% + Normal Synced 1m steerer Advance rollout podinfo.test weight 60 + Warning Synced 1m steerer Halt rollout podinfo.test success rate 82.33% < 99% + Warning Synced 1m steerer Halt rollout podinfo.test success rate 87.22% < 99% Warning Synced 1m steerer Halt rollout podinfo.test success rate 94.74% < 99% Normal Synced 1m steerer Advance rollout podinfo.test weight 70 Normal Synced 55s steerer Advance rollout podinfo.test weight 80 @@ -220,11 +243,24 @@ Events: Normal Synced 5s steerer Promotion complete! Scaling down podinfo-canary.test ``` -During the rollout you can generate HTTP 500 errors to test if Steerer pauses the rollout: +During the rollout you can generate HTTP 500 errors and high latency to test if Steerer pauses the rollout. + +Create a tester pod and exec into it: ```bash -watch -n 1 curl https:///status/500 +kubectl -n test run tester --image=quay.io/stefanprodan/podinfo:1.2.1 -- ./podinfo --port=9898 +kubectl -n test exec -it tester-xx-xx sh ``` +Generate HTTP 500 errors: +```bash +watch curl http://podinfo-canary:9898/status/500 +``` + +Generate latency: + +```bash +watch curl http://podinfo-canary:9898/delay/1 +``` diff --git a/artifacts/rollouts/podinfo.yaml b/artifacts/rollouts/podinfo.yaml index 31a35afb..a9f4e915 100644 --- a/artifacts/rollouts/podinfo.yaml +++ b/artifacts/rollouts/podinfo.yaml @@ -1,4 +1,7 @@ # monitor events: watch "kubectl -n test describe rollout/podinfo | sed -n 35,1000p" +# run tester: kubectl run -n test tester --image=quay.io/stefanprodan/podinfo:1.2.1 -- ./podinfo --port=9898 +# generate latency: watch curl http://podinfo-canary:9898/delay/1 +# generate errors: watch curl http://podinfo-canary:9898/status/500 apiVersion: apps.weave.works/v1beta1 kind: Rollout metadata: @@ -17,9 +20,17 @@ spec: host: podinfo-canary virtualService: name: podinfo + # canary increment step + # percentage (0-100) weight: 10 - metric: - type: counter - name: istio_requests_total - interval: 1m + metrics: + - name: istio_requests_total + # minimum req success rate (non 5xx responses) + # percentage (0-100) threshold: 99 + interval: 1m + - name: istio_request_duration_seconds_bucket + # maximum req duration P99 + # milliseconds + threshold: 500 + interval: 1m diff --git a/artifacts/steerer/crd.yaml b/artifacts/steerer/crd.yaml index 15d87a50..ef9ffe1a 100644 --- a/artifacts/steerer/crd.yaml +++ b/artifacts/steerer/crd.yaml @@ -43,14 +43,16 @@ spec: type: string weight: type: number - metric: + metrics: + type: array properties: - type: - type: string - name: - type: string - interval: - type: string - pattern: "^[0-9]+(m)" - threshold: - type: number + items: + type: object + properties: + name: + type: string + interval: + type: string + pattern: "^[0-9]+(m)" + threshold: + type: number diff --git a/artifacts/steerer/deployment.yaml b/artifacts/steerer/deployment.yaml index 7d9a5a36..17a35f8b 100644 --- a/artifacts/steerer/deployment.yaml +++ b/artifacts/steerer/deployment.yaml @@ -22,7 +22,7 @@ spec: serviceAccountName: steerer containers: - name: steerer - image: stefanprodan/steerer:0.0.1-beta.5 + image: stefanprodan/steerer:0.0.1-beta.9 imagePullPolicy: Always ports: - name: http diff --git a/chart/steerer/Chart.yaml b/chart/steerer/Chart.yaml index 68f6b3ac..2ff41883 100644 --- a/chart/steerer/Chart.yaml +++ b/chart/steerer/Chart.yaml @@ -1,5 +1,5 @@ apiVersion: v1 name: steerer version: 0.0.1 -appVersion: 0.0.1-beta.5 +appVersion: 0.0.1-beta.9 description: Steerer is a Kubernetes operator that automates the promotion of canary deployments using Istio routing for traffic shifting and Prometheus metrics for canary analysis. diff --git a/chart/steerer/templates/crd.yaml b/chart/steerer/templates/crd.yaml index 38f5edb5..53427b46 100644 --- a/chart/steerer/templates/crd.yaml +++ b/chart/steerer/templates/crd.yaml @@ -44,15 +44,17 @@ spec: type: string weight: type: number - metric: + metrics: + type: array properties: - type: - type: string - name: - type: string - interval: - type: string - pattern: "^[0-9]+(m)" - threshold: - type: number + items: + type: object + properties: + name: + type: string + interval: + type: string + pattern: "^[0-9]+(m)" + threshold: + type: number {{- end }} diff --git a/chart/steerer/values.yaml b/chart/steerer/values.yaml index 9963cafa..934a9aed 100644 --- a/chart/steerer/values.yaml +++ b/chart/steerer/values.yaml @@ -2,7 +2,7 @@ image: repository: stefanprodan/steerer - tag: 0.0.1-beta.5 + tag: 0.0.1-beta.9 pullPolicy: IfNotPresent controlLoopInterval: "10s"