Refactor canary analysis

- move CRD metrics and weight setting to canary analysis
- add max weight to CRD spec
This commit is contained in:
Stefan Prodan
2018-09-29 11:08:40 +03:00
parent 6143ebce48
commit 7993ae100b
18 changed files with 175 additions and 96 deletions

View File

@@ -2,6 +2,8 @@
# run tester: kubectl run -n test tester --image=quay.io/stefanprodan/podinfo:1.2.1 -- ./podinfo --port=9898
# generate latency: watch curl http://podinfo-canary:9898/delay/1
# generate errors: watch curl http://podinfo-canary:9898/status/500
# run load test: kubectl run -n test -it --rm --restart=Never hey --image=stefanprodan/loadtest -- sh
# generate load: hey -z 2m -h2 -m POST -d '{test: 1}' -c 10 -q 5 http://podinfo:9898/api/echo
apiVersion: apps.weave.works/v1beta1
kind: Rollout
metadata:
@@ -12,25 +14,29 @@ metadata:
namespace: test
spec:
targetKind: Deployment
virtualService:
name: podinfo
primary:
name: podinfo
host: podinfo
canary:
name: podinfo-canary
host: podinfo-canary
virtualService:
name: podinfo
canaryAnalysis:
# max traffic percentage routed to canary
# percentage (0-100)
maxWeight: 100
# canary increment step
# percentage (0-100)
weight: 10
metrics:
- name: istio_requests_total
# minimum req success rate (non 5xx responses)
# percentage (0-100)
threshold: 99
interval: 1m
- name: istio_request_duration_seconds_bucket
# maximum req duration P99
# milliseconds
threshold: 500
interval: 1m
stepWeight: 10
metrics:
- name: istio_requests_total
# minimum req success rate (non 5xx responses)
# percentage (0-100)
threshold: 99
interval: 1m
- name: istio_request_duration_seconds_bucket
# maximum req duration P99
# milliseconds
threshold: 500
interval: 30s

View File

@@ -21,10 +21,18 @@ spec:
properties:
spec:
required:
- targetKind
- virtualService
- primary
- canary
- virtualService
- canaryAnalysis
properties:
targetKind:
type: string
virtualService:
properties:
name:
type: string
primary:
properties:
name:
@@ -37,22 +45,23 @@ spec:
type: string
host:
type: string
virtualService:
canaryAnalysis:
properties:
name:
type: string
weight:
maxWeight:
type: number
metrics:
type: array
properties:
items:
type: object
stepWeight:
type: number
metrics:
type: array
properties:
name:
type: string
interval:
type: string
pattern: "^[0-9]+(m)"
threshold:
type: number
items:
type: object
properties:
name:
type: string
interval:
type: string
pattern: "^[0-9]+(m)"
threshold:
type: number

View File

@@ -22,7 +22,7 @@ spec:
serviceAccountName: steerer
containers:
- name: steerer
image: stefanprodan/steerer:0.0.1-beta.12
image: stefanprodan/steerer:0.0.1-rc.1
imagePullPolicy: Always
ports:
- name: http

View File

@@ -57,7 +57,7 @@ spec:
- http
- localhost:9898/readyz
failureThreshold: 3
periodSeconds: 10
periodSeconds: 3
successThreshold: 1
timeoutSeconds: 2
resources:
@@ -65,7 +65,7 @@ spec:
cpu: 1000m
memory: 256Mi
requests:
cpu: 10m
cpu: 100m
memory: 16Mi
volumeMounts:
- mountPath: /data

View File

@@ -3,16 +3,13 @@ kind: Deployment
metadata:
name: podinfo
namespace: test
annotations:
apps.weave.works/progressive: "true"
labels:
app: podinfo
spec:
replicas: 2
replicas: 1
strategy:
rollingUpdate:
maxSurge: 25%
maxUnavailable: 1
maxUnavailable: 0
type: RollingUpdate
selector:
matchLabels:
@@ -48,6 +45,7 @@ spec:
- check
- http
- localhost:9898/healthz
initialDelaySeconds: 5
failureThreshold: 3
periodSeconds: 10
successThreshold: 1
@@ -59,11 +57,15 @@ spec:
- check
- http
- localhost:9898/readyz
initialDelaySeconds: 5
failureThreshold: 3
periodSeconds: 10
periodSeconds: 3
successThreshold: 1
timeoutSeconds: 1
resources:
limits:
cpu: 2000m
memory: 512Mi
requests:
cpu: 1m
memory: 16Mi
cpu: 10m
memory: 64Mi

View File

@@ -14,6 +14,8 @@ spec:
- type: Resource
resource:
name: cpu
# scale up if usage is above
# 99% of the requested CPU (100m)
targetAverageUtilization: 99
- type: Resource
resource:

View File

@@ -0,0 +1,19 @@
apiVersion: autoscaling/v2beta1
kind: HorizontalPodAutoscaler
metadata:
name: podinfo
namespace: test
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: podinfo
minReplicas: 2
maxReplicas: 4
metrics:
- type: Resource
resource:
name: cpu
# scale up if usage is above
# 99% of the requested CPU (100m)
targetAverageUtilization: 99

View File

@@ -1,13 +1,10 @@
apiVersion: networking.istio.io/v1alpha3
kind: VirtualService
metadata:
annotations:
apps.weave.works/progressive-revision: ""
apps.weave.works/progressive-status: ""
labels:
app: podinfo
name: podinfo
namespace: test
labels:
app: podinfo
spec:
gateways:
- public-gateway.istio-system.svc.cluster.local
@@ -26,4 +23,7 @@ spec:
port:
number: 9898
weight: 0
timeout: 30s
timeout: 10s
retries:
attempts: 3
perTryTimeout: 2s

View File

@@ -1,5 +1,5 @@
apiVersion: v1
name: steerer
version: 0.0.1
appVersion: 0.0.1-beta.12
appVersion: 0.0.1-rc.1
description: Steerer is a Kubernetes operator that automates the promotion of canary deployments using Istio routing for traffic shifting and Prometheus metrics for canary analysis.

View File

@@ -22,10 +22,18 @@ spec:
properties:
spec:
required:
- targetKind
- virtualService
- primary
- canary
- virtualService
- canaryAnalysis
properties:
targetKind:
type: string
virtualService:
properties:
name:
type: string
primary:
properties:
name:
@@ -38,23 +46,23 @@ spec:
type: string
host:
type: string
virtualService:
canaryAnalysis:
properties:
name:
type: string
weight:
maxWeight:
type: number
metrics:
type: array
properties:
items:
type: object
stepWeight:
type: number
metrics:
type: array
properties:
name:
type: string
interval:
type: string
pattern: "^[0-9]+(m)"
threshold:
type: number
items:
type: object
properties:
name:
type: string
interval:
type: string
pattern: "^[0-9]+(m)"
threshold:
type: number
{{- end }}

View File

@@ -2,7 +2,7 @@
image:
repository: stefanprodan/steerer
tag: 0.0.1-beta.12
tag: 0.0.1-rc.1
pullPolicy: IfNotPresent
controlLoopInterval: "10s"

View File

@@ -2,14 +2,14 @@ apiVersion: v1
entries:
steerer:
- apiVersion: v1
appVersion: 0.0.1-beta.12
created: 2018-09-28T16:49:49.90177054+03:00
appVersion: 0.0.1-rc.1
created: 2018-09-29T11:08:25.598356915+03:00
description: Steerer is a Kubernetes operator that automates the promotion of
canary deployments using Istio routing for traffic shifting and Prometheus metrics
for canary analysis.
digest: 792a2bf520cac415a2e2a4a3b2b7142c9fb09a4737ea7135146bd5796c5f9d94
digest: af14826edae5afcda1b2afebf17e3b8007f1d2a35e65093ab32e786a6599b201
name: steerer
urls:
- https://stefanprodan.github.io/steerer/steerer-0.0.1.tgz
version: 0.0.1
generated: 2018-09-28T16:49:49.900919976+03:00
generated: 2018-09-29T11:08:25.597473362+03:00

Binary file not shown.

View File

@@ -37,8 +37,8 @@ type RolloutSpec struct {
TargetKind string `json:"targetKind"`
Primary Target `json:"primary"`
Canary Target `json:"canary"`
CanaryAnalysis CanaryAnalysis `json:"canaryAnalysis"`
VirtualService VirtualService `json:"virtualService"`
Metrics []Metric `json:"metrics"`
}
type Target struct {
@@ -47,8 +47,13 @@ type Target struct {
}
type VirtualService struct {
Name string `json:"name"`
Weight int `json:"weight"`
Name string `json:"name"`
}
type CanaryAnalysis struct {
MaxWeight int `json:"maxWeight"`
StepWeight int `json:"stepWeight"`
Metrics []Metric `json:"metrics"`
}
type Metric struct {

View File

@@ -24,6 +24,27 @@ import (
runtime "k8s.io/apimachinery/pkg/runtime"
)
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *CanaryAnalysis) DeepCopyInto(out *CanaryAnalysis) {
*out = *in
if in.Metrics != nil {
in, out := &in.Metrics, &out.Metrics
*out = make([]Metric, len(*in))
copy(*out, *in)
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CanaryAnalysis.
func (in *CanaryAnalysis) DeepCopy() *CanaryAnalysis {
if in == nil {
return nil
}
out := new(CanaryAnalysis)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *Metric) DeepCopyInto(out *Metric) {
*out = *in
@@ -106,12 +127,8 @@ func (in *RolloutSpec) DeepCopyInto(out *RolloutSpec) {
*out = *in
out.Primary = in.Primary
out.Canary = in.Canary
in.CanaryAnalysis.DeepCopyInto(&out.CanaryAnalysis)
out.VirtualService = in.VirtualService
if in.Metrics != nil {
in, out := &in.Metrics, &out.Metrics
*out = make([]Metric, len(*in))
copy(*out, *in)
}
return
}

View File

@@ -180,8 +180,8 @@ func (c *Controller) syncHandler(key string) error {
return nil
}
c.logger.Infof("Adding %s.%s to cache", rollout.Name, rollout.Namespace)
c.rollouts.Store(fmt.Sprintf("%s.%s", rollout.Name, rollout.Namespace), rollout)
c.logger.Infof("Synced %s", key)
return nil
}
@@ -206,6 +206,11 @@ func (c *Controller) recordEventErrorf(r *rolloutv1.Rollout, template string, ar
c.recorder.Event(r, corev1.EventTypeWarning, "Synced", fmt.Sprintf(template, args...))
}
func (c *Controller) recordEventWarningf(r *rolloutv1.Rollout, template string, args ...interface{}) {
c.logger.Infof(template, args...)
c.recorder.Event(r, corev1.EventTypeWarning, "Synced", fmt.Sprintf(template, args...))
}
func checkCustomResourceType(obj interface{}, logger *zap.SugaredLogger) (rolloutv1.Rollout, bool) {
var roll *rolloutv1.Rollout
var ok bool

View File

@@ -32,10 +32,10 @@ func (c *Controller) advanceDeploymentRollout(name string, namespace string) {
return
}
// gate stage: check if primary deployment exists and is healthy
primary, ok := c.getDeployment(r, r.Spec.Primary.Name, r.Namespace)
if !ok {
return
// set max weight default value to 100%
maxWeight := 100
if r.Spec.CanaryAnalysis.MaxWeight > 0 {
maxWeight = r.Spec.CanaryAnalysis.MaxWeight
}
// gate stage: check if canary deployment exists and is healthy
@@ -44,6 +44,12 @@ func (c *Controller) advanceDeploymentRollout(name string, namespace string) {
return
}
// gate stage: check if primary deployment exists and is healthy
primary, ok := c.getDeployment(r, r.Spec.Primary.Name, r.Namespace)
if !ok {
return
}
// gate stage: check if virtual service exists
// and if it contains weighted destination routes to the primary and canary services
vs, primaryRoute, canaryRoute, ok := c.getVirtualService(r)
@@ -67,15 +73,15 @@ func (c *Controller) advanceDeploymentRollout(name string, namespace string) {
}
// routing stage: increase canary traffic percentage
if canaryRoute.Weight != 100 {
primaryRoute.Weight -= r.Spec.VirtualService.Weight
if primaryRoute.Weight > 100 {
primaryRoute.Weight = 100
}
canaryRoute.Weight += r.Spec.VirtualService.Weight
if canaryRoute.Weight < maxWeight {
primaryRoute.Weight -= r.Spec.CanaryAnalysis.StepWeight
if primaryRoute.Weight < 0 {
primaryRoute.Weight = 0
}
canaryRoute.Weight += r.Spec.CanaryAnalysis.StepWeight
if primaryRoute.Weight > 100 {
primaryRoute.Weight = 100
}
if ok := c.updateVirtualServiceRoutes(r, vs, primaryRoute, canaryRoute); !ok {
return
@@ -84,7 +90,7 @@ func (c *Controller) advanceDeploymentRollout(name string, namespace string) {
c.recordEventInfof(r, "Advance rollout %s.%s weight %v", r.Name, r.Namespace, canaryRoute.Weight)
// promotion stage: override primary.template.spec with the canary spec
if canaryRoute.Weight == 100 {
if canaryRoute.Weight == maxWeight {
c.recordEventInfof(r, "Copying %s.%s template spec to %s.%s",
canary.GetName(), canary.Namespace, primary.GetName(), primary.Namespace)
@@ -175,7 +181,7 @@ func (c *Controller) getDeployment(r *rolloutv1.Rollout, name string, namespace
}
if msg, healthy := getDeploymentStatus(dep); !healthy {
c.logger.Infof("Halt rollout for %s.%s %s", dep.GetName(), dep.Namespace, msg)
c.recordEventWarningf(r, "Halt rollout %s.%s %s", dep.GetName(), dep.Namespace, msg)
return nil, false
}
@@ -187,7 +193,7 @@ func (c *Controller) getDeployment(r *rolloutv1.Rollout, name string, namespace
}
func (c *Controller) checkDeploymentMetrics(r *rolloutv1.Rollout) bool {
for _, metric := range r.Spec.Metrics {
for _, metric := range r.Spec.CanaryAnalysis.Metrics {
if metric.Name == "istio_requests_total" {
val, err := c.getDeploymentCounter(r.Spec.Canary.Name, r.Namespace, metric.Name, metric.Interval)
if err != nil {
@@ -195,7 +201,7 @@ func (c *Controller) checkDeploymentMetrics(r *rolloutv1.Rollout) bool {
return false
}
if float64(metric.Threshold) > val {
c.recordEventErrorf(r, "Halt rollout %s.%s success rate %.2f%% < %v%%",
c.recordEventWarningf(r, "Halt rollout %s.%s success rate %.2f%% < %v%%",
r.Name, r.Namespace, val, metric.Threshold)
return false
}
@@ -209,7 +215,7 @@ func (c *Controller) checkDeploymentMetrics(r *rolloutv1.Rollout) bool {
}
t := time.Duration(metric.Threshold) * time.Millisecond
if val > t {
c.recordEventErrorf(r, "Halt rollout %s.%s request duration %v > %v",
c.recordEventWarningf(r, "Halt rollout %s.%s request duration %v > %v",
r.Name, r.Namespace, val, t)
return false
}

View File

@@ -1,4 +1,4 @@
package version
var VERSION = "0.0.1-beta.12"
var VERSION = "0.0.1-rc.1"
var REVISION = "unknown"