From 0e81b5f4d285ffecd411f24646a7912dc7bb03aa Mon Sep 17 00:00:00 2001 From: stefanprodan Date: Wed, 26 Feb 2020 10:52:25 +0200 Subject: [PATCH 01/13] Update docs for Flagger v1beta1 API --- CONTRIBUTING.md | 8 +- README.md | 54 +- docs/gitbook/SUMMARY.md | 2 + docs/gitbook/dev-guide.md | 85 +- docs/gitbook/faq.md | 327 ++++---- docs/gitbook/how-it-works.md | 853 ++------------------ docs/gitbook/usage/alerting.md | 95 ++- docs/gitbook/usage/deployment-strategies.md | 167 +++- docs/gitbook/usage/metrics.md | 137 ++++ docs/gitbook/usage/webhooks.md | 401 +++++++++ 10 files changed, 1090 insertions(+), 1039 deletions(-) create mode 100644 docs/gitbook/usage/metrics.md create mode 100644 docs/gitbook/usage/webhooks.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9536a644..4dde4592 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -17,12 +17,12 @@ contribution. ## Chat The project uses Slack: To join the conversation, simply join the -[Weave community](https://slack.weave.works/) Slack workspace. +[Weave community](https://slack.weave.works/) Slack workspace #flagger channel. ## Getting Started - Fork the repository on GitHub -- If you want to contribute as a developer, continue reading this document for further instructions +- If you want to contribute as a developer, read [Flagger Development Guide](https://docs.flagger.app/dev-guide) - If you have questions, concerns, get stuck or need a hand, let us know on the Slack channel. We are happy to help and look forward to having you part of the team. No matter in which capacity. @@ -59,7 +59,7 @@ get asked to resubmit the PR or divide the changes into more than one PR. ### Format of the Commit Message -For Flux we prefer the following rules for good commit messages: +For Flagger we prefer the following rules for good commit messages: - Limit the subject to 50 characters and write as the continuation of the sentence "If applied, this commit will ..." @@ -69,4 +69,4 @@ For Flux we prefer the following rules for good commit messages: The [following article](https://chris.beams.io/posts/git-commit/#seven-rules) has some more helpful advice on documenting your work. -This doc is adapted from the [Weaveworks Flux](https://github.com/weaveworks/flux/blob/master/CONTRIBUTING.md) +This doc is adapted from [FluxCD](https://github.com/fluxcd/flux/blob/master/CONTRIBUTING.md). diff --git a/README.md b/README.md index d1fbca2f..2147be3c 100644 --- a/README.md +++ b/README.md @@ -87,7 +87,7 @@ spec: kind: HorizontalPodAutoscaler name: podinfo service: - # service name (optional) + # service name (defaults to targetRef.name) name: podinfo # ClusterIP port number port: 9898 @@ -95,6 +95,9 @@ spec: targetPort: 9898 # port name can be http or grpc (default http) portName: http + # add all the other container ports + # to the ClusterIP services (default false) + portDiscovery: true # HTTP match conditions (optional) match: - uri: @@ -118,36 +121,57 @@ spec: # canary increment step # percentage (0-100) stepWeight: 5 - # Istio Prometheus checks + # validation (optional) metrics: - # builtin checks - name: request-success-rate + # builtin Prometheus check # minimum req success rate (non 5xx responses) # percentage (0-100) threshold: 99 interval: 1m - name: request-duration + # builtin Prometheus check # maximum req duration P99 # milliseconds threshold: 500 interval: 30s - # custom check - - name: "kafka lag" - threshold: 100 - query: | - avg_over_time( - kafka_consumergroup_lag{ - consumergroup=~"podinfo-consumer-.*", - topic="podinfo" - }[1m] - ) + - name: "database connections" + # custom Prometheus check + templateRef: + name: db-connections + thresholdRange: + min: 2 + max: 100 + interval: 1m # testing (optional) webhooks: - - name: load-test + - name: "conformance test" + type: pre-rollout + url: http://flagger-helmtester.test/ + timeout: 5m + metadata: + type: "helmv3" + cmd: "test run podinfo -n test" + - name: "load test" + type: rollout url: http://flagger-loadtester.test/ - timeout: 5s metadata: cmd: "hey -z 1m -q 10 -c 2 http://podinfo.test:9898/" + # alerting (optional) + alerts: + - name: "dev team Slack" + severity: error + providerRef: + name: dev-slack + namespace: flagger + - name: "qa team Discord" + severity: warn + providerRef: + name: qa-discord + - name: "on-call MS Teams" + severity: info + providerRef: + name: on-call-msteams ``` For more details on how the canary analysis and promotion works please [read the docs](https://docs.flagger.app/how-it-works). diff --git a/docs/gitbook/SUMMARY.md b/docs/gitbook/SUMMARY.md index 02655e1e..6c50a1b8 100644 --- a/docs/gitbook/SUMMARY.md +++ b/docs/gitbook/SUMMARY.md @@ -15,6 +15,8 @@ ## Usage * [Deployment Strategies](usage/deployment-strategies.md) +* [Metrics Analysis](usage/metrics.md) +* [Webhooks](usage/webhooks.md) * [Alerting](usage/alerting.md) * [Monitoring](usage/monitoring.md) diff --git a/docs/gitbook/dev-guide.md b/docs/gitbook/dev-guide.md index fd54e3eb..c082df0a 100644 --- a/docs/gitbook/dev-guide.md +++ b/docs/gitbook/dev-guide.md @@ -1,54 +1,70 @@ -# Development guide +# Flagger Development Guide This document describes how to build, test and run Flagger from source. -## Setup dev environment +### Setup dev environment Flagger is written in Go and uses Go modules for dependency management. On your dev machine install the following tools: +* go >= 1.13 +* git >= 2.20 +* bash >= 5.0 +* make >= 3.81 +* kubectl >= 1.16 +* kustomize >= 3.5 +* helm >= 3.0 +* docker >= 19.03 -* go >= 1.13 -* git >= 2.20 -* bash >= 5.0 -* make >= 3.81 -* kubectl >= 1.16 -* kustomize >= 3.5 -* helm >= 3.0 -* docker >= 19.03 +You'll also need a Kubernetes cluster for testing Flagger. +You can use Minikube, Kind, Docker desktop or any remote cluster +(AKS/EKS/GKE/etc) Kubernetes version 1.14 or newer. -You'll also need a Kubernetes cluster for testing Flagger. You can use Minikube, Kind, Docker desktop or any remote cluster \(AKS/EKS/GKE/etc\) Kubernetes version 1.14 or newer. +To start contributing to Flagger, fork the [repository](https://github.com/weaveworks/flagger) on GitHub. -## Build - -To start contributing to Flagger, fork the repository and clone it locally: +Create a dir inside your `GOPATH`: ```bash -git clone https://github.com//flagger +mkdir -p $GOPATH/src/github.com/weaveworks +``` + +Clone your fork: + +```bash +cd $GOPATH/src/github.com/weaveworks +git clone https://github.com/YOUR_USERNAME/flagger cd flagger ``` +Set Flagger repository as upstream: + +```bash +git remote add upstream https://github.com/weaveworks/flagger.git +``` + +Sync your fork regularly to keep it up-to-date with upstream: + +```bash +git fetch upstream +git checkout master +git merge upstream/master +``` + +### Build + Download Go modules: ```bash go mod download ``` -Build Flagger binary: - -```bash -CGO_ENABLED=0 go build -o ./bin/flagger ./cmd/flagger/ -``` - Build Flagger container image: ```bash make build ``` -## Unit testing - -Make a change to the source code and run the linter and unit tests: +Run unit tests: ```bash make test @@ -66,9 +82,10 @@ If you made changes to `pkg/apis` regenerate Kubernetes client sets with: ./hack/update-codegen.sh ``` -## Manual testing +### Manual testing -Install a service mesh and/or an ingress controller on your cluster and deploy Flagger using one of the install options [listed here](https://docs.flagger.app/install/flagger-install-on-kubernetes). +Install a service mesh and/or an ingress controller on your cluster and deploy Flagger +using one of the install options [listed here](https://docs.flagger.app/install/flagger-install-on-kubernetes). If you made changes to the CRDs, apply your local copy with: @@ -76,7 +93,7 @@ If you made changes to the CRDs, apply your local copy with: kubectl apply -f artifacts/flagger/crd.yaml ``` -Shutdown the Flagger instance installed on your cluster \(replace the namespace with your mesh/ingress one\): +Shutdown the Flagger instance installed on your cluster (replace the namespace with your mesh/ingress one): ```bash kubectl -n istio-system scale deployment/flagger --replicas=0 @@ -112,9 +129,9 @@ kubectl -n istio-system set image deployment/flagger flagger=..svc.cluster.local` - selector `app=-primary` - * `-primary..svc.cluster.local` - selector `app=-primary` - * `-canary..svc.cluster.local` - selector `app=` -This ensures that traffic coming from a namespace outside the mesh to `podinfo.test:9898` will be routed to the latest stable release of your app. +This ensures that traffic coming from a namespace outside the mesh to `podinfo.test:9898` +will be routed to the latest stable release of your app. ```yaml apiVersion: v1 @@ -243,13 +113,16 @@ spec: targetPort: http ``` -The `podinfo-canary.test:9898` address is available only during the canary analysis and can be used for conformance testing or load testing. +The `podinfo-canary.test:9898` address is available only during the +canary analysis and can be used for conformance testing or load testing. -## Multiple ports +### Multiple ports **My application listens on multiple ports, how can I expose them inside the cluster?** -If port discovery is enabled, Flagger scans the deployment spec and extracts the containers ports excluding the port specified in the canary service and Envoy sidecar ports. \`These ports will be used when generating the ClusterIP services. +If port discovery is enabled, Flagger scans the deployment spec and extracts the containers +ports excluding the port specified in the canary service and Envoy sidecar ports. +`These ports will be used when generating the ClusterIP services. For a deployment that exposes two ports: @@ -291,7 +164,7 @@ spec: Both port `8080` and `9090` will be added to the ClusterIP services. -## Label selectors +### Label selectors **What labels selectors are supported by Flagger?** @@ -312,7 +185,8 @@ spec: app: podinfo ``` -Besides `app` Flagger supports `name` and `app.kubernetes.io/name` selectors. If you use a different convention you can specify your label with the `-selector-labels` flag. +Besides `app` Flagger supports `name` and `app.kubernetes.io/name` selectors. If you use a different +convention you can specify your label with the `-selector-labels` flag. **Is pod affinity and anti affinity supported?** @@ -347,13 +221,131 @@ spec: topologyKey: kubernetes.io/hostname ``` -## Istio routing +### Metrics + +**How does Flagger measures the request success rate and duration?** + +Flagger measures the request success rate and duration using Prometheus queries. + +**HTTP requests success rate percentage** + +Spec: + +```yaml + canaryAnalysis: + metrics: + - name: request-success-rate + # minimum req success rate (non 5xx responses) + # percentage (0-100) + threshold: 99 + interval: 1m +``` + +Istio query: + +```javascript +sum( + rate( + istio_requests_total{ + reporter="destination", + destination_workload_namespace=~"$namespace", + destination_workload=~"$workload", + response_code!~"5.*" + }[$interval] + ) +) +/ +sum( + rate( + istio_requests_total{ + reporter="destination", + destination_workload_namespace=~"$namespace", + destination_workload=~"$workload" + }[$interval] + ) +) +``` + +Envoy query (App Mesh, Contour or Gloo): + +```javascript +sum( + rate( + envoy_cluster_upstream_rq{ + kubernetes_namespace="$namespace", + kubernetes_pod_name=~"$workload", + envoy_response_code!~"5.*" + }[$interval] + ) +) +/ +sum( + rate( + envoy_cluster_upstream_rq{ + kubernetes_namespace="$namespace", + kubernetes_pod_name=~"$workload" + }[$interval] + ) +) +``` + +**HTTP requests milliseconds duration P99** + +Spec: + +```yaml + canaryAnalysis: + metrics: + - name: request-duration + # maximum req duration P99 + # milliseconds + threshold: 500 + interval: 1m +``` + +Istio query: + +```javascript +histogram_quantile(0.99, + sum( + irate( + istio_request_duration_seconds_bucket{ + reporter="destination", + destination_workload=~"$workload", + destination_workload_namespace=~"$namespace" + }[$interval] + ) + ) by (le) +) +``` + +Envoy query (App Mesh, Contour or Gloo): + +```javascript +histogram_quantile(0.99, + sum( + irate( + envoy_cluster_upstream_rq_time_bucket{ + kubernetes_pod_name=~"$workload", + kubernetes_namespace=~"$namespace" + }[$interval] + ) + ) by (le) +) +``` + +> **Note** that the metric interval should be lower or equal to the control loop interval. + +### Istio routing **How does Flagger interact with Istio?** -Flagger creates an Istio Virtual Service and Destination Rules based on the Canary service spec. The service configuration lets you expose an app inside or outside the mesh. You can also define traffic policies, HTTP match conditions, URI rewrite rules, CORS policies, timeout and retries. +Flagger creates an Istio Virtual Service and Destination Rules based on the Canary service spec. +The service configuration lets you expose an app inside or outside the mesh. +You can also define traffic policies, HTTP match conditions, URI rewrite rules, CORS policies, timeout and retries. -The following spec exposes the `frontend` workload inside the mesh on `frontend.test.svc.cluster.local:9898` and outside the mesh on `frontend.example.com`. You'll have to specify an Istio ingress gateway for external hosts. +The following spec exposes the `frontend` workload inside the mesh on `frontend.test.svc.cluster.local:9898` +and outside the mesh on `frontend.example.com`. You'll have to specify an Istio ingress gateway for external hosts. ```yaml apiVersion: flagger.app/v1alpha3 @@ -487,9 +479,11 @@ spec: mode: DISABLE ``` -Flagger keeps in sync the virtual service and destination rules with the canary service spec. Any direct modification to the virtual service spec will be overwritten. +Flagger keeps in sync the virtual service and destination rules with the canary service spec. +Any direct modification to the virtual service spec will be overwritten. -To expose a workload inside the mesh on `http://backend.test.svc.cluster.local:9898`, the service spec can contain only the container port and the traffic policy: +To expose a workload inside the mesh on `http://backend.test.svc.cluster.local:9898`, +the service spec can contain only the container port and the traffic policy: ```yaml apiVersion: flagger.app/v1alpha3 @@ -530,13 +524,15 @@ spec: app: backend-primary ``` -Flagger works for user facing apps exposed outside the cluster via an ingress gateway and for backend HTTP APIs that are accessible only from inside the mesh. +Flagger works for user facing apps exposed outside the cluster via an ingress gateway +and for backend HTTP APIs that are accessible only from inside the mesh. -## Istio Ingress Gateway +### Istio Ingress Gateway **How can I expose multiple canaries on the same external domain?** -Assuming you have two apps, one that servers the main website and one that serves the REST API. For each app you can define a canary object as: +Assuming you have two apps, one that servers the main website and one that serves the REST API. +For each app you can define a canary object as: ```yaml apiVersion: flagger.app/v1alpha3 @@ -574,11 +570,13 @@ spec: uri: / ``` -Based on the above configuration, Flagger will create two virtual services bounded to the same ingress gateway and external host. Istio Pilot will [merge](https://istio.io/help/ops/traffic-management/deploy-guidelines/#multiple-virtual-services-and-destination-rules-for-the-same-host) the two services and the website rule will be moved to the end of the list in the merged configuration. +Based on the above configuration, Flagger will create two virtual services bounded to the same ingress gateway and external host. +Istio Pilot will [merge](https://istio.io/help/ops/traffic-management/deploy-guidelines/#multiple-virtual-services-and-destination-rules-for-the-same-host) +the two services and the website rule will be moved to the end of the list in the merged configuration. Note that host merging only works if the canaries are bounded to a ingress gateway other than the `mesh` gateway. -## Istio Mutual TLS +### Istio Mutual TLS **How can I enable mTLS for a canary?** @@ -633,4 +631,3 @@ spec: ports: - number: 80 ``` - diff --git a/docs/gitbook/how-it-works.md b/docs/gitbook/how-it-works.md index 9b9f525d..8987da9c 100644 --- a/docs/gitbook/how-it-works.md +++ b/docs/gitbook/how-it-works.md @@ -1,8 +1,11 @@ # How it works -[Flagger](https://github.com/weaveworks/flagger) takes a Kubernetes deployment and optionally a horizontal pod autoscaler \(HPA\) and creates a series of objects \(Kubernetes deployments, ClusterIP services, virtual service, traffic split or ingress\) to drive the canary analysis and promotion. +[Flagger](https://github.com/weaveworks/flagger) takes a Kubernetes deployment and optionally +a horizontal pod autoscaler (HPA) and creates a series of objects +(Kubernetes deployments, ClusterIP services, virtual service, traffic split or ingress) +to drive the canary analysis and promotion. -## Canary Custom Resource +### Canary Custom Resource For a deployment named _podinfo_, a canary promotion can be defined using Flagger's custom resource: @@ -11,71 +14,57 @@ apiVersion: flagger.app/v1alpha3 kind: Canary metadata: name: podinfo - namespace: test spec: - # service mesh provider (optional) - # can be: kubernetes, istio, linkerd, appmesh, nginx, gloo, supergloo - provider: linkerd - # deployment reference targetRef: apiVersion: apps/v1 kind: Deployment name: podinfo - # the maximum time in seconds for the canary deployment - # to make progress before it is rollback (default 600s) - progressDeadlineSeconds: 60 - # HPA reference (optional) autoscalerRef: apiVersion: autoscaling/v2beta1 kind: HorizontalPodAutoscaler name: podinfo service: - # service name (optional) name: podinfo - # ClusterIP port number port: 9898 - # ClusterIP port name can be http or grpc (default http) portName: http - # container port number or name (optional) targetPort: 9898 - # add all the other container ports - # to the ClusterIP services (default false) - portDiscovery: false - # promote the canary without analysing it (default false) - skipAnalysis: false - # define the canary analysis timing and KPIs + portDiscovery: true canaryAnalysis: - # schedule interval (default 60s) interval: 1m - # max number of failed metric checks before rollback threshold: 10 - # max traffic percentage routed to canary - # percentage (0-100) maxWeight: 50 - # canary increment step - # percentage (0-100) stepWeight: 5 - # Prometheus checks metrics: - - name: request-success-rate - # minimum req success rate (non 5xx responses) - # percentage (0-100) - threshold: 99 - interval: 1m - - name: request-duration - # maximum req duration P99 - # milliseconds - threshold: 500 - interval: 30s - # testing (optional) + - name: request-success-rate + threshold: 99 + interval: 1m + - name: request-duration + threshold: 99 + interval: 1m webhooks: - name: load-test url: http://flagger-loadtester.test/ - timeout: 5s metadata: - cmd: "hey -z 1m -q 10 -c 2 http://podinfo.test:9898/" + cmd: "hey -z 1m -q 10 -c 2 http://podinfo-canary.test:9898/" ``` +Based on the above configuration, Flagger generates the following Kubernetes objects: +* `deployment/-primary` +* `hpa/-primary` + +The primary deployment is considered the stable release of your app, by default all traffic is routed to this version +and the target deployment is scaled to zero. +Flagger will detect changes to the target deployment (including secrets and configmaps) and will perform a +canary analysis before promoting the new version as primary. + +The autoscaler reference is optional, when specified, Flagger will pause the traffic increase while the +target and primary deployments are scaled up or down. HPA can help reduce the resource usage during the canary analysis. + +If the target deployment uses secrets and/or configmaps, Flagger will create a copy of each object using the `-primary` +prefix and will reference these objects in the primary deployment. You can disable the secrets/configmaps tracking +with the `-enable-config-tracking=false` command flag in the Flagger deployment manifest under containers args +or by setting `--set configTracking.enabled=false` when installing Flagger with Helm. + **Note** that the target deployment must have a single label selector in the format `app: `: ```yaml @@ -93,13 +82,30 @@ spec: app: podinfo ``` -Besides `app` Flagger supports `name` and `app.kubernetes.io/name` selectors. If you use a different convention you can specify your label with the `-selector-labels=my-app-label` command flag in the Flagger deployment manifest under containers args or by setting `--set selectorLabels=my-app-label` when installing Flagger with Helm. +Besides `app` Flagger supports `name` and `app.kubernetes.io/name` selectors. +If you use a different convention you can specify your label with +the `-selector-labels=my-app-label` command flag in the Flagger deployment manifest under containers args +or by setting `--set selectorLabels=my-app-label` when installing Flagger with Helm. -The target deployment should expose a TCP port that will be used by Flagger to create the ClusterIP Services. The container port from the target deployment should match the `service.port` or `service.targetPort`. +The target deployment should expose a TCP port that will be used by Flagger to create the ClusterIP Services. +The container port from the target deployment should match the `service.port` or `service.targetPort`. -## Canary status +Based on the canary spec service, Flagger generates the following Kubernetes ClusterIP service: -Get the current status of canary deployments cluster wide: +* `..svc.cluster.local` + selector `app=-primary` +* `-primary..svc.cluster.local` + selector `app=-primary` +* `-canary..svc.cluster.local` + selector `app=` + +This ensures that traffic to `podinfo.test:9898` will be routed to the latest stable release of your app. +The `podinfo-canary.test:9898` address is available only during the +canary analysis and can be used for conformance testing or load testing. + +### Canary status + +You can use kubectl to get the current status of canary deployments cluster wide: ```bash kubectl get canaries --all-namespaces @@ -110,7 +116,7 @@ prod frontend Succeeded 0 2019-06-30T16:15:07Z prod backend Failed 0 2019-06-30T17:05:07Z ``` -The status condition reflects the last know state of the canary analysis: +The status condition reflects the last known state of the canary analysis: ```bash kubectl -n test get canary/podinfo -oyaml | awk '/status/,0' @@ -134,7 +140,10 @@ status: type: Promoted ``` -The `Promoted` status condition can have one of the following reasons: Initialized, Waiting, Progressing, Promoting, Finalising, Succeeded or Failed. A failed canary will have the promoted status set to `false`, the reason to `failed` and the last applied spec will be different to the last promoted one. +The `Promoted` status condition can have one of the following reasons: +Initialized, Waiting, Progressing, Promoting, Finalising, Succeeded or Failed. +A failed canary will have the promoted status set to `false`, +the reason to `failed` and the last applied spec will be different to the last promoted one. Wait for a successful rollout: @@ -162,747 +171,47 @@ kubectl wait canary/podinfo --for=condition=promoted --timeout=5m kubectl get canary/podinfo | grep Succeeded ``` -## Canary Stages +### Canary Analysis -![Flagger Canary Stages](https://raw.githubusercontent.com/weaveworks/flagger/master/docs/diagrams/flagger-canary-steps.png) - -A canary deployment is triggered by changes in any of the following objects: - -* Deployment PodSpec \(container image, command, ports, env, resources, etc\) -* ConfigMaps mounted as volumes or mapped to environment variables -* Secrets mounted as volumes or mapped to environment variables - -Gated canary promotion stages: - -* scan for canary deployments -* check primary and canary deployment status - * halt advancement if a rolling update is underway - * halt advancement if pods are unhealthy -* call confirm-rollout webhooks and check results - * halt advancement if any hook returns a non HTTP 2xx result -* call pre-rollout webhooks and check results - * halt advancement if any hook returns a non HTTP 2xx result - * increment the failed checks counter -* increase canary traffic weight percentage from 0% to 5% \(step weight\) -* call rollout webhooks and check results -* check canary HTTP request success rate and latency - * halt advancement if any metric is under the specified threshold - * increment the failed checks counter -* check if the number of failed checks reached the threshold - * route all traffic to primary - * scale to zero the canary deployment and mark it as failed - * call post-rollout webhooks - * post the analysis result to Slack - * wait for the canary deployment to be updated and start over -* increase canary traffic weight by 5% \(step weight\) till it reaches 50% \(max weight\) - * halt advancement if any webhook call fails - * halt advancement while canary request success rate is under the threshold - * halt advancement while canary request duration P99 is over the threshold - * halt advancement while any custom metric check fails - * halt advancement if the primary or canary deployment becomes unhealthy - * halt advancement while canary deployment is being scaled up/down by HPA -* call confirm-promotion webhooks and check results - * halt advancement if any hook returns a non HTTP 2xx result -* promote canary to primary - * copy ConfigMaps and Secrets from canary to primary - * copy canary deployment spec template over primary -* wait for primary rolling update to finish - * halt advancement if pods are unhealthy -* route all traffic to primary -* scale to zero the canary deployment -* mark rollout as finished -* call post-rollout webhooks -* post the analysis result to Slack or MS Teams -* wait for the canary deployment to be updated and start over - -## Canary Analysis - -The canary analysis runs periodically until it reaches the maximum traffic weight or the failed checks threshold. +The canary analysis defines: +* the type of [deployment strategy](usage/deployment-strategies.md) +* the [metrics](usage/metrics.md) used to validate the canary version +* the [webhooks](usage/webhooks.md) used for conformance testing, load testing and manual gating +* the [alerting settings](usage/alerting.md) Spec: ```yaml canaryAnalysis: # schedule interval (default 60s) - interval: 1m + interval: # max number of failed metric checks before rollback - threshold: 10 + threshold: # max traffic percentage routed to canary # percentage (0-100) - maxWeight: 50 + maxWeight: # canary increment step # percentage (0-100) - stepWeight: 2 - # deploy straight to production without - # the metrics and webhook checks - skipAnalysis: false -``` - -The above analysis, if it succeeds, will run for 25 minutes while validating the HTTP metrics and webhooks every minute. You can determine the minimum time that it takes to validate and promote a canary deployment using this formula: - -```text -interval * (maxWeight / stepWeight) -``` - -And the time it takes for a canary to be rollback when the metrics or webhook checks are failing: - -```text -interval * threshold -``` - -In emergency cases, you may want to skip the analysis phase and ship changes directly to production. At any time you can set the `spec.skipAnalysis: true`. When skip analysis is enabled, Flagger checks if the canary deployment is healthy and promotes it without analysing it. If an analysis is underway, Flagger cancels it and runs the promotion. - -## A/B Testing - -Besides weighted routing, Flagger can be configured to route traffic to the canary based on HTTP match conditions. In an A/B testing scenario, you'll be using HTTP headers or cookies to target a certain segment of your users. This is particularly useful for frontend applications that require session affinity. - -You can enable A/B testing by specifying the HTTP match conditions and the number of iterations: - -```yaml - canaryAnalysis: - # schedule interval (default 60s) - interval: 1m + stepWeight: # total number of iterations - iterations: 10 - # max number of failed iterations before rollback - threshold: 2 - # canary match condition + # used for A/B Testing and Blue/Green + iterations: + # canary match conditions + # used for A/B Testing match: - - headers: - user-agent: - regex: "^(?!.*Chrome).*Safari.*" - - headers: - cookie: - regex: "^(.*?;)?(user=test)(;.*)?$" -``` - -If Flagger finds a HTTP match condition, it will ignore the `maxWeight` and `stepWeight` settings. - -The above configuration will run an analysis for ten minutes targeting the Safari users and those that have a test cookie. You can determine the minimum time that it takes to validate and promote a canary deployment using this formula: - -```text -interval * iterations -``` - -And the time it takes for a canary to be rollback when the metrics or webhook checks are failing: - -```text -interval * threshold -``` - -Make sure that the analysis threshold is lower than the number of iterations. - -## Blue/Green deployments - -For applications that are not deployed on a service mesh, Flagger can orchestrate blue/green style deployments with Kubernetes L4 networking. When using Istio you have the option to mirror traffic between blue and green. - -You can use the blue/green deployment strategy by replacing `stepWeight/maxWeight` with `iterations` in the `canaryAnalysis` spec: - -```yaml - canaryAnalysis: - # schedule interval (default 60s) - interval: 1m - # total number of iterations - iterations: 10 - # max number of failed iterations before rollback - threshold: 2 - # Traffic shadowing (compatible with Istio only) - mirror: true -``` - -With the above configuration Flagger will run conformance and load tests on the canary pods for ten minutes. If the metrics analysis succeeds, live traffic will be switched from the old version to the new one when the canary is promoted. - -The blue/green deployment strategy is supported for all service mesh providers. - -Blue/Green rollout steps for service mesh: - -* scale up the canary \(green\) -* run conformance tests for the canary pods -* run load tests and metric checks for the canary pods -* route traffic to canary -* promote canary spec over primary \(blue\) -* wait for primary rollout -* route traffic to primary -* scale down canary - -After the analysis finishes, the traffic is routed to the canary \(green\) before triggering the primary \(blue\) rolling update, this ensures a smooth transition to the new version avoiding dropping in-flight requests during the Kubernetes deployment rollout. - -## HTTP Metrics - -The canary analysis is using the following Prometheus queries: - -**HTTP requests success rate percentage** - -Spec: - -```yaml - canaryAnalysis: + - # HTTP header + # key performance indicators metrics: - - name: request-success-rate - # minimum req success rate (non 5xx responses) - # percentage (0-100) - threshold: 99 - interval: 1m -``` - -Istio query: - -```javascript -sum( - rate( - istio_requests_total{ - reporter="destination", - destination_workload_namespace=~"$namespace", - destination_workload=~"$workload", - response_code!~"5.*" - }[$interval] - ) -) -/ -sum( - rate( - istio_requests_total{ - reporter="destination", - destination_workload_namespace=~"$namespace", - destination_workload=~"$workload" - }[$interval] - ) -) -``` - -Envoy query \(App Mesh, Contour or Gloo\): - -```javascript -sum( - rate( - envoy_cluster_upstream_rq{ - kubernetes_namespace="$namespace", - kubernetes_pod_name=~"$workload", - envoy_response_code!~"5.*" - }[$interval] - ) -) -/ -sum( - rate( - envoy_cluster_upstream_rq{ - kubernetes_namespace="$namespace", - kubernetes_pod_name=~"$workload" - }[$interval] - ) -) -``` - -**HTTP requests milliseconds duration P99** - -Spec: - -```yaml - canaryAnalysis: - metrics: - - name: request-duration - # maximum req duration P99 - # milliseconds - threshold: 500 - interval: 1m -``` - -Istio query: - -```javascript -histogram_quantile(0.99, - sum( - irate( - istio_request_duration_seconds_bucket{ - reporter="destination", - destination_workload=~"$workload", - destination_workload_namespace=~"$namespace" - }[$interval] - ) - ) by (le) -) -``` - -Envoy query \(App Mesh, Contour or Gloo\): - -```javascript -histogram_quantile(0.99, - sum( - irate( - envoy_cluster_upstream_rq_time_bucket{ - kubernetes_pod_name=~"$workload", - kubernetes_namespace=~"$namespace" - }[$interval] - ) - ) by (le) -) -``` - -> **Note** that the metric interval should be lower or equal to the control loop interval. - -## Custom Metrics - -The canary analysis can be extended with custom Prometheus queries. - -```yaml - canaryAnalysis: - threshold: 1 - maxWeight: 50 - stepWeight: 5 - metrics: - - name: "404s percentage" - threshold: 5 - query: | - 100 - sum( - rate( - istio_requests_total{ - reporter="destination", - destination_workload_namespace="test", - destination_workload="podinfo", - response_code!="404" - }[1m] - ) - ) - / - sum( - rate( - istio_requests_total{ - reporter="destination", - destination_workload_namespace="test", - destination_workload="podinfo" - }[1m] - ) - ) * 100 -``` - -The above configuration validates the canary by checking if the HTTP 404 req/sec percentage is below 5 percent of the total traffic. If the 404s rate reaches the 5% threshold, then the canary fails. - -```yaml - canaryAnalysis: - threshold: 1 - maxWeight: 50 - stepWeight: 5 - metrics: - - name: "rpc error rate" - threshold: 5 - query: | - 100 - (sum - rate( - grpc_server_handled_total{ - grpc_service="my.TestService", - grpc_code!="OK" - }[1m] - ) - ) - / - sum( - rate( - grpc_server_started_total{ - grpc_service="my.TestService" - }[1m] - ) - ) * 100 -``` - -The above configuration validates the canary by checking if the percentage of non-OK GRPC req/sec is below 5 percent of the total requests. If the non-OK rate reaches the 5% threshold, then the canary fails. - -When specifying a query, Flagger will run the promql query and convert the result to float64. Then it compares the query result value with the metric threshold value. - -## Webhooks - -The canary analysis can be extended with webhooks. Flagger will call each webhook URL and determine from the response status code \(HTTP 2xx\) if the canary is failing or not. - -There are several types of hooks: - -* **confirm-rollout** hooks are executed before scaling up the canary deployment and can be used for manual approval. - - The rollout is paused until the hook returns a successful HTTP status code. - -* **pre-rollout** hooks are executed before routing traffic to canary. - - The canary advancement is paused if a pre-rollout hook fails and if the number of failures reach the - - threshold the canary will be rollback. - -* **rollout** hooks are executed during the analysis on each iteration before the metric checks. - - If a rollout hook call fails the canary advancement is paused and eventfully rolled back. - -* **confirm-promotion** hooks are executed before the promotion step. - - The canary promotion is paused until the hooks return HTTP 200. - - While the promotion is paused, Flagger will continue to run the metrics checks and rollout hooks. - -* **post-rollout** hooks are executed after the canary has been promoted or rolled back. - - If a post rollout hook fails the error is logged. - -* **rollback** hooks are executed while a canary deployment is in either Progressing or Waiting status. - - This provides the ability to rollback during analysis or while waiting for a confirmation. If a rollback hook - - returns a successful HTTP status code, Flagger will stop the analysis and mark the canary release as failed. - -* **event** hooks are executed every time Flagger emits a Kubernetes event. When configured, - - every action that Flagger takes during a canary deployment will be sent as JSON via an HTTP POST request. - -Spec: - -```yaml - canaryAnalysis: + - # metric check + # alerting + alerts: + - # alert provider + # external checks webhooks: - - name: "start gate" - type: confirm-rollout - url: http://flagger-loadtester.test/gate/approve - - name: "smoke test" - type: pre-rollout - url: http://flagger-helmtester.kube-system/ - timeout: 3m - metadata: - type: "helm" - cmd: "test podinfo --cleanup" - - name: "load test" - type: rollout - url: http://flagger-loadtester.test/ - timeout: 15s - metadata: - cmd: "hey -z 1m -q 5 -c 2 http://podinfo-canary.test:9898/" - - name: "promotion gate" - type: confirm-promotion - url: http://flagger-loadtester.test/gate/approve - - name: "notify" - type: post-rollout - url: http://telegram.bot:8080/ - timeout: 5s - metadata: - some: "message" - - name: "rollback gate" - type: rollback - url: http://flagger-loadtester.test/rollback/check - - name: "send to Slack" - type: event - url: http://event-recevier.notifications/slack + - # hook ``` -> **Note** that the sum of all rollout webhooks timeouts should be lower than the analysis interval. - -Webhook payload \(HTTP POST\): - -```javascript -{ - "name": "podinfo", - "namespace": "test", - "phase": "Progressing", - "metadata": { - "test": "all", - "token": "16688eb5e9f289f1991c" - } -} -``` - -Response status codes: - -* 200-202 - advance canary by increasing the traffic weight -* timeout or non-2xx - halt advancement and increment failed checks - -On a non-2xx response Flagger will include the response body \(if any\) in the failed checks log and Kubernetes events. - -Event payload \(HTTP POST\): - -```javascript -{ - "name": "string (canary name)", - "namespace": "string (canary namespace)", - "phase": "string (canary phase)", - "metadata": { - "eventMessage": "string (canary event message)", - "eventType": "string (canary event type)", - "timestamp": "string (unix timestamp ms)" - } -} -``` - -The event receiver can create alerts based on the received phase \(possible values: `Initialized`, `Waiting`, `Progressing`, `Promoting`, `Finalising`, `Succeeded` or `Failed`\). - -## Load Testing - -For workloads that are not receiving constant traffic Flagger can be configured with a webhook, that when called, will start a load test for the target workload. If the target workload doesn't receive any traffic during the canary analysis, Flagger metric checks will fail with "no values found for metric request-success-rate". - -Flagger comes with a load testing service based on [rakyll/hey](https://github.com/rakyll/hey) that generates traffic during analysis when configured as a webhook. - -![Flagger Load Testing Webhook](https://raw.githubusercontent.com/weaveworks/flagger/master/docs/diagrams/flagger-load-testing.png) - -First you need to deploy the load test runner in a namespace with sidecar injection enabled: - -```bash -export REPO=https://raw.githubusercontent.com/weaveworks/flagger/master - -kubectl -n test apply -f ${REPO}/artifacts/loadtester/deployment.yaml -kubectl -n test apply -f ${REPO}/artifacts/loadtester/service.yaml -``` - -Or by using Helm: - -```bash -helm repo add flagger https://flagger.app - -helm upgrade -i flagger-loadtester flagger/loadtester \ ---namespace=test \ ---set cmd.timeout=1h -``` - -When deployed the load tester API will be available at `http://flagger-loadtester.test/`. - -Now you can add webhooks to the canary analysis spec: - -```yaml -webhooks: - - name: load-test-get - url: http://flagger-loadtester.test/ - timeout: 5s - metadata: - type: cmd - cmd: "hey -z 1m -q 10 -c 2 http://podinfo-canary.test:9898/" - - name: load-test-post - url: http://flagger-loadtester.test/ - timeout: 5s - metadata: - type: cmd - cmd: "hey -z 1m -q 10 -c 2 -m POST -d '{test: 2}' http://podinfo-canary.test:9898/echo" -``` - -When the canary analysis starts, Flagger will call the webhooks and the load tester will run the `hey` commands in the background, if they are not already running. This will ensure that during the analysis, the `podinfo-canary.test` service will receive a steady stream of GET and POST requests. - -If your workload is exposed outside the mesh you can point `hey` to the public URL and use HTTP2. - -```yaml -webhooks: - - name: load-test-get - url: http://flagger-loadtester.test/ - timeout: 5s - metadata: - type: cmd - cmd: "hey -z 1m -q 10 -c 2 -h2 https://podinfo.example.com/" -``` - -For gRPC services you can use [bojand/ghz](https://github.com/bojand/ghz) which is a similar tool to Hey but for gPRC: - -```yaml -webhooks: - - name: grpc-load-test - url: http://flagger-loadtester.test/ - timeout: 5s - metadata: - type: cmd - cmd: "ghz -z 1m -q 10 -c 2 --insecure podinfo.test:9898" -``` - -`ghz` uses reflection to identify which gRPC method to call. If you do not wish to enable reflection for your gRPC service you can implement a standardized health check from the [grpc-proto](https://github.com/grpc/grpc-proto) library. To use this [health check schema](https://github.com/grpc/grpc-proto/blob/master/grpc/health/v1/health.proto) without reflection you can pass a parameter to `ghz` like this - -```yaml -webhooks: - - name: grpc-load-test-no-reflection - url: http://flagger-loadtester.test/ - timeout: 5s - metadata: - type: cmd - cmd: "ghz --insecure --proto=/tmp/ghz/health.proto --call=grpc.health.v1.Health/Check podinfo.test:9898" -``` - -The load tester can run arbitrary commands as long as the binary is present in the container image. For example if you you want to replace `hey` with another CLI, you can create your own Docker image: - -```text -FROM weaveworks/flagger-loadtester: - -RUN curl -Lo /usr/local/bin/my-cli https://github.com/user/repo/releases/download/ver/my-cli \ - && chmod +x /usr/local/bin/my-cli -``` - -## Load Testing Delegation - -The load tester can also forward testing tasks to external tools, by now [nGrinder](https://github.com/naver/ngrinder) is supported. - -To use this feature, add a load test task of type 'ngrinder' to the canary analysis spec: - -```yaml -webhooks: - - name: load-test-post - url: http://flagger-loadtester.test/ - timeout: 5s - metadata: - # type of this load test task, cmd or ngrinder - type: ngrinder - # base url of your nGrinder controller server - server: http://ngrinder-server:port - # id of the test to clone from, the test must have been defined. - clone: 100 - # user name and base64 encoded password to authenticate against the nGrinder server - username: admin - passwd: YWRtaW4= - # the interval between between nGrinder test status polling, default to 1s - pollInterval: 5s -``` - -When the canary analysis starts, the load tester will initiate a [clone\_and\_start request](https://github.com/naver/ngrinder/wiki/REST-API-PerfTest) to the nGrinder server and start a new performance test. the load tester will periodically poll the nGrinder server for the status of the test, and prevent duplicate requests from being sent in subsequent analysis loops. - -## Integration Testing - -Flagger comes with a testing service that can run Helm tests or Bats tests when configured as a webhook. - -Deploy the Helm test runner in the `kube-system` namespace using the `tiller` service account: - -```bash -helm repo add flagger https://flagger.app - -helm upgrade -i flagger-helmtester flagger/loadtester \ ---namespace=kube-system \ ---set serviceAccountName=tiller -``` - -When deployed the Helm tester API will be available at `http://flagger-helmtester.kube-system/`. - -Now you can add pre-rollout webhooks to the canary analysis spec: - -```yaml - canaryAnalysis: - webhooks: - - name: "smoke test" - type: pre-rollout - url: http://flagger-helmtester.kube-system/ - timeout: 3m - metadata: - type: "helm" - cmd: "test {{ .Release.Name }} --cleanup" -``` - -When the canary analysis starts, Flagger will call the pre-rollout webhooks before routing traffic to the canary. If the helm test fails, Flagger will retry until the analysis threshold is reached and the canary is rolled back. - -If you are using Helm v3, you'll have to create a dedicated service account and add the release namespace to the test command: - -```yaml - canaryAnalysis: - webhooks: - - name: "smoke test" - type: pre-rollout - url: http://flagger-helmtester.kube-system/ - timeout: 3m - metadata: - type: "helmv3" - cmd: "test run {{ .Release.Name }} --cleanup -n {{ .Release.Namespace }}" -``` - -As an alternative to Helm you can use the [Bash Automated Testing System](https://github.com/bats-core/bats-core) to run your tests. - -```yaml - canaryAnalysis: - webhooks: - - name: "acceptance tests" - type: pre-rollout - url: http://flagger-batstester.default/ - timeout: 5m - metadata: - type: "bash" - cmd: "bats /tests/acceptance.bats" -``` - -Note that you should create a ConfigMap with your Bats tests and mount it inside the tester container. - -## Manual Gating - -For manual approval of a canary deployment you can use the `confirm-rollout` and `confirm-promotion` webhooks. The confirmation rollout hooks are executed before the pre-rollout hooks. Flagger will halt the canary traffic shifting and analysis until the confirm webhook returns HTTP status 200. - -For manual rollback of a canary deployment you can use the `rollback` webhook. The rollback hook will be called during the analysis and confirmation states. If a rollback webhook returns a successful HTTP status code, Flagger will shift all traffic back to the primary instance and fail the canary. - -Manual gating with Flagger's tester: - -```yaml - canaryAnalysis: - webhooks: - - name: "gate" - type: confirm-rollout - url: http://flagger-loadtester.test/gate/halt -``` - -The `/gate/halt` returns HTTP 403 thus blocking the rollout. - -If you have notifications enabled, Flagger will post a message to Slack or MS Teams if a canary rollout is waiting for approval. - -Change the URL to `/gate/approve` to start the canary analysis: - -```yaml - canaryAnalysis: - webhooks: - - name: "gate" - type: confirm-rollout - url: http://flagger-loadtester.test/gate/approve -``` - -Manual gating can be driven with Flagger's tester API. Set the confirmation URL to `/gate/check`: - -```yaml - canaryAnalysis: - webhooks: - - name: "ask for confirmation" - type: confirm-rollout - url: http://flagger-loadtester.test/gate/check -``` - -By default the gate is closed, you can start or resume the canary rollout with: - -```bash -kubectl -n test exec -it flagger-loadtester-xxxx-xxxx sh - -curl -d '{"name": "podinfo","namespace":"test"}' http://localhost:8080/gate/open -``` - -You can pause the rollout at any time with: - -```bash -curl -d '{"name": "podinfo","namespace":"test"}' http://localhost:8080/gate/close -``` - -If a canary analysis is paused the status will change to waiting: - -```bash -kubectl get canary/podinfo - -NAME STATUS WEIGHT -podinfo Waiting 0 -``` - -The `confirm-promotion` hook type can be used to manually approve the canary promotion. While the promotion is paused, Flagger will continue to run the metrics checks and load tests. - -```yaml - canaryAnalysis: - webhooks: - - name: "promotion gate" - type: confirm-promotion - url: http://flagger-loadtester.test/gate/halt -``` - -The `rollback` hook type can be used to manually rollback the canary promotion. As with gating, rollbacks can be driven with Flagger's tester API by setting the rollback URL to `/rollback/check` - -```yaml - canaryAnalysis: - webhooks: - - name: "rollback" - type: rollback - url: http://flagger-loadtester.test/rollback/check -``` - -By default rollback is closed, you can rollback a canary rollout with: - -```bash -kubectl -n test exec -it flagger-loadtester-xxxx-xxxx sh - -curl -d '{"name": "podinfo","namespace":"test"}' http://localhost:8080/rollback/open -``` - -You can close the rollback with: - -```bash curl -d '{"name": "podinfo","namespace":"test"}' http://localhost:8080/rollback/close`` - -If you have notifications enabled, Flagger will post a message to Slack or MS Teams if a canary promotion is waiting for approval. +The canary analysis runs periodically until it reaches the maximum traffic weight or the number of iterations. +On each run, Flagger calls the webhooks, checks the metrics and if the failed checks threshold is reached, stops the +analysis and rolls back the canary. If alerting is configured, Flagger will post the analysis result using the alert providers. diff --git a/docs/gitbook/usage/alerting.md b/docs/gitbook/usage/alerting.md index 212888c3..f408e404 100644 --- a/docs/gitbook/usage/alerting.md +++ b/docs/gitbook/usage/alerting.md @@ -1,6 +1,9 @@ # Alerting -## Slack +Flagger can be configured to send alerts to various chat platforms. You can define a global alert provider at +install time or configure alerts on a per canary basis. + +### Global configuration Flagger can be configured to send Slack notifications: @@ -11,16 +14,16 @@ helm upgrade -i flagger flagger/flagger \ --set slack.user=flagger ``` -Once configured with a Slack incoming **webhook**, Flagger will post messages when a canary deployment has been initialised, when a new revision has been detected and if the canary analysis failed or succeeded. +Once configured with a Slack incoming **webhook**, Flagger will post messages when a canary deployment +has been initialised, when a new revision has been detected and if the canary analysis failed or succeeded. ![Slack Notifications](https://raw.githubusercontent.com/weaveworks/flagger/master/docs/screens/slack-canary-notifications.png) -A canary deployment will be rolled back if the progress deadline exceeded or if the analysis reached the maximum number of failed checks: +A canary deployment will be rolled back if the progress deadline exceeded or if the analysis reached the +maximum number of failed checks: ![Slack Notifications](https://raw.githubusercontent.com/weaveworks/flagger/master/docs/screens/slack-canary-failed.png) -## Microsoft Teams - Flagger can be configured to send notifications to Microsoft Teams: ```bash @@ -28,17 +31,89 @@ helm upgrade -i flagger flagger/flagger \ --set msteams.url=https://outlook.office.com/webhook/YOUR/TEAMS/WEBHOOK ``` -Flagger will post a message card to MS Teams when a new revision has been detected and if the canary analysis failed or succeeded: +Similar to Slack, Flagger alerts on canary analysis events: ![MS Teams Notifications](https://raw.githubusercontent.com/weaveworks/flagger/master/docs/screens/flagger-ms-teams-notifications.png) -And you'll get a notification on rollback: - ![MS Teams Notifications](https://raw.githubusercontent.com/weaveworks/flagger/master/docs/screens/flagger-ms-teams-failed.png) -## Prometheus Alert Manager +### Canary configuration -Besides Slack, you can use Alertmanager to trigger alerts when a canary deployment failed: +Configuring alerting globally has several limitations as it's not possible to specify different channels +or configure the verbosity on a per canary basis. +To make the alerting move flexible, the canary analysis can be extended +with a list of alerts that reference an alert provider. +For each alert, users can configure the severity level. +The alerts section overrides the global setting. + +Slack example: + +```yaml +apiVersion: flagger.app/v1beta1 +kind: AlertProvider +metadata: + name: on-call + namespace: flagger +spec: + type: slack + channel: on-call-alerts + username: flagger + # webhook address (ignored if secretRef is specified) + address: https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK + # secret containing the webhook address (optional) + secretRef: + name: on-call-url +--- +apiVersion: v1 +kind: Secret +metadata: + name: on-call-url + namespace: flagger +data: + address: +``` + +The alert provider **type** can be: `slack`, `msteams`, `rocket` or `discord`. When set to `discord`, +Flagger will use [Slack formatting](https://birdie0.github.io/discord-webhooks-guide/other/slack_formatting.html) +and will append `/slack` to the Discord address. + +When not specified, **channel** defaults to `general` and **username** defaults to `flagger`. + +When **secretRef** is specified, the Kubernetes secret must contain a data field named `address`, +the address in the secret will take precedence over the **address** field in the provider spec. + +The canary analysis can have a list of alerts, each alert referencing an alert provider: + +```yaml + canaryAnalysis: + alerts: + - name: "on-call Slack" + severity: error + providerRef: + name: on-call + namespace: flagger + - name: "qa Discord" + severity: warn + providerRef: + name: qa-discord + - name: "dev MS Teams" + severity: info + providerRef: + name: dev-msteams +``` + +Alert fields: +* **name** (required) +* **severity** levels: `info`, `warn`, `error` (default info) +* **providerRef.name** alert provider name (required) +* **providerRef.namespace** alert provider namespace (defaults to the canary namespace) + +When the severity is set to `warn`, Flagger will alert when waiting on manual confirmation or if the analysis fails. +When the severity is set to `error`, Flagger will alert only if the canary analysis fails. + +### Prometheus Alert Manager + +You can use Alertmanager to trigger alerts when a canary deployment failed: ```yaml - alert: canary_rollback diff --git a/docs/gitbook/usage/deployment-strategies.md b/docs/gitbook/usage/deployment-strategies.md index 8b01c047..2775f73a 100644 --- a/docs/gitbook/usage/deployment-strategies.md +++ b/docs/gitbook/usage/deployment-strategies.md @@ -1,31 +1,33 @@ # Deployment Strategies Flagger can run automated application analysis, promotion and rollback for the following deployment strategies: +* **Canary Release** (progressive traffic shifting) + * Istio, Linkerd, App Mesh, NGINX, Contour, Gloo +* **A/B Testing** (HTTP headers and cookies traffic routing) + * Istio, App Mesh, NGINX, Contour +* **Blue/Green** (traffic switching) + * Kubernetes CNI, Istio, Linkerd, App Mesh, NGINX, Contour, Gloo +* **Blue/Green Mirroring** (traffic shadowing) + * Istio -* Canary release \(progressive traffic shifting\) - * Istio, Linkerd, App Mesh, NGINX, Contour, Gloo -* A/B Testing \(HTTP headers and cookies traffic routing\) - * Istio, App Mesh, NGINX, Contour -* Blue/Green \(traffic switch\) - * Kubernetes CNI, Istio, Linkerd, App Mesh, NGINX, Contour, Gloo -* Blue/Green \(traffic mirroring\) - * Istio - -For Canary releases and A/B testing you'll need a Layer 7 traffic management solution like a service mesh or an ingress controller. For Blue/Green deployments no service mesh or ingress controller is required. +For Canary releases and A/B testing you'll need a Layer 7 traffic management solution like a service mesh or an ingress controller. +For Blue/Green deployments no service mesh or ingress controller is required. A canary analysis is triggered by changes in any of the following objects: -* Deployment PodSpec \(container image, command, ports, env, resources, etc\) +* Deployment PodSpec (container image, command, ports, env, resources, etc) * ConfigMaps mounted as volumes or mapped to environment variables * Secrets mounted as volumes or mapped to environment variables -## Canary Release +### Canary Release -Flagger implements a control loop that gradually shifts traffic to the canary while measuring key performance indicators like HTTP requests success rate, requests average duration and pod health. Based on analysis of the KPIs a canary is promoted or aborted. +Flagger implements a control loop that gradually shifts traffic to the canary while measuring key performance +indicators like HTTP requests success rate, requests average duration and pod health. +Based on analysis of the KPIs a canary is promoted or aborted. ![Flagger Canary Stages](https://raw.githubusercontent.com/weaveworks/flagger/master/docs/diagrams/flagger-canary-steps.png) -The canary analysis runs periodically until it reaches the maximum traffic weight or the failed checks threshold. +The canary analysis runs periodically until it reaches the maximum traffic weight or the failed checks threshold. Spec: @@ -46,27 +48,76 @@ Spec: skipAnalysis: false ``` -The above analysis, if it succeeds, will run for 25 minutes while validating the HTTP metrics and webhooks every minute. You can determine the minimum time that it takes to validate and promote a canary deployment using this formula: +The above analysis, if it succeeds, will run for 25 minutes while validating the HTTP metrics and webhooks every minute. +You can determine the minimum time that it takes to validate and promote a canary deployment using this formula: -```text +``` interval * (maxWeight / stepWeight) ``` And the time it takes for a canary to be rollback when the metrics or webhook checks are failing: -```text -interval * threshold +``` +interval * threshold ``` -In emergency cases, you may want to skip the analysis phase and ship changes directly to production. At any time you can set the `spec.skipAnalysis: true`. When skip analysis is enabled, Flagger checks if the canary deployment is healthy and promotes it without analysing it. If an analysis is underway, Flagger cancels it and runs the promotion. +In emergency cases, you may want to skip the analysis phase and ship changes directly to production. +At any time you can set the `spec.skipAnalysis: true`. +When skip analysis is enabled, Flagger checks if the canary deployment is healthy and +promotes it without analysing it. If an analysis is underway, Flagger cancels it and runs the promotion. -## A/B Testing +Gated canary promotion stages: -For frontend applications that require session affinity you should use HTTP headers or cookies match conditions to ensure a set of users will stay on the same version for the whole duration of the canary analysis. +* scan for canary deployments +* check primary and canary deployment status + * halt advancement if a rolling update is underway + * halt advancement if pods are unhealthy +* call confirm-rollout webhooks and check results + * halt advancement if any hook returns a non HTTP 2xx result +* call pre-rollout webhooks and check results + * halt advancement if any hook returns a non HTTP 2xx result + * increment the failed checks counter +* increase canary traffic weight percentage from 0% to 2% (step weight) +* call rollout webhooks and check results +* check canary HTTP request success rate and latency + * halt advancement if any metric is under the specified threshold + * increment the failed checks counter +* check if the number of failed checks reached the threshold + * route all traffic to primary + * scale to zero the canary deployment and mark it as failed + * call post-rollout webhooks + * post the analysis result to Slack + * wait for the canary deployment to be updated and start over +* increase canary traffic weight by 2% (step weight) till it reaches 50% (max weight) + * halt advancement if any webhook call fails + * halt advancement while canary request success rate is under the threshold + * halt advancement while canary request duration P99 is over the threshold + * halt advancement while any custom metric check fails + * halt advancement if the primary or canary deployment becomes unhealthy + * halt advancement while canary deployment is being scaled up/down by HPA +* call confirm-promotion webhooks and check results + * halt advancement if any hook returns a non HTTP 2xx result +* promote canary to primary + * copy ConfigMaps and Secrets from canary to primary + * copy canary deployment spec template over primary +* wait for primary rolling update to finish + * halt advancement if pods are unhealthy +* route all traffic to primary +* scale to zero the canary deployment +* mark rollout as finished +* call post-rollout webhooks +* send notification with the canary analysis result +* wait for the canary deployment to be updated and start over + +### A/B Testing + +For frontend applications that require session affinity you should use HTTP headers or cookies match conditions +to ensure a set of users will stay on the same version for the whole duration of the canary analysis. ![Flagger A/B Testing Stages](https://raw.githubusercontent.com/weaveworks/flagger/master/docs/diagrams/flagger-abtest-steps.png) -You can enable A/B testing by specifying the HTTP match conditions and the number of iterations. If Flagger finds a HTTP match condition, it will ignore the `maxWeight` and `stepWeight` settings. +You can enable A/B testing by specifying the HTTP match conditions and the number of iterations. +If Flagger finds a HTTP match condition, it will ignore the `maxWeight` and `stepWeight` settings. Istio example: @@ -88,16 +139,17 @@ Istio example: regex: "^(.*?;)?(canary=always)(;.*)?$" ``` -The above configuration will run an analysis for ten minutes targeting the Safari users and those that have a test cookie. You can determine the minimum time that it takes to validate and promote a canary deployment using this formula: +The above configuration will run an analysis for ten minutes targeting the Safari users and those that have a test cookie. +You can determine the minimum time that it takes to validate and promote a canary deployment using this formula: -```text +``` interval * iterations ``` And the time it takes for a canary to be rollback when the metrics or webhook checks are failing: -```text -interval * threshold +``` +interval * threshold ``` App Mesh example: @@ -155,9 +207,10 @@ curl -H 'X-Canary: insider' http://app.example.com curl -b 'canary=always' http://app.example.com ``` -## Blue/Green Deployments +### Blue/Green Deployments -For applications that are not deployed on a service mesh, Flagger can orchestrate blue/green style deployments with Kubernetes L4 networking. When using Istio you have the option to mirror traffic between blue and green. +For applications that are not deployed on a service mesh, Flagger can orchestrate blue/green style deployments +with Kubernetes L4 networking. When using Istio you have the option to mirror traffic between blue and green. ![Flagger Blue/Green Stages](https://raw.githubusercontent.com/weaveworks/flagger/master/docs/diagrams/flagger-bluegreen-steps.png) @@ -173,30 +226,44 @@ You can use the blue/green deployment strategy by replacing `stepWeight/maxWeigh threshold: 2 ``` -With the above configuration Flagger will run conformance and load tests on the canary pods for ten minutes. If the metrics analysis succeeds, live traffic will be switched from the old version to the new one when the canary is promoted. +With the above configuration Flagger will run conformance and load tests on the canary pods for ten minutes. +If the metrics analysis succeeds, live traffic will be switched from the old version to the new one when the +canary is promoted. The blue/green deployment strategy is supported for all service mesh providers. Blue/Green rollout steps for service mesh: - -* scale up the canary \(green\) +* detect new revision (deployment spec, secrets or configmaps changes) +* scale up the canary (green) * run conformance tests for the canary pods -* run load tests and metric checks for the canary pods +* run load tests and metric checks for the canary pods every minute +* abort the canary release if the failure threshold is reached * route traffic to canary -* promote canary spec over primary \(blue\) +* promote canary spec over primary (blue) * wait for primary rollout * route traffic to primary * scale down canary -After the analysis finishes, the traffic is routed to the canary \(green\) before triggering the primary \(blue\) rolling update, this ensures a smooth transition to the new version avoiding dropping in-flight requests during the Kubernetes deployment rollout. +After the analysis finishes, the traffic is routed to the canary (green) before triggering the primary (blue) +rolling update, this ensures a smooth transition to the new version avoiding dropping in-flight requests during +the Kubernetes deployment rollout. -## Blue/Green with Traffic Mirroring +### Blue/Green with Traffic Mirroring -Traffic Mirroring is a pre-stage in a Canary \(progressive traffic shifting\) or Blue/Green deployment strategy. Traffic mirroring will copy each incoming request, sending one request to the primary and one to the canary service. The response from the primary is sent back to the user. The response from the canary is discarded. Metrics are collected on both requests so that the deployment will only proceed if the canary metrics are healthy. +Traffic Mirroring is a pre-stage in a Canary (progressive traffic shifting) or +Blue/Green deployment strategy. Traffic mirroring will copy each incoming +request, sending one request to the primary and one to the canary service. +The response from the primary is sent back to the user. The response from the canary +is discarded. Metrics are collected on both requests so that the deployment will +only proceed if the canary metrics are healthy. -Mirroring must only be used for requests that are **idempotent** or capable of being processed twice \(once by the primary and once by the canary\). Reads are idempotent. Before using mirroring on requests that may be writes, you should consider what will happen if a write is duplicated and handled by the primary and canary. +Mirroring should be used for requests that are **idempotent** or capable of +being processed twice (once by the primary and once by the canary). Reads are +idempotent. Before using mirroring on requests that may be writes, you should +consider what will happen if a write is duplicated and handled by the primary +and canary. -To use mirroring, set `spec.canaryAnalysis.mirror` to `true`. +To use mirroring, set `spec.canaryAnalysis.mirror` to `true`. Istio example: @@ -212,3 +279,27 @@ Istio example: mirror: true ``` +Mirroring rollout steps for service mesh: +* detect new revision (deployment spec, secrets or configmaps changes) +* scale from zero the canary deployment +* wait for the HPA to set the canary minimum replicas +* check canary pods health +* run the acceptance tests +* abort the canary release if tests fail +* start the load tests +* mirror traffic from primary to canary +* check request success rate and request duration every minute +* abort the canary release if the failure threshold is reached +* stop traffic mirroring after the number of iterations is reached +* route live traffic to the canary pods +* promote the canary (update the primary secrets, configmaps and deployment spec) +* wait for the primary deployment rollout to finish +* wait for the HPA to set the primary minimum replicas +* check primary pods health +* switch live traffic back to primary +* scale to zero the canary +* send notification with the canary analysis result + +After the analysis finishes, the traffic is routed to the canary (green) before triggering the primary (blue) +rolling update, this ensures a smooth transition to the new version avoiding dropping in-flight requests during +the Kubernetes deployment rollout. \ No newline at end of file diff --git a/docs/gitbook/usage/metrics.md b/docs/gitbook/usage/metrics.md new file mode 100644 index 00000000..60c4ffe4 --- /dev/null +++ b/docs/gitbook/usage/metrics.md @@ -0,0 +1,137 @@ +# Metrics Analysis + +As part of the analysis process, Flagger can validate service level objectives (SLOs) like +availability, error rate percentage, average response time and any other objective based on app specific metrics. +If a drop in performance is noticed during the SLOs analysis, +the release will be automatically rolled back with minimum impact to end-users. + +### Builtin Metrics + +Flagger comes with two builtin metric checks: HTTP request success rate and duration. + +```yaml + canaryAnalysis: + metrics: + - name: request-success-rate + interval: 1m + # minimum req success rate (non 5xx responses) + # percentage (0-100) + thresholdRange: + min: 99 + - name: request-duration + interval: 1m + # maximum req duration P99 + # milliseconds + thresholdRange: + max: 500 +``` + +For each metric you can specify a range of accepted values with `thresholdRange` +and the window size or the time series with `interval`. +The builtin checks are available for every service mesh / ingress controller +and are implemented with [Prometheus queries](../faq.md#metrics). + +### Custom Metrics + +The canary analysis can be extended with custom metric checks. Using a `MetricTemplate` custom resource, you +configure Flagger to connect to a metric provider and run a query that returns a `float64` value. +The query result is used to validate the canary based on the specified threshold range. + +Prometheus template example: + +```yaml +apiVersion: flagger.app/v1beta1 +kind: MetricTemplate +metadata: + name: not-found-percentage + namespace: istio-system +spec: + provider: + type: prometheus + address: http://promethues.istio-system:9090 + query: | + 100 - sum( + rate( + istio_requests_total{ + reporter="destination", + destination_workload_namespace="{{ namespace }}", + destination_workload="{{ target }}", + response_code!="404" + }[{{ interval }}] + ) + ) + / + sum( + rate( + istio_requests_total{ + reporter="destination", + destination_workload_namespace="{{ namespace }}", + destination_workload="{{ target }}" + }[{{ interval }}] + ) + ) * 100 +``` + +The following variables are available in templates: + +- `name` (canary.metadata.name) +- `namespace` (canary.metadata.namespace) +- `target` (canary.spec.targetRef.name) +- `service` (canary.spec.service.name) +- `ingress` (canary.spec.ingresRef.name) +- `interval` (canary.spec.canaryAnalysis.metrics[].interval) + +A canary analysis metric can reference a template with `templateRef`: + +```yaml + canaryAnalysis: + metrics: + - name: "404s percentage" + templateRef: + name: not-found-percentage + # namespace is optional + # when not specified, the canary namespace will be used + namespace: istio-system + thresholdRange: + max: 5 + interval: 1m +``` + +The above configuration validates the canary by checking +if the HTTP 404 req/sec percentage is below 5 percent of the total traffic. +If the 404s rate reaches the 5% threshold, then the canary fails. + +Prometheus gRPC error rate example: + +```yaml +apiVersion: flagger.app/v1beta1 +kind: MetricTemplate +metadata: + name: grpc-error-rate-percentage + namespace: flagger +spec: + provider: + type: prometheus + address: http://flagger-promethues.flagger-system:9090 + query: | + 100 - sum( + rate( + grpc_server_handled_total{ + grpc_code!="OK", + kubernetes_namespace="{{ namespace }}", + kubernetes_pod_name=~"{{ target }}-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)" + }[{{ interval }}] + ) + ) + / + sum( + rate( + grpc_server_started_total{ + kubernetes_namespace="{{ namespace }}", + kubernetes_pod_name=~"{{ target }}-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)" + }[{{ interval }}] + ) + ) * 100 +``` + +The above template is for gPRC services instrumented with [go-grpc-prometheus](https://github.com/grpc-ecosystem/go-grpc-prometheus). \ No newline at end of file diff --git a/docs/gitbook/usage/webhooks.md b/docs/gitbook/usage/webhooks.md new file mode 100644 index 00000000..7d574dce --- /dev/null +++ b/docs/gitbook/usage/webhooks.md @@ -0,0 +1,401 @@ +# Webhooks + +The canary analysis can be extended with webhooks. Flagger will call each webhook URL and +determine from the response status code (HTTP 2xx) if the canary is failing or not. + +There are several types of hooks: +* **confirm-rollout** hooks are executed before scaling up the canary deployment and can be used for manual approval. +The rollout is paused until the hook returns a successful HTTP status code. +* **pre-rollout** hooks are executed before routing traffic to canary. +The canary advancement is paused if a pre-rollout hook fails and if the number of failures reach the +threshold the canary will be rollback. +* **rollout** hooks are executed during the analysis on each iteration before the metric checks. +If a rollout hook call fails the canary advancement is paused and eventfully rolled back. +* **confirm-promotion** hooks are executed before the promotion step. +The canary promotion is paused until the hooks return HTTP 200. +While the promotion is paused, Flagger will continue to run the metrics checks and rollout hooks. +* **post-rollout** hooks are executed after the canary has been promoted or rolled back. +If a post rollout hook fails the error is logged. +* **rollback** hooks are executed while a canary deployment is in either Progressing or Waiting status. +This provides the ability to rollback during analysis or while waiting for a confirmation. If a rollback hook +returns a successful HTTP status code, Flagger will stop the analysis and mark the canary release as failed. +* **event** hooks are executed every time Flagger emits a Kubernetes event. When configured, +every action that Flagger takes during a canary deployment will be sent as JSON via an HTTP POST request. + +Spec: + +```yaml + canaryAnalysis: + webhooks: + - name: "start gate" + type: confirm-rollout + url: http://flagger-loadtester.test/gate/approve + - name: "smoke test" + type: pre-rollout + url: http://flagger-helmtester.kube-system/ + timeout: 3m + metadata: + type: "helm" + cmd: "test podinfo --cleanup" + - name: "load test" + type: rollout + url: http://flagger-loadtester.test/ + timeout: 15s + metadata: + cmd: "hey -z 1m -q 5 -c 2 http://podinfo-canary.test:9898/" + - name: "promotion gate" + type: confirm-promotion + url: http://flagger-loadtester.test/gate/approve + - name: "notify" + type: post-rollout + url: http://telegram.bot:8080/ + timeout: 5s + metadata: + some: "message" + - name: "rollback gate" + type: rollback + url: http://flagger-loadtester.test/rollback/check + - name: "send to Slack" + type: event + url: http://event-recevier.notifications/slack +``` + +> **Note** that the sum of all rollout webhooks timeouts should be lower than the analysis interval. + +Webhook payload (HTTP POST): + +```json +{ + "name": "podinfo", + "namespace": "test", + "phase": "Progressing", + "metadata": { + "test": "all", + "token": "16688eb5e9f289f1991c" + } +} +``` + +Response status codes: + +* 200-202 - advance canary by increasing the traffic weight +* timeout or non-2xx - halt advancement and increment failed checks + +On a non-2xx response Flagger will include the response body (if any) in the failed checks log and Kubernetes events. + +Event payload (HTTP POST): + +```json +{ + "name": "string (canary name)", + "namespace": "string (canary namespace)", + "phase": "string (canary phase)", + "metadata": { + "eventMessage": "string (canary event message)", + "eventType": "string (canary event type)", + "timestamp": "string (unix timestamp ms)" + } +} +``` + +The event receiver can create alerts based on the received phase +(possible values: ` Initialized`, `Waiting`, `Progressing`, `Promoting`, `Finalising`, `Succeeded` or `Failed`). + +### Load Testing + +For workloads that are not receiving constant traffic Flagger can be configured with a webhook, +that when called, will start a load test for the target workload. +If the target workload doesn't receive any traffic during the canary analysis, +Flagger metric checks will fail with "no values found for metric request-success-rate". + +Flagger comes with a load testing service based on [rakyll/hey](https://github.com/rakyll/hey) +that generates traffic during analysis when configured as a webhook. + +![Flagger Load Testing Webhook](https://raw.githubusercontent.com/weaveworks/flagger/master/docs/diagrams/flagger-load-testing.png) + +First you need to deploy the load test runner in a namespace with sidecar injection enabled: + +```bash +export REPO=https://raw.githubusercontent.com/weaveworks/flagger/master + +kubectl -n test apply -f ${REPO}/artifacts/loadtester/deployment.yaml +kubectl -n test apply -f ${REPO}/artifacts/loadtester/service.yaml +``` + +Or by using Helm: + +```bash +helm repo add flagger https://flagger.app + +helm upgrade -i flagger-loadtester flagger/loadtester \ +--namespace=test \ +--set cmd.timeout=1h +``` + +When deployed the load tester API will be available at `http://flagger-loadtester.test/`. + +Now you can add webhooks to the canary analysis spec: + +```yaml +webhooks: + - name: load-test-get + url: http://flagger-loadtester.test/ + timeout: 5s + metadata: + type: cmd + cmd: "hey -z 1m -q 10 -c 2 http://podinfo-canary.test:9898/" + - name: load-test-post + url: http://flagger-loadtester.test/ + timeout: 5s + metadata: + type: cmd + cmd: "hey -z 1m -q 10 -c 2 -m POST -d '{test: 2}' http://podinfo-canary.test:9898/echo" +``` + +When the canary analysis starts, Flagger will call the webhooks and the load tester will run the `hey` commands +in the background, if they are not already running. This will ensure that during the +analysis, the `podinfo-canary.test` service will receive a steady stream of GET and POST requests. + +If your workload is exposed outside the mesh you can point `hey` to the +public URL and use HTTP2. + +```yaml +webhooks: + - name: load-test-get + url: http://flagger-loadtester.test/ + timeout: 5s + metadata: + type: cmd + cmd: "hey -z 1m -q 10 -c 2 -h2 https://podinfo.example.com/" +``` + +For gRPC services you can use [bojand/ghz](https://github.com/bojand/ghz) which is a similar tool to Hey but for gPRC: + +```yaml +webhooks: + - name: grpc-load-test + url: http://flagger-loadtester.test/ + timeout: 5s + metadata: + type: cmd + cmd: "ghz -z 1m -q 10 -c 2 --insecure podinfo.test:9898" +``` + +`ghz` uses reflection to identify which gRPC method to call. If you do not wish to enable reflection for your gRPC service you can implement a standardized health check from the [grpc-proto](https://github.com/grpc/grpc-proto) library. To use this [health check schema](https://github.com/grpc/grpc-proto/blob/master/grpc/health/v1/health.proto) without reflection you can pass a parameter to `ghz` like this + +```yaml +webhooks: + - name: grpc-load-test-no-reflection + url: http://flagger-loadtester.test/ + timeout: 5s + metadata: + type: cmd + cmd: "ghz --insecure --proto=/tmp/ghz/health.proto --call=grpc.health.v1.Health/Check podinfo.test:9898" +``` + +The load tester can run arbitrary commands as long as the binary is present in the container image. +For example if you you want to replace `hey` with another CLI, you can create your own Docker image: + +```dockerfile +FROM weaveworks/flagger-loadtester: + +RUN curl -Lo /usr/local/bin/my-cli https://github.com/user/repo/releases/download/ver/my-cli \ + && chmod +x /usr/local/bin/my-cli +``` + +### Load Testing Delegation + +The load tester can also forward testing tasks to external tools, by now [nGrinder](https://github.com/naver/ngrinder) +is supported. + +To use this feature, add a load test task of type 'ngrinder' to the canary analysis spec: + +```yaml +webhooks: + - name: load-test-post + url: http://flagger-loadtester.test/ + timeout: 5s + metadata: + # type of this load test task, cmd or ngrinder + type: ngrinder + # base url of your nGrinder controller server + server: http://ngrinder-server:port + # id of the test to clone from, the test must have been defined. + clone: 100 + # user name and base64 encoded password to authenticate against the nGrinder server + username: admin + passwd: YWRtaW4= + # the interval between between nGrinder test status polling, default to 1s + pollInterval: 5s +``` +When the canary analysis starts, the load tester will initiate a [clone_and_start request](https://github.com/naver/ngrinder/wiki/REST-API-PerfTest) +to the nGrinder server and start a new performance test. the load tester will periodically poll the nGrinder server +for the status of the test, and prevent duplicate requests from being sent in subsequent analysis loops. + +### Integration Testing + +Flagger comes with a testing service that can run Helm tests or Bats tests when configured as a webhook. + +Deploy the Helm test runner in the `kube-system` namespace using the `tiller` service account: + +```bash +helm repo add flagger https://flagger.app + +helm upgrade -i flagger-helmtester flagger/loadtester \ +--namespace=kube-system \ +--set serviceAccountName=tiller +``` + +When deployed the Helm tester API will be available at `http://flagger-helmtester.kube-system/`. + +Now you can add pre-rollout webhooks to the canary analysis spec: + +```yaml + canaryAnalysis: + webhooks: + - name: "smoke test" + type: pre-rollout + url: http://flagger-helmtester.kube-system/ + timeout: 3m + metadata: + type: "helm" + cmd: "test {{ .Release.Name }} --cleanup" +``` + +When the canary analysis starts, Flagger will call the pre-rollout webhooks before routing traffic to the canary. +If the helm test fails, Flagger will retry until the analysis threshold is reached and the canary is rolled back. + +If you are using Helm v3, you'll have to create a dedicated service account and add the release namespace to the test command: + +```yaml + canaryAnalysis: + webhooks: + - name: "smoke test" + type: pre-rollout + url: http://flagger-helmtester.kube-system/ + timeout: 3m + metadata: + type: "helmv3" + cmd: "test run {{ .Release.Name }} --timeout 3m -n {{ .Release.Namespace }}" +``` + +As an alternative to Helm you can use the [Bash Automated Testing System](https://github.com/bats-core/bats-core) to run your tests. + +```yaml + canaryAnalysis: + webhooks: + - name: "acceptance tests" + type: pre-rollout + url: http://flagger-batstester.default/ + timeout: 5m + metadata: + type: "bash" + cmd: "bats /tests/acceptance.bats" +``` + +Note that you should create a ConfigMap with your Bats tests and mount it inside the tester container. + +### Manual Gating + +For manual approval of a canary deployment you can use the `confirm-rollout` and `confirm-promotion` webhooks. +The confirmation rollout hooks are executed before the pre-rollout hooks. +Flagger will halt the canary traffic shifting and analysis until the confirm webhook returns HTTP status 200. + +For manual rollback of a canary deployment you can use the `rollback` webhook. The rollback hook will be called +during the analysis and confirmation states. If a rollback webhook returns a successful HTTP status code, Flagger +will shift all traffic back to the primary instance and fail the canary. + +Manual gating with Flagger's tester: + +```yaml + canaryAnalysis: + webhooks: + - name: "gate" + type: confirm-rollout + url: http://flagger-loadtester.test/gate/halt +``` + +The `/gate/halt` returns HTTP 403 thus blocking the rollout. + +If you have notifications enabled, Flagger will post a message to Slack or MS Teams if a canary rollout is waiting for approval. + +Change the URL to `/gate/approve` to start the canary analysis: + +```yaml + canaryAnalysis: + webhooks: + - name: "gate" + type: confirm-rollout + url: http://flagger-loadtester.test/gate/approve +``` + +Manual gating can be driven with Flagger's tester API. Set the confirmation URL to `/gate/check`: + +```yaml + canaryAnalysis: + webhooks: + - name: "ask for confirmation" + type: confirm-rollout + url: http://flagger-loadtester.test/gate/check +``` + +By default the gate is closed, you can start or resume the canary rollout with: + +```bash +kubectl -n test exec -it flagger-loadtester-xxxx-xxxx sh + +curl -d '{"name": "podinfo","namespace":"test"}' http://localhost:8080/gate/open +``` + +You can pause the rollout at any time with: + +```bash +curl -d '{"name": "podinfo","namespace":"test"}' http://localhost:8080/gate/close +``` + +If a canary analysis is paused the status will change to waiting: + +```bash +kubectl get canary/podinfo + +NAME STATUS WEIGHT +podinfo Waiting 0 +``` + +The `confirm-promotion` hook type can be used to manually approve the canary promotion. +While the promotion is paused, Flagger will continue to run the metrics checks and load tests. + +```yaml + canaryAnalysis: + webhooks: + - name: "promotion gate" + type: confirm-promotion + url: http://flagger-loadtester.test/gate/halt +``` + +The `rollback` hook type can be used to manually rollback the canary promotion. As with gating, rollbacks can be driven +with Flagger's tester API by setting the rollback URL to `/rollback/check` + +```yaml + canaryAnalysis: + webhooks: + - name: "rollback" + type: rollback + url: http://flagger-loadtester.test/rollback/check +``` + +By default rollback is closed, you can rollback a canary rollout with: + +```bash +kubectl -n test exec -it flagger-loadtester-xxxx-xxxx sh + +curl -d '{"name": "podinfo","namespace":"test"}' http://localhost:8080/rollback/open +``` + +You can close the rollback with: + +```bash +curl -d '{"name": "podinfo","namespace":"test"}' http://localhost:8080/rollback/close +``` + +If you have notifications enabled, Flagger will post a message to Slack or MS Teams if a canary has been rolled back. From 2837d4407e8720ec3fb289033fd60cae0edb127d Mon Sep 17 00:00:00 2001 From: stefanprodan Date: Wed, 26 Feb 2020 20:40:49 +0200 Subject: [PATCH 02/13] Split the CRD docs into canary target, service, status, analysis --- docs/gitbook/faq.md | 2 +- docs/gitbook/how-it-works.md | 66 ++++++++++++++----- .../install/flagger-install-on-kubernetes.md | 25 +++++-- 3 files changed, 70 insertions(+), 23 deletions(-) diff --git a/docs/gitbook/faq.md b/docs/gitbook/faq.md index 0224d348..3d13f0a0 100644 --- a/docs/gitbook/faq.md +++ b/docs/gitbook/faq.md @@ -122,7 +122,7 @@ canary analysis and can be used for conformance testing or load testing. If port discovery is enabled, Flagger scans the deployment spec and extracts the containers ports excluding the port specified in the canary service and Envoy sidecar ports. -`These ports will be used when generating the ClusterIP services. +These ports will be used when generating the ClusterIP services. For a deployment that exposes two ports: diff --git a/docs/gitbook/how-it-works.md b/docs/gitbook/how-it-works.md index 8987da9c..4cdda8e0 100644 --- a/docs/gitbook/how-it-works.md +++ b/docs/gitbook/how-it-works.md @@ -7,7 +7,7 @@ to drive the canary analysis and promotion. ### Canary Custom Resource -For a deployment named _podinfo_, a canary promotion can be defined using Flagger's custom resource: +For a deployment named _podinfo_, a canary can be defined using Flagger's custom resource: ```yaml apiVersion: flagger.app/v1alpha3 @@ -19,16 +19,8 @@ spec: apiVersion: apps/v1 kind: Deployment name: podinfo - autoscalerRef: - apiVersion: autoscaling/v2beta1 - kind: HorizontalPodAutoscaler - name: podinfo service: - name: podinfo port: 9898 - portName: http - targetPort: 9898 - portDiscovery: true canaryAnalysis: interval: 1m threshold: 10 @@ -48,7 +40,27 @@ spec: cmd: "hey -z 1m -q 10 -c 2 http://podinfo-canary.test:9898/" ``` +### Canary target + +A canary resource can target a Kubernetes Deployment or DaemonSet. + +Kubernetes Deployment example: + +```yaml +spec: + progressDeadlineSeconds: 60 + targetRef: + apiVersion: apps/v1 + kind: Deployment + name: podinfo + autoscalerRef: + apiVersion: autoscaling/v2beta1 + kind: HorizontalPodAutoscaler + name: podinfo +``` + Based on the above configuration, Flagger generates the following Kubernetes objects: + * `deployment/-primary` * `hpa/-primary` @@ -57,9 +69,6 @@ and the target deployment is scaled to zero. Flagger will detect changes to the target deployment (including secrets and configmaps) and will perform a canary analysis before promoting the new version as primary. -The autoscaler reference is optional, when specified, Flagger will pause the traffic increase while the -target and primary deployments are scaled up or down. HPA can help reduce the resource usage during the canary analysis. - If the target deployment uses secrets and/or configmaps, Flagger will create a copy of each object using the `-primary` prefix and will reference these objects in the primary deployment. You can disable the secrets/configmaps tracking with the `-enable-config-tracking=false` command flag in the Flagger deployment manifest under containers args @@ -87,10 +96,37 @@ If you use a different convention you can specify your label with the `-selector-labels=my-app-label` command flag in the Flagger deployment manifest under containers args or by setting `--set selectorLabels=my-app-label` when installing Flagger with Helm. -The target deployment should expose a TCP port that will be used by Flagger to create the ClusterIP Services. -The container port from the target deployment should match the `service.port` or `service.targetPort`. +The autoscaler reference is optional, when specified, Flagger will pause the traffic increase while the +target and primary deployments are scaled up or down. HPA can help reduce the resource usage during the canary analysis. -Based on the canary spec service, Flagger generates the following Kubernetes ClusterIP service: +The progress deadline represents the maximum time in seconds for the canary deployment to make progress +before it is rolled back, defaults to ten minutes. + +### Canary service + +A canary resource dictates how the target workload is exposed inside the cluster. +The canary target should expose a TCP port that will be used by Flagger to create the ClusterIP Services. + +```yaml +spec: + service: + name: podinfo + port: 9898 + portName: http + targetPort: 9898 + portDiscovery: true +``` + +The container port from the target workload should match the `service.port` or `service.targetPort`. +The `service.name` is optional, defaults to `spec.targetRef.name`. +The `service.targetPort` can be a container port number or name. +The `service.portName` is optional (defaults to `http`), if your workload uses gPRC then set the port name to `grcp`. + +If port discovery is enabled, Flagger scans the target workload and extracts the containers +ports excluding the port specified in the canary service and service mesh sidecar ports. +These ports will be used when generating the ClusterIP services. + +Based on the canary spec service, Flagger creates the following Kubernetes ClusterIP service: * `..svc.cluster.local` selector `app=-primary` diff --git a/docs/gitbook/install/flagger-install-on-kubernetes.md b/docs/gitbook/install/flagger-install-on-kubernetes.md index 5478d9ca..d9dfbb1f 100644 --- a/docs/gitbook/install/flagger-install-on-kubernetes.md +++ b/docs/gitbook/install/flagger-install-on-kubernetes.md @@ -30,7 +30,11 @@ helm upgrade -i flagger flagger/flagger \ --set metricsServer=http://prometheus:9090 ``` -For Istio multi-cluster shared control plane you can install Flagger on each remote cluster and set the Istio control plane host cluster kubeconfig: +Note that Flagger depends on Istio telemetry and Prometheus, if you're installing Istio with istioctl +then you should be using the [default profile](https://istio.io/docs/setup/additional-setup/config-profiles/). + +For Istio multi-cluster shared control plane you can install Flagger +on each remote cluster and set the Istio control plane host cluster kubeconfig: ```bash helm upgrade -i flagger flagger/flagger \ @@ -42,7 +46,9 @@ helm upgrade -i flagger flagger/flagger \ --set istio.kubeconfig.key=kubeconfig ``` -Note that the Istio kubeconfig must be stored in a Kubernetes secret with a data key named `kubeconfig`. For more details on how to configure Istio multi-cluster credentials read the [Istio docs](https://istio.io/docs/setup/install/multicluster/shared-vpn/#credentials). +Note that the Istio kubeconfig must be stored in a Kubernetes secret with a data key named `kubeconfig`. +For more details on how to configure Istio multi-cluster credentials +read the [Istio docs](https://istio.io/docs/setup/install/multicluster/shared-vpn/#credentials). Deploy Flagger for Linkerd: @@ -114,7 +120,8 @@ helm delete flagger The command removes all the Kubernetes components associated with the chart and deletes the release. -> **Note** that on uninstall the Canary CRD will not be removed. Deleting the CRD will make Kubernetes remove all the objects owned by Flagger like Istio virtual services, Kubernetes deployments and ClusterIP services. +> **Note** that on uninstall the Canary CRD will not be removed. Deleting the CRD will make Kubernetes +>remove all the objects owned by Flagger like Istio virtual services, Kubernetes deployments and ClusterIP services. If you want to remove all the objects created by Flagger you have delete the Canary CRD with kubectl: @@ -169,7 +176,8 @@ kubectl apply -k github.com/weaveworks/flagger//kustomize/istio This deploys Flagger in the `istio-system` namespace and sets the metrics server URL to Istio's Prometheus instance. -Note that you'll need kubectl 1.14 to run the above the command or you can download the [kustomize binary](https://github.com/kubernetes-sigs/kustomize/releases) and run: +Note that you'll need kubectl 1.14 to run the above the command or you can download +the [kustomize binary](https://github.com/kubernetes-sigs/kustomize/releases) and run: ```bash kustomize build github.com/weaveworks/flagger//kustomize/istio | kubectl apply -f - @@ -205,9 +213,11 @@ Install Flagger and Prometheus: kubectl apply -k github.com/weaveworks/flagger//kustomize/kubernetes ``` -This deploys Flagger and Prometheus in the `flagger-system` namespace, sets the metrics server URL to `http://flagger-prometheus.flagger-system:9090` and the mesh provider to `kubernetes`. +This deploys Flagger and Prometheus in the `flagger-system` namespace, sets the metrics server URL +to `http://flagger-prometheus.flagger-system:9090` and the mesh provider to `kubernetes`. -The Prometheus instance has a two hours data retention and is configured to scrape all pods in your cluster that have the `prometheus.io/scrape: "true"` annotation. +The Prometheus instance has a two hours data retention and is configured to scrape all pods in your cluster +that have the `prometheus.io/scrape: "true"` annotation. To target a different provider you can specify it in the canary custom resource: @@ -265,5 +275,6 @@ Install Flagger with Slack: kubectl apply -k . ``` -If you want to use MS Teams instead of Slack, replace `-slack-url` with `-msteams-url` and set the webhook address to `https://outlook.office.com/webhook/YOUR/TEAMS/WEBHOOK`. +If you want to use MS Teams instead of Slack, replace `-slack-url` with `-msteams-url` and set the webhook address +to `https://outlook.office.com/webhook/YOUR/TEAMS/WEBHOOK`. From e4da4a34a61754b14f824dabfaf1d6eb2bde9fa2 Mon Sep 17 00:00:00 2001 From: stefanprodan Date: Thu, 27 Feb 2020 10:37:48 +0200 Subject: [PATCH 03/13] Add dev guides section to docs --- docs/gitbook/README.md | 13 +++++++--- docs/gitbook/SUMMARY.md | 5 +++- docs/gitbook/{ => dev}/dev-guide.md | 17 +------------ docs/gitbook/dev/release-guide.md | 34 ++++++++++++++++++++++++++ docs/gitbook/how-it-works.md | 37 +++++++++++++++++++++++++---- 5 files changed, 82 insertions(+), 24 deletions(-) rename docs/gitbook/{ => dev}/dev-guide.md (85%) create mode 100644 docs/gitbook/dev/release-guide.md diff --git a/docs/gitbook/README.md b/docs/gitbook/README.md index 1aaf6b91..5d8e60df 100644 --- a/docs/gitbook/README.md +++ b/docs/gitbook/README.md @@ -4,13 +4,20 @@ description: Flagger is a progressive delivery Kubernetes operator # Introduction -[Flagger](https://github.com/weaveworks/flagger) is a **Kubernetes** operator that automates the promotion of canary deployments using **Istio**, **Linkerd**, **App Mesh**, **NGINX**, **Contour** or **Gloo** routing for traffic shifting and **Prometheus** metrics for canary analysis. The canary analysis can be extended with webhooks for running system integration/acceptance tests, load tests, or any other custom validation. +[Flagger](https://github.com/weaveworks/flagger) is a **Kubernetes** operator that automates the promotion of +canary deployments using **Istio**, **Linkerd**, **App Mesh**, **NGINX**, **Contour** or **Gloo** routing for +traffic shifting and **Prometheus** metrics for canary analysis. The canary analysis can be extended with webhooks for +running system integration/acceptance tests, load tests, or any other custom validation. -Flagger implements a control loop that gradually shifts traffic to the canary while measuring key performance indicators like HTTP requests success rate, requests average duration and pods health. Based on analysis of the **KPIs** a canary is promoted or aborted, and the analysis result is published to **Slack** or **MS Teams**. +Flagger implements a control loop that gradually shifts traffic to the canary while measuring key performance indicators +like HTTP requests success rate, requests average duration and pods health. +Based on analysis of the **KPIs** a canary is promoted or aborted, and the analysis result is published to **Slack** or **MS Teams**. ![Flagger overview diagram](https://raw.githubusercontent.com/weaveworks/flagger/master/docs/diagrams/flagger-canary-overview.png) -Flagger can be configured with Kubernetes custom resources and is compatible with any CI/CD solutions made for Kubernetes. Since Flagger is declarative and reacts to Kubernetes events, it can be used in **GitOps** pipelines together with Flux CD or JenkinsX. +Flagger can be configured with Kubernetes custom resources and is compatible with any CI/CD solutions made for Kubernetes. +Since Flagger is declarative and reacts to Kubernetes events, +it can be used in **GitOps** pipelines together with Flux CD or JenkinsX. This project is sponsored by [Weaveworks](https://www.weave.works/) diff --git a/docs/gitbook/SUMMARY.md b/docs/gitbook/SUMMARY.md index 6c50a1b8..d51ea045 100644 --- a/docs/gitbook/SUMMARY.md +++ b/docs/gitbook/SUMMARY.md @@ -3,7 +3,6 @@ * [Introduction](README.md) * [How it works](how-it-works.md) * [FAQ](faq.md) -* [Development guide](dev-guide.md) ## Install @@ -35,3 +34,7 @@ * [Canaries with Helm charts and GitOps](tutorials/canary-helm-gitops.md) * [Zero downtime deployments](tutorials/zero-downtime-deployments.md) +## Dev + +* [Development Guide](dev/dev-guide.md) +* [Release Guide](dev/release-guide.md) \ No newline at end of file diff --git a/docs/gitbook/dev-guide.md b/docs/gitbook/dev/dev-guide.md similarity index 85% rename from docs/gitbook/dev-guide.md rename to docs/gitbook/dev/dev-guide.md index c082df0a..43a004f7 100644 --- a/docs/gitbook/dev-guide.md +++ b/docs/gitbook/dev/dev-guide.md @@ -1,4 +1,4 @@ -# Flagger Development Guide +# Development Guide This document describes how to build, test and run Flagger from source. @@ -177,18 +177,3 @@ chose one that matches your changes from this [list](https://github.com/weavewor When you open a pull request on Flagger repo, the unit and integration tests will be run in CI. -### Release - -To release a new Flagger version (e.g. `2.0.0`) follow these steps: -* create a branch `git checkout -b prep-2.0.0` -* set the version in code and manifests `TAG=2.0.0 make version-set` -* commit changes and merge PR -* checkout master `git checkout master && git pull` -* tag master `make release` - -After the tag has been pushed to GitHub, the CI release pipeline does the following: -* creates a GitHub release -* pushes the Flagger binary and change log to GitHub release -* pushes the Flagger container image to Docker Hub -* pushes the Helm chart to github-pages branch -* GitHub pages publishes the new chart version on the Helm repository diff --git a/docs/gitbook/dev/release-guide.md b/docs/gitbook/dev/release-guide.md new file mode 100644 index 00000000..2140f60a --- /dev/null +++ b/docs/gitbook/dev/release-guide.md @@ -0,0 +1,34 @@ +# Flagger Release Guide + +This document describes how to release Flagger. + +### Release + +To release a new Flagger version (e.g. `2.0.0`) follow these steps: +* create a branch `git checkout -b prep-2.0.0` +* set the version in code and manifests `TAG=2.0.0 make version-set` +* commit changes and merge PR +* checkout master `git checkout master && git pull` +* tag master `make release` + +### CI + +After the tag has been pushed to GitHub, the CI release pipeline does the following: +* creates a GitHub release +* pushes the Flagger binary and change log to GitHub release +* pushes the Flagger container image to Docker Hub +* pushes the Helm chart to github-pages branch +* GitHub pages publishes the new chart version on the Helm repository + +### Docs + +The documentation [website](https://docs.flagger.app) is built from the `docs` branch. + +After a Flagger release, publish the docs with: +* `git checkout master && git pull` +* `git checkout docs` +* `git rebase master` +* `git push origin docs` + + + diff --git a/docs/gitbook/how-it-works.md b/docs/gitbook/how-it-works.md index 4cdda8e0..9defb689 100644 --- a/docs/gitbook/how-it-works.md +++ b/docs/gitbook/how-it-works.md @@ -5,9 +5,12 @@ a horizontal pod autoscaler (HPA) and creates a series of objects (Kubernetes deployments, ClusterIP services, virtual service, traffic split or ingress) to drive the canary analysis and promotion. -### Canary Custom Resource +### Canary custom resource -For a deployment named _podinfo_, a canary can be defined using Flagger's custom resource: +The canary custom resource defines the release process of an application running on Kubernetes +and is portable across clusters, service meshes and ingress providers. + +For a deployment named _podinfo_, a canary release with progressive traffic shifting can be defined as: ```yaml apiVersion: flagger.app/v1alpha3 @@ -40,6 +43,11 @@ spec: cmd: "hey -z 1m -q 10 -c 2 http://podinfo-canary.test:9898/" ``` +When you deploy a new version of an app, Flagger gradually shifts traffic to the canary, +and at the same time, measures the requests success rate as well as the average response duration. +You can extend the canary analysis with custom metrics, acceptance and load testing +to harden the validation process of your app release process. + ### Canary target A canary resource can target a Kubernetes Deployment or DaemonSet. @@ -139,6 +147,28 @@ This ensures that traffic to `podinfo.test:9898` will be routed to the latest st The `podinfo-canary.test:9898` address is available only during the canary analysis and can be used for conformance testing or load testing. +Besides the port mapping, the service specification can contain URI match and rewrite rules, +timeout and retry polices: + +```yaml +spec: + service: + port: 9898 + match: + - uri: + prefix: / + rewrite: + uri: / + retries: + attempts: 3 + perTryTimeout: 1s + timeout: 5s +``` + +When using **Istio** as the mesh provider, you can also specify +HTTP header operations, CORS and traffic policies, Istio gateways and hosts. +The Istio routing configuration can be found [here](faq.md#istio-routing). + ### Canary status You can use kubectl to get the current status of canary deployments cluster wide: @@ -207,7 +237,7 @@ kubectl wait canary/podinfo --for=condition=promoted --timeout=5m kubectl get canary/podinfo | grep Succeeded ``` -### Canary Analysis +### Canary analysis The canary analysis defines: * the type of [deployment strategy](usage/deployment-strategies.md) @@ -250,4 +280,3 @@ Spec: The canary analysis runs periodically until it reaches the maximum traffic weight or the number of iterations. On each run, Flagger calls the webhooks, checks the metrics and if the failed checks threshold is reached, stops the analysis and rolls back the canary. If alerting is configured, Flagger will post the analysis result using the alert providers. - From 5aa9dd154c7fc4107aa4403b20ae33193f5ef3ad Mon Sep 17 00:00:00 2001 From: stefanprodan Date: Thu, 27 Feb 2020 11:30:53 +0200 Subject: [PATCH 04/13] Add datadog metric provider to docs Ref: #460 --- docs/gitbook/usage/metrics.md | 128 ++++++++++++++++++++++++++++++---- 1 file changed, 113 insertions(+), 15 deletions(-) diff --git a/docs/gitbook/usage/metrics.md b/docs/gitbook/usage/metrics.md index 60c4ffe4..32984ef4 100644 --- a/docs/gitbook/usage/metrics.md +++ b/docs/gitbook/usage/metrics.md @@ -5,7 +5,7 @@ availability, error rate percentage, average response time and any other objecti If a drop in performance is noticed during the SLOs analysis, the release will be automatically rolled back with minimum impact to end-users. -### Builtin Metrics +### Builtin metrics Flagger comes with two builtin metric checks: HTTP request success rate and duration. @@ -31,12 +31,59 @@ and the window size or the time series with `interval`. The builtin checks are available for every service mesh / ingress controller and are implemented with [Prometheus queries](../faq.md#metrics). -### Custom Metrics +### Custom metrics The canary analysis can be extended with custom metric checks. Using a `MetricTemplate` custom resource, you configure Flagger to connect to a metric provider and run a query that returns a `float64` value. The query result is used to validate the canary based on the specified threshold range. +```yaml +apiVersion: flagger.app/v1beta1 +kind: MetricTemplate +metadata: + name: my-metric +spec: + provider: + type: # can be prometheus or datadog + address: # API URL + secretRef: + name: # name of the secret containing the API credentials + query: # metric query +``` + +The following variables are available in query templates: + +- `name` (canary.metadata.name) +- `namespace` (canary.metadata.namespace) +- `target` (canary.spec.targetRef.name) +- `service` (canary.spec.service.name) +- `ingress` (canary.spec.ingresRef.name) +- `interval` (canary.spec.canaryAnalysis.metrics[].interval) + +A canary analysis metric can reference a template with `templateRef`: + +```yaml + canaryAnalysis: + metrics: + - name: "my metric" + templateRef: + name: my-metric + # namespace is optional + # when not specified, the canary namespace will be used + namespace: flagger + # accepted values + thresholdRange: + min: 10 + max: 1000 + # metric query time window + interval: 1m +``` + +### Prometheus + +You can create custom metric checks targeting a Prometheus server +by setting the provider type to `prometheus` and writing the query in PromQL. + Prometheus template example: ```yaml @@ -72,16 +119,7 @@ spec: ) * 100 ``` -The following variables are available in templates: - -- `name` (canary.metadata.name) -- `namespace` (canary.metadata.namespace) -- `target` (canary.spec.targetRef.name) -- `service` (canary.spec.service.name) -- `ingress` (canary.spec.ingresRef.name) -- `interval` (canary.spec.canaryAnalysis.metrics[].interval) - -A canary analysis metric can reference a template with `templateRef`: +Reference the template in the canary analysis: ```yaml canaryAnalysis: @@ -89,8 +127,6 @@ A canary analysis metric can reference a template with `templateRef`: - name: "404s percentage" templateRef: name: not-found-percentage - # namespace is optional - # when not specified, the canary namespace will be used namespace: istio-system thresholdRange: max: 5 @@ -134,4 +170,66 @@ spec: ) * 100 ``` -The above template is for gPRC services instrumented with [go-grpc-prometheus](https://github.com/grpc-ecosystem/go-grpc-prometheus). \ No newline at end of file +The above template is for gPRC services instrumented with [go-grpc-prometheus](https://github.com/grpc-ecosystem/go-grpc-prometheus). + +### Datadog + +You can create custom metric checks using the Datadog provider. + +Create a secret with your Datadog API credentials: + +```yaml +apiVersion: v1 +kind: Secret +metadata: + name: datadog + namespace: istio-system +data: + datadog_api_key: your-datadog-api-key + datadog_application_key: your-datadog-application-key +``` + +Datadog template example: + +```yaml +apiVersion: flagger.app/v1beta1 +kind: MetricTemplate +metadata: + name: not-found-percentage + namespace: istio-system +spec: + provider: + type: datadog + address: https://api.datadoghq.com + secretRef: + name: datadog + query: | + 100 - ( + sum:istio.mesh.request.count{ + reporter:destination, + destination_workload_namespace:{{ namespace }}, + destination_workload:{{ target }}, + !response_code:404 + }.as_count() + / + sum:istio.mesh.request.count{ + reporter:destination, + destination_workload_namespace:{{ namespace }}, + destination_workload:{{ target }} + }.as_count() + ) * 100 +``` + +Reference the template in the canary analysis: + +```yaml + canaryAnalysis: + metrics: + - name: "404s percentage" + templateRef: + name: not-found-percentage + namespace: istio-system + thresholdRange: + max: 5 + interval: 1m +``` From 98ecae93e1c8b01829faefd64be3d80ccd97a4ef Mon Sep 17 00:00:00 2001 From: stefanprodan Date: Thu, 27 Feb 2020 12:06:33 +0200 Subject: [PATCH 05/13] Set API version to v1beta1 in docs examples --- docs/gitbook/dev/release-guide.md | 2 +- docs/gitbook/faq.md | 20 +++++++++---------- docs/gitbook/how-it-works.md | 11 +++++----- .../install/flagger-install-on-kubernetes.md | 2 +- .../tutorials/appmesh-progressive-delivery.md | 2 +- .../tutorials/contour-progressive-delivery.md | 2 +- .../crossover-progressive-delivery.md | 2 +- docs/gitbook/tutorials/flagger-smi-istio.md | 2 +- .../tutorials/gloo-progressive-delivery.md | 2 +- docs/gitbook/tutorials/istio-ab-testing.md | 2 +- .../tutorials/istio-progressive-delivery.md | 4 ++-- .../tutorials/kubernetes-blue-green.md | 2 +- .../tutorials/linkerd-progressive-delivery.md | 4 ++-- .../tutorials/nginx-progressive-delivery.md | 2 +- .../tutorials/zero-downtime-deployments.md | 2 +- 15 files changed, 31 insertions(+), 30 deletions(-) diff --git a/docs/gitbook/dev/release-guide.md b/docs/gitbook/dev/release-guide.md index 2140f60a..dc186998 100644 --- a/docs/gitbook/dev/release-guide.md +++ b/docs/gitbook/dev/release-guide.md @@ -1,4 +1,4 @@ -# Flagger Release Guide +# Release Guide This document describes how to release Flagger. diff --git a/docs/gitbook/faq.md b/docs/gitbook/faq.md index 3d13f0a0..024be4bd 100644 --- a/docs/gitbook/faq.md +++ b/docs/gitbook/faq.md @@ -33,7 +33,7 @@ Mirroring should be used for requests that are **idempotent** or capable of bein Assuming the app name is podinfo you can define a canary like: ```yaml -apiVersion: flagger.app/v1alpha3 +apiVersion: flagger.app/v1beta1 kind: Canary metadata: name: podinfo @@ -146,7 +146,7 @@ spec: You can enable port discovery so that Prometheus will be able to reach port `9090` over mTLS: ```yaml -apiVersion: flagger.app/v1alpha3 +apiVersion: flagger.app/v1beta1 kind: Canary spec: service: @@ -348,7 +348,7 @@ The following spec exposes the `frontend` workload inside the mesh on `frontend. and outside the mesh on `frontend.example.com`. You'll have to specify an Istio ingress gateway for external hosts. ```yaml -apiVersion: flagger.app/v1alpha3 +apiVersion: flagger.app/v1beta1 kind: Canary metadata: name: frontend @@ -409,7 +409,7 @@ metadata: name: frontend namespace: test ownerReferences: - - apiVersion: flagger.app/v1alpha3 + - apiVersion: flagger.app/v1beta1 blockOwnerDeletion: true controller: true kind: Canary @@ -486,7 +486,7 @@ To expose a workload inside the mesh on `http://backend.test.svc.cluster.local:9 the service spec can contain only the container port and the traffic policy: ```yaml -apiVersion: flagger.app/v1alpha3 +apiVersion: flagger.app/v1beta1 kind: Canary metadata: name: backend @@ -507,7 +507,7 @@ kind: Service metadata: name: backend-primary ownerReferences: - - apiVersion: flagger.app/v1alpha3 + - apiVersion: flagger.app/v1beta1 blockOwnerDeletion: true controller: true kind: Canary @@ -535,7 +535,7 @@ Assuming you have two apps, one that servers the main website and one that serve For each app you can define a canary object as: ```yaml -apiVersion: flagger.app/v1alpha3 +apiVersion: flagger.app/v1beta1 kind: Canary metadata: name: website @@ -552,7 +552,7 @@ spec: rewrite: uri: / --- -apiVersion: flagger.app/v1alpha3 +apiVersion: flagger.app/v1beta1 kind: Canary metadata: name: webapi @@ -583,7 +583,7 @@ Note that host merging only works if the canaries are bounded to a ingress gatew When deploying Istio with global mTLS enabled, you have to set the TLS mode to `ISTIO_MUTUAL`: ```yaml -apiVersion: flagger.app/v1alpha3 +apiVersion: flagger.app/v1beta1 kind: Canary spec: service: @@ -595,7 +595,7 @@ spec: If you run Istio in permissive mode you can disable TLS: ```yaml -apiVersion: flagger.app/v1alpha3 +apiVersion: flagger.app/v1beta1 kind: Canary spec: service: diff --git a/docs/gitbook/how-it-works.md b/docs/gitbook/how-it-works.md index 9defb689..5b714717 100644 --- a/docs/gitbook/how-it-works.md +++ b/docs/gitbook/how-it-works.md @@ -1,9 +1,7 @@ # How it works -[Flagger](https://github.com/weaveworks/flagger) takes a Kubernetes deployment and optionally -a horizontal pod autoscaler (HPA) and creates a series of objects -(Kubernetes deployments, ClusterIP services, virtual service, traffic split or ingress) -to drive the canary analysis and promotion. +[Flagger](https://github.com/weaveworks/flagger) can be configured to automate the release process +for Kubernetes workloads with a custom resource named canary. ### Canary custom resource @@ -13,7 +11,7 @@ and is portable across clusters, service meshes and ingress providers. For a deployment named _podinfo_, a canary release with progressive traffic shifting can be defined as: ```yaml -apiVersion: flagger.app/v1alpha3 +apiVersion: flagger.app/v1beta1 kind: Canary metadata: name: podinfo @@ -48,6 +46,9 @@ and at the same time, measures the requests success rate as well as the average You can extend the canary analysis with custom metrics, acceptance and load testing to harden the validation process of your app release process. +If you are running multiple service meshes or ingress controllers in the same cluster, +you can override the global provider for a specific canary with `spec.provider`. + ### Canary target A canary resource can target a Kubernetes Deployment or DaemonSet. diff --git a/docs/gitbook/install/flagger-install-on-kubernetes.md b/docs/gitbook/install/flagger-install-on-kubernetes.md index d9dfbb1f..48bdc517 100644 --- a/docs/gitbook/install/flagger-install-on-kubernetes.md +++ b/docs/gitbook/install/flagger-install-on-kubernetes.md @@ -222,7 +222,7 @@ that have the `prometheus.io/scrape: "true"` annotation. To target a different provider you can specify it in the canary custom resource: ```yaml -apiVersion: flagger.app/v1alpha3 +apiVersion: flagger.app/v1beta1 kind: Canary metadata: name: app diff --git a/docs/gitbook/tutorials/appmesh-progressive-delivery.md b/docs/gitbook/tutorials/appmesh-progressive-delivery.md index cf2d133c..b622b445 100644 --- a/docs/gitbook/tutorials/appmesh-progressive-delivery.md +++ b/docs/gitbook/tutorials/appmesh-progressive-delivery.md @@ -51,7 +51,7 @@ helm upgrade -i flagger-loadtester flagger/loadtester \ Create a canary custom resource: ```yaml -apiVersion: flagger.app/v1alpha3 +apiVersion: flagger.app/v1beta1 kind: Canary metadata: name: podinfo diff --git a/docs/gitbook/tutorials/contour-progressive-delivery.md b/docs/gitbook/tutorials/contour-progressive-delivery.md index 59760799..99cb302f 100644 --- a/docs/gitbook/tutorials/contour-progressive-delivery.md +++ b/docs/gitbook/tutorials/contour-progressive-delivery.md @@ -63,7 +63,7 @@ kubectl apply -k github.com/weaveworks/flagger//kustomize/podinfo Create a canary custom resource \(replace `app.example.com` with your own domain\): ```yaml -apiVersion: flagger.app/v1alpha3 +apiVersion: flagger.app/v1beta1 kind: Canary metadata: name: podinfo diff --git a/docs/gitbook/tutorials/crossover-progressive-delivery.md b/docs/gitbook/tutorials/crossover-progressive-delivery.md index 31ce7098..49eb1cb1 100644 --- a/docs/gitbook/tutorials/crossover-progressive-delivery.md +++ b/docs/gitbook/tutorials/crossover-progressive-delivery.md @@ -82,7 +82,7 @@ helm upgrade -i flagger-loadtester flagger/loadtester \ Create a canary custom resource: ```yaml -apiVersion: flagger.app/v1alpha3 +apiVersion: flagger.app/v1beta1 kind: Canary metadata: name: podinfo diff --git a/docs/gitbook/tutorials/flagger-smi-istio.md b/docs/gitbook/tutorials/flagger-smi-istio.md index 902e3c16..7add409f 100644 --- a/docs/gitbook/tutorials/flagger-smi-istio.md +++ b/docs/gitbook/tutorials/flagger-smi-istio.md @@ -106,7 +106,7 @@ kubectl apply -k github.com/weaveworks/flagger//kustomize/tester Create a canary custom resource \(replace example.com with your own domain\): ```yaml -apiVersion: flagger.app/v1alpha3 +apiVersion: flagger.app/v1beta1 kind: Canary metadata: name: podinfo diff --git a/docs/gitbook/tutorials/gloo-progressive-delivery.md b/docs/gitbook/tutorials/gloo-progressive-delivery.md index 4e3624b6..e5e24774 100644 --- a/docs/gitbook/tutorials/gloo-progressive-delivery.md +++ b/docs/gitbook/tutorials/gloo-progressive-delivery.md @@ -91,7 +91,7 @@ kubectl apply -f ./podinfo-virtualservice.yaml Create a canary custom resource \(replace `app.example.com` with your own domain\): ```yaml -apiVersion: flagger.app/v1alpha3 +apiVersion: flagger.app/v1beta1 kind: Canary metadata: name: podinfo diff --git a/docs/gitbook/tutorials/istio-ab-testing.md b/docs/gitbook/tutorials/istio-ab-testing.md index deec896c..fc362b84 100644 --- a/docs/gitbook/tutorials/istio-ab-testing.md +++ b/docs/gitbook/tutorials/istio-ab-testing.md @@ -30,7 +30,7 @@ kubectl apply -k github.com/weaveworks/flagger//kustomize/tester Create a canary custom resource \(replace example.com with your own domain\): ```yaml -apiVersion: flagger.app/v1alpha3 +apiVersion: flagger.app/v1beta1 kind: Canary metadata: name: podinfo diff --git a/docs/gitbook/tutorials/istio-progressive-delivery.md b/docs/gitbook/tutorials/istio-progressive-delivery.md index 5fc944f0..a3de59b2 100644 --- a/docs/gitbook/tutorials/istio-progressive-delivery.md +++ b/docs/gitbook/tutorials/istio-progressive-delivery.md @@ -30,7 +30,7 @@ kubectl apply -k github.com/weaveworks/flagger//kustomize/tester Create a canary custom resource \(replace example.com with your own domain\): ```yaml -apiVersion: flagger.app/v1alpha3 +apiVersion: flagger.app/v1beta1 kind: Canary metadata: name: podinfo @@ -260,7 +260,7 @@ Note that mirroring should be used for requests that are **idempotent** or capab You can enable mirroring by replacing `stepWeight/maxWeight` with `iterations` and by setting `canaryAnalysis.mirror` to `true`: ```yaml -apiVersion: flagger.app/v1alpha3 +apiVersion: flagger.app/v1beta1 kind: Canary metadata: name: podinfo diff --git a/docs/gitbook/tutorials/kubernetes-blue-green.md b/docs/gitbook/tutorials/kubernetes-blue-green.md index be0bcde1..52fe9170 100644 --- a/docs/gitbook/tutorials/kubernetes-blue-green.md +++ b/docs/gitbook/tutorials/kubernetes-blue-green.md @@ -69,7 +69,7 @@ kubectl -n test apply -f ${REPO}/artifacts/loadtester/service.yaml Create a canary custom resource: ```yaml -apiVersion: flagger.app/v1alpha3 +apiVersion: flagger.app/v1beta1 kind: Canary metadata: name: podinfo diff --git a/docs/gitbook/tutorials/linkerd-progressive-delivery.md b/docs/gitbook/tutorials/linkerd-progressive-delivery.md index 8d32efcb..1bf7a16f 100644 --- a/docs/gitbook/tutorials/linkerd-progressive-delivery.md +++ b/docs/gitbook/tutorials/linkerd-progressive-delivery.md @@ -44,7 +44,7 @@ kubectl apply -k github.com/weaveworks/flagger//kustomize/podinfo Create a canary custom resource for the podinfo deployment: ```yaml -apiVersion: flagger.app/v1alpha3 +apiVersion: flagger.app/v1beta1 kind: Canary metadata: name: podinfo @@ -366,7 +366,7 @@ Besides weighted routing, Flagger can be configured to route traffic to the cana Edit podinfo canary analysis, set the provider to `nginx`, add the ingress reference, remove the max/step weight and add the match conditions and iterations: ```yaml -apiVersion: flagger.app/v1alpha3 +apiVersion: flagger.app/v1beta1 kind: Canary metadata: name: podinfo diff --git a/docs/gitbook/tutorials/nginx-progressive-delivery.md b/docs/gitbook/tutorials/nginx-progressive-delivery.md index 8a8ce8c1..b8e80d84 100644 --- a/docs/gitbook/tutorials/nginx-progressive-delivery.md +++ b/docs/gitbook/tutorials/nginx-progressive-delivery.md @@ -96,7 +96,7 @@ kubectl apply -f ./podinfo-ingress.yaml Create a canary custom resource \(replace `app.example.com` with your own domain\): ```yaml -apiVersion: flagger.app/v1alpha3 +apiVersion: flagger.app/v1beta1 kind: Canary metadata: name: podinfo diff --git a/docs/gitbook/tutorials/zero-downtime-deployments.md b/docs/gitbook/tutorials/zero-downtime-deployments.md index c85e4442..e72e4f95 100644 --- a/docs/gitbook/tutorials/zero-downtime-deployments.md +++ b/docs/gitbook/tutorials/zero-downtime-deployments.md @@ -166,7 +166,7 @@ The above HPA ensures your app will be scaled up before the pods reach the CPU o To minimise the impact of downscaling operations you can make use of Envoy retry capabilities. ```yaml -apiVersion: flagger.app/v1alpha3 +apiVersion: flagger.app/v1beta1 kind: Canary spec: service: From a9fba0a1f274395bd7a3dbe21b5678b78b0dcf72 Mon Sep 17 00:00:00 2001 From: stefanprodan Date: Fri, 28 Feb 2020 13:24:09 +0200 Subject: [PATCH 06/13] docs: rename canaryAnalysis to analysis --- README.md | 4 ++-- docs/gitbook/faq.md | 4 ++-- docs/gitbook/how-it-works.md | 4 ++-- .../tutorials/appmesh-progressive-delivery.md | 4 ++-- .../tutorials/contour-progressive-delivery.md | 4 ++-- .../crossover-progressive-delivery.md | 2 +- docs/gitbook/tutorials/flagger-smi-istio.md | 2 +- .../tutorials/gloo-progressive-delivery.md | 4 ++-- docs/gitbook/tutorials/istio-ab-testing.md | 2 +- .../tutorials/istio-progressive-delivery.md | 4 ++-- .../gitbook/tutorials/kubernetes-blue-green.md | 6 +++--- .../tutorials/linkerd-progressive-delivery.md | 6 +++--- .../tutorials/nginx-progressive-delivery.md | 6 +++--- docs/gitbook/usage/alerting.md | 2 +- docs/gitbook/usage/deployment-strategies.md | 14 +++++++------- docs/gitbook/usage/metrics.md | 8 ++++---- docs/gitbook/usage/monitoring.md | 2 +- docs/gitbook/usage/webhooks.md | 18 +++++++++--------- 18 files changed, 48 insertions(+), 48 deletions(-) diff --git a/README.md b/README.md index 2147be3c..2dd3dc88 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,7 @@ When promoting a workload in production, both code (container images) and config For a deployment named _podinfo_, a canary promotion can be defined using Flagger's custom resource: ```yaml -apiVersion: flagger.app/v1alpha3 +apiVersion: flagger.app/v1beta1 kind: Canary metadata: name: podinfo @@ -110,7 +110,7 @@ spec: # promote the canary without analysing it (default false) skipAnalysis: false # define the canary analysis timing and KPIs - canaryAnalysis: + analysis: # schedule interval (default 60s) interval: 1m # max number of failed metric checks before rollback diff --git a/docs/gitbook/faq.md b/docs/gitbook/faq.md index 024be4bd..b42e9b00 100644 --- a/docs/gitbook/faq.md +++ b/docs/gitbook/faq.md @@ -232,7 +232,7 @@ Flagger measures the request success rate and duration using Prometheus queries. Spec: ```yaml - canaryAnalysis: + analysis: metrics: - name: request-success-rate # minimum req success rate (non 5xx responses) @@ -294,7 +294,7 @@ sum( Spec: ```yaml - canaryAnalysis: + analysis: metrics: - name: request-duration # maximum req duration P99 diff --git a/docs/gitbook/how-it-works.md b/docs/gitbook/how-it-works.md index 5b714717..d4206580 100644 --- a/docs/gitbook/how-it-works.md +++ b/docs/gitbook/how-it-works.md @@ -22,7 +22,7 @@ spec: name: podinfo service: port: 9898 - canaryAnalysis: + analysis: interval: 1m threshold: 10 maxWeight: 50 @@ -249,7 +249,7 @@ The canary analysis defines: Spec: ```yaml - canaryAnalysis: + analysis: # schedule interval (default 60s) interval: # max number of failed metric checks before rollback diff --git a/docs/gitbook/tutorials/appmesh-progressive-delivery.md b/docs/gitbook/tutorials/appmesh-progressive-delivery.md index b622b445..f118dad4 100644 --- a/docs/gitbook/tutorials/appmesh-progressive-delivery.md +++ b/docs/gitbook/tutorials/appmesh-progressive-delivery.md @@ -92,7 +92,7 @@ spec: perTryTimeout: 5s retryOn: "gateway-error,client-error,stream-error" # define the canary analysis timing and KPIs - canaryAnalysis: + analysis: # schedule interval (default 60s) interval: 1m # max number of failed metric checks before rollback @@ -329,7 +329,7 @@ Besides weighted routing, Flagger can be configured to route traffic to the cana Edit the canary analysis, remove the max/step weight and add the match conditions and iterations: ```yaml - canaryAnalysis: + analysis: interval: 1m threshold: 5 iterations: 10 diff --git a/docs/gitbook/tutorials/contour-progressive-delivery.md b/docs/gitbook/tutorials/contour-progressive-delivery.md index 99cb302f..de432e32 100644 --- a/docs/gitbook/tutorials/contour-progressive-delivery.md +++ b/docs/gitbook/tutorials/contour-progressive-delivery.md @@ -91,7 +91,7 @@ spec: attempts: 3 perTryTimeout: 5s # define the canary analysis timing and KPIs - canaryAnalysis: + analysis: # schedule interval (default 60s) interval: 30s # max number of failed metric checks before rollback @@ -332,7 +332,7 @@ Besides weighted routing, Flagger can be configured to route traffic to the cana Edit the canary analysis, remove the max/step weight and add the match conditions and iterations: ```yaml -canaryAnalysis: +analysis: interval: 1m threshold: 5 iterations: 10 diff --git a/docs/gitbook/tutorials/crossover-progressive-delivery.md b/docs/gitbook/tutorials/crossover-progressive-delivery.md index 49eb1cb1..bf6532e3 100644 --- a/docs/gitbook/tutorials/crossover-progressive-delivery.md +++ b/docs/gitbook/tutorials/crossover-progressive-delivery.md @@ -109,7 +109,7 @@ spec: # container port number or name (optional) targetPort: 9898 # define the canary analysis timing and KPIs - canaryAnalysis: + analysis: # schedule interval (default 60s) interval: 1m # max number of failed metric checks before rollback diff --git a/docs/gitbook/tutorials/flagger-smi-istio.md b/docs/gitbook/tutorials/flagger-smi-istio.md index 7add409f..55580a1c 100644 --- a/docs/gitbook/tutorials/flagger-smi-istio.md +++ b/docs/gitbook/tutorials/flagger-smi-istio.md @@ -134,7 +134,7 @@ spec: # Istio virtual service host names (optional) hosts: - app.example.com - canaryAnalysis: + analysis: # schedule interval (default 60s) interval: 10s # max number of failed metric checks before rollback diff --git a/docs/gitbook/tutorials/gloo-progressive-delivery.md b/docs/gitbook/tutorials/gloo-progressive-delivery.md index e5e24774..50f31575 100644 --- a/docs/gitbook/tutorials/gloo-progressive-delivery.md +++ b/docs/gitbook/tutorials/gloo-progressive-delivery.md @@ -113,7 +113,7 @@ spec: port: 9898 # container port number or name (optional) targetPort: 9898 - canaryAnalysis: + analysis: # schedule interval (default 60s) interval: 10s # max number of failed metric checks before rollback @@ -299,7 +299,7 @@ The demo app is instrumented with Prometheus so you can create a custom check th Edit the canary analysis and add the following metric: ```yaml - canaryAnalysis: + analysis: metrics: - name: "404s percentage" threshold: 5 diff --git a/docs/gitbook/tutorials/istio-ab-testing.md b/docs/gitbook/tutorials/istio-ab-testing.md index fc362b84..8196460d 100644 --- a/docs/gitbook/tutorials/istio-ab-testing.md +++ b/docs/gitbook/tutorials/istio-ab-testing.md @@ -63,7 +63,7 @@ spec: tls: # use ISTIO_MUTUAL when mTLS is enabled mode: DISABLE - canaryAnalysis: + analysis: # schedule interval (default 60s) interval: 1m # total number of iterations diff --git a/docs/gitbook/tutorials/istio-progressive-delivery.md b/docs/gitbook/tutorials/istio-progressive-delivery.md index a3de59b2..5116491a 100644 --- a/docs/gitbook/tutorials/istio-progressive-delivery.md +++ b/docs/gitbook/tutorials/istio-progressive-delivery.md @@ -70,7 +70,7 @@ spec: attempts: 3 perTryTimeout: 1s retryOn: "gateway-error,connect-failure,refused-stream" - canaryAnalysis: + analysis: # schedule interval (default 60s) interval: 1m # max number of failed metric checks before rollback @@ -266,7 +266,7 @@ metadata: name: podinfo namespace: test spec: - canaryAnalysis: + analysis: # schedule interval interval: 1m # max number of failed metric checks before rollback diff --git a/docs/gitbook/tutorials/kubernetes-blue-green.md b/docs/gitbook/tutorials/kubernetes-blue-green.md index 52fe9170..55db917c 100644 --- a/docs/gitbook/tutorials/kubernetes-blue-green.md +++ b/docs/gitbook/tutorials/kubernetes-blue-green.md @@ -93,7 +93,7 @@ spec: service: port: 9898 portDiscovery: true - canaryAnalysis: + analysis: # schedule interval (default 60s) interval: 30s # max number of failed checks before rollback @@ -258,7 +258,7 @@ The analysis can be extended with Prometheus queries. The demo app is instrument Edit the canary analysis and add the following metric: ```yaml - canaryAnalysis: + analysis: metrics: - name: "404s percentage" threshold: 5 @@ -333,7 +333,7 @@ When deployed the Helm tester API will be available at `http://flagger-helmteste Add a helm test pre-rollout hook to your chart: ```yaml - canaryAnalysis: + analysis: webhooks: - name: "conformance testing" type: pre-rollout diff --git a/docs/gitbook/tutorials/linkerd-progressive-delivery.md b/docs/gitbook/tutorials/linkerd-progressive-delivery.md index 1bf7a16f..bc446726 100644 --- a/docs/gitbook/tutorials/linkerd-progressive-delivery.md +++ b/docs/gitbook/tutorials/linkerd-progressive-delivery.md @@ -68,7 +68,7 @@ spec: port: 9898 # container port number or name (optional) targetPort: 9898 - canaryAnalysis: + analysis: # schedule interval (default 60s) interval: 30s # max number of failed metric checks before rollback @@ -256,7 +256,7 @@ The canary analysis can be extended with Prometheus queries. Let's a define a check for not found errors. Edit the canary analysis and add the following metric: ```yaml - canaryAnalysis: + analysis: metrics: - name: "404s percentage" threshold: 3 @@ -389,7 +389,7 @@ spec: service: # container port port: 9898 - canaryAnalysis: + analysis: interval: 1m threshold: 10 iterations: 10 diff --git a/docs/gitbook/tutorials/nginx-progressive-delivery.md b/docs/gitbook/tutorials/nginx-progressive-delivery.md index b8e80d84..cbe824c2 100644 --- a/docs/gitbook/tutorials/nginx-progressive-delivery.md +++ b/docs/gitbook/tutorials/nginx-progressive-delivery.md @@ -126,7 +126,7 @@ spec: port: 80 # container port number or name targetPort: 9898 - canaryAnalysis: + analysis: # schedule interval (default 60s) interval: 10s # max number of failed metric checks before rollback @@ -291,7 +291,7 @@ The demo app is instrumented with Prometheus so you can create a custom check th Edit the canary analysis and add the following metric: ```yaml - canaryAnalysis: + analysis: metrics: - name: "latency" threshold: 0.5 @@ -353,7 +353,7 @@ Besides weighted routing, Flagger can be configured to route traffic to the cana Edit the canary analysis, remove the max/step weight and add the match conditions and iterations: ```yaml - canaryAnalysis: + analysis: interval: 1m threshold: 10 iterations: 10 diff --git a/docs/gitbook/usage/alerting.md b/docs/gitbook/usage/alerting.md index f408e404..4671be93 100644 --- a/docs/gitbook/usage/alerting.md +++ b/docs/gitbook/usage/alerting.md @@ -85,7 +85,7 @@ the address in the secret will take precedence over the **address** field in the The canary analysis can have a list of alerts, each alert referencing an alert provider: ```yaml - canaryAnalysis: + analysis: alerts: - name: "on-call Slack" severity: error diff --git a/docs/gitbook/usage/deployment-strategies.md b/docs/gitbook/usage/deployment-strategies.md index 2775f73a..633b9b65 100644 --- a/docs/gitbook/usage/deployment-strategies.md +++ b/docs/gitbook/usage/deployment-strategies.md @@ -32,7 +32,7 @@ The canary analysis runs periodically until it reaches the maximum traffic weigh Spec: ```yaml - canaryAnalysis: + analysis: # schedule interval (default 60s) interval: 1m # max number of failed metric checks before rollback @@ -122,7 +122,7 @@ If Flagger finds a HTTP match condition, it will ignore the `maxWeight` and `ste Istio example: ```yaml - canaryAnalysis: + analysis: # schedule interval (default 60s) interval: 1m # total number of iterations @@ -155,7 +155,7 @@ interval * threshold App Mesh example: ```yaml - canaryAnalysis: + analysis: interval: 1m threshold: 10 iterations: 2 @@ -170,7 +170,7 @@ Note that App Mesh supports a single condition. Contour example: ```yaml - canaryAnalysis: + analysis: interval: 1m threshold: 10 iterations: 2 @@ -185,7 +185,7 @@ Note that Contour does not support regex, you can use prefix, suffix or exact. NGINX example: ```yaml - canaryAnalysis: + analysis: interval: 1m threshold: 10 iterations: 2 @@ -217,7 +217,7 @@ with Kubernetes L4 networking. When using Istio you have the option to mirror tr You can use the blue/green deployment strategy by replacing `stepWeight/maxWeight` with `iterations` in the `canaryAnalysis` spec: ```yaml - canaryAnalysis: + analysis: # schedule interval (default 60s) interval: 1m # total number of iterations @@ -268,7 +268,7 @@ To use mirroring, set `spec.canaryAnalysis.mirror` to `true`. Istio example: ```yaml - canaryAnalysis: + analysis: # schedule interval (default 60s) interval: 1m # total number of iterations diff --git a/docs/gitbook/usage/metrics.md b/docs/gitbook/usage/metrics.md index 32984ef4..3c037cf1 100644 --- a/docs/gitbook/usage/metrics.md +++ b/docs/gitbook/usage/metrics.md @@ -10,7 +10,7 @@ the release will be automatically rolled back with minimum impact to end-users. Flagger comes with two builtin metric checks: HTTP request success rate and duration. ```yaml - canaryAnalysis: + analysis: metrics: - name: request-success-rate interval: 1m @@ -63,7 +63,7 @@ The following variables are available in query templates: A canary analysis metric can reference a template with `templateRef`: ```yaml - canaryAnalysis: + analysis: metrics: - name: "my metric" templateRef: @@ -122,7 +122,7 @@ spec: Reference the template in the canary analysis: ```yaml - canaryAnalysis: + analysis: metrics: - name: "404s percentage" templateRef: @@ -223,7 +223,7 @@ spec: Reference the template in the canary analysis: ```yaml - canaryAnalysis: + analysis: metrics: - name: "404s percentage" templateRef: diff --git a/docs/gitbook/usage/monitoring.md b/docs/gitbook/usage/monitoring.md index 3eb41990..15dc10e7 100644 --- a/docs/gitbook/usage/monitoring.md +++ b/docs/gitbook/usage/monitoring.md @@ -84,7 +84,7 @@ Example: The event webhook can be overwritten at canary level with: ```yaml - canaryAnalysis: + analysis: webhooks: - name: "send to Slack" type: event diff --git a/docs/gitbook/usage/webhooks.md b/docs/gitbook/usage/webhooks.md index 7d574dce..16056db5 100644 --- a/docs/gitbook/usage/webhooks.md +++ b/docs/gitbook/usage/webhooks.md @@ -25,7 +25,7 @@ every action that Flagger takes during a canary deployment will be sent as JSON Spec: ```yaml - canaryAnalysis: + analysis: webhooks: - name: "start gate" type: confirm-rollout @@ -251,7 +251,7 @@ When deployed the Helm tester API will be available at `http://flagger-helmteste Now you can add pre-rollout webhooks to the canary analysis spec: ```yaml - canaryAnalysis: + analysis: webhooks: - name: "smoke test" type: pre-rollout @@ -268,7 +268,7 @@ If the helm test fails, Flagger will retry until the analysis threshold is reach If you are using Helm v3, you'll have to create a dedicated service account and add the release namespace to the test command: ```yaml - canaryAnalysis: + analysis: webhooks: - name: "smoke test" type: pre-rollout @@ -282,7 +282,7 @@ If you are using Helm v3, you'll have to create a dedicated service account and As an alternative to Helm you can use the [Bash Automated Testing System](https://github.com/bats-core/bats-core) to run your tests. ```yaml - canaryAnalysis: + analysis: webhooks: - name: "acceptance tests" type: pre-rollout @@ -308,7 +308,7 @@ will shift all traffic back to the primary instance and fail the canary. Manual gating with Flagger's tester: ```yaml - canaryAnalysis: + analysis: webhooks: - name: "gate" type: confirm-rollout @@ -322,7 +322,7 @@ If you have notifications enabled, Flagger will post a message to Slack or MS Te Change the URL to `/gate/approve` to start the canary analysis: ```yaml - canaryAnalysis: + analysis: webhooks: - name: "gate" type: confirm-rollout @@ -332,7 +332,7 @@ Change the URL to `/gate/approve` to start the canary analysis: Manual gating can be driven with Flagger's tester API. Set the confirmation URL to `/gate/check`: ```yaml - canaryAnalysis: + analysis: webhooks: - name: "ask for confirmation" type: confirm-rollout @@ -366,7 +366,7 @@ The `confirm-promotion` hook type can be used to manually approve the canary pro While the promotion is paused, Flagger will continue to run the metrics checks and load tests. ```yaml - canaryAnalysis: + analysis: webhooks: - name: "promotion gate" type: confirm-promotion @@ -377,7 +377,7 @@ The `rollback` hook type can be used to manually rollback the canary promotion. with Flagger's tester API by setting the rollback URL to `/rollback/check` ```yaml - canaryAnalysis: + analysis: webhooks: - name: "rollback" type: rollback From be4c67540d1664b4c5c28f598efcb9bb6ba8ae03 Mon Sep 17 00:00:00 2001 From: stefanprodan Date: Fri, 28 Feb 2020 18:46:26 +0200 Subject: [PATCH 07/13] build: make release compatible with go mod --- .circleci/config.yml | 2 +- Makefile | 11 +++++++---- test/container-push.sh | 7 +++---- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 867044a5..a7023529 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -199,7 +199,7 @@ jobs: - run: name: Publish charts command: | - if echo "${CIRCLE_TAG}" | grep -Eq "[0-9]+(\.[0-9]+)*(-[a-z]+)?$"; then + if echo "${CIRCLE_TAG}" | grep v; then REPOSITORY="https://weaveworksbot:${GITHUB_TOKEN}@github.com/weaveworks/flagger.git" git config user.email weaveworksbot@users.noreply.github.com git config user.name weaveworksbot diff --git a/Makefile b/Makefile index 9e2dddf3..02edad79 100644 --- a/Makefile +++ b/Makefile @@ -20,6 +20,9 @@ test-fmt: gofmt -l -s ./ | grep ".*\.go"; if [ "$$?" = "0" ]; then exit 1; fi goimports -l ./ | grep ".*\.go"; if [ "$$?" = "0" ]; then exit 1; fi +codegen: + ./hack/update-codegen.sh + test-codegen: ./hack/verify-codegen.sh @@ -42,16 +45,16 @@ version-set: echo "Version $$next set in code, deployment, chart and kustomize" release: - git tag $(VERSION) - git push origin $(VERSION) + git tag "v$(VERSION)" + git push origin "v$(VERSION)" release-notes: cd /tmp && GH_REL_URL="https://github.com/buchanae/github-release-notes/releases/download/0.2.0/github-release-notes-linux-amd64-0.2.0.tar.gz" && \ curl -sSL $${GH_REL_URL} | tar xz && sudo mv github-release-notes /usr/local/bin/ loadtester-build: - GO111MODULE=on CGO_ENABLED=0 GOOS=linux go build -a -installsuffix cgo -o ./bin/loadtester ./cmd/loadtester/* + CGO_ENABLED=0 GOOS=linux go build -a -installsuffix cgo -o ./bin/loadtester ./cmd/loadtester/* + docker build -t weaveworks/flagger-loadtester:$(LT_VERSION) . -f Dockerfile.loadtester loadtester-push: - docker build -t weaveworks/flagger-loadtester:$(LT_VERSION) . -f Dockerfile.loadtester docker push weaveworks/flagger-loadtester:$(LT_VERSION) diff --git a/test/container-push.sh b/test/container-push.sh index bf2a7fe1..66dec947 100755 --- a/test/container-push.sh +++ b/test/container-push.sh @@ -2,7 +2,6 @@ set -o errexit - push () { echo $DOCKER_PASS | docker login -u=$DOCKER_USER --password-stdin @@ -11,8 +10,9 @@ push () { docker tag test/flagger:latest weaveworks/flagger:${BRANCH_COMMIT}; docker push weaveworks/flagger:${BRANCH_COMMIT}; else - docker tag test/flagger:latest weaveworks/flagger:${CIRCLE_TAG}; - docker push weaveworks/flagger:${CIRCLE_TAG}; + VER=${CIRCLE_TAG:1} + docker tag test/flagger:latest weaveworks/flagger:${VER}; + docker push weaveworks/flagger:${VER}; fi } @@ -21,4 +21,3 @@ if [[ -z "$DOCKER_PASS" ]]; then else push fi - From 49c088595e1b2d57ede6477eda5b0aac1dfa1336 Mon Sep 17 00:00:00 2001 From: stefanprodan Date: Fri, 28 Feb 2020 18:46:52 +0200 Subject: [PATCH 08/13] Add code changes section to dev docs --- docs/gitbook/dev/dev-guide.md | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/docs/gitbook/dev/dev-guide.md b/docs/gitbook/dev/dev-guide.md index 43a004f7..9e6e557b 100644 --- a/docs/gitbook/dev/dev-guide.md +++ b/docs/gitbook/dev/dev-guide.md @@ -58,18 +58,22 @@ Download Go modules: go mod download ``` -Build Flagger container image: +Build Flagger binary and container image: ```bash make build ``` -Run unit tests: +Build load tester binary and container image: ```bash -make test +make loadtester-build ``` +### Code changes + +Before submitting a PR, make sure your changes are covered by unit tests. + If you made changes to `go.mod` run: ```bash @@ -79,7 +83,19 @@ go mod tidy If you made changes to `pkg/apis` regenerate Kubernetes client sets with: ```bash -./hack/update-codegen.sh +make codegen +``` + +Run code formatters: + +```bash +make fmt +``` + +Run unit tests: + +```bash +make test ``` ### Manual testing From c4a9712b818eff2751193dbfa9a2afc628c82964 Mon Sep 17 00:00:00 2001 From: stefanprodan Date: Sat, 29 Feb 2020 11:37:18 +0200 Subject: [PATCH 09/13] docs: add getting started section --- docs/gitbook/README.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/docs/gitbook/README.md b/docs/gitbook/README.md index 5d8e60df..8452986a 100644 --- a/docs/gitbook/README.md +++ b/docs/gitbook/README.md @@ -21,3 +21,28 @@ it can be used in **GitOps** pipelines together with Flux CD or JenkinsX. This project is sponsored by [Weaveworks](https://www.weave.works/) +## Getting started + +To get started with Flagger, chose one of the supported routing providers +and [install](install/flagger-install-on-kubernetes.md) Flagger with Helm or Kustomize. + +After install Flagger you can follow one the tutorials: + +**Service mesh tutorials** + +* [Istio](tutorials/istio-progressive-delivery.md) +* [Linkerd](tutorials/linkerd-progressive-delivery.md) +* [AWS App Mesh](tutorials/appmesh-progressive-delivery.md) + +**Ingress controller tutorials** + +* [Contour](tutorials/contour-progressive-delivery.md) +* [Gloo](tutorials/gloo-progressive-delivery.md) +* [NGINX Ingress](tutorials/nginx-progressive-delivery.md) + +**Hands-on GitOps workshops** + +* [Istio](https://github.com/stefanprodan/gitops-istio) +* [Linkerd](https://helm.workshop.flagger.dev) +* [AWS App Mesh](https://eks.hands-on.flagger.dev) + From bf0499e8a64a5b5a5b56fdbdd3d2853ab6c42c36 Mon Sep 17 00:00:00 2001 From: stefanprodan Date: Sat, 29 Feb 2020 11:38:04 +0200 Subject: [PATCH 10/13] docs: use metric providers in tutorials --- .gitbook.yaml | 1 + docs/gitbook/SUMMARY.md | 3 +- docs/gitbook/faq.md | 5 + .../tutorials/appmesh-progressive-delivery.md | 1 + .../tutorials/contour-progressive-delivery.md | 47 ++++++--- .../crossover-progressive-delivery.md | 21 ++-- .../tutorials/gloo-progressive-delivery.md | 98 +++++++++++-------- docs/gitbook/tutorials/istio-ab-testing.md | 4 + .../tutorials/istio-progressive-delivery.md | 6 +- .../tutorials/kubernetes-blue-green.md | 89 +++++++++++------ .../tutorials/linkerd-progressive-delivery.md | 48 ++++++--- .../tutorials/nginx-progressive-delivery.md | 73 ++++++++++---- docs/gitbook/{ => usage}/how-it-works.md | 10 +- 13 files changed, 271 insertions(+), 135 deletions(-) rename docs/gitbook/{ => usage}/how-it-works.md (96%) diff --git a/.gitbook.yaml b/.gitbook.yaml index ef0ea31a..bdd2977a 100644 --- a/.gitbook.yaml +++ b/.gitbook.yaml @@ -1,6 +1,7 @@ root: ./docs/gitbook redirects: + how-it-works: usage/how-it-works.md usage/progressive-delivery: tutorials/istio-progressive-delivery.md usage/ab-testing: tutorials/istio-ab-testing.md usage/blue-green: tutorials/kubernetes-blue-green.md diff --git a/docs/gitbook/SUMMARY.md b/docs/gitbook/SUMMARY.md index d51ea045..5267e07a 100644 --- a/docs/gitbook/SUMMARY.md +++ b/docs/gitbook/SUMMARY.md @@ -1,7 +1,6 @@ # Table of contents * [Introduction](README.md) -* [How it works](how-it-works.md) * [FAQ](faq.md) ## Install @@ -9,10 +8,10 @@ * [Flagger Install on Kubernetes](install/flagger-install-on-kubernetes.md) * [Flagger Install on GKE Istio](install/flagger-install-on-google-cloud.md) * [Flagger Install on EKS App Mesh](install/flagger-install-on-eks-appmesh.md) -* [Flagger Install with SuperGloo](install/flagger-install-with-supergloo.md) ## Usage +* [How it works](usage/how-it-works.md) * [Deployment Strategies](usage/deployment-strategies.md) * [Metrics Analysis](usage/metrics.md) * [Webhooks](usage/webhooks.md) diff --git a/docs/gitbook/faq.md b/docs/gitbook/faq.md index b42e9b00..1e75588d 100644 --- a/docs/gitbook/faq.md +++ b/docs/gitbook/faq.md @@ -336,6 +336,11 @@ histogram_quantile(0.99, > **Note** that the metric interval should be lower or equal to the control loop interval. +**Can I use custom metrics?** + +The analysis can be extended with metrics provided by Prometheus, Datadog and AWS CloudWatch. For more details +on how custom metrics can be used please read the [metrics docs](usage/metrics.md). + ### Istio routing **How does Flagger interact with Istio?** diff --git a/docs/gitbook/tutorials/appmesh-progressive-delivery.md b/docs/gitbook/tutorials/appmesh-progressive-delivery.md index f118dad4..b1201e6c 100644 --- a/docs/gitbook/tutorials/appmesh-progressive-delivery.md +++ b/docs/gitbook/tutorials/appmesh-progressive-delivery.md @@ -389,3 +389,4 @@ Routing all traffic to primary Promotion completed! Scaling down podinfo.test ``` +For an in-depth look at the analysis process read the [usage docs](../usage/how-it-works.md). diff --git a/docs/gitbook/tutorials/contour-progressive-delivery.md b/docs/gitbook/tutorials/contour-progressive-delivery.md index de432e32..78119bd7 100644 --- a/docs/gitbook/tutorials/contour-progressive-delivery.md +++ b/docs/gitbook/tutorials/contour-progressive-delivery.md @@ -22,7 +22,7 @@ Install Flagger using Kustomize \(kubectl 1.14\) in the `projectcontour` namespa kubectl apply -k github.com/weaveworks/flagger//kustomize/contour ``` -The above command will deploy Flagger and Prometheus configured to scrape the Contour's Envoy instances. You can also enable Slack or MS Teams notifications, see the Kustomize install [docs](https://docs.flagger.app/install/flagger-install-on-kubernetes#install-flagger-with-kustomize). +The above command will deploy Flagger and Prometheus configured to scrape the Contour's Envoy instances. Or you can install Flagger using Helm: @@ -32,15 +32,17 @@ helm repo add flagger https://flagger.app helm upgrade -i flagger flagger/flagger \ --namespace projectcontour \ --set meshProvider=contour \ ---set prometheus.install=true \ ---set slack.url=https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK \ ---set slack.channel=general \ ---set slack.user=flagger +--set prometheus.install=true ``` +You can also enable Slack, Discord, Rocket or MS Teams notifications, +see the alerting [docs](../usage/alerting.md). + ## Bootstrap -Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler \(HPA\), then creates a series of objects \(Kubernetes deployments, ClusterIP services and Contour HTTPProxy\). These objects expose the application in the cluster and drive the canary analysis and promotion. +Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler (HPA), +then creates a series of objects (Kubernetes deployments, ClusterIP services and Contour HTTPProxy). +These objects expose the application in the cluster and drive the canary analysis and promotion. Create a test namespace: @@ -60,7 +62,7 @@ Create a deployment and a horizontal pod autoscaler: kubectl apply -k github.com/weaveworks/flagger//kustomize/podinfo ``` -Create a canary custom resource \(replace `app.example.com` with your own domain\): +Create a canary custom resource (replace `app.example.com` with your own domain): ```yaml apiVersion: flagger.app/v1beta1 @@ -155,7 +157,9 @@ service/podinfo-primary httpproxy.projectcontour.io/podinfo ``` -After the boostrap, the podinfo deployment will be scaled to zero and the traffic to `podinfo.test` will be routed to the primary pods. During the canary analysis, the `podinfo-canary.test` address can be used to target directly the canary pods. +After the boostrap, the podinfo deployment will be scaled to zero and the traffic to `podinfo.test` +will be routed to the primary pods. +During the canary analysis, the `podinfo-canary.test` address can be used to target directly the canary pods. ## Expose the app outside the cluster @@ -167,9 +171,11 @@ export ADDRESS="$(kubectl -n projectcontour get svc/envoy -ojson \ echo $ADDRESS ``` -Configure your DNS server with a CNAME record \(AWS\) or A record \(GKE/AKS/DOKS\) and point a domain e.g. `app.example.com` to the LB address. +Configure your DNS server with a CNAME record \(AWS\) or A record (GKE/AKS/DOKS) +and point a domain e.g. `app.example.com` to the LB address. -Create a HTTPProxy definition and include the podinfo proxy generated by Flagger \(replace `app.example.com` with your own domain\): +Create a HTTPProxy definition and include the podinfo proxy generated by Flagger +(replace `app.example.com` with your own domain): ```yaml apiVersion: projectcontour.io/v1 @@ -205,17 +211,21 @@ podinfo-ingress app.example.com valid Now you can access podinfo UI using your domain address. -Note that you should be using HTTPS when exposing production workloads on internet. You can obtain free TLS certs from Let's Encrypt, read this [guide](https://github.com/stefanprodan/eks-contour-ingress) on how to configure cert-manager to secure Contour with TLS certificates. +Note that you should be using HTTPS when exposing production workloads on internet. +You can obtain free TLS certs from Let's Encrypt, read this [guide](https://github.com/stefanprodan/eks-contour-ingress) +on how to configure cert-manager to secure Contour with TLS certificates. ## Automated canary promotion -Flagger implements a control loop that gradually shifts traffic to the canary while measuring key performance indicators like HTTP requests success rate, requests average duration and pod health. Based on analysis of the KPIs a canary is promoted or aborted. +Flagger implements a control loop that gradually shifts traffic to the canary while measuring +key performance indicators like HTTP requests success rate, requests average duration and pod health. +Based on analysis of the KPIs a canary is promoted or aborted. ![Flagger Canary Stages](https://raw.githubusercontent.com/weaveworks/flagger/master/docs/diagrams/flagger-canary-steps.png) A canary deployment is triggered by changes in any of the following objects: -* Deployment PodSpec \(container image, command, ports, env, resources, etc\) +* Deployment PodSpec (container image, command, ports, env, resources, etc) * ConfigMaps and Secrets mounted as volumes or mapped to environment variables Trigger a canary deployment by updating the container image: @@ -300,7 +310,8 @@ Generate latency: watch -n 1 curl http://app.example.com/delay/1 ``` -When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary, the canary is scaled to zero and the rollout is marked as failed. +When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary, +the canary is scaled to zero and the rollout is marked as failed. ```text kubectl -n projectcontour logs deploy/flagger -f | jq .msg @@ -319,13 +330,16 @@ Rolling back podinfo.test failed checks threshold reached 5 Canary failed! Scaling down podinfo.test ``` -If you’ve enabled the Slack notifications, you’ll receive a message if the progress deadline is exceeded, or if the analysis reached the maximum number of failed checks: +If you’ve enabled the Slack notifications, you’ll receive a message if the progress deadline is exceeded, +or if the analysis reached the maximum number of failed checks: ![Flagger Slack Notifications](https://raw.githubusercontent.com/weaveworks/flagger/master/docs/screens/slack-canary-failed.png) ## A/B Testing -Besides weighted routing, Flagger can be configured to route traffic to the canary based on HTTP match conditions. In an A/B testing scenario, you'll be using HTTP headers or cookies to target a certain segment of your users. This is particularly useful for frontend applications that require session affinity. +Besides weighted routing, Flagger can be configured to route traffic to the canary based on HTTP match conditions. +In an A/B testing scenario, you'll be using HTTP headers or cookies to target a certain segment of your users. +This is particularly useful for frontend applications that require session affinity. ![Flagger A/B Testing Stages](https://raw.githubusercontent.com/weaveworks/flagger/master/docs/diagrams/flagger-abtest-steps.png) @@ -421,3 +435,4 @@ match: suffix: "Firefox/71.0" ``` +For an in-depth look at the analysis process read the [usage docs](../usage/how-it-works.md). diff --git a/docs/gitbook/tutorials/crossover-progressive-delivery.md b/docs/gitbook/tutorials/crossover-progressive-delivery.md index bf6532e3..d2323dd5 100644 --- a/docs/gitbook/tutorials/crossover-progressive-delivery.md +++ b/docs/gitbook/tutorials/crossover-progressive-delivery.md @@ -64,7 +64,10 @@ helm upgrade -i flagger flagger/flagger \ ## Bootstrap -Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler \(HPA\), then creates a series of objects \(Kubernetes deployments, ClusterIP services, SMI traffic splits\). These objects expose the application on the mesh and drive the canary analysis and promotion. There's no SMI object you need to create by yourself. +Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler (HPA), +then creates a series of objects (Kubernetes deployments, ClusterIP services, SMI traffic splits). +These objects expose the application on the mesh and drive the canary analysis and promotion. +There's no SMI object you need to create by yourself. Create a deployment and a horizontal pod autoscaler: @@ -171,17 +174,21 @@ service/podinfo-primary trafficsplits.split.smi-spec.io/podinfo ``` -After the boostrap, the podinfo deployment will be scaled to zero and the traffic to `podinfo.test` will be routed to the primary pods. During the canary analysis, the `podinfo-canary.test` address can be used to target directly the canary pods. +After the boostrap, the podinfo deployment will be scaled to zero and the traffic to `podinfo.test` +will be routed to the primary pods. During the canary analysis, +the `podinfo-canary.test` address can be used to target directly the canary pods. ## Automated canary promotion -Flagger implements a control loop that gradually shifts traffic to the canary while measuring key performance indicators like HTTP requests success rate, requests average duration and pod health. Based on analysis of the KPIs a canary is promoted or aborted, and the analysis result is published to Slack. +Flagger implements a control loop that gradually shifts traffic to the canary while measuring +key performance indicators like HTTP requests success rate, requests average duration and pod health. +Based on analysis of the KPIs a canary is promoted or aborted, and the analysis result is published to Slack. ![Flagger Canary Stages](https://raw.githubusercontent.com/weaveworks/flagger/master/docs/diagrams/flagger-canary-steps.png) A canary deployment is triggered by changes in any of the following objects: -* Deployment PodSpec \(container image, command, ports, env, resources, etc\) +* Deployment PodSpec (container image, command, ports, env, resources, etc) * ConfigMaps and Secrets mounted as volumes or mapped to environment variables Trigger a canary deployment by updating the container image: @@ -288,7 +295,8 @@ Generate latency: watch -n 1 curl -H 'Host: podinfo.test' http://envoy.test:10000/delay/1 ``` -When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary, the canary is scaled to zero and the rollout is marked as failed. +When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary, +the canary is scaled to zero and the rollout is marked as failed. ```text kubectl -n test logs deploy/flagger -f | jq .msg @@ -307,7 +315,8 @@ Rolling back podinfo.test failed checks threshold reached 5 Canary failed! Scaling down podinfo.test ``` -If you’ve enabled the Slack notifications, you’ll receive a message if the progress deadline is exceeded, or if the analysis reached the maximum number of failed checks: +If you’ve enabled the Slack notifications, you’ll receive a message if the progress deadline is exceeded, +or if the analysis reached the maximum number of failed checks: ![Flagger Slack Notifications](https://raw.githubusercontent.com/weaveworks/flagger/master/docs/screens/slack-canary-failed.png) diff --git a/docs/gitbook/tutorials/gloo-progressive-delivery.md b/docs/gitbook/tutorials/gloo-progressive-delivery.md index 50f31575..9ff5011e 100644 --- a/docs/gitbook/tutorials/gloo-progressive-delivery.md +++ b/docs/gitbook/tutorials/gloo-progressive-delivery.md @@ -28,20 +28,11 @@ helm upgrade -i flagger flagger/flagger \ --set meshProvider=gloo ``` -Optionally you can enable Slack notifications: - -```bash -helm upgrade -i flagger flagger/flagger \ ---reuse-values \ ---namespace gloo-system \ ---set slack.url=https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK \ ---set slack.channel=general \ ---set slack.user=flagger -``` - ## Bootstrap -Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler \(HPA\), then creates a series of objects \(Kubernetes deployments, ClusterIP services and Gloo upstream groups\). These objects expose the application outside the cluster and drive the canary analysis and promotion. +Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler (HPA), +then creates a series of objects (Kubernetes deployments, ClusterIP services and Gloo upstream groups). +These objects expose the application outside the cluster and drive the canary analysis and promotion. Create a test namespace: @@ -61,7 +52,8 @@ Deploy the load testing service to generate traffic during the canary analysis: kubectl -n test apply -k github.com/weaveworks/flagger//kustomize/tester ``` -Create an virtual service definition that references an upstream group that will be generated by Flagger \(replace `app.example.com` with your own domain\): +Create an virtual service definition that references an upstream group that will be generated by Flagger +(replace `app.example.com` with your own domain): ```yaml apiVersion: gateway.solo.io/v1 @@ -88,7 +80,7 @@ Save the above resource as podinfo-virtualservice.yaml and then apply it: kubectl apply -f ./podinfo-virtualservice.yaml ``` -Create a canary custom resource \(replace `app.example.com` with your own domain\): +Create a canary custom resource (replace `app.example.com` with your own domain): ```yaml apiVersion: flagger.app/v1beta1 @@ -188,7 +180,9 @@ podinfo Initialized 0 2019-05-17T08:09:51Z ## Automated canary promotion -Flagger implements a control loop that gradually shifts traffic to the canary while measuring key performance indicators like HTTP requests success rate, requests average duration and pod health. Based on analysis of the KPIs a canary is promoted or aborted, and the analysis result is published to Slack. +Flagger implements a control loop that gradually shifts traffic to the canary while measuring +key performance indicators like HTTP requests success rate, requests average duration and pod health. +Based on analysis of the KPIs a canary is promoted or aborted, and the analysis result is published to Slack. ![Flagger Canary Stages](https://raw.githubusercontent.com/weaveworks/flagger/master/docs/diagrams/flagger-canary-steps.png) @@ -265,7 +259,8 @@ Generate high latency: watch curl -H 'Host: app.example.com' http://gateway-proxy-v2.gloo-system/delay/2 ``` -When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary, the canary is scaled to zero and the rollout is marked as failed. +When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary, +the canary is scaled to zero and the rollout is marked as failed. ```text kubectl -n test describe canary/podinfo @@ -294,37 +289,57 @@ Events: The canary analysis can be extended with Prometheus queries. -The demo app is instrumented with Prometheus so you can create a custom check that will use the HTTP request duration histogram to validate the canary. +The demo app is instrumented with Prometheus so you can create a custom check that will use +the HTTP request duration histogram to validate the canary. + +Create a metric template and apply it on the cluster: + +```yaml +apiVersion: flagger.app/v1beta1 +kind: MetricTemplate +metadata: + name: not-found-percentage + namespace: test +spec: + provider: + type: prometheus + address: http://flagger-promethues.gloo-system:9090 + query: | + 100 - sum( + rate( + http_request_duration_seconds_count{ + kubernetes_namespace="{{ namespace }}", + kubernetes_pod_name=~"{{ target }}-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)" + status!="{{ interval }}" + }[1m] + ) + ) + / + sum( + rate( + http_request_duration_seconds_count{ + kubernetes_namespace="{{ namespace }}", + kubernetes_pod_name=~"{{ target }}-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)" + }[{{ interval }}] + ) + ) * 100 +``` Edit the canary analysis and add the following metric: ```yaml analysis: metrics: - - name: "404s percentage" - threshold: 5 - query: | - 100 - sum( - rate( - http_request_duration_seconds_count{ - kubernetes_namespace="test", - kubernetes_pod_name=~"podinfo-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)" - status!="404" - }[1m] - ) - ) - / - sum( - rate( - http_request_duration_seconds_count{ - kubernetes_namespace="test", - kubernetes_pod_name=~"podinfo-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)" - }[1m] - ) - ) * 100 + - name: "404s percentage" + templateRef: + name: not-found-percentage + thresholdRange: + max: 5 + interval: 1m ``` -The above configuration validates the canary by checking if the HTTP 404 req/sec percentage is below 5 percent of the total traffic. If the 404s rate reaches the 5% threshold, then the canary fails. +The above configuration validates the canary by checking if the HTTP 404 req/sec percentage is +below 5 percent of the total traffic. If the 404s rate reaches the 5% threshold, then the canary fails. Trigger a canary deployment by updating the container image: @@ -357,5 +372,8 @@ Rolling back podinfo.test failed checks threshold reached 5 Canary failed! Scaling down podinfo.test ``` -If you have Slack configured, Flagger will send a notification with the reason why the canary failed. +If you have [alerting](../usage/alerting.md) configured, +Flagger will send a notification with the reason why the canary failed. + +For an in-depth look at the analysis process read the [usage docs](../usage/how-it-works.md). diff --git a/docs/gitbook/tutorials/istio-ab-testing.md b/docs/gitbook/tutorials/istio-ab-testing.md index 8196460d..484f42f0 100644 --- a/docs/gitbook/tutorials/istio-ab-testing.md +++ b/docs/gitbook/tutorials/istio-ab-testing.md @@ -213,3 +213,7 @@ Events: Warning Synced 1m flagger Canary failed! Scaling down podinfo.test ``` +The above procedure can be extended with [custom metrics](../usage/metrics.md) checks, +[webhooks](../usage/webhooks.md), +[manual promotion](../usage/webhooks.md#manual-gating) approval and +[Slack or MS Teams](../usage/alerting.md) notifications. \ No newline at end of file diff --git a/docs/gitbook/tutorials/istio-progressive-delivery.md b/docs/gitbook/tutorials/istio-progressive-delivery.md index 5116491a..12dedf43 100644 --- a/docs/gitbook/tutorials/istio-progressive-delivery.md +++ b/docs/gitbook/tutorials/istio-progressive-delivery.md @@ -319,5 +319,7 @@ With the above configuration, Flagger will run a canary release with the followi * scale to zero the canary * send notification with the canary analysis result -The above procedure can be extended with [custom metrics](https://docs.flagger.app/how-it-works#custom-metrics) checks, [webhooks](https://docs.flagger.app/how-it-works#webhooks), [manual promotion](https://docs.flagger.app/how-it-works#manual-gating) approval and [Slack or MS Teams](https://docs.flagger.app/usage/alerting) notifications. - +The above procedure can be extended with [custom metrics](../usage/metrics.md) checks, +[webhooks](../usage/webhooks.md), +[manual promotion](../usage/webhooks.md#manual-gating) approval and +[Slack or MS Teams](../usage/alerting.md) notifications. diff --git a/docs/gitbook/tutorials/kubernetes-blue-green.md b/docs/gitbook/tutorials/kubernetes-blue-green.md index 55db917c..836cbc1e 100644 --- a/docs/gitbook/tutorials/kubernetes-blue-green.md +++ b/docs/gitbook/tutorials/kubernetes-blue-green.md @@ -2,7 +2,9 @@ This guide shows you how to automate Blue/Green deployments with Flagger and Kubernetes. -For applications that are not deployed on a service mesh, Flagger can orchestrate Blue/Green style deployments with Kubernetes L4 networking. When using a service mesh blue/green can be used as specified [here](https://docs.flagger.app/how-it-works#blue-green-deployments). +For applications that are not deployed on a service mesh, Flagger can orchestrate Blue/Green style deployments +with Kubernetes L4 networking. When using a service mesh blue/green can be used as +specified [here](../usage/deployment-strategies.md). ![Flagger Blue/Green Stages](https://raw.githubusercontent.com/weaveworks/flagger/master/docs/diagrams/flagger-bluegreen-steps.png) @@ -42,7 +44,9 @@ helm upgrade -i flagger flagger/flagger \ ## Bootstrap -Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler \(HPA\), then creates a series of objects \(Kubernetes deployment and ClusterIP services\). These objects expose the application inside the cluster and drive the canary analysis and Blue/Green promotion. +Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler (HPA), +then creates a series of objects (Kubernetes deployment and ClusterIP services). +These objects expose the application inside the cluster and drive the canary analysis and Blue/Green promotion. Create a test namespace: @@ -156,8 +160,10 @@ service/podinfo-primary Blue/Green scenario: -* on bootstrap, Flagger will create three ClusterIP services \(`app-primary`,`app-canary`, `app`\) and a shadow deployment named `app-primary` that represents the blue version -* when a new version is detected, Flagger would scale up the green version and run the conformance tests \(the tests should target the `app-canary` ClusterIP service to reach the green version\) +* on bootstrap, Flagger will create three ClusterIP services (`app-primary`,`app-canary`, `app`) +and a shadow deployment named `app-primary` that represents the blue version +* when a new version is detected, Flagger would scale up the green version and run the conformance tests +(the tests should target the `app-canary` ClusterIP service to reach the green version) * if the conformance tests are passing, Flagger would start the load tests and validate them with custom Prometheus queries * if the load test analysis is successful, Flagger will promote the new version to `app-primary` and scale down the green version @@ -253,37 +259,58 @@ Events: ## Custom metrics -The analysis can be extended with Prometheus queries. The demo app is instrumented with Prometheus so you can create a custom check that will use the HTTP request duration histogram to validate the canary \(green version\). +The analysis can be extended with Prometheus queries. The demo app is instrumented with Prometheus so you can +create a custom check that will use the HTTP request duration histogram to validate the canary (green version). + + +Create a metric template and apply it on the cluster: + +```yaml +apiVersion: flagger.app/v1beta1 +kind: MetricTemplate +metadata: + name: not-found-percentage + namespace: test +spec: + provider: + type: prometheus + address: http://flagger-promethues.flagger:9090 + query: | + 100 - sum( + rate( + http_request_duration_seconds_count{ + kubernetes_namespace="{{ namespace }}", + kubernetes_pod_name=~"{{ target }}-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)" + status!="{{ interval }}" + }[1m] + ) + ) + / + sum( + rate( + http_request_duration_seconds_count{ + kubernetes_namespace="{{ namespace }}", + kubernetes_pod_name=~"{{ target }}-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)" + }[{{ interval }}] + ) + ) * 100 +``` Edit the canary analysis and add the following metric: ```yaml analysis: metrics: - - name: "404s percentage" - threshold: 5 - query: | - 100 - sum( - rate( - http_request_duration_seconds_count{ - kubernetes_namespace="test", - kubernetes_pod_name=~"podinfo-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)" - status!="404" - }[1m] - ) - ) - / - sum( - rate( - http_request_duration_seconds_count{ - kubernetes_namespace="test", - kubernetes_pod_name=~"podinfo-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)" - }[1m] - ) - ) * 100 + - name: "404s percentage" + templateRef: + name: not-found-percentage + thresholdRange: + max: 5 + interval: 1m ``` -The above configuration validates the canary \(green version\) by checking if the HTTP 404 req/sec percentage is below 5 percent of the total traffic. If the 404s rate reaches the 5% threshold, then the rollout is rolled back. +The above configuration validates the canary (green version) by checking if the HTTP 404 req/sec percentage is +below 5 percent of the total traffic. If the 404s rate reaches the 5% threshold, then the rollout is rolled back. Trigger a deployment by updating the container image: @@ -312,7 +339,8 @@ Rolling back podinfo.test failed checks threshold reached 2 Canary failed! Scaling down podinfo.test ``` -If you have Slack configured, Flagger will send a notification with the reason why the canary failed. +If you have [alerting](../usage/alerting.md) configured, +Flagger will send a notification with the reason why the canary failed. ## Conformance Testing with Helm @@ -344,5 +372,8 @@ Add a helm test pre-rollout hook to your chart: cmd: "test {{ .Release.Name }} --cleanup" ``` -When the canary analysis starts, Flagger will call the pre-rollout webhooks. If the helm test fails, Flagger will retry until the analysis threshold is reached and the canary is rolled back. +When the canary analysis starts, Flagger will call the pre-rollout webhooks. +If the helm test fails, Flagger will retry until the analysis threshold is reached and the canary is rolled back. + +For an in-depth look at the analysis process read the [usage docs](../usage/how-it-works.md). diff --git a/docs/gitbook/tutorials/linkerd-progressive-delivery.md b/docs/gitbook/tutorials/linkerd-progressive-delivery.md index bc446726..9721efb4 100644 --- a/docs/gitbook/tutorials/linkerd-progressive-delivery.md +++ b/docs/gitbook/tutorials/linkerd-progressive-delivery.md @@ -16,11 +16,11 @@ kubectl apply -k github.com/weaveworks/flagger//kustomize/linkerd Note that you'll need kubectl 1.14 or newer to run the above command. -To enable Slack or MS Teams notifications, see Flagger's [install docs](https://docs.flagger.app/install/flagger-install-on-kubernetes) for Kustomize or Helm options. - ## Bootstrap -Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler \(HPA\), then creates a series of objects \(Kubernetes deployments, ClusterIP services and SMI traffic split\). These objects expose the application inside the mesh and drive the canary analysis and promotion. +Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler (HPA), +then creates a series of objects (Kubernetes deployments, ClusterIP services and SMI traffic split). +These objects expose the application inside the mesh and drive the canary analysis and promotion. Create a test namespace and enable Linkerd proxy injection: @@ -113,7 +113,8 @@ Save the above resource as podinfo-canary.yaml and then apply it: kubectl apply -f ./podinfo-canary.yaml ``` -When the canary analysis starts, Flagger will call the pre-rollout webhooks before routing traffic to the canary. The canary analysis will run for five minutes while validating the HTTP metrics and rollout hooks every half a minute. +When the canary analysis starts, Flagger will call the pre-rollout webhooks before routing traffic to the canary. +The canary analysis will run for five minutes while validating the HTTP metrics and rollout hooks every half a minute. After a couple of seconds Flagger will create the canary objects: @@ -133,11 +134,15 @@ service/podinfo-primary trafficsplits.split.smi-spec.io/podinfo ``` -After the boostrap, the podinfo deployment will be scaled to zero and the traffic to `podinfo.test` will be routed to the primary pods. During the canary analysis, the `podinfo-canary.test` address can be used to target directly the canary pods. +After the boostrap, the podinfo deployment will be scaled to zero and the traffic to `podinfo.test` +will be routed to the primary pods. +During the canary analysis, the `podinfo-canary.test` address can be used to target directly the canary pods. ## Automated canary promotion -Flagger implements a control loop that gradually shifts traffic to the canary while measuring key performance indicators like HTTP requests success rate, requests average duration and pod health. Based on analysis of the KPIs a canary is promoted or aborted, and the analysis result is published to Slack. +Flagger implements a control loop that gradually shifts traffic to the canary while measuring +key performance indicators like HTTP requests success rate, requests average duration and pod health. +Based on analysis of the KPIs a canary is promoted or aborted, and the analysis result is published to Slack. ![Flagger Canary Stages](https://raw.githubusercontent.com/weaveworks/flagger/master/docs/diagrams/flagger-canary-steps.png) @@ -198,7 +203,8 @@ prod backend Failed 0 2019-06-30T17:05:07Z ## Automated rollback -During the canary analysis you can generate HTTP 500 errors and high latency to test if Flagger pauses and rolls back the faulted version. +During the canary analysis you can generate HTTP 500 errors and high latency to +test if Flagger pauses and rolls back the faulted version. Trigger another canary deployment: @@ -225,7 +231,8 @@ Generate latency: watch -n 1 curl http://podinfo-canary.test:9898/delay/1 ``` -When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary, the canary is scaled to zero and the rollout is marked as failed. +When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary, +the canary is scaled to zero and the rollout is marked as failed. ```text kubectl -n test describe canary/podinfo @@ -284,7 +291,9 @@ Let's a define a check for not found errors. Edit the canary analysis and add th * 100 ``` -The above configuration validates the canary version by checking if the HTTP 404 req/sec percentage is below three percent of the total traffic. If the 404s rate reaches the 3% threshold, then the analysis is aborted and the canary is marked as failed. +The above configuration validates the canary version by checking if the HTTP 404 req/sec percentage +is below three percent of the total traffic. +If the 404s rate reaches the 3% threshold, then the analysis is aborted and the canary is marked as failed. Trigger a canary deployment by updating the container image: @@ -329,7 +338,8 @@ helm upgrade -i nginx-ingress stable/nginx-ingress \ --namespace ingress-nginx ``` -Create an ingress definition for podinfo that rewrites the incoming header to the internal service name \(required by Linkerd\): +Create an ingress definition for podinfo that rewrites the incoming header +to the internal service name (required by Linkerd): ```yaml apiVersion: extensions/v1beta1 @@ -355,15 +365,20 @@ spec: servicePort: 9898 ``` -When using an ingress controller, the Linkerd traffic split does not apply to incoming traffic since NGINX in running outside of the mesh. In order to run a canary analysis for a frontend app, Flagger creates a shadow ingress and sets the NGINX specific annotations. +When using an ingress controller, the Linkerd traffic split does not apply to incoming traffic +since NGINX in running outside of the mesh. In order to run a canary analysis for a frontend app, +Flagger creates a shadow ingress and sets the NGINX specific annotations. ## A/B Testing -Besides weighted routing, Flagger can be configured to route traffic to the canary based on HTTP match conditions. In an A/B testing scenario, you'll be using HTTP headers or cookies to target a certain segment of your users. This is particularly useful for frontend applications that require session affinity. +Besides weighted routing, Flagger can be configured to route traffic to the canary based on HTTP match conditions. +In an A/B testing scenario, you'll be using HTTP headers or cookies to target a certain segment of your users. +This is particularly useful for frontend applications that require session affinity. ![Flagger Linkerd Ingress](https://raw.githubusercontent.com/weaveworks/flagger/master/docs/diagrams/flagger-nginx-linkerd.png) -Edit podinfo canary analysis, set the provider to `nginx`, add the ingress reference, remove the max/step weight and add the match conditions and iterations: +Edit podinfo canary analysis, set the provider to `nginx`, add the ingress reference, +remove the max/step weight and add the match conditions and iterations: ```yaml apiVersion: flagger.app/v1beta1 @@ -425,7 +440,8 @@ spec: cmd: "hey -z 2m -q 10 -c 2 -H 'Cookie: canary=always' http://app.example.com" ``` -The above configuration will run an analysis for ten minutes targeting users that have a `canary` cookie set to `always` or those that call the service using the `X-Canary: always` header. +The above configuration will run an analysis for ten minutes targeting users that have +a `canary` cookie set to `always` or those that call the service using the `X-Canary: always` header. **Note** that the load test now targets the external address and uses the canary cookie. @@ -459,3 +475,7 @@ Events: Promotion completed! Scaling down podinfo.test ``` +The above procedure can be extended with [custom metrics](../usage/metrics.md) checks, +[webhooks](../usage/webhooks.md), +[manual promotion](../usage/webhooks.md#manual-gating) approval and +[Slack or MS Teams](../usage/alerting.md) notifications. \ No newline at end of file diff --git a/docs/gitbook/tutorials/nginx-progressive-delivery.md b/docs/gitbook/tutorials/nginx-progressive-delivery.md index cbe824c2..4a53c961 100644 --- a/docs/gitbook/tutorials/nginx-progressive-delivery.md +++ b/docs/gitbook/tutorials/nginx-progressive-delivery.md @@ -44,7 +44,9 @@ helm upgrade -i flagger flagger/flagger \ ## Bootstrap -Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler \(HPA\), then creates a series of objects \(Kubernetes deployments, ClusterIP services and canary ingress\). These objects expose the application outside the cluster and drive the canary analysis and promotion. +Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler (HPA), +then creates a series of objects (Kubernetes deployments, ClusterIP services and canary ingress). +These objects expose the application outside the cluster and drive the canary analysis and promotion. Create a test namespace: @@ -186,7 +188,9 @@ ingresses.extensions/podinfo-canary ## Automated canary promotion -Flagger implements a control loop that gradually shifts traffic to the canary while measuring key performance indicators like HTTP requests success rate, requests average duration and pod health. Based on analysis of the KPIs a canary is promoted or aborted, and the analysis result is published to Slack. +Flagger implements a control loop that gradually shifts traffic to the canary while measuring +key performance indicators like HTTP requests success rate, requests average duration and pod health. +Based on analysis of the KPIs a canary is promoted or aborted, and the analysis result is published to Slack or MS Teams. ![Flagger Canary Stages](https://raw.githubusercontent.com/weaveworks/flagger/master/docs/diagrams/flagger-canary-steps.png) @@ -257,7 +261,8 @@ Generate HTTP 500 errors: watch curl http://app.example.com/status/500 ``` -When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary, the canary is scaled to zero and the rollout is marked as failed. +When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary, +the canary is scaled to zero and the rollout is marked as failed. ```text kubectl -n test describe canary/podinfo @@ -286,30 +291,49 @@ Events: The canary analysis can be extended with Prometheus queries. -The demo app is instrumented with Prometheus so you can create a custom check that will use the HTTP request duration histogram to validate the canary. +The demo app is instrumented with Prometheus so you can create a custom check that will use the +HTTP request duration histogram to validate the canary. -Edit the canary analysis and add the following metric: +Create a metric template and apply it on the cluster: + +```yaml +apiVersion: flagger.app/v1beta1 +kind: MetricTemplate +metadata: + name: latency + namespace: test +spec: + provider: + type: prometheus + address: http://flagger-promethues.ingress-nginx:9090 + query: | + histogram_quantile(0.99, + sum( + rate( + http_request_duration_seconds_bucket{ + kubernetes_namespace="{{ namespace }}", + kubernetes_pod_name=~"{{ target }}-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)" + }[1m] + ) + ) by (le) + ) +``` + +Edit the canary analysis and add the latency check: ```yaml analysis: metrics: - name: "latency" - threshold: 0.5 + templateRef: + name: latency + thresholdRange: + max: 0.5 interval: 1m - query: | - histogram_quantile(0.99, - sum( - rate( - http_request_duration_seconds_bucket{ - kubernetes_namespace="test", - kubernetes_pod_name=~"podinfo-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)" - }[1m] - ) - ) by (le) - ) ``` -The threshold is set to 500ms so if the average request duration in the last minute goes over half a second then the analysis will fail and the canary will not be promoted. +The threshold is set to 500ms so if the average request duration in the last minute goes over half a second +then the analysis will fail and the canary will not be promoted. Trigger a canary deployment by updating the container image: @@ -342,11 +366,13 @@ Rolling back podinfo.test failed checks threshold reached 5 Canary failed! Scaling down podinfo.test ``` -If you have Slack configured, Flagger will send a notification with the reason why the canary failed. +If you have alerting configured, Flagger will send a notification with the reason why the canary failed. ## A/B Testing -Besides weighted routing, Flagger can be configured to route traffic to the canary based on HTTP match conditions. In an A/B testing scenario, you'll be using HTTP headers or cookies to target a certain segment of your users. This is particularly useful for frontend applications that require session affinity. +Besides weighted routing, Flagger can be configured to route traffic to the canary based on HTTP match conditions. +In an A/B testing scenario, you'll be using HTTP headers or cookies to target a certain segment of your users. +This is particularly useful for frontend applications that require session affinity. ![Flagger A/B Testing Stages](https://raw.githubusercontent.com/weaveworks/flagger/master/docs/diagrams/flagger-abtest-steps.png) @@ -378,7 +404,8 @@ Edit the canary analysis, remove the max/step weight and add the match condition cmd: "hey -z 1m -q 10 -c 2 -H 'Cookie: canary=always' http://app.example.com/" ``` -The above configuration will run an analysis for ten minutes targeting users that have a `canary` cookie set to `always` or those that call the service using the `X-Canary: insider` header. +The above configuration will run an analysis for ten minutes targeting users that have a `canary` cookie +set to `always` or those that call the service using the `X-Canary: insider` header. Trigger a canary deployment by updating the container image: @@ -416,3 +443,7 @@ Events: Normal Synced 5s flagger Promotion completed! Scaling down podinfo.test ``` +The above procedure can be extended with [custom metrics](../usage/metrics.md) checks, +[webhooks](../usage/webhooks.md), +[manual promotion](../usage/webhooks.md#manual-gating) approval and +[Slack or MS Teams](../usage/alerting.md) notifications. diff --git a/docs/gitbook/how-it-works.md b/docs/gitbook/usage/how-it-works.md similarity index 96% rename from docs/gitbook/how-it-works.md rename to docs/gitbook/usage/how-it-works.md index d4206580..9f83efd0 100644 --- a/docs/gitbook/how-it-works.md +++ b/docs/gitbook/usage/how-it-works.md @@ -168,7 +168,7 @@ spec: When using **Istio** as the mesh provider, you can also specify HTTP header operations, CORS and traffic policies, Istio gateways and hosts. -The Istio routing configuration can be found [here](faq.md#istio-routing). +The Istio routing configuration can be found [here](../faq.md#istio-routing). ### Canary status @@ -241,10 +241,10 @@ kubectl get canary/podinfo | grep Succeeded ### Canary analysis The canary analysis defines: -* the type of [deployment strategy](usage/deployment-strategies.md) -* the [metrics](usage/metrics.md) used to validate the canary version -* the [webhooks](usage/webhooks.md) used for conformance testing, load testing and manual gating -* the [alerting settings](usage/alerting.md) +* the type of [deployment strategy](deployment-strategies.md) +* the [metrics](metrics.md) used to validate the canary version +* the [webhooks](webhooks.md) used for conformance testing, load testing and manual gating +* the [alerting settings](alerting.md) Spec: From 0c998c36cfe5758f3a3c3463694affa069fbb4e0 Mon Sep 17 00:00:00 2001 From: stefanprodan Date: Sun, 1 Mar 2020 12:13:55 +0200 Subject: [PATCH 11/13] docs: add upgrade guide for v1beta1 --- docs/gitbook/SUMMARY.md | 3 ++- docs/gitbook/dev/upgrade-guide.md | 25 +++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 docs/gitbook/dev/upgrade-guide.md diff --git a/docs/gitbook/SUMMARY.md b/docs/gitbook/SUMMARY.md index 5267e07a..d85459fc 100644 --- a/docs/gitbook/SUMMARY.md +++ b/docs/gitbook/SUMMARY.md @@ -36,4 +36,5 @@ ## Dev * [Development Guide](dev/dev-guide.md) -* [Release Guide](dev/release-guide.md) \ No newline at end of file +* [Release Guide](dev/release-guide.md) +* [Upgrade Guide](dev/upgrade-guide.md) \ No newline at end of file diff --git a/docs/gitbook/dev/upgrade-guide.md b/docs/gitbook/dev/upgrade-guide.md new file mode 100644 index 00000000..2b3aa151 --- /dev/null +++ b/docs/gitbook/dev/upgrade-guide.md @@ -0,0 +1,25 @@ +# Upgrade Guide + +This document describes how to upgrade Flagger. + +### Upgrade canaries v1alpha3 to v1beta1 + +Canary CRD changes in `canaries.flagger.app/v1beta1`: +* the `spec.canaryAnalysis` field has been deprecated and replaced with `spec.analysis` +* the `spec.analysis.interval` and `spec.analysis.threshold` fields are required +* the `status.lastAppliedSpec` and `status.lastPromotedSpec` hashing algorithm changed to `hash/fnv` +* the `spec.analysis.alerts` array can reference `alertproviders.flagger.app/v1beta1` resources +* the `spec.analysis.metrics[].templateRef` can reference a `metrictemplate.flagger.app/v1beta1` resource +* the `metric.threshold` field has been deprecated and replaced with `metric.thresholdRange` +* the `spec.targetRef` can reference `DaemonSet` kind + +Upgrade procedure: +* install the `v1beta1` CRDs +* update Flagger deployment +* replace `apiVersion: flagger.app/v1alpha3` with `apiVersion: flagger.app/v1beta1` in all canary manifests +* replace `spec.canaryAnalysis` with `spec.analysis` in all canary manifests +* update canary manifests in cluster + +**Note** that after upgrading Flagger, all canaries will be triggered as the hash value used for tracking changes +is computed differently. You can set `spec.skipAnalysis: true` in all canary manifests before upgrading Flagger, +do the upgrade, wait for Flagger to finish the no-op promotions and finally set `skipAnalysis` to `false`. From b6958733e19e9d1ec348613f4d86d58e5cac6e88 Mon Sep 17 00:00:00 2001 From: stefanprodan Date: Sun, 1 Mar 2020 22:36:08 +0200 Subject: [PATCH 12/13] docs: replace threshold with thresholdRange --- docs/gitbook/dev/upgrade-guide.md | 16 ++++++++++++++++ docs/gitbook/faq.md | 6 ++++-- .../tutorials/appmesh-progressive-delivery.md | 6 ++++-- .../tutorials/contour-progressive-delivery.md | 6 ++++-- .../tutorials/crossover-progressive-delivery.md | 6 ++++-- docs/gitbook/tutorials/flagger-smi-istio.md | 6 ++++-- .../tutorials/gloo-progressive-delivery.md | 6 ++++-- docs/gitbook/tutorials/istio-ab-testing.md | 6 ++++-- .../tutorials/istio-progressive-delivery.md | 12 ++++++++---- docs/gitbook/tutorials/kubernetes-blue-green.md | 6 ++++-- .../tutorials/linkerd-progressive-delivery.md | 12 ++++++++---- .../tutorials/nginx-progressive-delivery.md | 6 ++++-- docs/gitbook/usage/how-it-works.md | 6 ++++-- 13 files changed, 72 insertions(+), 28 deletions(-) diff --git a/docs/gitbook/dev/upgrade-guide.md b/docs/gitbook/dev/upgrade-guide.md index 2b3aa151..c5f2b503 100644 --- a/docs/gitbook/dev/upgrade-guide.md +++ b/docs/gitbook/dev/upgrade-guide.md @@ -23,3 +23,19 @@ Upgrade procedure: **Note** that after upgrading Flagger, all canaries will be triggered as the hash value used for tracking changes is computed differently. You can set `spec.skipAnalysis: true` in all canary manifests before upgrading Flagger, do the upgrade, wait for Flagger to finish the no-op promotions and finally set `skipAnalysis` to `false`. + +Update builtin metrics: +* replace `threshold` with `thresholdRange.min` for request-success-rate +* replace `threshold` with `thresholdRange.max` for request-duration + +```yaml +metrics: +- name: request-success-rate + thresholdRange: + min: 99 + interval: 1m +- name: request-duration + thresholdRange: + max: 500 + interval: 1m +``` diff --git a/docs/gitbook/faq.md b/docs/gitbook/faq.md index 1e75588d..ecc807c4 100644 --- a/docs/gitbook/faq.md +++ b/docs/gitbook/faq.md @@ -237,7 +237,8 @@ Spec: - name: request-success-rate # minimum req success rate (non 5xx responses) # percentage (0-100) - threshold: 99 + thresholdRange: + min: 99 interval: 1m ``` @@ -299,7 +300,8 @@ Spec: - name: request-duration # maximum req duration P99 # milliseconds - threshold: 500 + thresholdRange: + max: 500 interval: 1m ``` diff --git a/docs/gitbook/tutorials/appmesh-progressive-delivery.md b/docs/gitbook/tutorials/appmesh-progressive-delivery.md index b1201e6c..4ee1fcb2 100644 --- a/docs/gitbook/tutorials/appmesh-progressive-delivery.md +++ b/docs/gitbook/tutorials/appmesh-progressive-delivery.md @@ -108,12 +108,14 @@ spec: - name: request-success-rate # minimum req success rate (non 5xx responses) # percentage (0-100) - threshold: 99 + thresholdRange: + min: 99 interval: 1m - name: request-duration # maximum req duration P99 # milliseconds - threshold: 500 + thresholdRange: + max: 500 interval: 30s # testing (optional) webhooks: diff --git a/docs/gitbook/tutorials/contour-progressive-delivery.md b/docs/gitbook/tutorials/contour-progressive-delivery.md index 78119bd7..5930c1ca 100644 --- a/docs/gitbook/tutorials/contour-progressive-delivery.md +++ b/docs/gitbook/tutorials/contour-progressive-delivery.md @@ -109,11 +109,13 @@ spec: - name: request-success-rate # minimum req success rate (non 5xx responses) # percentage (0-100) - threshold: 99 + thresholdRange: + min: 99 interval: 1m - name: request-duration # maximum req duration P99 in milliseconds - threshold: 500 + thresholdRange: + max: 500 interval: 30s # testing webhooks: diff --git a/docs/gitbook/tutorials/crossover-progressive-delivery.md b/docs/gitbook/tutorials/crossover-progressive-delivery.md index d2323dd5..8f92bfa0 100644 --- a/docs/gitbook/tutorials/crossover-progressive-delivery.md +++ b/docs/gitbook/tutorials/crossover-progressive-delivery.md @@ -128,12 +128,14 @@ spec: - name: request-success-rate # minimum req success rate (non 5xx responses) # percentage (0-100) - threshold: 99 + thresholdRange: + min: 99 interval: 1m - name: request-duration # maximum req duration P99 # milliseconds - threshold: 500 + thresholdRange: + max: 500 interval: 30s # testing (optional) webhooks: diff --git a/docs/gitbook/tutorials/flagger-smi-istio.md b/docs/gitbook/tutorials/flagger-smi-istio.md index 55580a1c..90291e48 100644 --- a/docs/gitbook/tutorials/flagger-smi-istio.md +++ b/docs/gitbook/tutorials/flagger-smi-istio.md @@ -149,12 +149,14 @@ spec: - name: request-success-rate # minimum req success rate (non 5xx responses) # percentage (0-100) - threshold: 99 + thresholdRange: + min: 99 interval: 1m - name: request-duration # maximum req duration P99 # milliseconds - threshold: 500 + thresholdRange: + max: 500 interval: 30s # generate traffic during analysis webhooks: diff --git a/docs/gitbook/tutorials/gloo-progressive-delivery.md b/docs/gitbook/tutorials/gloo-progressive-delivery.md index 9ff5011e..d95e434c 100644 --- a/docs/gitbook/tutorials/gloo-progressive-delivery.md +++ b/docs/gitbook/tutorials/gloo-progressive-delivery.md @@ -121,12 +121,14 @@ spec: - name: request-success-rate # minimum req success rate (non 5xx responses) # percentage (0-100) - threshold: 99 + thresholdRange: + min: 99 interval: 1m - name: request-duration # maximum req duration P99 # milliseconds - threshold: 500 + thresholdRange: + max: 500 interval: 30s # testing (optional) webhooks: diff --git a/docs/gitbook/tutorials/istio-ab-testing.md b/docs/gitbook/tutorials/istio-ab-testing.md index 484f42f0..857cb962 100644 --- a/docs/gitbook/tutorials/istio-ab-testing.md +++ b/docs/gitbook/tutorials/istio-ab-testing.md @@ -82,12 +82,14 @@ spec: - name: request-success-rate # minimum req success rate (non 5xx responses) # percentage (0-100) - threshold: 99 + thresholdRange: + min: 99 interval: 1m - name: request-duration # maximum req duration P99 # milliseconds - threshold: 500 + thresholdRange: + max: 500 interval: 30s # generate traffic during analysis webhooks: diff --git a/docs/gitbook/tutorials/istio-progressive-delivery.md b/docs/gitbook/tutorials/istio-progressive-delivery.md index 12dedf43..bb9e318e 100644 --- a/docs/gitbook/tutorials/istio-progressive-delivery.md +++ b/docs/gitbook/tutorials/istio-progressive-delivery.md @@ -85,12 +85,14 @@ spec: - name: request-success-rate # minimum req success rate (non 5xx responses) # percentage (0-100) - threshold: 99 + thresholdRange: + min: 99 interval: 1m - name: request-duration # maximum req duration P99 # milliseconds - threshold: 500 + thresholdRange: + max: 500 interval: 30s # testing (optional) webhooks: @@ -277,10 +279,12 @@ spec: mirror: true metrics: - name: request-success-rate - threshold: 99 + thresholdRange: + min: 99 interval: 1m - name: request-duration - threshold: 500 + thresholdRange: + max: 500 interval: 1m webhooks: - name: acceptance-test diff --git a/docs/gitbook/tutorials/kubernetes-blue-green.md b/docs/gitbook/tutorials/kubernetes-blue-green.md index 836cbc1e..8598b8ba 100644 --- a/docs/gitbook/tutorials/kubernetes-blue-green.md +++ b/docs/gitbook/tutorials/kubernetes-blue-green.md @@ -110,12 +110,14 @@ spec: - name: request-success-rate # minimum req success rate (non 5xx responses) # percentage (0-100) - threshold: 99 + thresholdRange: + min: 99 interval: 1m - name: request-duration # maximum req duration P99 # milliseconds - threshold: 500 + thresholdRange: + max: 500 interval: 30s # acceptance/load testing hooks webhooks: diff --git a/docs/gitbook/tutorials/linkerd-progressive-delivery.md b/docs/gitbook/tutorials/linkerd-progressive-delivery.md index 9721efb4..87c161d1 100644 --- a/docs/gitbook/tutorials/linkerd-progressive-delivery.md +++ b/docs/gitbook/tutorials/linkerd-progressive-delivery.md @@ -84,12 +84,14 @@ spec: - name: request-success-rate # minimum req success rate (non 5xx responses) # percentage (0-100) - threshold: 99 + thresholdRange: + min: 99 interval: 1m - name: request-duration # maximum req duration P99 # milliseconds - threshold: 500 + thresholdRange: + max: 500 interval: 30s # testing (optional) webhooks: @@ -420,10 +422,12 @@ spec: # Linkerd Prometheus checks metrics: - name: request-success-rate - threshold: 99 + thresholdRange: + min: 99 interval: 1m - name: request-duration - threshold: 500 + thresholdRange: + max: 500 interval: 30s webhooks: - name: acceptance-test diff --git a/docs/gitbook/tutorials/nginx-progressive-delivery.md b/docs/gitbook/tutorials/nginx-progressive-delivery.md index 4a53c961..f6d9de2f 100644 --- a/docs/gitbook/tutorials/nginx-progressive-delivery.md +++ b/docs/gitbook/tutorials/nginx-progressive-delivery.md @@ -144,7 +144,8 @@ spec: - name: request-success-rate # minimum req success rate (non 5xx responses) # percentage (0-100) - threshold: 99 + thresholdRange: + min: 99 interval: 1m # testing (optional) webhooks: @@ -394,7 +395,8 @@ Edit the canary analysis, remove the max/step weight and add the match condition exact: "canary" metrics: - name: request-success-rate - threshold: 99 + thresholdRange: + min: 99 interval: 1m webhooks: - name: load-test diff --git a/docs/gitbook/usage/how-it-works.md b/docs/gitbook/usage/how-it-works.md index 9f83efd0..4b0cea47 100644 --- a/docs/gitbook/usage/how-it-works.md +++ b/docs/gitbook/usage/how-it-works.md @@ -29,10 +29,12 @@ spec: stepWeight: 5 metrics: - name: request-success-rate - threshold: 99 + thresholdRange: + min: 99 interval: 1m - name: request-duration - threshold: 99 + thresholdRange: + max: 500 interval: 1m webhooks: - name: load-test From 34c9fecf8c065c793de19f2039c51718556dd4fa Mon Sep 17 00:00:00 2001 From: stefanprodan Date: Mon, 2 Mar 2020 13:05:33 +0200 Subject: [PATCH 13/13] docs: add prerequisites to tutorials --- README.md | 6 +- .../tutorials/contour-progressive-delivery.md | 2 +- docs/gitbook/tutorials/istio-ab-testing.md | 53 +++++++++++++--- .../tutorials/istio-progressive-delivery.md | 62 ++++++++++++++++--- .../tutorials/kubernetes-blue-green.md | 8 +-- 5 files changed, 106 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 2dd3dc88..e8222f73 100644 --- a/README.md +++ b/README.md @@ -127,13 +127,15 @@ spec: # builtin Prometheus check # minimum req success rate (non 5xx responses) # percentage (0-100) - threshold: 99 + thresholdRange: + min: 99 interval: 1m - name: request-duration # builtin Prometheus check # maximum req duration P99 # milliseconds - threshold: 500 + thresholdRange: + max: 500 interval: 30s - name: "database connections" # custom Prometheus check diff --git a/docs/gitbook/tutorials/contour-progressive-delivery.md b/docs/gitbook/tutorials/contour-progressive-delivery.md index 5930c1ca..1e61ff95 100644 --- a/docs/gitbook/tutorials/contour-progressive-delivery.md +++ b/docs/gitbook/tutorials/contour-progressive-delivery.md @@ -16,7 +16,7 @@ kubectl apply -f https://projectcontour.io/quickstart/contour.yaml The above command will deploy Contour and an Envoy daemonset in the `projectcontour` namespace. -Install Flagger using Kustomize \(kubectl 1.14\) in the `projectcontour` namespace: +Install Flagger using Kustomize (kubectl 1.14) in the `projectcontour` namespace: ```bash kubectl apply -k github.com/weaveworks/flagger//kustomize/contour diff --git a/docs/gitbook/tutorials/istio-ab-testing.md b/docs/gitbook/tutorials/istio-ab-testing.md index 857cb962..e47afce8 100644 --- a/docs/gitbook/tutorials/istio-ab-testing.md +++ b/docs/gitbook/tutorials/istio-ab-testing.md @@ -2,10 +2,48 @@ This guide shows you how to automate A/B testing with Istio and Flagger. -Besides weighted routing, Flagger can be configured to route traffic to the canary based on HTTP match conditions. In an A/B testing scenario, you'll be using HTTP headers or cookies to target a certain segment of your users. This is particularly useful for frontend applications that require session affinity. +Besides weighted routing, Flagger can be configured to route traffic to the canary based on HTTP match conditions. +In an A/B testing scenario, you'll be using HTTP headers or cookies to target a certain segment of your users. +This is particularly useful for frontend applications that require session affinity. ![Flagger A/B Testing Stages](https://raw.githubusercontent.com/weaveworks/flagger/master/docs/diagrams/flagger-abtest-steps.png) +## Prerequisites + +Flagger requires a Kubernetes cluster **v1.11** or newer and Istio **v1.0** or newer. + +Install Istio with telemetry support and Prometheus: + +```bash +istioctl manifest apply --set profile=default +``` + +Install Flagger using Kustomize (kubectl 1.14) in the `istio-system` namespace: + +```bash +kubectl apply -k github.com/weaveworks/flagger//kustomize/istio +``` + +Create an ingress gateway to expose the demo app outside of the mesh: + +```yaml +apiVersion: networking.istio.io/v1alpha3 +kind: Gateway +metadata: + name: public-gateway + namespace: istio-system +spec: + selector: + istio: ingressgateway + servers: + - port: + number: 80 + name: http + protocol: HTTP + hosts: + - "*" +``` + ## Bootstrap Create a test namespace with Istio sidecar injection enabled: @@ -132,7 +170,7 @@ virtualservice.networking.istio.io/podinfo Trigger a canary deployment by updating the container image: ```bash -kubectl -n test set image deployment/abtest \ +kubectl -n test set image deployment/podinfo \ podinfod=stefanprodan/podinfo:3.1.1 ``` @@ -160,8 +198,8 @@ Events: Normal Synced 55s flagger Advance podinfo.test canary iteration 8/10 Normal Synced 45s flagger Advance podinfo.test canary iteration 9/10 Normal Synced 35s flagger Advance podinfo.test canary iteration 10/10 - Normal Synced 25s flagger Copying podinfo.test template spec to abtest-primary.test - Warning Synced 15s flagger Waiting for abtest-primary.test rollout to finish: 1 of 2 updated replicas are available + Normal Synced 25s flagger Copying podinfo.test template spec to podinfo-primary.test + Warning Synced 15s flagger Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available Normal Synced 5s flagger Promotion completed! Scaling down podinfo.test ``` @@ -173,7 +211,7 @@ You can monitor all canaries with: watch kubectl get canaries --all-namespaces NAMESPACE NAME STATUS WEIGHT LASTTRANSITIONTIME -test abtest Progressing 100 2019-03-16T14:05:07Z +test podinfo Progressing 100 2019-03-16T14:05:07Z prod frontend Succeeded 0 2019-03-15T16:15:07Z prod backend Failed 0 2019-03-14T17:05:07Z ``` @@ -194,10 +232,11 @@ Generate latency: watch curl -b 'type=insider' http://app.example.com/delay/1 ``` -When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary, the canary is scaled to zero and the rollout is marked as failed. +When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary, +the canary is scaled to zero and the rollout is marked as failed. ```text -kubectl -n test describe canary/abtest +kubectl -n test describe canary/podinfo Status: Failed Checks: 2 diff --git a/docs/gitbook/tutorials/istio-progressive-delivery.md b/docs/gitbook/tutorials/istio-progressive-delivery.md index bb9e318e..413f4b40 100644 --- a/docs/gitbook/tutorials/istio-progressive-delivery.md +++ b/docs/gitbook/tutorials/istio-progressive-delivery.md @@ -4,9 +4,48 @@ This guide shows you how to use Istio and Flagger to automate canary deployments ![Flagger Canary Stages](https://raw.githubusercontent.com/weaveworks/flagger/master/docs/diagrams/flagger-canary-steps.png) +## Prerequisites + +Flagger requires a Kubernetes cluster **v1.11** or newer and Istio **v1.0** or newer. + +Install Istio with telemetry support and Prometheus: + +```bash +istioctl manifest apply --set profile=default +``` + +Install Flagger using Kustomize (kubectl 1.14) in the `istio-system` namespace: + +```bash +kubectl apply -k github.com/weaveworks/flagger//kustomize/istio +``` + +Create an ingress gateway to expose the demo app outside of the mesh: + +```yaml +apiVersion: networking.istio.io/v1alpha3 +kind: Gateway +metadata: + name: public-gateway + namespace: istio-system +spec: + selector: + istio: ingressgateway + servers: + - port: + number: 80 + name: http + protocol: HTTP + hosts: + - "*" +``` + ## Bootstrap -Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler \(HPA\), then creates a series of objects \(Kubernetes deployments, ClusterIP services, Istio destination rules and virtual services\). These objects expose the application inside the mesh and drive the canary analysis and promotion. +Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler (HPA), +then creates a series of objects (Kubernetes deployments, ClusterIP services, +Istio destination rules and virtual services). +These objects expose the application inside the mesh and drive the canary analysis and promotion. Create a test namespace with Istio sidecar injection enabled: @@ -27,7 +66,7 @@ Deploy the load testing service to generate traffic during the canary analysis: kubectl apply -k github.com/weaveworks/flagger//kustomize/tester ``` -Create a canary custom resource \(replace example.com with your own domain\): +Create a canary custom resource (replace example.com with your own domain): ```yaml apiVersion: flagger.app/v1beta1 @@ -116,7 +155,8 @@ Save the above resource as podinfo-canary.yaml and then apply it: kubectl apply -f ./podinfo-canary.yaml ``` -When the canary analysis starts, Flagger will call the pre-rollout webhooks before routing traffic to the canary. The canary analysis will run for five minutes while validating the HTTP metrics and rollout hooks every minute. +When the canary analysis starts, Flagger will call the pre-rollout webhooks before routing traffic to the canary. +The canary analysis will run for five minutes while validating the HTTP metrics and rollout hooks every minute. ![Flagger Canary Process](https://raw.githubusercontent.com/weaveworks/flagger/master/docs/diagrams/flagger-canary-hpa.png) @@ -226,7 +266,8 @@ Generate latency: watch curl http://podinfo-canary:9898/delay/1 ``` -When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary, the canary is scaled to zero and the rollout is marked as failed. +When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary, +the canary is scaled to zero and the rollout is marked as failed. ```text kubectl -n test describe canary/podinfo @@ -255,11 +296,14 @@ Events: ![Flagger Canary Traffic Shadowing](https://raw.githubusercontent.com/weaveworks/flagger/master/docs/diagrams/flagger-canary-traffic-mirroring.png) -For applications that perform read operations, Flagger can be configured to drive canary releases with traffic mirroring. Istio traffic mirroring will copy each incoming request, sending one request to the primary and one to the canary service. The response from the primary is sent back to the user and the response from the canary is discarded. Metrics are collected on both requests so that the deployment will only proceed if the canary metrics are within the threshold values. +For applications that perform read operations, Flagger can be configured to drive canary releases with traffic mirroring. +Istio traffic mirroring will copy each incoming request, sending one request to the primary and one to the canary service. +The response from the primary is sent back to the user and the response from the canary is discarded. +Metrics are collected on both requests so that the deployment will only proceed if the canary metrics are within the threshold values. -Note that mirroring should be used for requests that are **idempotent** or capable of being processed twice \(once by the primary and once by the canary\). +Note that mirroring should be used for requests that are **idempotent** or capable of being processed twice (once by the primary and once by the canary). -You can enable mirroring by replacing `stepWeight/maxWeight` with `iterations` and by setting `canaryAnalysis.mirror` to `true`: +You can enable mirroring by replacing `stepWeight/maxWeight` with `iterations` and by setting `analysis.mirror` to `true`: ```yaml apiVersion: flagger.app/v1beta1 @@ -303,7 +347,7 @@ spec: With the above configuration, Flagger will run a canary release with the following steps: -* detect new revision \(deployment spec, secrets or configmaps changes\) +* detect new revision (deployment spec, secrets or configmaps changes) * scale from zero the canary deployment * wait for the HPA to set the canary minimum replicas * check canary pods health @@ -315,7 +359,7 @@ With the above configuration, Flagger will run a canary release with the followi * abort the canary release if the metrics check failure threshold is reached * stop traffic mirroring after the number of iterations is reached * route live traffic to the canary pods -* promote the canary \(update the primary secrets, configmaps and deployment spec\) +* promote the canary (update the primary secrets, configmaps and deployment spec) * wait for the primary deployment rollout to finish * wait for the HPA to set the primary minimum replicas * check primary pods health diff --git a/docs/gitbook/tutorials/kubernetes-blue-green.md b/docs/gitbook/tutorials/kubernetes-blue-green.md index 8598b8ba..0b9cac29 100644 --- a/docs/gitbook/tutorials/kubernetes-blue-green.md +++ b/docs/gitbook/tutorials/kubernetes-blue-green.md @@ -57,17 +57,13 @@ kubectl create ns test Create a deployment and a horizontal pod autoscaler: ```bash -export REPO=https://raw.githubusercontent.com/weaveworks/flagger/master - -kubectl apply -f ${REPO}/artifacts/canaries/deployment.yaml -kubectl apply -f ${REPO}/artifacts/canaries/hpa.yaml +kubectl apply -k github.com/weaveworks/flagger//kustomize/podinfo ``` Deploy the load testing service to generate traffic during the analysis: ```bash -kubectl -n test apply -f ${REPO}/artifacts/loadtester/deployment.yaml -kubectl -n test apply -f ${REPO}/artifacts/loadtester/service.yaml +kubectl apply -k github.com/weaveworks/flagger//kustomize/tester ``` Create a canary custom resource: