Compare commits

...

16 Commits
0.6.0 ... 0.7.0

Author SHA1 Message Date
Stefan Prodan
1140af8dc7 Merge pull request #63 from stefanprodan/release-0.7.0
Release v0.7.0
2019-02-28 17:12:27 +02:00
stefanprodan
a2688c3910 Add link to custom metrics docs 2019-02-28 16:58:26 +02:00
stefanprodan
75b27ab3f3 Add change log for v0.7.0 2019-02-28 16:56:49 +02:00
stefanprodan
59d3f55fb2 Release v0.7.0 2019-02-28 16:05:48 +02:00
Stefan Prodan
f34739f334 Merge pull request #62 from stefanprodan/retries
Add timeout and retries
2019-02-28 15:36:46 +02:00
stefanprodan
90c71ec18f Update roadmap with alternatives to Istio 2019-02-28 15:09:24 +02:00
stefanprodan
395234d7c8 Add promql custom check to readme 2019-02-28 00:33:47 +02:00
stefanprodan
e322ba0065 Add timeout and retries to router 2019-02-28 00:05:40 +02:00
stefanprodan
6db8b96f72 Add timeout and retries example to docs 2019-02-28 00:02:48 +02:00
stefanprodan
44d7e96e96 Add timeout and retries fields to Canary CRD 2019-02-28 00:02:01 +02:00
Stefan Prodan
1662479c8d Merge pull request #60 from stefanprodan/custom-metrics
Add support for custom metrics
2019-02-27 23:31:05 +02:00
stefanprodan
2e351fcf0d Add a custom metric example to docs 2019-02-27 16:37:42 +02:00
stefanprodan
5d81876d07 Make the metric interval optional
- set default value to 1m
2019-02-27 16:03:56 +02:00
stefanprodan
c81e6989ec Add e2e tests for custom metrics 2019-02-27 15:49:09 +02:00
stefanprodan
4d61a896c3 Add custom promql queries support 2019-02-27 15:48:31 +02:00
stefanprodan
d148933ab3 Add metric query field to Canary CRD 2019-02-27 15:46:09 +02:00
15 changed files with 200 additions and 20 deletions

View File

@@ -2,6 +2,15 @@
All notable changes to this project are documented in this file.
## 0.7.0 (2019-02-28)
Adds support for custom metric checks, HTTP timeouts and HTTP retries
#### Features
- Allow custom promql queries in the canary analysis spec [##60](https://github.com/stefanprodan/flagger/pull/#60)
- Add HTTP timeout and retries to canary service spec [##62](https://github.com/stefanprodan/flagger/pull/#62)
## 0.6.0 (2019-02-25)
Allows for [HTTPMatchRequests](https://istio.io/docs/reference/config/istio.networking.v1alpha3/#HTTPMatchRequest)
@@ -20,7 +29,6 @@ to be customized in the service spec of the canary custom resource.
- Run e2e testing on [Kubernetes Kind](https://github.com/kubernetes-sigs/kind) for canary promotion
[#53](https://github.com/stefanprodan/flagger/pull/53)
## 0.5.1 (2019-02-14)
Allows skipping the analysis phase to ship changes directly to production

View File

@@ -30,6 +30,7 @@ Flagger documentation can be found at [docs.flagger.app](https://docs.flagger.ap
* [Canary deployment stages](https://docs.flagger.app/how-it-works#canary-deployment)
* [Canary analysis](https://docs.flagger.app/how-it-works#canary-analysis)
* [HTTP metrics](https://docs.flagger.app/how-it-works#http-metrics)
* [Custom metrics](https://docs.flagger.app/how-it-works#custom-metrics)
* [Webhooks](https://docs.flagger.app/how-it-works#webhooks)
* [Load testing](https://docs.flagger.app/how-it-works#load-testing)
* Usage
@@ -98,16 +99,21 @@ spec:
# Istio virtual service host names (optional)
hosts:
- podinfo.example.com
# Istio virtual service HTTP match conditions (optional)
# HTTP match conditions (optional)
match:
- uri:
prefix: /
# Istio virtual service HTTP rewrite (optional)
# HTTP rewrite (optional)
rewrite:
uri: /
# for emergency cases when you want to ship changes
# in production without analysing the canary (default false)
# timeout for HTTP requests (optional)
timeout: 5s
# retry policy when a HTTP request fails (optional)
retries:
attempts: 3
# promote the canary without analysing it (default false)
skipAnalysis: false
# define the canary analysis timing and KPIs
canaryAnalysis:
# schedule interval (default 60s)
interval: 1m
@@ -121,6 +127,7 @@ spec:
stepWeight: 5
# Istio Prometheus checks
metrics:
# builtin Istio checks
- name: istio_requests_total
# minimum req success rate (non 5xx responses)
# percentage (0-100)
@@ -131,6 +138,16 @@ spec:
# milliseconds
threshold: 500
interval: 30s
# custom check
- name: "kafka lag"
threshold: 100
query: |
avg_over_time(
kafka_consumergroup_lag{
consumergroup=~"podinfo-consumer-.*",
topic="podinfo"
}[1m]
)
# external checks (optional)
webhooks:
- name: load-test
@@ -144,8 +161,8 @@ For more details on how the canary analysis and promotion works please [read the
### Roadmap
* Extend the validation mechanism to support other metrics than HTTP success rate and latency
* Add A/B testing capabilities using fixed routing based on HTTP headers and cookies match conditions
* Integrate with other service mesh technologies like AWS AppMesh and Linkerd v2
* Add support for comparing the canary metrics to the primary ones and do the validation based on the derivation between the two
### Contributing

View File

@@ -73,6 +73,8 @@ spec:
properties:
port:
type: number
timeout:
type: string
skipAnalysis:
type: boolean
canaryAnalysis:
@@ -91,7 +93,7 @@ spec:
properties:
items:
type: object
required: ['name', 'interval', 'threshold']
required: ['name', 'threshold']
properties:
name:
type: string
@@ -100,6 +102,8 @@ spec:
pattern: "^[0-9]+(m|s)"
threshold:
type: number
query:
type: string
webhooks:
type: array
properties:

View File

@@ -22,7 +22,7 @@ spec:
serviceAccountName: flagger
containers:
- name: flagger
image: quay.io/stefanprodan/flagger:0.6.0
image: quay.io/stefanprodan/flagger:0.7.0
imagePullPolicy: IfNotPresent
ports:
- name: http

View File

@@ -1,7 +1,7 @@
apiVersion: v1
name: flagger
version: 0.6.0
appVersion: 0.6.0
version: 0.7.0
appVersion: 0.7.0
kubeVersion: ">=1.11.0-0"
engine: gotpl
description: Flagger is a Kubernetes operator that automates the promotion of canary deployments using Istio routing for traffic shifting and Prometheus metrics for canary analysis.

View File

@@ -74,6 +74,8 @@ spec:
properties:
port:
type: number
timeout:
type: string
skipAnalysis:
type: boolean
canaryAnalysis:
@@ -92,7 +94,7 @@ spec:
properties:
items:
type: object
required: ['name', 'interval', 'threshold']
required: ['name', 'threshold']
properties:
name:
type: string
@@ -101,6 +103,8 @@ spec:
pattern: "^[0-9]+(m|s)"
threshold:
type: number
query:
type: string
webhooks:
type: array
properties:

View File

@@ -2,7 +2,7 @@
image:
repository: quay.io/stefanprodan/flagger
tag: 0.6.0
tag: 0.7.0
pullPolicy: IfNotPresent
metricsServer: "http://prometheus.istio-system.svc.cluster.local:9090"

View File

@@ -39,16 +39,22 @@ spec:
# Istio virtual service host names (optional)
hosts:
- podinfo.example.com
# Istio virtual service HTTP match conditions (optional)
# HTTP match conditions (optional)
match:
- uri:
prefix: /
# Istio virtual service HTTP rewrite (optional)
# HTTP rewrite (optional)
rewrite:
uri: /
# for emergency cases when you want to ship changes
# in production without analysing the canary
# timeout for HTTP requests (optional)
timeout: 5s
# retry policy when a HTTP request fails (optional)
retries:
attempts: 3
perTryTimeout: 3s
# promote the canary without analysing it (default false)
skipAnalysis: false
# define the canary analysis timing and KPIs
canaryAnalysis:
# schedule interval (default 60s)
interval: 1m
@@ -377,6 +383,49 @@ histogram_quantile(0.99,
> **Note** that the metric interval should be lower or equal to the control loop interval.
### Custom Metrics
The canary analysis can be extended with custom Prometheus queries.
```yaml
canaryAnalysis:
threshold: 1
maxWeight: 50
stepWeight: 5
metrics:
- name: "404s percentage"
threshold: 5
query: |
100 - sum(
rate(
istio_requests_total{
reporter="destination",
destination_workload_namespace="test",
destination_workload="podinfo",
response_code!="404"
}[1m]
)
)
/
sum(
rate(
istio_requests_total{
reporter="destination",
destination_workload_namespace="test",
destination_workload="podinfo"
}[1m]
)
) * 100
```
The above configuration validates the canary by checking
if the HTTP 404 req/sec percentage is below 5 percent of the total traffic.
If the 404s rate reaches the 5% threshold, then the canary fails.
When specifying a query, Flagger will run the promql query and convert the result to float64.
Then it compares the query result value with the metric threshold value.
### Webhooks
The canary analysis can be extended with webhooks.

View File

@@ -27,6 +27,7 @@ const (
CanaryKind = "Canary"
ProgressDeadlineSeconds = 600
AnalysisInterval = 60 * time.Second
MetricInterval = "1m"
)
// +genclient
@@ -113,6 +114,8 @@ type CanaryService struct {
Hosts []string `json:"hosts"`
Match []istiov1alpha3.HTTPMatchRequest `json:"match,omitempty"`
Rewrite *istiov1alpha3.HTTPRewrite `json:"rewrite,omitempty"`
Timeout string `json:"timeout,omitempty"`
Retries *istiov1alpha3.HTTPRetry `json:"retries,omitempty"`
}
// CanaryAnalysis is used to describe how the analysis should be done
@@ -127,9 +130,11 @@ type CanaryAnalysis struct {
// CanaryMetric holds the reference to Istio metrics used for canary analysis
type CanaryMetric struct {
Name string `json:"name"`
Interval string `json:"interval"`
Threshold int `json:"threshold"`
Name string `json:"name"`
Interval string `json:"interval,omitempty"`
Threshold float64 `json:"threshold"`
// +optional
Query string `json:"query,omitempty"`
}
// CanaryWebhook holds the reference to external checks used for canary analysis
@@ -170,3 +175,8 @@ func (c *Canary) GetAnalysisInterval() time.Duration {
return interval
}
// GetMetricInterval returns the metric interval default value (1m)
func (c *Canary) GetMetricInterval() string {
return MetricInterval
}

View File

@@ -156,6 +156,11 @@ func (in *CanaryService) DeepCopyInto(out *CanaryService) {
*out = new(istiov1alpha3.HTTPRewrite)
**out = **in
}
if in.Retries != nil {
in, out := &in.Retries, &out.Retries
*out = new(istiov1alpha3.HTTPRetry)
**out = **in
}
return
}

View File

@@ -8,6 +8,7 @@ import (
"net/http"
"net/url"
"strconv"
"strings"
"time"
)
@@ -73,6 +74,38 @@ func (c *CanaryObserver) queryMetric(query string) (*vectorQueryResponse, error)
return &values, nil
}
// GetScalar runs the promql query and returns the first value found
func (c *CanaryObserver) GetScalar(query string) (float64, error) {
if c.metricsServer == "fake" {
return 100, nil
}
query = strings.Replace(query, "\n", "", -1)
query = strings.Replace(query, " ", "", -1)
var value *float64
result, err := c.queryMetric(query)
if err != nil {
return 0, err
}
for _, v := range result.Data.Result {
metricValue := v.Value[1]
switch metricValue.(type) {
case string:
f, err := strconv.ParseFloat(metricValue.(string), 64)
if err != nil {
return 0, err
}
value = &f
}
}
if value == nil {
return 0, fmt.Errorf("no values found for query %s", query)
}
return *value, nil
}
// GetDeploymentCounter returns the requests success rate using istio_requests_total metric
func (c *CanaryObserver) GetDeploymentCounter(name string, namespace string, metric string, interval string) (float64, error) {
if c.metricsServer == "fake" {

View File

@@ -195,6 +195,8 @@ func (c *CanaryRouter) syncVirtualService(cd *flaggerv1.Canary) error {
{
Match: cd.Spec.Service.Match,
Rewrite: cd.Spec.Service.Rewrite,
Timeout: cd.Spec.Service.Timeout,
Retries: cd.Spec.Service.Retries,
Route: route,
},
},
@@ -309,6 +311,8 @@ func (c *CanaryRouter) SetRoutes(
{
Match: cd.Spec.Service.Match,
Rewrite: cd.Spec.Service.Rewrite,
Timeout: cd.Spec.Service.Timeout,
Retries: cd.Spec.Service.Retries,
Route: []istiov1alpha3.DestinationWeight{primary, canary},
},
}

View File

@@ -405,6 +405,10 @@ func (c *Controller) analyseCanary(r *flaggerv1.Canary) bool {
// run metrics checks
for _, metric := range r.Spec.CanaryAnalysis.Metrics {
if metric.Interval == "" {
metric.Interval = r.GetMetricInterval()
}
if metric.Name == "istio_requests_total" {
val, err := c.observer.GetDeploymentCounter(r.Spec.TargetRef.Name, r.Namespace, metric.Name, metric.Interval)
if err != nil {
@@ -436,6 +440,24 @@ func (c *Controller) analyseCanary(r *flaggerv1.Canary) bool {
return false
}
}
if metric.Query != "" {
val, err := c.observer.GetScalar(metric.Query)
if err != nil {
if strings.Contains(err.Error(), "no values found") {
c.recordEventWarningf(r, "Halt advancement no values found for metric %s probably %s.%s is not receiving traffic",
metric.Name, r.Spec.TargetRef.Name, r.Namespace)
} else {
c.recordEventErrorf(r, "Metrics server %s query failed: %v", c.observer.metricsServer, err)
}
return false
}
if val > float64(metric.Threshold) {
c.recordEventWarningf(r, "Halt %s.%s advancement %s %.2f > %v",
r.Name, r.Namespace, metric.Name, val, metric.Threshold)
return false
}
}
}
return true

View File

@@ -1,4 +1,4 @@
package version
var VERSION = "0.6.0"
var VERSION = "0.7.0"
var REVISION = "unknown"

View File

@@ -45,6 +45,30 @@ spec:
- name: istio_request_duration_seconds_bucket
threshold: 500
interval: 30s
- name: "404s percentage"
threshold: 5
interval: 1m
query: |
100 - sum(
rate(
istio_requests_total{
reporter="destination",
destination_workload_namespace=~"test",
destination_workload=~"podinfo",
response_code!="404"
}[1m]
)
)
/
sum(
rate(
istio_requests_total{
reporter="destination",
destination_workload_namespace=~"test",
destination_workload=~"podinfo"
}[1m]
)
) * 100
webhooks:
- name: load-test
url: http://flagger-loadtester.test/