mirror of
https://github.com/fluxcd/flagger.git
synced 2026-02-16 19:09:55 +00:00
Compare commits
16 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1140af8dc7 | ||
|
|
a2688c3910 | ||
|
|
75b27ab3f3 | ||
|
|
59d3f55fb2 | ||
|
|
f34739f334 | ||
|
|
90c71ec18f | ||
|
|
395234d7c8 | ||
|
|
e322ba0065 | ||
|
|
6db8b96f72 | ||
|
|
44d7e96e96 | ||
|
|
1662479c8d | ||
|
|
2e351fcf0d | ||
|
|
5d81876d07 | ||
|
|
c81e6989ec | ||
|
|
4d61a896c3 | ||
|
|
d148933ab3 |
10
CHANGELOG.md
10
CHANGELOG.md
@@ -2,6 +2,15 @@
|
||||
|
||||
All notable changes to this project are documented in this file.
|
||||
|
||||
## 0.7.0 (2019-02-28)
|
||||
|
||||
Adds support for custom metric checks, HTTP timeouts and HTTP retries
|
||||
|
||||
#### Features
|
||||
|
||||
- Allow custom promql queries in the canary analysis spec [##60](https://github.com/stefanprodan/flagger/pull/#60)
|
||||
- Add HTTP timeout and retries to canary service spec [##62](https://github.com/stefanprodan/flagger/pull/#62)
|
||||
|
||||
## 0.6.0 (2019-02-25)
|
||||
|
||||
Allows for [HTTPMatchRequests](https://istio.io/docs/reference/config/istio.networking.v1alpha3/#HTTPMatchRequest)
|
||||
@@ -20,7 +29,6 @@ to be customized in the service spec of the canary custom resource.
|
||||
- Run e2e testing on [Kubernetes Kind](https://github.com/kubernetes-sigs/kind) for canary promotion
|
||||
[#53](https://github.com/stefanprodan/flagger/pull/53)
|
||||
|
||||
|
||||
## 0.5.1 (2019-02-14)
|
||||
|
||||
Allows skipping the analysis phase to ship changes directly to production
|
||||
|
||||
27
README.md
27
README.md
@@ -30,6 +30,7 @@ Flagger documentation can be found at [docs.flagger.app](https://docs.flagger.ap
|
||||
* [Canary deployment stages](https://docs.flagger.app/how-it-works#canary-deployment)
|
||||
* [Canary analysis](https://docs.flagger.app/how-it-works#canary-analysis)
|
||||
* [HTTP metrics](https://docs.flagger.app/how-it-works#http-metrics)
|
||||
* [Custom metrics](https://docs.flagger.app/how-it-works#custom-metrics)
|
||||
* [Webhooks](https://docs.flagger.app/how-it-works#webhooks)
|
||||
* [Load testing](https://docs.flagger.app/how-it-works#load-testing)
|
||||
* Usage
|
||||
@@ -98,16 +99,21 @@ spec:
|
||||
# Istio virtual service host names (optional)
|
||||
hosts:
|
||||
- podinfo.example.com
|
||||
# Istio virtual service HTTP match conditions (optional)
|
||||
# HTTP match conditions (optional)
|
||||
match:
|
||||
- uri:
|
||||
prefix: /
|
||||
# Istio virtual service HTTP rewrite (optional)
|
||||
# HTTP rewrite (optional)
|
||||
rewrite:
|
||||
uri: /
|
||||
# for emergency cases when you want to ship changes
|
||||
# in production without analysing the canary (default false)
|
||||
# timeout for HTTP requests (optional)
|
||||
timeout: 5s
|
||||
# retry policy when a HTTP request fails (optional)
|
||||
retries:
|
||||
attempts: 3
|
||||
# promote the canary without analysing it (default false)
|
||||
skipAnalysis: false
|
||||
# define the canary analysis timing and KPIs
|
||||
canaryAnalysis:
|
||||
# schedule interval (default 60s)
|
||||
interval: 1m
|
||||
@@ -121,6 +127,7 @@ spec:
|
||||
stepWeight: 5
|
||||
# Istio Prometheus checks
|
||||
metrics:
|
||||
# builtin Istio checks
|
||||
- name: istio_requests_total
|
||||
# minimum req success rate (non 5xx responses)
|
||||
# percentage (0-100)
|
||||
@@ -131,6 +138,16 @@ spec:
|
||||
# milliseconds
|
||||
threshold: 500
|
||||
interval: 30s
|
||||
# custom check
|
||||
- name: "kafka lag"
|
||||
threshold: 100
|
||||
query: |
|
||||
avg_over_time(
|
||||
kafka_consumergroup_lag{
|
||||
consumergroup=~"podinfo-consumer-.*",
|
||||
topic="podinfo"
|
||||
}[1m]
|
||||
)
|
||||
# external checks (optional)
|
||||
webhooks:
|
||||
- name: load-test
|
||||
@@ -144,8 +161,8 @@ For more details on how the canary analysis and promotion works please [read the
|
||||
|
||||
### Roadmap
|
||||
|
||||
* Extend the validation mechanism to support other metrics than HTTP success rate and latency
|
||||
* Add A/B testing capabilities using fixed routing based on HTTP headers and cookies match conditions
|
||||
* Integrate with other service mesh technologies like AWS AppMesh and Linkerd v2
|
||||
* Add support for comparing the canary metrics to the primary ones and do the validation based on the derivation between the two
|
||||
|
||||
### Contributing
|
||||
|
||||
@@ -73,6 +73,8 @@ spec:
|
||||
properties:
|
||||
port:
|
||||
type: number
|
||||
timeout:
|
||||
type: string
|
||||
skipAnalysis:
|
||||
type: boolean
|
||||
canaryAnalysis:
|
||||
@@ -91,7 +93,7 @@ spec:
|
||||
properties:
|
||||
items:
|
||||
type: object
|
||||
required: ['name', 'interval', 'threshold']
|
||||
required: ['name', 'threshold']
|
||||
properties:
|
||||
name:
|
||||
type: string
|
||||
@@ -100,6 +102,8 @@ spec:
|
||||
pattern: "^[0-9]+(m|s)"
|
||||
threshold:
|
||||
type: number
|
||||
query:
|
||||
type: string
|
||||
webhooks:
|
||||
type: array
|
||||
properties:
|
||||
|
||||
@@ -22,7 +22,7 @@ spec:
|
||||
serviceAccountName: flagger
|
||||
containers:
|
||||
- name: flagger
|
||||
image: quay.io/stefanprodan/flagger:0.6.0
|
||||
image: quay.io/stefanprodan/flagger:0.7.0
|
||||
imagePullPolicy: IfNotPresent
|
||||
ports:
|
||||
- name: http
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
apiVersion: v1
|
||||
name: flagger
|
||||
version: 0.6.0
|
||||
appVersion: 0.6.0
|
||||
version: 0.7.0
|
||||
appVersion: 0.7.0
|
||||
kubeVersion: ">=1.11.0-0"
|
||||
engine: gotpl
|
||||
description: Flagger is a Kubernetes operator that automates the promotion of canary deployments using Istio routing for traffic shifting and Prometheus metrics for canary analysis.
|
||||
|
||||
@@ -74,6 +74,8 @@ spec:
|
||||
properties:
|
||||
port:
|
||||
type: number
|
||||
timeout:
|
||||
type: string
|
||||
skipAnalysis:
|
||||
type: boolean
|
||||
canaryAnalysis:
|
||||
@@ -92,7 +94,7 @@ spec:
|
||||
properties:
|
||||
items:
|
||||
type: object
|
||||
required: ['name', 'interval', 'threshold']
|
||||
required: ['name', 'threshold']
|
||||
properties:
|
||||
name:
|
||||
type: string
|
||||
@@ -101,6 +103,8 @@ spec:
|
||||
pattern: "^[0-9]+(m|s)"
|
||||
threshold:
|
||||
type: number
|
||||
query:
|
||||
type: string
|
||||
webhooks:
|
||||
type: array
|
||||
properties:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
image:
|
||||
repository: quay.io/stefanprodan/flagger
|
||||
tag: 0.6.0
|
||||
tag: 0.7.0
|
||||
pullPolicy: IfNotPresent
|
||||
|
||||
metricsServer: "http://prometheus.istio-system.svc.cluster.local:9090"
|
||||
|
||||
@@ -39,16 +39,22 @@ spec:
|
||||
# Istio virtual service host names (optional)
|
||||
hosts:
|
||||
- podinfo.example.com
|
||||
# Istio virtual service HTTP match conditions (optional)
|
||||
# HTTP match conditions (optional)
|
||||
match:
|
||||
- uri:
|
||||
prefix: /
|
||||
# Istio virtual service HTTP rewrite (optional)
|
||||
# HTTP rewrite (optional)
|
||||
rewrite:
|
||||
uri: /
|
||||
# for emergency cases when you want to ship changes
|
||||
# in production without analysing the canary
|
||||
# timeout for HTTP requests (optional)
|
||||
timeout: 5s
|
||||
# retry policy when a HTTP request fails (optional)
|
||||
retries:
|
||||
attempts: 3
|
||||
perTryTimeout: 3s
|
||||
# promote the canary without analysing it (default false)
|
||||
skipAnalysis: false
|
||||
# define the canary analysis timing and KPIs
|
||||
canaryAnalysis:
|
||||
# schedule interval (default 60s)
|
||||
interval: 1m
|
||||
@@ -377,6 +383,49 @@ histogram_quantile(0.99,
|
||||
|
||||
> **Note** that the metric interval should be lower or equal to the control loop interval.
|
||||
|
||||
### Custom Metrics
|
||||
|
||||
The canary analysis can be extended with custom Prometheus queries.
|
||||
|
||||
```yaml
|
||||
canaryAnalysis:
|
||||
threshold: 1
|
||||
maxWeight: 50
|
||||
stepWeight: 5
|
||||
metrics:
|
||||
- name: "404s percentage"
|
||||
threshold: 5
|
||||
query: |
|
||||
100 - sum(
|
||||
rate(
|
||||
istio_requests_total{
|
||||
reporter="destination",
|
||||
destination_workload_namespace="test",
|
||||
destination_workload="podinfo",
|
||||
response_code!="404"
|
||||
}[1m]
|
||||
)
|
||||
)
|
||||
/
|
||||
sum(
|
||||
rate(
|
||||
istio_requests_total{
|
||||
reporter="destination",
|
||||
destination_workload_namespace="test",
|
||||
destination_workload="podinfo"
|
||||
}[1m]
|
||||
)
|
||||
) * 100
|
||||
```
|
||||
|
||||
The above configuration validates the canary by checking
|
||||
if the HTTP 404 req/sec percentage is below 5 percent of the total traffic.
|
||||
If the 404s rate reaches the 5% threshold, then the canary fails.
|
||||
|
||||
When specifying a query, Flagger will run the promql query and convert the result to float64.
|
||||
Then it compares the query result value with the metric threshold value.
|
||||
|
||||
|
||||
### Webhooks
|
||||
|
||||
The canary analysis can be extended with webhooks.
|
||||
|
||||
@@ -27,6 +27,7 @@ const (
|
||||
CanaryKind = "Canary"
|
||||
ProgressDeadlineSeconds = 600
|
||||
AnalysisInterval = 60 * time.Second
|
||||
MetricInterval = "1m"
|
||||
)
|
||||
|
||||
// +genclient
|
||||
@@ -113,6 +114,8 @@ type CanaryService struct {
|
||||
Hosts []string `json:"hosts"`
|
||||
Match []istiov1alpha3.HTTPMatchRequest `json:"match,omitempty"`
|
||||
Rewrite *istiov1alpha3.HTTPRewrite `json:"rewrite,omitempty"`
|
||||
Timeout string `json:"timeout,omitempty"`
|
||||
Retries *istiov1alpha3.HTTPRetry `json:"retries,omitempty"`
|
||||
}
|
||||
|
||||
// CanaryAnalysis is used to describe how the analysis should be done
|
||||
@@ -127,9 +130,11 @@ type CanaryAnalysis struct {
|
||||
|
||||
// CanaryMetric holds the reference to Istio metrics used for canary analysis
|
||||
type CanaryMetric struct {
|
||||
Name string `json:"name"`
|
||||
Interval string `json:"interval"`
|
||||
Threshold int `json:"threshold"`
|
||||
Name string `json:"name"`
|
||||
Interval string `json:"interval,omitempty"`
|
||||
Threshold float64 `json:"threshold"`
|
||||
// +optional
|
||||
Query string `json:"query,omitempty"`
|
||||
}
|
||||
|
||||
// CanaryWebhook holds the reference to external checks used for canary analysis
|
||||
@@ -170,3 +175,8 @@ func (c *Canary) GetAnalysisInterval() time.Duration {
|
||||
|
||||
return interval
|
||||
}
|
||||
|
||||
// GetMetricInterval returns the metric interval default value (1m)
|
||||
func (c *Canary) GetMetricInterval() string {
|
||||
return MetricInterval
|
||||
}
|
||||
|
||||
@@ -156,6 +156,11 @@ func (in *CanaryService) DeepCopyInto(out *CanaryService) {
|
||||
*out = new(istiov1alpha3.HTTPRewrite)
|
||||
**out = **in
|
||||
}
|
||||
if in.Retries != nil {
|
||||
in, out := &in.Retries, &out.Retries
|
||||
*out = new(istiov1alpha3.HTTPRetry)
|
||||
**out = **in
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@ import (
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
@@ -73,6 +74,38 @@ func (c *CanaryObserver) queryMetric(query string) (*vectorQueryResponse, error)
|
||||
return &values, nil
|
||||
}
|
||||
|
||||
// GetScalar runs the promql query and returns the first value found
|
||||
func (c *CanaryObserver) GetScalar(query string) (float64, error) {
|
||||
if c.metricsServer == "fake" {
|
||||
return 100, nil
|
||||
}
|
||||
|
||||
query = strings.Replace(query, "\n", "", -1)
|
||||
query = strings.Replace(query, " ", "", -1)
|
||||
|
||||
var value *float64
|
||||
result, err := c.queryMetric(query)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
for _, v := range result.Data.Result {
|
||||
metricValue := v.Value[1]
|
||||
switch metricValue.(type) {
|
||||
case string:
|
||||
f, err := strconv.ParseFloat(metricValue.(string), 64)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
value = &f
|
||||
}
|
||||
}
|
||||
if value == nil {
|
||||
return 0, fmt.Errorf("no values found for query %s", query)
|
||||
}
|
||||
return *value, nil
|
||||
}
|
||||
|
||||
// GetDeploymentCounter returns the requests success rate using istio_requests_total metric
|
||||
func (c *CanaryObserver) GetDeploymentCounter(name string, namespace string, metric string, interval string) (float64, error) {
|
||||
if c.metricsServer == "fake" {
|
||||
|
||||
@@ -195,6 +195,8 @@ func (c *CanaryRouter) syncVirtualService(cd *flaggerv1.Canary) error {
|
||||
{
|
||||
Match: cd.Spec.Service.Match,
|
||||
Rewrite: cd.Spec.Service.Rewrite,
|
||||
Timeout: cd.Spec.Service.Timeout,
|
||||
Retries: cd.Spec.Service.Retries,
|
||||
Route: route,
|
||||
},
|
||||
},
|
||||
@@ -309,6 +311,8 @@ func (c *CanaryRouter) SetRoutes(
|
||||
{
|
||||
Match: cd.Spec.Service.Match,
|
||||
Rewrite: cd.Spec.Service.Rewrite,
|
||||
Timeout: cd.Spec.Service.Timeout,
|
||||
Retries: cd.Spec.Service.Retries,
|
||||
Route: []istiov1alpha3.DestinationWeight{primary, canary},
|
||||
},
|
||||
}
|
||||
|
||||
@@ -405,6 +405,10 @@ func (c *Controller) analyseCanary(r *flaggerv1.Canary) bool {
|
||||
|
||||
// run metrics checks
|
||||
for _, metric := range r.Spec.CanaryAnalysis.Metrics {
|
||||
if metric.Interval == "" {
|
||||
metric.Interval = r.GetMetricInterval()
|
||||
}
|
||||
|
||||
if metric.Name == "istio_requests_total" {
|
||||
val, err := c.observer.GetDeploymentCounter(r.Spec.TargetRef.Name, r.Namespace, metric.Name, metric.Interval)
|
||||
if err != nil {
|
||||
@@ -436,6 +440,24 @@ func (c *Controller) analyseCanary(r *flaggerv1.Canary) bool {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
if metric.Query != "" {
|
||||
val, err := c.observer.GetScalar(metric.Query)
|
||||
if err != nil {
|
||||
if strings.Contains(err.Error(), "no values found") {
|
||||
c.recordEventWarningf(r, "Halt advancement no values found for metric %s probably %s.%s is not receiving traffic",
|
||||
metric.Name, r.Spec.TargetRef.Name, r.Namespace)
|
||||
} else {
|
||||
c.recordEventErrorf(r, "Metrics server %s query failed: %v", c.observer.metricsServer, err)
|
||||
}
|
||||
return false
|
||||
}
|
||||
if val > float64(metric.Threshold) {
|
||||
c.recordEventWarningf(r, "Halt %s.%s advancement %s %.2f > %v",
|
||||
r.Name, r.Namespace, metric.Name, val, metric.Threshold)
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
package version
|
||||
|
||||
var VERSION = "0.6.0"
|
||||
var VERSION = "0.7.0"
|
||||
var REVISION = "unknown"
|
||||
|
||||
@@ -45,6 +45,30 @@ spec:
|
||||
- name: istio_request_duration_seconds_bucket
|
||||
threshold: 500
|
||||
interval: 30s
|
||||
- name: "404s percentage"
|
||||
threshold: 5
|
||||
interval: 1m
|
||||
query: |
|
||||
100 - sum(
|
||||
rate(
|
||||
istio_requests_total{
|
||||
reporter="destination",
|
||||
destination_workload_namespace=~"test",
|
||||
destination_workload=~"podinfo",
|
||||
response_code!="404"
|
||||
}[1m]
|
||||
)
|
||||
)
|
||||
/
|
||||
sum(
|
||||
rate(
|
||||
istio_requests_total{
|
||||
reporter="destination",
|
||||
destination_workload_namespace=~"test",
|
||||
destination_workload=~"podinfo"
|
||||
}[1m]
|
||||
)
|
||||
) * 100
|
||||
webhooks:
|
||||
- name: load-test
|
||||
url: http://flagger-loadtester.test/
|
||||
|
||||
Reference in New Issue
Block a user