fluxcd · stefanprodan · Feb 27, 2019 · Feb 27, 2019 · Feb 27, 2019 · Feb 27, 2019
diff --git a/artifacts/flagger/crd.yaml b/artifacts/flagger/crd.yaml
@@ -91,7 +91,7 @@ spec:
                   properties:
                     items:
                       type: object
-                      required: ['name', 'interval', 'threshold']
+                      required: ['name', 'threshold']
                       properties:
                         name:
                           type: string
@@ -100,6 +100,8 @@ spec:
                           pattern: "^[0-9]+(m|s)"
                         threshold:
                           type: number
+                        query:
+                          type: string
                 webhooks:
                   type: array
                   properties:

diff --git a/charts/flagger/templates/crd.yaml b/charts/flagger/templates/crd.yaml
@@ -92,7 +92,7 @@ spec:
                   properties:
                     items:
                       type: object
-                      required: ['name', 'interval', 'threshold']
+                      required: ['name', 'threshold']
                       properties:
                         name:
                           type: string
@@ -101,6 +101,8 @@ spec:
                           pattern: "^[0-9]+(m|s)"
                         threshold:
                           type: number
+                        query:
+                          type: string
                 webhooks:
                   type: array
                   properties:

diff --git a/docs/gitbook/how-it-works.md b/docs/gitbook/how-it-works.md
@@ -377,6 +377,49 @@ histogram_quantile(0.99,
 
 > **Note** that the metric interval should be lower or equal to the control loop interval.
 
+### Custom Metrics
+
+The canary analysis can be extended with custom Prometheus queries. 
+
+```yaml
+  canaryAnalysis:
+    threshold: 1
+    maxWeight: 50
+    stepWeight: 5
+    metrics:
+    - name: "404s percentage"
+      threshold: 5
+      query: |
+        100 - sum(
+            rate(
+                istio_requests_total{
+                  reporter="destination",
+                  destination_workload_namespace="test",
+                  destination_workload="podinfo",
+                  response_code!="404"
+                }[1m]
+            )
+        )
+        /
+        sum(
+            rate(
+                istio_requests_total{
+                  reporter="destination",
+                  destination_workload_namespace="test",
+                  destination_workload="podinfo"
+                }[1m]
+            )
+        ) * 100
+```
+
+The above configuration validates the canary by checking
+if the HTTP 404 req/sec percentage is below 5 percent of the total traffic.
+If the 404s rate reaches the 5% threshold, then the canary fails.
+
+When specifying a query, Flagger will run the promql query and convert the result to float64. 
+Then it compares the query result value with the metric threshold value.
+
+
 ### Webhooks
 
 The canary analysis can be extended with webhooks. 

diff --git a/pkg/apis/flagger/v1alpha3/types.go b/pkg/apis/flagger/v1alpha3/types.go
@@ -27,6 +27,7 @@ const (
 	CanaryKind              = "Canary"
 	ProgressDeadlineSeconds = 600
 	AnalysisInterval        = 60 * time.Second
+	MetricInterval          = "1m"
 )
 
 // +genclient
@@ -127,9 +128,11 @@ type CanaryAnalysis struct {
 
 // CanaryMetric holds the reference to Istio metrics used for canary analysis
 type CanaryMetric struct {
-	Name      string `json:"name"`
-	Interval  string `json:"interval"`
-	Threshold int    `json:"threshold"`
+	Name      string  `json:"name"`
+	Interval  string  `json:"interval,omitempty"`
+	Threshold float64 `json:"threshold"`
+	// +optional
+	Query string `json:"query,omitempty"`
 }
 
 // CanaryWebhook holds the reference to external checks used for canary analysis
@@ -170,3 +173,8 @@ func (c *Canary) GetAnalysisInterval() time.Duration {
 
 	return interval
 }
+
+// GetMetricInterval returns the metric interval default value (1m)
+func (c *Canary) GetMetricInterval() string {
+	return MetricInterval
+}
diff --git a/pkg/controller/observer.go b/pkg/controller/observer.go
@@ -8,6 +8,7 @@ import (
 	"net/http"
 	"net/url"
 	"strconv"
+	"strings"
 	"time"
 )
 
@@ -73,6 +74,38 @@ func (c *CanaryObserver) queryMetric(query string) (*vectorQueryResponse, error)
 	return &values, nil
 }
 
+// GetScalar runs the promql query and returns the first value found
+func (c *CanaryObserver) GetScalar(query string) (float64, error) {
+	if c.metricsServer == "fake" {
+		return 100, nil
+	}
+
+	query = strings.Replace(query, "\n", "", -1)
+	query = strings.Replace(query, " ", "", -1)
+
+	var value *float64
+	result, err := c.queryMetric(query)
+	if err != nil {
+		return 0, err
+	}
+
+	for _, v := range result.Data.Result {
+		metricValue := v.Value[1]
+		switch metricValue.(type) {
+		case string:
+			f, err := strconv.ParseFloat(metricValue.(string), 64)
+			if err != nil {
+				return 0, err
+			}
+			value = &f
+		}
+	}
+	if value == nil {
+		return 0, fmt.Errorf("no values found for query %s", query)
+	}
+	return *value, nil
+}
+
 // GetDeploymentCounter returns the requests success rate using istio_requests_total metric
 func (c *CanaryObserver) GetDeploymentCounter(name string, namespace string, metric string, interval string) (float64, error) {
 	if c.metricsServer == "fake" {

diff --git a/pkg/controller/scheduler.go b/pkg/controller/scheduler.go
@@ -405,6 +405,10 @@ func (c *Controller) analyseCanary(r *flaggerv1.Canary) bool {
 
 	// run metrics checks
 	for _, metric := range r.Spec.CanaryAnalysis.Metrics {
+		if metric.Interval == "" {
+			metric.Interval = r.GetMetricInterval()
+		}
+
 		if metric.Name == "istio_requests_total" {
 			val, err := c.observer.GetDeploymentCounter(r.Spec.TargetRef.Name, r.Namespace, metric.Name, metric.Interval)
 			if err != nil {
@@ -436,6 +440,24 @@ func (c *Controller) analyseCanary(r *flaggerv1.Canary) bool {
 				return false
 			}
 		}
+
+		if metric.Query != "" {
+			val, err := c.observer.GetScalar(metric.Query)
+			if err != nil {
+				if strings.Contains(err.Error(), "no values found") {
+					c.recordEventWarningf(r, "Halt advancement no values found for metric %s probably %s.%s is not receiving traffic",
+						metric.Name, r.Spec.TargetRef.Name, r.Namespace)
+				} else {
+					c.recordEventErrorf(r, "Metrics server %s query failed: %v", c.observer.metricsServer, err)
+				}
+				return false
+			}
+			if val > float64(metric.Threshold) {
+				c.recordEventWarningf(r, "Halt %s.%s advancement %s %.2f > %v",
+					r.Name, r.Namespace, metric.Name, val, metric.Threshold)
+				return false
+			}
+		}
 	}
 
 	return true

diff --git a/test/e2e-tests.sh b/test/e2e-tests.sh
@@ -45,6 +45,30 @@ spec:
     - name: istio_request_duration_seconds_bucket
       threshold: 500
       interval: 30s
+    - name: "404s percentage"
+      threshold: 5
+      interval: 1m
+      query: |
+        100 - sum(
+            rate(
+                istio_requests_total{
+                  reporter="destination",
+                  destination_workload_namespace=~"test",
+                  destination_workload=~"podinfo",
+                  response_code!="404"
+                }[1m]
+            )
+        )
+        /
+        sum(
+            rate(
+                istio_requests_total{
+                  reporter="destination",
+                  destination_workload_namespace=~"test",
+                  destination_workload=~"podinfo"
+                }[1m]
+            )
+        ) * 100
     webhooks:
       - name: load-test
         url: http://flagger-loadtester.test/