Skip to content

Commit

Permalink
Merge pull request #60 from stefanprodan/custom-metrics
Browse files Browse the repository at this point in the history
 Add support for custom metrics
  • Loading branch information
stefanprodan committed Feb 27, 2019
2 parents 04a56a3 + 2e351fc commit 1662479
Show file tree
Hide file tree
Showing 7 changed files with 139 additions and 5 deletions.
4 changes: 3 additions & 1 deletion artifacts/flagger/crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ spec:
properties:
items:
type: object
required: ['name', 'interval', 'threshold']
required: ['name', 'threshold']
properties:
name:
type: string
Expand All @@ -100,6 +100,8 @@ spec:
pattern: "^[0-9]+(m|s)"
threshold:
type: number
query:
type: string
webhooks:
type: array
properties:
Expand Down
4 changes: 3 additions & 1 deletion charts/flagger/templates/crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ spec:
properties:
items:
type: object
required: ['name', 'interval', 'threshold']
required: ['name', 'threshold']
properties:
name:
type: string
Expand All @@ -101,6 +101,8 @@ spec:
pattern: "^[0-9]+(m|s)"
threshold:
type: number
query:
type: string
webhooks:
type: array
properties:
Expand Down
43 changes: 43 additions & 0 deletions docs/gitbook/how-it-works.md
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,49 @@ histogram_quantile(0.99,

> **Note** that the metric interval should be lower or equal to the control loop interval.

### Custom Metrics

The canary analysis can be extended with custom Prometheus queries.

```yaml
canaryAnalysis:
threshold: 1
maxWeight: 50
stepWeight: 5
metrics:
- name: "404s percentage"
threshold: 5
query: |
100 - sum(
rate(
istio_requests_total{
reporter="destination",
destination_workload_namespace="test",
destination_workload="podinfo",
response_code!="404"
}[1m]
)
)
/
sum(
rate(
istio_requests_total{
reporter="destination",
destination_workload_namespace="test",
destination_workload="podinfo"
}[1m]
)
) * 100
```

The above configuration validates the canary by checking
if the HTTP 404 req/sec percentage is below 5 percent of the total traffic.
If the 404s rate reaches the 5% threshold, then the canary fails.

When specifying a query, Flagger will run the promql query and convert the result to float64.
Then it compares the query result value with the metric threshold value.


### Webhooks

The canary analysis can be extended with webhooks.
Expand Down
14 changes: 11 additions & 3 deletions pkg/apis/flagger/v1alpha3/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ const (
CanaryKind = "Canary"
ProgressDeadlineSeconds = 600
AnalysisInterval = 60 * time.Second
MetricInterval = "1m"
)

// +genclient
Expand Down Expand Up @@ -127,9 +128,11 @@ type CanaryAnalysis struct {

// CanaryMetric holds the reference to Istio metrics used for canary analysis
type CanaryMetric struct {
Name string `json:"name"`
Interval string `json:"interval"`
Threshold int `json:"threshold"`
Name string `json:"name"`
Interval string `json:"interval,omitempty"`
Threshold float64 `json:"threshold"`
// +optional
Query string `json:"query,omitempty"`
}

// CanaryWebhook holds the reference to external checks used for canary analysis
Expand Down Expand Up @@ -170,3 +173,8 @@ func (c *Canary) GetAnalysisInterval() time.Duration {

return interval
}

// GetMetricInterval returns the metric interval default value (1m)
func (c *Canary) GetMetricInterval() string {
return MetricInterval
}
33 changes: 33 additions & 0 deletions pkg/controller/observer.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"net/http"
"net/url"
"strconv"
"strings"
"time"
)

Expand Down Expand Up @@ -73,6 +74,38 @@ func (c *CanaryObserver) queryMetric(query string) (*vectorQueryResponse, error)
return &values, nil
}

// GetScalar runs the promql query and returns the first value found
func (c *CanaryObserver) GetScalar(query string) (float64, error) {
if c.metricsServer == "fake" {
return 100, nil
}

query = strings.Replace(query, "\n", "", -1)
query = strings.Replace(query, " ", "", -1)

var value *float64
result, err := c.queryMetric(query)
if err != nil {
return 0, err
}

for _, v := range result.Data.Result {
metricValue := v.Value[1]
switch metricValue.(type) {
case string:
f, err := strconv.ParseFloat(metricValue.(string), 64)
if err != nil {
return 0, err
}
value = &f
}
}
if value == nil {
return 0, fmt.Errorf("no values found for query %s", query)
}
return *value, nil
}

// GetDeploymentCounter returns the requests success rate using istio_requests_total metric
func (c *CanaryObserver) GetDeploymentCounter(name string, namespace string, metric string, interval string) (float64, error) {
if c.metricsServer == "fake" {
Expand Down
22 changes: 22 additions & 0 deletions pkg/controller/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,10 @@ func (c *Controller) analyseCanary(r *flaggerv1.Canary) bool {

// run metrics checks
for _, metric := range r.Spec.CanaryAnalysis.Metrics {
if metric.Interval == "" {
metric.Interval = r.GetMetricInterval()
}

if metric.Name == "istio_requests_total" {
val, err := c.observer.GetDeploymentCounter(r.Spec.TargetRef.Name, r.Namespace, metric.Name, metric.Interval)
if err != nil {
Expand Down Expand Up @@ -436,6 +440,24 @@ func (c *Controller) analyseCanary(r *flaggerv1.Canary) bool {
return false
}
}

if metric.Query != "" {
val, err := c.observer.GetScalar(metric.Query)
if err != nil {
if strings.Contains(err.Error(), "no values found") {
c.recordEventWarningf(r, "Halt advancement no values found for metric %s probably %s.%s is not receiving traffic",
metric.Name, r.Spec.TargetRef.Name, r.Namespace)
} else {
c.recordEventErrorf(r, "Metrics server %s query failed: %v", c.observer.metricsServer, err)
}
return false
}
if val > float64(metric.Threshold) {
c.recordEventWarningf(r, "Halt %s.%s advancement %s %.2f > %v",
r.Name, r.Namespace, metric.Name, val, metric.Threshold)
return false
}
}
}

return true
Expand Down
24 changes: 24 additions & 0 deletions test/e2e-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,30 @@ spec:
- name: istio_request_duration_seconds_bucket
threshold: 500
interval: 30s
- name: "404s percentage"
threshold: 5
interval: 1m
query: |
100 - sum(
rate(
istio_requests_total{
reporter="destination",
destination_workload_namespace=~"test",
destination_workload=~"podinfo",
response_code!="404"
}[1m]
)
)
/
sum(
rate(
istio_requests_total{
reporter="destination",
destination_workload_namespace=~"test",
destination_workload=~"podinfo"
}[1m]
)
) * 100
webhooks:
- name: load-test
url: http://flagger-loadtester.test/
Expand Down

0 comments on commit 1662479

Please sign in to comment.