Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for custom metrics #60

Merged
merged 5 commits into from
Feb 27, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion artifacts/flagger/crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ spec:
properties:
items:
type: object
required: ['name', 'interval', 'threshold']
required: ['name', 'threshold']
stefanprodan marked this conversation as resolved.
Show resolved Hide resolved
properties:
name:
type: string
Expand All @@ -100,6 +100,8 @@ spec:
pattern: "^[0-9]+(m|s)"
threshold:
type: number
query:
type: string
webhooks:
type: array
properties:
Expand Down
4 changes: 3 additions & 1 deletion charts/flagger/templates/crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ spec:
properties:
items:
type: object
required: ['name', 'interval', 'threshold']
required: ['name', 'threshold']
properties:
name:
type: string
Expand All @@ -101,6 +101,8 @@ spec:
pattern: "^[0-9]+(m|s)"
threshold:
type: number
query:
type: string
webhooks:
type: array
properties:
Expand Down
43 changes: 43 additions & 0 deletions docs/gitbook/how-it-works.md
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,49 @@ histogram_quantile(0.99,

> **Note** that the metric interval should be lower or equal to the control loop interval.

### Custom Metrics

The canary analysis can be extended with custom Prometheus queries.

```yaml
canaryAnalysis:
threshold: 1
maxWeight: 50
stepWeight: 5
metrics:
- name: "404s percentage"
threshold: 5
query: |
100 - sum(
rate(
istio_requests_total{
reporter="destination",
destination_workload_namespace="test",
destination_workload="podinfo",
response_code!="404"
}[1m]
)
)
/
sum(
rate(
istio_requests_total{
reporter="destination",
destination_workload_namespace="test",
destination_workload="podinfo"
}[1m]
)
) * 100
```

The above configuration validates the canary by checking
if the HTTP 404 req/sec percentage is below 5 percent of the total traffic.
If the 404s rate reaches the 5% threshold, then the canary fails.

When specifying a query, Flagger will run the promql query and convert the result to float64.
Then it compares the query result value with the metric threshold value.


### Webhooks

The canary analysis can be extended with webhooks.
Expand Down
14 changes: 11 additions & 3 deletions pkg/apis/flagger/v1alpha3/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ const (
CanaryKind = "Canary"
ProgressDeadlineSeconds = 600
AnalysisInterval = 60 * time.Second
MetricInterval = "1m"
)

// +genclient
Expand Down Expand Up @@ -127,9 +128,11 @@ type CanaryAnalysis struct {

// CanaryMetric holds the reference to Istio metrics used for canary analysis
type CanaryMetric struct {
Name string `json:"name"`
Interval string `json:"interval"`
Threshold int `json:"threshold"`
Name string `json:"name"`
Interval string `json:"interval,omitempty"`
Threshold float64 `json:"threshold"`
// +optional
Query string `json:"query,omitempty"`
}

// CanaryWebhook holds the reference to external checks used for canary analysis
Expand Down Expand Up @@ -170,3 +173,8 @@ func (c *Canary) GetAnalysisInterval() time.Duration {

return interval
}

// GetMetricInterval returns the metric interval default value (1m)
func (c *Canary) GetMetricInterval() string {
return MetricInterval
}
33 changes: 33 additions & 0 deletions pkg/controller/observer.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"net/http"
"net/url"
"strconv"
"strings"
"time"
)

Expand Down Expand Up @@ -73,6 +74,38 @@ func (c *CanaryObserver) queryMetric(query string) (*vectorQueryResponse, error)
return &values, nil
}

// GetScalar runs the promql query and returns the first value found
func (c *CanaryObserver) GetScalar(query string) (float64, error) {
if c.metricsServer == "fake" {
return 100, nil
}

query = strings.Replace(query, "\n", "", -1)
query = strings.Replace(query, " ", "", -1)

var value *float64
result, err := c.queryMetric(query)
if err != nil {
return 0, err
}

for _, v := range result.Data.Result {
metricValue := v.Value[1]
switch metricValue.(type) {
case string:
f, err := strconv.ParseFloat(metricValue.(string), 64)
if err != nil {
return 0, err
}
value = &f
}
}
if value == nil {
return 0, fmt.Errorf("no values found for query %s", query)
}
return *value, nil
}

// GetDeploymentCounter returns the requests success rate using istio_requests_total metric
func (c *CanaryObserver) GetDeploymentCounter(name string, namespace string, metric string, interval string) (float64, error) {
if c.metricsServer == "fake" {
Expand Down
22 changes: 22 additions & 0 deletions pkg/controller/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,10 @@ func (c *Controller) analyseCanary(r *flaggerv1.Canary) bool {

// run metrics checks
for _, metric := range r.Spec.CanaryAnalysis.Metrics {
if metric.Interval == "" {
metric.Interval = r.GetMetricInterval()
}

if metric.Name == "istio_requests_total" {
val, err := c.observer.GetDeploymentCounter(r.Spec.TargetRef.Name, r.Namespace, metric.Name, metric.Interval)
if err != nil {
Expand Down Expand Up @@ -436,6 +440,24 @@ func (c *Controller) analyseCanary(r *flaggerv1.Canary) bool {
return false
}
}

if metric.Query != "" {
val, err := c.observer.GetScalar(metric.Query)
if err != nil {
if strings.Contains(err.Error(), "no values found") {
c.recordEventWarningf(r, "Halt advancement no values found for metric %s probably %s.%s is not receiving traffic",
metric.Name, r.Spec.TargetRef.Name, r.Namespace)
} else {
c.recordEventErrorf(r, "Metrics server %s query failed: %v", c.observer.metricsServer, err)
}
return false
}
if val > float64(metric.Threshold) {
c.recordEventWarningf(r, "Halt %s.%s advancement %s %.2f > %v",
r.Name, r.Namespace, metric.Name, val, metric.Threshold)
return false
}
}
}

return true
Expand Down
24 changes: 24 additions & 0 deletions test/e2e-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,30 @@ spec:
- name: istio_request_duration_seconds_bucket
threshold: 500
interval: 30s
- name: "404s percentage"
threshold: 5
interval: 1m
query: |
100 - sum(
rate(
istio_requests_total{
reporter="destination",
destination_workload_namespace=~"test",
destination_workload=~"podinfo",
response_code!="404"
}[1m]
)
)
/
sum(
rate(
istio_requests_total{
reporter="destination",
destination_workload_namespace=~"test",
destination_workload=~"podinfo"
}[1m]
)
) * 100
webhooks:
- name: load-test
url: http://flagger-loadtester.test/
Expand Down