Skip to content

Commit

Permalink
Rollback canary based on the deployment progress deadline check
Browse files Browse the repository at this point in the history
- determine if the canary deployment is stuck by checking if there is a minimum replicas unavailable condition and if the last update time exceeds the deadline
- set progress deadline default value to 60 seconds
  • Loading branch information
stefanprodan committed Nov 27, 2018
1 parent 5a79402 commit 4cb5ceb
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 43 deletions.
102 changes: 72 additions & 30 deletions pkg/controller/deployer.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"encoding/base64"
"encoding/json"
"fmt"
"time"

"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
Expand Down Expand Up @@ -62,37 +63,58 @@ func (c *CanaryDeployer) Promote(cd *flaggerv1.Canary) error {
return nil
}

// IsReady checks the primary and canary deployment status and returns an error if
// the deployments are in the middle of a rolling update or if the pods are unhealthy
func (c *CanaryDeployer) IsReady(cd *flaggerv1.Canary) error {
canary, err := c.kubeClient.AppsV1().Deployments(cd.Namespace).Get(cd.Spec.TargetRef.Name, metav1.GetOptions{})
// IsPrimaryReady checks the primary deployment status and returns an error if
// the deployment is in the middle of a rolling update or if the pods are unhealthy
// it will return a non retriable error if the rolling update is stuck
func (c *CanaryDeployer) IsPrimaryReady(cd *flaggerv1.Canary) (bool, error) {
primaryName := fmt.Sprintf("%s-primary", cd.Spec.TargetRef.Name)
primary, err := c.kubeClient.AppsV1().Deployments(cd.Namespace).Get(primaryName, metav1.GetOptions{})
if err != nil {
if errors.IsNotFound(err) {
return fmt.Errorf("deployment %s.%s not found", cd.Spec.TargetRef.Name, cd.Namespace)
return true, fmt.Errorf("deployment %s.%s not found", primaryName, cd.Namespace)
}
return fmt.Errorf("deployment %s.%s query error %v", cd.Spec.TargetRef.Name, cd.Namespace, err)
return true, fmt.Errorf("deployment %s.%s query error %v", primaryName, cd.Namespace, err)
}
if msg, healthy := c.getDeploymentStatus(canary); !healthy {
return fmt.Errorf("Halt %s.%s advancement %s", cd.Name, cd.Namespace, msg)

retriable, err := c.isDeploymentReady(primary, cd.GetProgressDeadlineSeconds())
if err != nil {
if retriable {
return retriable, fmt.Errorf("Halt %s.%s advancement %s", cd.Name, cd.Namespace, err.Error())
} else {
return retriable, err
}
}

primaryName := fmt.Sprintf("%s-primary", cd.Spec.TargetRef.Name)
primary, err := c.kubeClient.AppsV1().Deployments(cd.Namespace).Get(primaryName, metav1.GetOptions{})
if primary.Spec.Replicas == int32p(0) {
return true, fmt.Errorf("halt %s.%s advancement primary deployment is scaled to zero",
cd.Name, cd.Namespace)
}
return true, nil
}

// IsCanaryReady checks the primary deployment status and returns an error if
// the deployment is in the middle of a rolling update or if the pods are unhealthy
// it will return a non retriable error if the rolling update is stuck
func (c *CanaryDeployer) IsCanaryReady(cd *flaggerv1.Canary) (bool, error) {
canary, err := c.kubeClient.AppsV1().Deployments(cd.Namespace).Get(cd.Spec.TargetRef.Name, metav1.GetOptions{})
if err != nil {
if errors.IsNotFound(err) {
return fmt.Errorf("deployment %s.%s not found", primaryName, cd.Namespace)
return true, fmt.Errorf("deployment %s.%s not found", cd.Spec.TargetRef.Name, cd.Namespace)
}
return fmt.Errorf("deployment %s.%s query error %v", primaryName, cd.Namespace, err)
}
if msg, healthy := c.getDeploymentStatus(primary); !healthy {
return fmt.Errorf("Halt %s.%s advancement %s", cd.Name, cd.Namespace, msg)
return true, fmt.Errorf("deployment %s.%s query error %v", cd.Spec.TargetRef.Name, cd.Namespace, err)
}

if primary.Spec.Replicas == int32p(0) {
return fmt.Errorf("halt %s.%s advancement %s",
cd.Name, cd.Namespace, "primary deployment is scaled to zero")
retriable, err := c.isDeploymentReady(canary, cd.GetProgressDeadlineSeconds())
if err != nil {
if retriable {
return retriable, fmt.Errorf("Halt %s.%s advancement %s", cd.Name, cd.Namespace, err.Error())
} else {
return retriable, fmt.Errorf("deployment does not have minimum availability for more than %vs",
cd.GetProgressDeadlineSeconds())
}
}
return nil

return true, nil
}

// IsNewSpec returns true if the canary deployment pod spec has changed
Expand Down Expand Up @@ -140,7 +162,7 @@ func (c *CanaryDeployer) SetFailedChecks(cd *flaggerv1.Canary, val int) error {
}

// SetState updates the canary status state
func (c *CanaryDeployer) SetState(cd *flaggerv1.Canary, state string) error {
func (c *CanaryDeployer) SetState(cd *flaggerv1.Canary, state flaggerv1.CanaryState) error {
cd.Status.State = state
cd.Status.LastTransitionTime = metav1.Now()
cd, err := c.flaggerClient.FlaggerV1alpha1().Canaries(cd.Namespace).Update(cd)
Expand Down Expand Up @@ -324,26 +346,46 @@ func (c *CanaryDeployer) createPrimaryHpa(cd *flaggerv1.Canary) error {
return nil
}

func (c *CanaryDeployer) getDeploymentStatus(deployment *appsv1.Deployment) (string, bool) {
// isDeploymentReady determines if a deployment is ready by checking the status conditions
// if a deployment has exceeded the progress deadline it returns a non retriable error
func (c *CanaryDeployer) isDeploymentReady(deployment *appsv1.Deployment, deadline int) (bool, error) {
retriable := true

// Determine if the deployment is stuck by checking if there is a minimum replicas unavailable condition
// and if the last update time exceeds the deadline
if deployment.Generation <= deployment.Status.ObservedGeneration {
progress := c.getDeploymentCondition(deployment.Status, appsv1.DeploymentProgressing)
if progress != nil {
available := c.getDeploymentCondition(deployment.Status, appsv1.DeploymentAvailable)
if available != nil && available.Status == "False" && available.Reason == "MinimumReplicasUnavailable" {
from := available.LastUpdateTime
delta := time.Duration(deadline) * time.Second
retriable = !from.Add(delta).Before(time.Now())
}
}
}

if deployment.Generation <= deployment.Status.ObservedGeneration {
cond := c.getDeploymentCondition(deployment.Status, appsv1.DeploymentProgressing)

if cond != nil && cond.Reason == "ProgressDeadlineExceeded" {
return fmt.Sprintf("deployment %q exceeded its progress deadline", deployment.GetName()), false
return false, fmt.Errorf("deployment %q exceeded its progress deadline", deployment.GetName())
} else if deployment.Spec.Replicas != nil && deployment.Status.UpdatedReplicas < *deployment.Spec.Replicas {
return fmt.Sprintf("waiting for rollout to finish: %d out of %d new replicas have been updated",
deployment.Status.UpdatedReplicas, *deployment.Spec.Replicas), false
return retriable, fmt.Errorf("waiting for rollout to finish: %d out of %d new replicas have been updated",
deployment.Status.UpdatedReplicas, *deployment.Spec.Replicas)
} else if deployment.Status.Replicas > deployment.Status.UpdatedReplicas {
return fmt.Sprintf("waiting for rollout to finish: %d old replicas are pending termination",
deployment.Status.Replicas-deployment.Status.UpdatedReplicas), false
return retriable, fmt.Errorf("waiting for rollout to finish: %d old replicas are pending termination",
deployment.Status.Replicas-deployment.Status.UpdatedReplicas)
} else if deployment.Status.AvailableReplicas < deployment.Status.UpdatedReplicas {
return fmt.Sprintf("waiting for rollout to finish: %d of %d updated replicas are available",
deployment.Status.AvailableReplicas, deployment.Status.UpdatedReplicas), false
return retriable, fmt.Errorf("waiting for rollout to finish: %d of %d updated replicas are available",
deployment.Status.AvailableReplicas, deployment.Status.UpdatedReplicas)
}

} else {
return "waiting for rollout to finish: observed deployment generation less then desired generation", false
return true, fmt.Errorf("waiting for rollout to finish: observed deployment generation less then desired generation")
}

return "ready", true
return true, nil
}

func (c *CanaryDeployer) getDeploymentCondition(
Expand Down
8 changes: 4 additions & 4 deletions pkg/controller/deployer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,7 @@ func TestCanaryDeployer_SetState(t *testing.T) {
t.Fatal(err.Error())
}

err = deployer.SetState(canary, "running")
err = deployer.SetState(canary, v1alpha1.CanaryRunning)
if err != nil {
t.Fatal(err.Error())
}
Expand All @@ -392,8 +392,8 @@ func TestCanaryDeployer_SetState(t *testing.T) {
t.Fatal(err.Error())
}

if res.Status.State != "running" {
t.Errorf("Got %v wanted %v", res.Status.State, "running")
if res.Status.State != v1alpha1.CanaryRunning {
t.Errorf("Got %v wanted %v", res.Status.State, v1alpha1.CanaryRunning)
}
}

Expand All @@ -419,7 +419,7 @@ func TestCanaryDeployer_SyncStatus(t *testing.T) {
}

status := v1alpha1.CanaryStatus{
State: "running",
State: v1alpha1.CanaryRunning,
FailedChecks: 2,
}
err = deployer.SyncStatus(canary, status)
Expand Down
35 changes: 26 additions & 9 deletions pkg/controller/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ func (c *Controller) advanceCanary(name string, namespace string) {
maxWeight = cd.Spec.CanaryAnalysis.MaxWeight
}

// check primary and canary deployments status
if err := c.deployer.IsReady(cd); err != nil {
// check primary deployment status
if _, err := c.deployer.IsPrimaryReady(cd); err != nil {
c.recordEventWarningf(cd, "%v", err)
return
}
Expand All @@ -81,10 +81,26 @@ func (c *Controller) advanceCanary(name string, namespace string) {
c.recorder.SetDuration(cd, time.Since(begin))
}()

// check canary deployment status
retriable, err := c.deployer.IsCanaryReady(cd)
if err != nil && retriable {
c.recordEventWarningf(cd, "%v", err)
return
}

// check if the number of failed checks reached the threshold
if cd.Status.State == "running" && cd.Status.FailedChecks >= cd.Spec.CanaryAnalysis.Threshold {
c.recordEventWarningf(cd, "Rolling back %s.%s failed checks threshold reached %v",
cd.Name, cd.Namespace, cd.Status.FailedChecks)
if cd.Status.State == flaggerv1.CanaryRunning &&
(!retriable || cd.Status.FailedChecks >= cd.Spec.CanaryAnalysis.Threshold) {

if cd.Status.FailedChecks >= cd.Spec.CanaryAnalysis.Threshold {
c.recordEventWarningf(cd, "Rolling back %s.%s failed checks threshold reached %v",
cd.Name, cd.Namespace, cd.Status.FailedChecks)
}

if !retriable {
c.recordEventWarningf(cd, "Rolling back %s.%s progress deadline exceeded %v",
cd.Name, cd.Namespace, err)
}

// route all traffic back to primary
primaryRoute.Weight = 100
Expand All @@ -105,10 +121,11 @@ func (c *Controller) advanceCanary(name string, namespace string) {
}

// mark canary as failed
if err := c.deployer.SetState(cd, "failed"); err != nil {
if err := c.deployer.SyncStatus(cd, flaggerv1.CanaryStatus{State: flaggerv1.CanaryFailed}); err != nil {
c.logger.Errorf("%v", err)
return
}

c.recorder.SetStatus(cd)
c.sendNotification(cd.Spec.TargetRef.Name, cd.Namespace,
"Canary analysis failed, rollback finished.", true)
Expand Down Expand Up @@ -177,7 +194,7 @@ func (c *Controller) advanceCanary(name string, namespace string) {
}

// update status
if err := c.deployer.SetState(cd, "finished"); err != nil {
if err := c.deployer.SetState(cd, flaggerv1.CanaryFinished); err != nil {
c.recordEventWarningf(cd, "%v", err)
return
}
Expand All @@ -194,7 +211,7 @@ func (c *Controller) checkCanaryStatus(cd *flaggerv1.Canary, deployer CanaryDepl
}

if cd.Status.State == "" {
if err := deployer.SyncStatus(cd, flaggerv1.CanaryStatus{State: "initialized"}); err != nil {
if err := deployer.SyncStatus(cd, flaggerv1.CanaryStatus{State: flaggerv1.CanaryInitialized}); err != nil {
c.logger.Errorf("%v", err)
return false
}
Expand All @@ -213,7 +230,7 @@ func (c *Controller) checkCanaryStatus(cd *flaggerv1.Canary, deployer CanaryDepl
c.recordEventErrorf(cd, "%v", err)
return false
}
if err := deployer.SyncStatus(cd, flaggerv1.CanaryStatus{State: "running"}); err != nil {
if err := deployer.SyncStatus(cd, flaggerv1.CanaryStatus{State: flaggerv1.CanaryRunning}); err != nil {
c.logger.Errorf("%v", err)
return false
}
Expand Down

0 comments on commit 4cb5ceb

Please sign in to comment.