diff --git a/pkg/controller/deployer.go b/pkg/controller/deployer.go index 3130b6f0b..2f608f7e5 100644 --- a/pkg/controller/deployer.go +++ b/pkg/controller/deployer.go @@ -4,6 +4,7 @@ import ( "encoding/base64" "encoding/json" "fmt" + "time" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" @@ -62,37 +63,58 @@ func (c *CanaryDeployer) Promote(cd *flaggerv1.Canary) error { return nil } -// IsReady checks the primary and canary deployment status and returns an error if -// the deployments are in the middle of a rolling update or if the pods are unhealthy -func (c *CanaryDeployer) IsReady(cd *flaggerv1.Canary) error { - canary, err := c.kubeClient.AppsV1().Deployments(cd.Namespace).Get(cd.Spec.TargetRef.Name, metav1.GetOptions{}) +// IsPrimaryReady checks the primary deployment status and returns an error if +// the deployment is in the middle of a rolling update or if the pods are unhealthy +// it will return a non retriable error if the rolling update is stuck +func (c *CanaryDeployer) IsPrimaryReady(cd *flaggerv1.Canary) (bool, error) { + primaryName := fmt.Sprintf("%s-primary", cd.Spec.TargetRef.Name) + primary, err := c.kubeClient.AppsV1().Deployments(cd.Namespace).Get(primaryName, metav1.GetOptions{}) if err != nil { if errors.IsNotFound(err) { - return fmt.Errorf("deployment %s.%s not found", cd.Spec.TargetRef.Name, cd.Namespace) + return true, fmt.Errorf("deployment %s.%s not found", primaryName, cd.Namespace) } - return fmt.Errorf("deployment %s.%s query error %v", cd.Spec.TargetRef.Name, cd.Namespace, err) + return true, fmt.Errorf("deployment %s.%s query error %v", primaryName, cd.Namespace, err) } - if msg, healthy := c.getDeploymentStatus(canary); !healthy { - return fmt.Errorf("Halt %s.%s advancement %s", cd.Name, cd.Namespace, msg) + + retriable, err := c.isDeploymentReady(primary, cd.GetProgressDeadlineSeconds()) + if err != nil { + if retriable { + return retriable, fmt.Errorf("Halt %s.%s advancement %s", cd.Name, cd.Namespace, err.Error()) + } else { + return retriable, err + } } - primaryName := fmt.Sprintf("%s-primary", cd.Spec.TargetRef.Name) - primary, err := c.kubeClient.AppsV1().Deployments(cd.Namespace).Get(primaryName, metav1.GetOptions{}) + if primary.Spec.Replicas == int32p(0) { + return true, fmt.Errorf("halt %s.%s advancement primary deployment is scaled to zero", + cd.Name, cd.Namespace) + } + return true, nil +} + +// IsCanaryReady checks the primary deployment status and returns an error if +// the deployment is in the middle of a rolling update or if the pods are unhealthy +// it will return a non retriable error if the rolling update is stuck +func (c *CanaryDeployer) IsCanaryReady(cd *flaggerv1.Canary) (bool, error) { + canary, err := c.kubeClient.AppsV1().Deployments(cd.Namespace).Get(cd.Spec.TargetRef.Name, metav1.GetOptions{}) if err != nil { if errors.IsNotFound(err) { - return fmt.Errorf("deployment %s.%s not found", primaryName, cd.Namespace) + return true, fmt.Errorf("deployment %s.%s not found", cd.Spec.TargetRef.Name, cd.Namespace) } - return fmt.Errorf("deployment %s.%s query error %v", primaryName, cd.Namespace, err) - } - if msg, healthy := c.getDeploymentStatus(primary); !healthy { - return fmt.Errorf("Halt %s.%s advancement %s", cd.Name, cd.Namespace, msg) + return true, fmt.Errorf("deployment %s.%s query error %v", cd.Spec.TargetRef.Name, cd.Namespace, err) } - if primary.Spec.Replicas == int32p(0) { - return fmt.Errorf("halt %s.%s advancement %s", - cd.Name, cd.Namespace, "primary deployment is scaled to zero") + retriable, err := c.isDeploymentReady(canary, cd.GetProgressDeadlineSeconds()) + if err != nil { + if retriable { + return retriable, fmt.Errorf("Halt %s.%s advancement %s", cd.Name, cd.Namespace, err.Error()) + } else { + return retriable, fmt.Errorf("deployment does not have minimum availability for more than %vs", + cd.GetProgressDeadlineSeconds()) + } } - return nil + + return true, nil } // IsNewSpec returns true if the canary deployment pod spec has changed @@ -140,7 +162,7 @@ func (c *CanaryDeployer) SetFailedChecks(cd *flaggerv1.Canary, val int) error { } // SetState updates the canary status state -func (c *CanaryDeployer) SetState(cd *flaggerv1.Canary, state string) error { +func (c *CanaryDeployer) SetState(cd *flaggerv1.Canary, state flaggerv1.CanaryState) error { cd.Status.State = state cd.Status.LastTransitionTime = metav1.Now() cd, err := c.flaggerClient.FlaggerV1alpha1().Canaries(cd.Namespace).Update(cd) @@ -324,26 +346,46 @@ func (c *CanaryDeployer) createPrimaryHpa(cd *flaggerv1.Canary) error { return nil } -func (c *CanaryDeployer) getDeploymentStatus(deployment *appsv1.Deployment) (string, bool) { +// isDeploymentReady determines if a deployment is ready by checking the status conditions +// if a deployment has exceeded the progress deadline it returns a non retriable error +func (c *CanaryDeployer) isDeploymentReady(deployment *appsv1.Deployment, deadline int) (bool, error) { + retriable := true + + // Determine if the deployment is stuck by checking if there is a minimum replicas unavailable condition + // and if the last update time exceeds the deadline + if deployment.Generation <= deployment.Status.ObservedGeneration { + progress := c.getDeploymentCondition(deployment.Status, appsv1.DeploymentProgressing) + if progress != nil { + available := c.getDeploymentCondition(deployment.Status, appsv1.DeploymentAvailable) + if available != nil && available.Status == "False" && available.Reason == "MinimumReplicasUnavailable" { + from := available.LastUpdateTime + delta := time.Duration(deadline) * time.Second + retriable = !from.Add(delta).Before(time.Now()) + } + } + } + if deployment.Generation <= deployment.Status.ObservedGeneration { cond := c.getDeploymentCondition(deployment.Status, appsv1.DeploymentProgressing) + if cond != nil && cond.Reason == "ProgressDeadlineExceeded" { - return fmt.Sprintf("deployment %q exceeded its progress deadline", deployment.GetName()), false + return false, fmt.Errorf("deployment %q exceeded its progress deadline", deployment.GetName()) } else if deployment.Spec.Replicas != nil && deployment.Status.UpdatedReplicas < *deployment.Spec.Replicas { - return fmt.Sprintf("waiting for rollout to finish: %d out of %d new replicas have been updated", - deployment.Status.UpdatedReplicas, *deployment.Spec.Replicas), false + return retriable, fmt.Errorf("waiting for rollout to finish: %d out of %d new replicas have been updated", + deployment.Status.UpdatedReplicas, *deployment.Spec.Replicas) } else if deployment.Status.Replicas > deployment.Status.UpdatedReplicas { - return fmt.Sprintf("waiting for rollout to finish: %d old replicas are pending termination", - deployment.Status.Replicas-deployment.Status.UpdatedReplicas), false + return retriable, fmt.Errorf("waiting for rollout to finish: %d old replicas are pending termination", + deployment.Status.Replicas-deployment.Status.UpdatedReplicas) } else if deployment.Status.AvailableReplicas < deployment.Status.UpdatedReplicas { - return fmt.Sprintf("waiting for rollout to finish: %d of %d updated replicas are available", - deployment.Status.AvailableReplicas, deployment.Status.UpdatedReplicas), false + return retriable, fmt.Errorf("waiting for rollout to finish: %d of %d updated replicas are available", + deployment.Status.AvailableReplicas, deployment.Status.UpdatedReplicas) } + } else { - return "waiting for rollout to finish: observed deployment generation less then desired generation", false + return true, fmt.Errorf("waiting for rollout to finish: observed deployment generation less then desired generation") } - return "ready", true + return true, nil } func (c *CanaryDeployer) getDeploymentCondition( diff --git a/pkg/controller/deployer_test.go b/pkg/controller/deployer_test.go index 9171057cd..639a3b5d3 100644 --- a/pkg/controller/deployer_test.go +++ b/pkg/controller/deployer_test.go @@ -382,7 +382,7 @@ func TestCanaryDeployer_SetState(t *testing.T) { t.Fatal(err.Error()) } - err = deployer.SetState(canary, "running") + err = deployer.SetState(canary, v1alpha1.CanaryRunning) if err != nil { t.Fatal(err.Error()) } @@ -392,8 +392,8 @@ func TestCanaryDeployer_SetState(t *testing.T) { t.Fatal(err.Error()) } - if res.Status.State != "running" { - t.Errorf("Got %v wanted %v", res.Status.State, "running") + if res.Status.State != v1alpha1.CanaryRunning { + t.Errorf("Got %v wanted %v", res.Status.State, v1alpha1.CanaryRunning) } } @@ -419,7 +419,7 @@ func TestCanaryDeployer_SyncStatus(t *testing.T) { } status := v1alpha1.CanaryStatus{ - State: "running", + State: v1alpha1.CanaryRunning, FailedChecks: 2, } err = deployer.SyncStatus(canary, status) diff --git a/pkg/controller/scheduler.go b/pkg/controller/scheduler.go index ff5e7b4ad..f2f2a6096 100644 --- a/pkg/controller/scheduler.go +++ b/pkg/controller/scheduler.go @@ -56,8 +56,8 @@ func (c *Controller) advanceCanary(name string, namespace string) { maxWeight = cd.Spec.CanaryAnalysis.MaxWeight } - // check primary and canary deployments status - if err := c.deployer.IsReady(cd); err != nil { + // check primary deployment status + if _, err := c.deployer.IsPrimaryReady(cd); err != nil { c.recordEventWarningf(cd, "%v", err) return } @@ -81,10 +81,26 @@ func (c *Controller) advanceCanary(name string, namespace string) { c.recorder.SetDuration(cd, time.Since(begin)) }() + // check canary deployment status + retriable, err := c.deployer.IsCanaryReady(cd) + if err != nil && retriable { + c.recordEventWarningf(cd, "%v", err) + return + } + // check if the number of failed checks reached the threshold - if cd.Status.State == "running" && cd.Status.FailedChecks >= cd.Spec.CanaryAnalysis.Threshold { - c.recordEventWarningf(cd, "Rolling back %s.%s failed checks threshold reached %v", - cd.Name, cd.Namespace, cd.Status.FailedChecks) + if cd.Status.State == flaggerv1.CanaryRunning && + (!retriable || cd.Status.FailedChecks >= cd.Spec.CanaryAnalysis.Threshold) { + + if cd.Status.FailedChecks >= cd.Spec.CanaryAnalysis.Threshold { + c.recordEventWarningf(cd, "Rolling back %s.%s failed checks threshold reached %v", + cd.Name, cd.Namespace, cd.Status.FailedChecks) + } + + if !retriable { + c.recordEventWarningf(cd, "Rolling back %s.%s progress deadline exceeded %v", + cd.Name, cd.Namespace, err) + } // route all traffic back to primary primaryRoute.Weight = 100 @@ -105,10 +121,11 @@ func (c *Controller) advanceCanary(name string, namespace string) { } // mark canary as failed - if err := c.deployer.SetState(cd, "failed"); err != nil { + if err := c.deployer.SyncStatus(cd, flaggerv1.CanaryStatus{State: flaggerv1.CanaryFailed}); err != nil { c.logger.Errorf("%v", err) return } + c.recorder.SetStatus(cd) c.sendNotification(cd.Spec.TargetRef.Name, cd.Namespace, "Canary analysis failed, rollback finished.", true) @@ -177,7 +194,7 @@ func (c *Controller) advanceCanary(name string, namespace string) { } // update status - if err := c.deployer.SetState(cd, "finished"); err != nil { + if err := c.deployer.SetState(cd, flaggerv1.CanaryFinished); err != nil { c.recordEventWarningf(cd, "%v", err) return } @@ -194,7 +211,7 @@ func (c *Controller) checkCanaryStatus(cd *flaggerv1.Canary, deployer CanaryDepl } if cd.Status.State == "" { - if err := deployer.SyncStatus(cd, flaggerv1.CanaryStatus{State: "initialized"}); err != nil { + if err := deployer.SyncStatus(cd, flaggerv1.CanaryStatus{State: flaggerv1.CanaryInitialized}); err != nil { c.logger.Errorf("%v", err) return false } @@ -213,7 +230,7 @@ func (c *Controller) checkCanaryStatus(cd *flaggerv1.Canary, deployer CanaryDepl c.recordEventErrorf(cd, "%v", err) return false } - if err := deployer.SyncStatus(cd, flaggerv1.CanaryStatus{State: "running"}); err != nil { + if err := deployer.SyncStatus(cd, flaggerv1.CanaryStatus{State: flaggerv1.CanaryRunning}); err != nil { c.logger.Errorf("%v", err) return false }