Skip to content

Commit

Permalink
fix: incorrect task status update by deployer
Browse files Browse the repository at this point in the history
After deployer uninstall pods, it doesn't stop monitoring.
Thus, the deployer thinks that pods' status is unknown and updates
tasks' status as fail. This bug is fixed.
  • Loading branch information
myungjin committed Jun 2, 2023
1 parent 675439f commit 4039aca
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 2 deletions.
7 changes: 7 additions & 0 deletions cmd/deployer/app/deployer/k8s.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ type TaskHealthDetails struct {
Status v1.PodStatus `json:"status"`
JobName string `json:"job_name"`
CreationTime time.Time `json:"creation_time"`
UnknownCount int
}

type K8sDeployer struct {
Expand Down Expand Up @@ -178,6 +179,7 @@ func (deployer *K8sDeployer) GetMonitoredPodStatuses() (map[string]TaskHealthDet
return nil, err
} else if len(out.Items) == 0 {
pod.Status = v1.PodStatus{Phase: v1.PodUnknown}
pod.UnknownCount++
} else {
if len(out.Items) > 1 {
sort.Slice(out.Items, func(i, j int) bool {
Expand All @@ -196,8 +198,12 @@ func (deployer *K8sDeployer) GetMonitoredPodStatuses() (map[string]TaskHealthDet

if lastPod.Status.Phase == v1.PodPending || lastPod.Status.Phase == v1.PodRunning {
pod.Status = lastPod.Status
// reset unknown count
pod.UnknownCount = 0
} else if lastPod.Status.Phase == v1.PodSucceeded {
deployer.DeleteTaskFromMonitoring(pod.TaskID)
} else if lastPod.Status.Phase == v1.PodUnknown {
pod.UnknownCount++
} else {
pod.Status = lastPod.Status
}
Expand All @@ -206,6 +212,7 @@ func (deployer *K8sDeployer) GetMonitoredPodStatuses() (map[string]TaskHealthDet

// update the pod status
deployer.DeployedTasks[pod.TaskID] = pod
zap.S().Infof("Pod status of task %s = %s", pod.TaskID, pod.Status.Phase)
}

return deployer.DeployedTasks, nil
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@ func (r *resourceHandler) monitorPods() {
} else if len(taskHealthDetails) > 0 {
for _, pod := range taskHealthDetails {
if pod.Status.Phase == v1.PodPending ||
pod.Status.Phase == v1.PodRunning {
pod.Status.Phase == v1.PodRunning ||
pod.UnknownCount <= 3 {
// we will tolerate up to three times of unknown phase
continue
}

Expand All @@ -56,7 +58,7 @@ func (r *resourceHandler) monitorPods() {
}

r.dplyr.DeleteTaskFromMonitoring(pod.TaskID)
zap.S().Info("task %s failed; pod status: %s", pod.TaskID, pod.Status.Phase)
zap.S().Infof("task %s failed; pod status: %s", pod.TaskID, pod.Status.Phase)
}
}

Expand Down
4 changes: 4 additions & 0 deletions cmd/deployer/app/resource_handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,10 @@ func (r *resourceHandler) revokeResource(jobId string) (err error) {
continue
}
taskStatuses[taskId] = openapi.AGENT_REVOKE_SUCCESS

// stop monitoring of task
r.dplyr.DeleteTaskFromMonitoring(taskId)

// 2.delete all the task resource specification files
deploymentChartPath := filepath.Join(r.deploymentDirPath, jobId, taskId)
removeErr := os.RemoveAll(deploymentChartPath)
Expand Down

0 comments on commit 4039aca

Please sign in to comment.