From 7525f99d7ddfffc0e2cafa7c4452566238140392 Mon Sep 17 00:00:00 2001 From: Wen Zhou Date: Tue, 21 Nov 2023 13:49:48 +0100 Subject: [PATCH] [backport]: changes from rhods_2.4 to rhods_2.5 (#129) * [cherry-pick]: split workbenches image into 2 params.env file Signed-off-by: Wen Zhou * Update opendatahub label (cherry picked from commit 3e975f9188527b980f32a240020cb657389e4617) (cherry picked from commit 9f8b6492a2250e131289cd9b5e26a27c46441aff) * Update Codeflare manifests path (cherry picked from commit 014396cc70dfb4a59b53000227db9a4ff6177aaf) (cherry picked from commit 5f1c0d47b267d834d16d6eaa920e7caebdbefc3e) * Move creation of default DSC (cherry picked from commit ab3310987325e089b7f928808a7f31024665fc1e) (cherry picked from commit 00ddd6c771b98be8fafa3d342caf2c1ef2b68623) * update(manifests): enable kserve, modelmesh and workbenches - dashboard and modelmesh-monitoring still from odh-manifests Signed-off-by: Wen Zhou * Fix cherry-pick for dsci * fix(mm): set the new logic for modelmesh Signed-off-by: Wen Zhou * Fix the KF deployment: * fix(monitoring): do the switch for dev mode to not send alert Signed-off-by: Wen Zhou (cherry picked from commit 001cad1018c0b07c2be6aa5692a4d0815f2efd6a) * refactor: reduce alert level for codeflare operator * Update(manifests): for monitoring - remove https:// for dashbaord target - add nwp from odh-deployer - fix: wrong service name for operator, this is defined in CSV - port: do not use https but 8080 Signed-off-by: Wen Zhou * Fix manifests for monitoring (cherry picked from commit 85883f102bc15f2343c0f6afe253a29a4ff3f64f) * Revert changes to prometheus port Changes to prometheus port makes the route inaccessible * fix rebase * fix(dsci): missing label on namespaces (#98) - add SM which is in modelmesh-monitroing into operator monitoring - add roles which are in modelmesh-monitoring into ours too - apply 3 labels to both monitoring and application namespace (which is v1 doing) Signed-off-by: Wen Zhou * fix(monitoring): typo (#101) Signed-off-by: Wen Zhou * update(monitoring) - remove hardcoded app. namespace in segment manifests - remove hardcoded monitoring. namepsace in base manifests - add placeholder to inject monitoring namespace in Servicemonitor Signed-off-by: Wen Zhou * uplift: package version - github.com/operator-framework/operator-lifecycle-manager/releases/tag/v0.26.0 - github.com/openshift/api to latest v0.0.0 Signed-off-by: Wen Zhou * Remove odh csv * fix(crd): do not set ownerreference on CRD (#725) - we covered the case when set component from Managed to Remvoe - this is to cover the case when set have component as Managed and delete DSC CR - so if we do not set at first it wont get deleted Signed-off-by: Wen Zhou (cherry picked from commit e9461e0348c577329a3c596c9d63e74888eb079e) * Fix DSCI Patch * update(monitoring): metrics (#107) * update(monitoring): - add log in pod for QE to see it is dev mode cluster - add two metrics: i do not think they are used in this config but they are presented in v1 config , so i add back - move recording for workbench to correct rule file - remove operator-alerting.rules it is not used in v1 to keep it simple - fix: openshift-monitoring is using web as port name and our port - add more comments for the config and comments out not needed config - add egress for odh monitoring and add cluster monitoring NS for ingress - keep rhdos_aggerate_avaiablity from proemtehusrules along with 2 users reason for this is: PSI does not get non openshift-* or kube-* NS metrics to cluster-monitoring prometheus. as cluster-monitoring prometheus-k8s only use prometheusrule not serivcemonitor ? - from test result: if our monitoring ns not set cluster-monitoring, there is no targets on federation2 and no rhods_aggreated_in metrics - fix(monitoring): removed duplicated alerts of dashboard in workbenches - add UWM ns for operator ingress - according to doc: when enable UWM should not have custom Prometheus, this might be the conflicts why we cannot see metrics from odh monitoring in cluster-monitoring prometheus? Signed-off-by: Wen Zhou * Remove DSCI explicit naming * Fix regression in Prometheus Deployment * Remove os.exit for custom functions * Delete legacy blackbox exporter * fix(monitoring): add missing role and rolebinding for prometheus (#112) Signed-off-by: Wen Zhou * fix(monitoring): missing add new files into kustomization (#113) Signed-off-by: Wen Zhou * cleanup(monitoring): after previous 2 commits this is not needed/useful (#114) Signed-off-by: Wen Zhou * fix(monitoring): do not set odh monitoring namespace when apply for manifests in "monitoring/base" (#115) * fix(monitoring): not set our monitoring when apply to monitoring/base folder - hardcode our monitoring namespace for all needed manifests Signed-off-by: Wen Zhou * revert: label changes made in upgrade PR Signed-off-by: Wen Zhou * fix(monitoring): cannot load dashbaord record rules (#123) Signed-off-by: Wen Zhou * fix(monitoring): when DSC is removed entry in rule_files should be cleanedup - match does not work with * in the string need to use (.*) - add (-) in the front for diffientiate the rule_file or the real rules Signed-off-by: Wen Zhou * cherry-pick: edson's rhods-12939 from odh + debug + timeout tuning comnent out ExpointialBackoffWithContext for now to test not add v2 into markedDeletion list Signed-off-by: Wen Zhou * fix(upgrade): modelmesh monitoring deployment need deletion as well Signed-off-by: Wen Zhou * fix: add statefulset Signed-off-by: Wen Zhou * cherrypick: upstream 748 fix no reconcile when no error return Signed-off-by: Wen Zhou * RHODS-12956: removing CR update from the operator reconciliation loop to avoid infinite loop (#128) * chore Signed-off-by: Wen Zhou --------- Signed-off-by: Wen Zhou Co-authored-by: Vaishnavi Hire Co-authored-by: Dimitri Saridakis Co-authored-by: Edson Tirelli (cherry picked from commit 81ebc87bb232fa24aee4d9c11a10b8ead5eac9b8) --- Makefile | 2 +- .../rhods-operator.clusterserviceversion.yaml | 5 +- components/codeflare/codeflare.go | 2 +- components/component.go | 2 +- .../modelmeshserving/modelmeshserving.go | 2 +- components/workbenches/workbenches.go | 1 + .../alertmanager/alertmanager-configs.yaml | 1 + .../alertmanager/alertmanager-service.yaml | 2 +- config/monitoring/base/kustomization.yaml | 2 + .../base/rhods-prometheus-role.yaml | 17 ++ .../base/rhods-prometheus-rolebinding.yaml | 14 ++ .../base/rhods-prometheusrules.yaml | 14 ++ .../monitoring/base/rhods-servicemonitor.yaml | 77 +++++- .../networkpolicy/monitoring/monitoring.yaml | 16 +- .../networkpolicy/operator/operator.yaml | 6 + .../prometheus/apps/prometheus-configs.yaml | 180 ++++++++----- .../segment/segment-key-config.yaml | 1 - .../segment/segment-key-secret.yaml | 1 - config/rbac/role.yaml | 1 + .../datasciencecluster_controller.go | 31 +-- .../datasciencecluster/kubebuilder_rbac.go | 14 +- .../dscinitialization_controller.go | 1 + controllers/dscinitialization/monitoring.go | 11 +- controllers/dscinitialization/utils.go | 4 +- get_all_manifests.sh | 32 +-- go.mod | 10 +- go.sum | 19 +- main.go | 11 +- pkg/plugins/addLabelsplugin.go | 6 + pkg/upgrade/upgrade.go | 238 +++++++++++++++--- 30 files changed, 533 insertions(+), 190 deletions(-) create mode 100644 config/monitoring/base/rhods-prometheus-role.yaml create mode 100644 config/monitoring/base/rhods-prometheus-rolebinding.yaml diff --git a/Makefile b/Makefile index 0a4a97d17b1..e2644ba9323 100644 --- a/Makefile +++ b/Makefile @@ -164,7 +164,7 @@ run: manifests generate fmt vet ## Run a controller from your host. go run ./main.go .PHONY: image-build -image-build: unit-test ## Build image with the manager. +image-build: # unit-test ## Build image with the manager. $(IMAGE_BUILDER) build --no-cache -f Dockerfiles/Dockerfile ${IMAGE_BUILD_FLAGS} -t $(IMG) . .PHONY: image-push diff --git a/bundle/manifests/rhods-operator.clusterserviceversion.yaml b/bundle/manifests/rhods-operator.clusterserviceversion.yaml index af666c848bb..55b5cf01ffe 100644 --- a/bundle/manifests/rhods-operator.clusterserviceversion.yaml +++ b/bundle/manifests/rhods-operator.clusterserviceversion.yaml @@ -73,9 +73,9 @@ metadata: "metadata": { "labels": { "app.kubernetes.io/created-by": "opendatahub-operator", - "app.kubernetes.io/instance": "default-feature", + "app.kubernetes.io/instance": "default", "app.kubernetes.io/managed-by": "kustomize", - "app.kubernetes.io/name": "featuretracker", + "app.kubernetes.io/name": "default-feature", "app.kubernetes.io/part-of": "opendatahub-operator" }, "name": "default-feature" @@ -607,6 +607,7 @@ spec: verbs: - create - delete + - get - list - update - watch diff --git a/components/codeflare/codeflare.go b/components/codeflare/codeflare.go index 00edab78a24..6e1435c524a 100644 --- a/components/codeflare/codeflare.go +++ b/components/codeflare/codeflare.go @@ -77,7 +77,7 @@ func (c *CodeFlare) ReconcileComponent(cli client.Client, owner metav1.Object, d } if found, err := deploy.OperatorExists(cli, dependentOperator); err != nil { - return err + return fmt.Errorf("operator exists throws error %v", err) } else if found { return fmt.Errorf("operator %s found. Please uninstall the operator before enabling %s component", dependentOperator, ComponentName) diff --git a/components/component.go b/components/component.go index 332db804f57..20761572fa3 100644 --- a/components/component.go +++ b/components/component.go @@ -106,7 +106,7 @@ func (c *Component) UpdatePrometheusConfig(cli client.Client, enable bool, compo DeadManSnitchRules string `yaml:"deadmanssnitch-alerting.rules"` CFRRules string `yaml:"codeflare-recording.rules"` CRARules string `yaml:"codeflare-alerting.rules"` - DashboardRRules string `yaml:"rhods-dashboard-recording.rule"` + DashboardRRules string `yaml:"rhods-dashboard-recording.rules"` DashboardARules string `yaml:"rhods-dashboard-alerting.rules"` DSPRRules string `yaml:"data-science-pipelines-operator-recording.rules"` DSPARules string `yaml:"data-science-pipelines-operator-alerting.rules"` diff --git a/components/modelmeshserving/modelmeshserving.go b/components/modelmeshserving/modelmeshserving.go index 37826fabe11..5320f552fa2 100644 --- a/components/modelmeshserving/modelmeshserving.go +++ b/components/modelmeshserving/modelmeshserving.go @@ -117,7 +117,7 @@ func (m *ModelMeshServing) ReconcileComponent(cli client.Client, owner metav1.Ob // For odh-model-controller if enabled { - err := cluster.UpdatePodSecurityRolebinding(cli, dscispec.ApplicationsNamespace, "odh-model-controller") + err := cluster.UpdatePodSecurityRolebinding(cli, "odh-model-controller", dscispec.ApplicationsNamespace) if err != nil { return err } diff --git a/components/workbenches/workbenches.go b/components/workbenches/workbenches.go index f65eb03aba5..57d8db32afe 100644 --- a/components/workbenches/workbenches.go +++ b/components/workbenches/workbenches.go @@ -138,6 +138,7 @@ func (w *Workbenches) ReconcileComponent(cli client.Client, owner metav1.Object, if enabled { if dscispec.DevFlags.ManifestsUri == "" && len(w.DevFlags.Manifests) == 0 { if platform == deploy.ManagedRhods || platform == deploy.SelfManagedRhods { + // for kf-notebook-controller image if err := deploy.ApplyParams(notebookControllerPath, w.SetImageParamsMap(imageParamMap), false); err != nil { return err } diff --git a/config/monitoring/alertmanager/alertmanager-configs.yaml b/config/monitoring/alertmanager/alertmanager-configs.yaml index 07ac6cda0e0..545f2d51555 100644 --- a/config/monitoring/alertmanager/alertmanager-configs.yaml +++ b/config/monitoring/alertmanager/alertmanager-configs.yaml @@ -629,6 +629,7 @@ data: smtp_require_tls: true # The root route on which each incoming alert enters. + # TODO: check why need email_to route: group_by: ['alertname', 'cluster', 'service', 'job', 'email_to'] diff --git a/config/monitoring/alertmanager/alertmanager-service.yaml b/config/monitoring/alertmanager/alertmanager-service.yaml index ecceda2fd21..416f2c9b5f8 100644 --- a/config/monitoring/alertmanager/alertmanager-service.yaml +++ b/config/monitoring/alertmanager/alertmanager-service.yaml @@ -6,7 +6,7 @@ metadata: labels: name: alertmanager name: alertmanager - namespace: "redhat-ods-monitoring" + namespace: redhat-ods-monitoring spec: ports: - name: alertmanager diff --git a/config/monitoring/base/kustomization.yaml b/config/monitoring/base/kustomization.yaml index 6b64f364653..105add29843 100644 --- a/config/monitoring/base/kustomization.yaml +++ b/config/monitoring/base/kustomization.yaml @@ -4,3 +4,5 @@ resources: - cluster-monitor-rolebinding.yaml - rhods-prometheusrules.yaml - rhods-servicemonitor.yaml +- rhods-prometheus-role.yaml +- rhods-prometheus-rolebinding.yaml diff --git a/config/monitoring/base/rhods-prometheus-role.yaml b/config/monitoring/base/rhods-prometheus-role.yaml new file mode 100644 index 00000000000..c0b4f68def1 --- /dev/null +++ b/config/monitoring/base/rhods-prometheus-role.yaml @@ -0,0 +1,17 @@ +# this is role for cluster-monitoring to read rhods prometheus service by cluster-monitoring service account +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: rhods-prometheus-cluster-monitoring-viewer + namespace: redhat-ods-monitoring +rules: + - verbs: + - get + - watch + - list + apiGroups: + - '' + resources: + - pods + - services + - endpoints diff --git a/config/monitoring/base/rhods-prometheus-rolebinding.yaml b/config/monitoring/base/rhods-prometheus-rolebinding.yaml new file mode 100644 index 00000000000..a1268de5fd9 --- /dev/null +++ b/config/monitoring/base/rhods-prometheus-rolebinding.yaml @@ -0,0 +1,14 @@ +# this is rolebingding to rhods-prometheus-cluster-monitoring-viewer for cluster-monitoring to read rhods prometheus service +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: rhods-prometheus-cluster-monitoring-viewer-binding + namespace: redhat-ods-monitoring +subjects: + - kind: ServiceAccount + name: prometheus-k8s + namespace: openshift-monitoring +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: rhods-prometheus-cluster-monitoring-viewer \ No newline at end of file diff --git a/config/monitoring/base/rhods-prometheusrules.yaml b/config/monitoring/base/rhods-prometheusrules.yaml index 2b7fd310ed6..2464c395f3e 100644 --- a/config/monitoring/base/rhods-prometheusrules.yaml +++ b/config/monitoring/base/rhods-prometheusrules.yaml @@ -1,3 +1,8 @@ +# rhods_aggregate_availability, rhods_total_users, rhods_actvie_users should not be needed +# they should be from traditional prometheus pod but from prometheus operator +# but to get PSI work with some, put it here +# TODO: revisit if when we decomision customized prometheus instance +--- apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: @@ -6,19 +11,28 @@ metadata: role: recording-rules app: rhods name: rhods-rules + namespace: redhat-ods-monitoring spec: groups: - name: rhods-usage.rules rules: - record: cluster:usage:consumption:rhods:cpu:seconds:rate1h expr: sum(rate(container_cpu_usage_seconds_total{container="",pod=~"jupyter-nb.*",namespace="rhods-notebooks"}[1h])) + labels: + instance: jupyter-notebooks - record: cluster:usage:consumption:rhods:pod:up expr: count(kube_pod_container_status_ready{namespace="rhods-notebooks", pod=~"jupyter-nb.*",container=~"jupyter-nb-.*"}==1) + labels: + instance: jupyter-notebooks - record: cluster:usage:consumption:rhods:active_users expr: count(kube_statefulset_replicas{namespace=~"rhods-notebooks", statefulset=~"jupyter-nb-.*"} ==1) labels: instance: jupyter-notebooks - record: cluster:usage:consumption:rhods:cpu_requests_runtime expr: sum(kube_pod_container_resource_requests{namespace="rhods-notebooks",resource="cpu", container=~"jupyter-nb-.*"} * on(pod) kube_pod_status_phase{phase="Running", namespace="rhods-notebooks"}) + labels: + instance: jupyter-notebooks - record: cluster:usage:consumption:rhods:cpu_limits_runtime expr: sum(kube_pod_container_resource_limits{namespace="rhods-notebooks",resource="cpu", container=~"jupyter-nb-.*"} * on(pod) kube_pod_status_phase{phase="Running", namespace="rhods-notebooks"}) + labels: + instance: jupyter-notebooks \ No newline at end of file diff --git a/config/monitoring/base/rhods-servicemonitor.yaml b/config/monitoring/base/rhods-servicemonitor.yaml index 0e18b565de6..7449ef02150 100644 --- a/config/monitoring/base/rhods-servicemonitor.yaml +++ b/config/monitoring/base/rhods-servicemonitor.yaml @@ -2,6 +2,7 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: rhods-monitor-federation + namespace: redhat-ods-monitoring labels: monitor-component: rhods-resources team: rhods @@ -27,32 +28,86 @@ spec: interval: 30s namespaceSelector: matchNames: - - redhat-ods-monitoring + - selector: matchLabels: app: prometheus --- # servicemonitoring for rhods operator +# this is not in use, we need to implement operator metrics in logic first +# apiVersion: monitoring.coreos.com/v1 +# kind: ServiceMonitor +# metadata: +# labels: +# control-plane: controller-manager +# name: rhods-controller-manager-metrics-monitor +# namespace: redhat-ods-operator +# spec: +# endpoints: +# - path: /metrics +# port: metrics +# scheme: https +# bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token +# tlsConfig: +# insecureSkipVerify: true +# params: +# 'match[]': +# - '{__name__= "redhat-ods-operator-controller-manager-metrics-service"}' +# namespaceSelector: +# matchNames: +# - redhat-ods-operator +# selector: +# matchLabels: +# control-plane: controller-manager + +--- +# servicemonitoring for openshift-monitoring scrap +# move from modelmesh-monitoring +# this one is duplicated as the old modelmesh-federated-metrics +# in order to keep metrics there if user set modelmesh to Removed apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: + name: rhods-monitor-federation2 + namespace: redhat-ods-monitoring labels: - control-plane: controller-manager - name: rhods-controller-manager-metrics-monitor - namespace: redhat-ods-operator + monitor-component: rhods-resources + team: rhods spec: endpoints: - - path: /metrics - port: '8080' - scheme: https + - interval: 30s + params: + 'match[]': + - '{__name__= "haproxy_backend_http_average_response_latency_milliseconds"}' + - '{__name__= "haproxy_backend_http_responses_total"}' + - '{__name__= "container_cpu_usage_seconds_total"}' + - '{__name__= "container_memory_working_set_bytes"}' + - '{__name__= "node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate"}' + - '{__name__= "cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits"}' + - '{__name__= "cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests"}' + - '{__name__= "cluster:namespace:pod_memory:active:kube_pod_container_resource_requests"}' + - '{__name__= "cluster:namespace:pod_memory:active:kube_pod_container_resource_limits"}' + - '{__name__= "kube_persistentvolumeclaim_resource_requests_storage_bytes"}' + - '{__name__= "kubelet_volume_stats_used_bytes"}' + - '{__name__= "kubelet_volume_stats_capacity_bytes"}' + honorLabels: true + scrapeTimeout: 10s bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + bearerTokenSecret: + key: "" + path: /federate + port: web + scheme: https tlsConfig: + ca: {} + cert: {} insecureSkipVerify: true namespaceSelector: matchNames: - - redhat-ods-operator + - openshift-monitoring selector: matchLabels: - control-plane: controller-manager - - + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: openshift-monitoring \ No newline at end of file diff --git a/config/monitoring/networkpolicy/monitoring/monitoring.yaml b/config/monitoring/networkpolicy/monitoring/monitoring.yaml index 0563981b620..ba39672e7e0 100644 --- a/config/monitoring/networkpolicy/monitoring/monitoring.yaml +++ b/config/monitoring/networkpolicy/monitoring/monitoring.yaml @@ -2,6 +2,8 @@ # the services residing in redhat-ods-monitoring. namespaceSelector # ensures that traffic from only the desired namespaces is allowed # 9114 for blackbox or user_facing_endpoints* all down +# 9115 for blackbox health +# 10443 and 9091 for web --- kind: NetworkPolicy apiVersion: networking.k8s.io/v1 @@ -30,5 +32,17 @@ spec: - namespaceSelector: matchLabels: kubernetes.io/metadata.name: openshift-monitoring + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: openshift-user-workload-monitoring + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: redhat-ods-operator + - namespaceSelector: + matchLabels: + opendatahub.io/generated-namespace: "true" + egress: + - {} policyTypes: - - Ingress \ No newline at end of file + - Ingress + - Egress \ No newline at end of file diff --git a/config/monitoring/networkpolicy/operator/operator.yaml b/config/monitoring/networkpolicy/operator/operator.yaml index 8c9f8d9e122..a61b5a46f2a 100644 --- a/config/monitoring/networkpolicy/operator/operator.yaml +++ b/config/monitoring/networkpolicy/operator/operator.yaml @@ -16,5 +16,11 @@ spec: - namespaceSelector: matchLabels: kubernetes.io/metadata.name: openshift-monitoring + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: openshift-user-workload-monitoring + - namespaceSelector: + matchLabels: + opendatahub.io/generated-namespace: "true" policyTypes: - Ingress \ No newline at end of file diff --git a/config/monitoring/prometheus/apps/prometheus-configs.yaml b/config/monitoring/prometheus/apps/prometheus-configs.yaml index 5ebb0e3b266..5f29fa98910 100644 --- a/config/monitoring/prometheus/apps/prometheus-configs.yaml +++ b/config/monitoring/prometheus/apps/prometheus-configs.yaml @@ -7,8 +7,8 @@ metadata: data: prometheus.yml: | rule_files: - - 'operator-recording.rules' - - 'deadmanssnitch-alerting.rules' + - operator-recording.rules + - deadmanssnitch-alerting.rules global: scrape_interval: 10s @@ -232,6 +232,9 @@ data: scheme: http kubernetes_sd_configs: - role: endpoints + namespaces: + names: + - relabel_configs: - source_labels: [__meta_kubernetes_service_name] regex: ^(ds-pipeline-.*)$ @@ -273,7 +276,7 @@ data: - role: endpoints namespaces: names: - - + - relabel_configs: - source_labels: [__meta_kubernetes_service_name] regex: ^(codeflare-operator-manager-metrics)$ @@ -330,14 +333,14 @@ data: - name: DeadManSnitch interval: 1m rules: - - alert: DeadManSnitch - expr: vector(1) - labels: - severity: critical - annotations: - description: This is a DeadManSnitch to ensure RHODS monitoring and alerting pipeline is online. - summary: Alerting DeadManSnitch - + - alert: DeadManSnitch + expr: vector(1) + labels: + severity: critical + namespace: redhat-ods-monitoring + annotations: + description: This is a DeadManSnitch to ensure RHODS monitoring and alerting pipeline is online. + summary: Alerting DeadManSnitch codeflare-recording.rules: | groups: @@ -419,7 +422,7 @@ data: codeflare-alerting.rules: | groups: - - name: SLOs-probe_success + - name: SLOs-probe_success_codeflare rules: - alert: CodeFlare Operator Probe Success Burn Rate annotations: @@ -433,6 +436,7 @@ data: for: 2m labels: severity: info + namespace: redhat-ods-applications - alert: CodeFlare Operator Probe Success Burn Rate annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' @@ -445,6 +449,7 @@ data: for: 15m labels: severity: info + namespace: redhat-ods-applications - alert: CodeFlare Operator Probe Success Burn Rate annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' @@ -457,6 +462,7 @@ data: for: 1h labels: severity: info + namespace: redhat-ods-applications - name: Distributed Workloads CodeFlare interval: 1m @@ -464,7 +470,8 @@ data: - alert: CodeFlare Operator is not running expr: absent(up{job=~'CodeFlare Operator'}) or up{job=~'CodeFlare Operator'} != 1 labels: - severity: info + severity: info + namespace: redhat-ods-applications annotations: description: This alert fires when the CodeFlare Operator is not running. triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-availability.md' @@ -472,7 +479,8 @@ data: - alert: CodeFlare Operator taking too long to be up expr: absent_over_time(up{job="CodeFlare Operator"}[2m]) == 1 labels: - severity: info + severity: info + namespace: redhat-ods-applications annotations: description: This alert fires when the CodeFlare Operator takes over 2 min. to come back online. Either CodeFlare Operator is not running and failing to become ready, is misconfigured, or the metrics endpoint is not responding. triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-absent-over-time.md' @@ -583,6 +591,7 @@ data: for: 2m labels: severity: critical + namespace: redhat-ods-applications - alert: RHODS Dashboard Route Error Burn Rate annotations: message: 'High error budget burn for {{ $labels.route }} (current value: {{ $value }}).' @@ -595,6 +604,7 @@ data: for: 15m labels: severity: critical + namespace: redhat-ods-applications - alert: RHODS Dashboard Route Error Burn Rate annotations: message: 'High error budget burn for {{ $labels.route }} (current value: {{ $value }}).' @@ -607,6 +617,7 @@ data: for: 1h labels: severity: warning + namespace: redhat-ods-applications - alert: RHODS Dashboard Route Error Burn Rate annotations: message: 'High error budget burn for {{ $labels.route }} (current value: {{ $value }}).' @@ -619,7 +630,8 @@ data: for: 3h labels: severity: warning - - name: SLOs-probe_success + namespace: redhat-ods-applications + - name: SLOs-probe_success_dashboard rules: - alert: RHODS Dashboard Probe Success Burn Rate annotations: @@ -633,6 +645,7 @@ data: for: 2m labels: severity: critical + namespace: redhat-ods-applications - alert: RHODS Dashboard Probe Success Burn Rate annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' @@ -645,6 +658,7 @@ data: for: 15m labels: severity: critical + namespace: redhat-ods-applications - alert: RHODS Dashboard Probe Success Burn Rate annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' @@ -657,6 +671,7 @@ data: for: 1h labels: severity: warning + namespace: redhat-ods-applications - alert: RHODS Dashboard Probe Success Burn Rate annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' @@ -669,6 +684,7 @@ data: for: 3h labels: severity: warning + namespace: redhat-ods-applications data-science-pipelines-operator-recording.rules: | groups: @@ -778,6 +794,7 @@ data: for: 2m labels: severity: info + namespace: redhat-ods-applications - alert: Data Science Pipelines Application Route Error Burn Rate annotations: message: 'High error budget burn for {{ $labels.route }} (current value: {{ $value }}).' @@ -790,6 +807,7 @@ data: for: 15m labels: severity: info + namespace: redhat-ods-applications - alert: Data Science Pipelines Application Route Error Burn Rate annotations: message: 'High error budget burn for {{ $labels.route }} (current value: {{ $value }}).' @@ -802,6 +820,7 @@ data: for: 1h labels: severity: info + namespace: redhat-ods-applications - alert: Data Science Pipelines Application Route Error Burn Rate annotations: message: 'High error budget burn for {{ $labels.route }} (current value: {{ $value }}).' @@ -814,7 +833,8 @@ data: for: 3h labels: severity: info - - name: SLOs-probe_success + namespace: redhat-ods-applications + - name: SLOs-probe_success_dsp rules: - alert: Data Science Pipelines Operator Probe Success Burn Rate annotations: @@ -828,6 +848,7 @@ data: for: 2m labels: severity: critical + namespace: redhat-ods-applications - alert: Data Science Pipelines Operator Probe Success Burn Rate annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' @@ -840,6 +861,7 @@ data: for: 15m labels: severity: critical + namespace: redhat-ods-applications - alert: Data Science Pipelines Operator Probe Success Burn Rate annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' @@ -852,6 +874,7 @@ data: for: 1h labels: severity: warning + namespace: redhat-ods-applications - name: RHODS Data Science Pipelines rules: - alert: Data Science Pipeline Application Unavailable @@ -863,6 +886,7 @@ data: for: 2m labels: severity: info + namespace: redhat-ods-applications - alert: Data Science Pipeline APIServer Unavailable annotations: message: 'Data Science Pipelines APIServer component is down!' @@ -872,6 +896,7 @@ data: for: 2m labels: severity: info + namespace: redhat-ods-applications - alert: Data Science Pipeline PersistenceAgent Unavailable annotations: message: 'Data Science Pipelines PersistenceAgent component is down!' @@ -881,6 +906,7 @@ data: for: 2m labels: severity: info + namespace: redhat-ods-applications - alert: Data Science Pipeline ScheduledWorkflows Unavailable annotations: message: 'Data Science Pipelines ScheduledWorkflows component is down!' @@ -890,6 +916,7 @@ data: for: 2m labels: severity: info + namespace: redhat-ods-applications model-mesh-recording.rules: | groups: @@ -938,7 +965,7 @@ data: model-mesh-alerting.rules: | groups: - - name: SLOs-probe_success + - name: SLOs-probe_success_modelmesh rules: - alert: Modelmesh Controller Probe Success Burn Rate annotations: @@ -952,6 +979,7 @@ data: for: 2m labels: severity: critical + namespace: redhat-ods-applications - alert: Modelmesh Controller Probe Success Burn Rate annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' @@ -964,6 +992,7 @@ data: for: 15m labels: severity: critical + namespace: redhat-ods-applications - alert: Modelmesh Controller Probe Success Burn Rate annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' @@ -976,6 +1005,7 @@ data: for: 1h labels: severity: warning + namespace: redhat-ods-applications odh-model-controller-recording.rules: | groups: @@ -1024,7 +1054,7 @@ data: odh-model-controller-alerting.rules: | groups: - - name: SLOs-probe_success + - name: SLOs-probe_success_model_controller rules: - alert: ODH Model Controller Probe Success Burn Rate annotations: @@ -1038,6 +1068,7 @@ data: for: 2m labels: severity: critical + namespace: redhat-ods-applications - alert: ODH Model Controller Probe Success Burn Rate annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' @@ -1050,6 +1081,7 @@ data: for: 15m labels: severity: critical + namespace: redhat-ods-applications - alert: ODH Model Controller Probe Success Burn Rate annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' @@ -1062,6 +1094,7 @@ data: for: 1h labels: severity: warning + namespace: redhat-ods-applications ray-alerting.rules: | groups: @@ -1117,6 +1150,44 @@ data: instance: notebook-spawner record: probe_success:burnrate6h + - name: Usage Metrics + rules: + - expr: count(kube_statefulset_replicas{namespace=~"rhods-notebooks", statefulset=~"jupyter-nb-.*"}) + record: rhods_total_users + labels: + instance: jupyter-notebooks + - expr: count(kube_statefulset_replicas{namespace=~"rhods-notebooks", statefulset=~"jupyter-nb-.*"} ==1) + record: rhods_active_users + labels: + instance: jupyter-notebooks + - record: cluster:usage:consumption:rhods:cpu:seconds:rate1h + expr: sum(rate(container_cpu_usage_seconds_total{container="",pod=~"jupyter-nb.*",namespace="rhods-notebooks"}[1h])) + labels: + instance: jupyter-notebooks + - record: cluster:usage:consumption:rhods:pod:up + expr: count(kube_pod_container_status_ready{namespace="rhods-notebooks", pod=~"jupyter-nb.*",container=~"jupyter-nb-.*"}==1) + labels: + instance: jupyter-notebooks + - record: cluster:usage:consumption:rhods:active_users + expr: count(kube_statefulset_replicas{namespace=~"rhods-notebooks", statefulset=~"jupyter-nb-.*"} ==1) + labels: + instance: jupyter-notebooks + - record: cluster:usage:consumption:rhods:cpu_requests_runtime + expr: sum(kube_pod_container_resource_requests{namespace="rhods-notebooks",resource="cpu", container=~"jupyter-nb-.*"} * on(pod) kube_pod_status_phase{phase="Running", namespace="rhods-notebooks"}) + labels: + instance: jupyter-notebooks + - record: cluster:usage:consumption:rhods:cpu_limits_runtime + expr: sum(kube_pod_container_resource_limits{namespace="rhods-notebooks",resource="cpu", container=~"jupyter-nb-.*"} * on(pod) kube_pod_status_phase{phase="Running", namespace="rhods-notebooks"}) + labels: + instance: jupyter-notebooks + + - name: Availability Metrics + rules: + - expr: ((min(probe_success{name=~"rhods-dashboard|notebook-spawner"}) by (name) or on() vector(0)) or label_replace(min(probe_success{name=~"rhods-dashboard|notebook-spawner"}), "name", "combined", "name", ".*")) + record: rhods_aggregate_availability + labels: + instance: jupyter-notebooks + workbenches-alerting.rules: | groups: - name: RHODS-PVC-Usage @@ -1141,24 +1212,31 @@ data: labels: severity: warning route: user-notifications - - - name: Usage Metrics + + - name: RHODS Notebook controllers rules: - - expr: count(kube_statefulset_replicas{namespace=~"rhods-notebooks", statefulset=~"jupyter-nb-.*"}) - record: rhods_total_users + - alert: Kubeflow notebook controller pod is not running + annotations: + message: 'Kubeflow Notebook controller is down!' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Jupyter/rhods-kfnbc-notebook-controller-alert.md" + summary: Kubeflow notebook controller pod is not running + expr: absent(up{job=~'Kubeflow Notebook Controller Service Metrics'}) + for: 5m labels: - instance: jupyter-notebooks - - expr: count(kube_statefulset_replicas{namespace=~"rhods-notebooks", statefulset=~"jupyter-nb-.*"} ==1) - record: rhods_active_users + severity: warning + namespace: redhat-ods-applications + - alert: ODH notebook controller pod is not running + annotations: + message: 'ODH notebook controller is down!' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Jupyter/rhods-odh-notebook-controller-alert.md" + summary: ODH notebook controller pod is not running + expr: absent(up{job=~'ODH Notebook Controller Service Metrics'}) + for: 5m labels: - instance: jupyter-notebooks - - - name: Availability Metrics - rules: - - expr: ((min(probe_success{name=~"rhods-dashboard|notebook-spawner"}) by (name) or on() vector(0)) or label_replace(min(probe_success{name=~"rhods-dashboard|notebook-spawner"}), "name", "combined", "name", ".*")) - record: rhods_aggregate_availability + severity: warning + namespace: redhat-ods-applications - - name: SLOs-probe_success + - name: SLOs-probe_success_workbench rules: - alert: RHODS Jupyter Probe Success Burn Rate annotations: @@ -1172,6 +1250,7 @@ data: for: 2m labels: severity: critical + instance: notebook-spawner - alert: RHODS Jupyter Probe Success Burn Rate annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' @@ -1184,6 +1263,7 @@ data: for: 15m labels: severity: critical + instance: notebook-spawner - alert: RHODS Jupyter Probe Success Burn Rate annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' @@ -1196,6 +1276,7 @@ data: for: 1h labels: severity: warning + instance: notebook-spawner - alert: RHODS Jupyter Probe Success Burn Rate annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' @@ -1208,36 +1289,5 @@ data: for: 3h labels: severity: warning - - alert: RHODS Dashboard Probe Success Burn Rate - annotations: - message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' - triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-dashboard-probe-success-burn-rate.md" - summary: RHODS Dashboard Probe Success Burn Rate - expr: | - sum(probe_success:burnrate5m{name=~"rhods-dashboard"}) by (name) > (14.40 * (1-0.99950)) - and - sum(probe_success:burnrate1h{name=~"rhods-dashboard"}) by (name) > (14.40 * (1-0.99950)) - for: 2m - labels: - severity: critical - - - name: RHODS Notebook controllers - rules: - - alert: Kubeflow notebook controller pod is not running - annotations: - message: 'Kubeflow Notebook controller is down!' - triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Jupyter/rhods-kfnbc-notebook-controller-alert.md" - summary: Kubeflow notebook controller pod is not running - expr: absent(up{job=~'Kubeflow Notebook Controller Service Metrics'}) - for: 5m - labels: - severity: warning - - alert: ODH notebook controller pod is not running - annotations: - message: 'ODH notebook controller is down!' - triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Jupyter/rhods-odh-notebook-controller-alert.md" - summary: ODH notebook controller pod is not running - expr: absent(up{job=~'ODH Notebook Controller Service Metrics'}) - for: 5m - labels: - severity: warning + instance: notebook-spawner + diff --git a/config/monitoring/segment/segment-key-config.yaml b/config/monitoring/segment/segment-key-config.yaml index f1e6472385c..dc59518c96f 100644 --- a/config/monitoring/segment/segment-key-config.yaml +++ b/config/monitoring/segment/segment-key-config.yaml @@ -2,6 +2,5 @@ apiVersion: v1 kind: ConfigMap metadata: name: odh-segment-key-config - namespace: redhat-ods-applications data: segmentKeyEnabled: "true" \ No newline at end of file diff --git a/config/monitoring/segment/segment-key-secret.yaml b/config/monitoring/segment/segment-key-secret.yaml index 9e20ad67d0b..b6f855967e3 100644 --- a/config/monitoring/segment/segment-key-secret.yaml +++ b/config/monitoring/segment/segment-key-secret.yaml @@ -2,7 +2,6 @@ apiVersion: v1 kind: Secret metadata: name: odh-segment-key - namespace: redhat-ods-applications type: Opaque data: segmentKey: S1JVaG9BSUVwV2xHdXo0c1dpeGFlMXZBWEtLR2xENUs= \ No newline at end of file diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index d4ec0ac7785..eb4c416e902 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -402,6 +402,7 @@ rules: verbs: - create - delete + - get - list - update - watch diff --git a/controllers/datasciencecluster/datasciencecluster_controller.go b/controllers/datasciencecluster/datasciencecluster_controller.go index f1470d0ee71..203ea0fddf0 100644 --- a/controllers/datasciencecluster/datasciencecluster_controller.go +++ b/controllers/datasciencecluster/datasciencecluster_controller.go @@ -92,9 +92,11 @@ func (r *DataScienceClusterReconciler) Reconcile(ctx context.Context, req ctrl.R // Owned objects are automatically garbage collected. For additional cleanup logic use operatorUninstall function. // Return and don't requeue if upgrade.HasDeleteConfigMap(r.Client) { - return reconcile.Result{}, fmt.Errorf("error while operator uninstall: %v", - upgrade.OperatorUninstall(r.Client, r.RestConfig)) + if uninstallErr := upgrade.OperatorUninstall(r.Client, r.RestConfig); uninstallErr != nil { + return ctrl.Result{}, fmt.Errorf("error while operator uninstall: %v", uninstallErr) + } } + return ctrl.Result{}, nil } @@ -198,13 +200,6 @@ func (r *DataScienceClusterReconciler) Reconcile(ctx context.Context, req ctrl.R } } - // Ensure all omitted components show up as explicitly disabled - instance, err = r.updateComponents(ctx, instance) - if err != nil { - _ = r.reportError(err, instance, "error updating list of components in the CR") - return ctrl.Result{}, err - } - // Initialize error list, instead of returning errors after every component is deployed var componentErrors *multierror.Error @@ -252,6 +247,7 @@ func (r *DataScienceClusterReconciler) reconcileSubComponent(ctx context.Context component components.ComponentInterface, ) (*dsc.DataScienceCluster, error) { componentName := component.GetComponentName() + enabled := component.GetManagementState() == v1.Managed // First set conditions to reflect a component is about to be reconciled instance, err := r.updateStatus(ctx, instance, func(saved *dsc.DataScienceCluster) { @@ -372,23 +368,6 @@ func (r *DataScienceClusterReconciler) updateStatus(ctx context.Context, origina return saved, err } -func (r *DataScienceClusterReconciler) updateComponents(ctx context.Context, original *dsc.DataScienceCluster) (*dsc.DataScienceCluster, error) { - saved := &dsc.DataScienceCluster{} - err := retry.RetryOnConflict(retry.DefaultRetry, func() error { - err := r.Client.Get(ctx, client.ObjectKeyFromObject(original), saved) - if err != nil { - return err - } - - // Try to update - err = r.Client.Update(context.TODO(), saved) - // Return err itself here (not wrapped inside another error) - // so that RetryOnConflict can identify it correctly. - return err - }) - return saved, err -} - func (r *DataScienceClusterReconciler) watchDataScienceClusterResources(a client.Object) (requests []reconcile.Request) { instanceList := &dsc.DataScienceClusterList{} err := r.Client.List(context.TODO(), instanceList) diff --git a/controllers/datasciencecluster/kubebuilder_rbac.go b/controllers/datasciencecluster/kubebuilder_rbac.go index c8445329b08..6dbfaba89fe 100644 --- a/controllers/datasciencecluster/kubebuilder_rbac.go +++ b/controllers/datasciencecluster/kubebuilder_rbac.go @@ -103,6 +103,18 @@ package datasciencecluster // +kubebuilder:rbac:groups="monitoring.coreos.com",resources=podmonitors,verbs=get;create;delete;update;watch;list;patch // +kubebuilder:rbac:groups="monitoring.coreos.com",resources=prometheusrules,verbs=get;create;patch;delete;deletecollection // +kubebuilder:rbac:groups="monitoring.coreos.com",resources=prometheuses,verbs=get;create;patch;delete;deletecollection +// +kubebuilder:rbac:groups="monitoring.coreos.com",resources=prometheuses/finalizers,verbs=get;create;patch;delete;deletecollection +// +kubebuilder:rbac:groups="monitoring.coreos.com",resources=prometheuses/status,verbs=get;create;patch;delete;deletecollection + +// +kubebuilder:rbac:groups="monitoring.coreos.com",resources=alertmanagers,verbs=get;create;patch;delete;deletecollection +// +kubebuilder:rbac:groups="monitoring.coreos.com",resources=alertmanagers/finalizers,verbs=get;create;patch;delete;deletecollection +// +kubebuilder:rbac:groups="monitoring.coreos.com",resources=alertmanagers/status,verbs=get;create;patch;delete;deletecollection +// +kubebuilder:rbac:groups="monitoring.coreos.com",resources=alertmanagerconfigs,verbs=get;create;patch;delete;deletecollection +// +kubebuilder:rbac:groups="monitoring.coreos.com",resources=thanosrulers,verbs=get;create;patch;delete;deletecollection +// +kubebuilder:rbac:groups="monitoring.coreos.com",resources=thanosrulers/finalizers,verbs=get;create;patch;delete;deletecollection +// +kubebuilder:rbac:groups="monitoring.coreos.com",resources=thanosrulers/status,verbs=get;create;patch;delete;deletecollection +// +kubebuilder:rbac:groups="monitoring.coreos.com",resources=probes,verbs=get;create;patch;delete;deletecollection +// +kubebuilder:rbac:groups="monitoring.coreos.com",resources=prometheusrules,verbs=get;create;patch;delete;deletecollection //+kubebuilder:rbac:groups=trustyai.opendatahub.io.trustyai.opendatahub.io,resources=trustyaiservices,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups=trustyai.opendatahub.io.trustyai.opendatahub.io,resources=trustyaiservices/status,verbs=get;update;patch @@ -171,7 +183,7 @@ package datasciencecluster // +kubebuilder:rbac:groups="core",resources=events,verbs=get;create;watch;update;list;patch;delete // +kubebuilder:rbac:groups="events.k8s.io",resources=events,verbs=list;watch;patch;delete -// +kubebuilder:rbac:groups="core",resources=endpoints,verbs=watch;list;create;update;delete +// +kubebuilder:rbac:groups="core",resources=endpoints,verbs=watch;list;get;create;update;delete // +kubebuilder:rbac:groups="core",resources=configmaps/status,verbs=get;update;patch;delete // +kubebuilder:rbac:groups="core",resources=configmaps,verbs=get;create;watch;patch;delete;list diff --git a/controllers/dscinitialization/dscinitialization_controller.go b/controllers/dscinitialization/dscinitialization_controller.go index d9a46e1bd26..eb98c44cb2d 100644 --- a/controllers/dscinitialization/dscinitialization_controller.go +++ b/controllers/dscinitialization/dscinitialization_controller.go @@ -185,6 +185,7 @@ func (r *DSCInitializationReconciler) Reconcile(ctx context.Context, req ctrl.Re default: // Check namespace is not exist, then create namespace := instance.Spec.ApplicationsNamespace + r.Log.Info("Standard Reconciling workflow to create namespaces") err = r.createOdhNamespace(ctx, instance, namespace) if err != nil { // no need to log error as it was already logged in createOdhNamespace diff --git a/controllers/dscinitialization/monitoring.go b/controllers/dscinitialization/monitoring.go index d9a80f326ad..ea6bd8596b5 100644 --- a/controllers/dscinitialization/monitoring.go +++ b/controllers/dscinitialization/monitoring.go @@ -47,9 +47,18 @@ func (r *DSCInitializationReconciler) configureManagedMonitoring(ctx context.Con } } if initial == "revertbackup" { + // TODO: implement with a better solution + // to have - before component name is to filter out the real rules file line + // e.g line of "workbenches-recording.rules: |" err := common.MatchLineInFile(filepath.Join(prometheusConfigPath, "prometheus-configs.yaml"), map[string]string{ - "*.rules: ": "", + "(.*)-(.*)workbenches(.*).rules": "", + "(.*)-(.*)rhods-dashboard(.*).rules": "", + "(.*)-(.*)codeflare(.*).rules": "", + "(.*)-(.*)data-science-pipelines-operator(.*).rules": "", + "(.*)-(.*)model-mesh(.*).rules": "", + "(.*)-(.*)odh-model-controller(.*).rules": "", + "(.*)-(.*)ray(.*).rules": "", }) if err != nil { r.Log.Error(err, "error to remove previous enabled component rules") diff --git a/controllers/dscinitialization/utils.go b/controllers/dscinitialization/utils.go index c944cd8ebe6..9c026fb5127 100644 --- a/controllers/dscinitialization/utils.go +++ b/controllers/dscinitialization/utils.go @@ -265,9 +265,9 @@ func (r *DSCInitializationReconciler) reconcileDefaultNetworkPolicy(ctx context. }, }, }, - { // OR logic for ROSA + { // OR logic From: []netv1.NetworkPolicyPeer{ - { // need this to access dashboard + { // need this for access dashboard NamespaceSelector: &metav1.LabelSelector{ MatchLabels: map[string]string{ "kubernetes.io/metadata.name": "openshift-ingress", diff --git a/get_all_manifests.sh b/get_all_manifests.sh index fe6e0e12808..33f66021b2b 100755 --- a/get_all_manifests.sh +++ b/get_all_manifests.sh @@ -11,18 +11,16 @@ MANIFESTS_TARBALL_URL="${GITHUB_URL}/${MANIFEST_ORG}/odh-manifests/tarball/${MAN # component: notebook, dsp, kserve, dashbaord, cf/ray, trustyai, modelmesh. # in the format of "repo-org:repo-name:branch-name:source-folder:target-folder". declare -A COMPONENT_MANIFESTS=( - ["codeflare"]="red-hat-data-services:codeflare-operator:rhods-2.4:config:codeflare" - ["ray"]="red-hat-data-services:kuberay:rhods-2.4:ray-operator/config:ray" - ["data-science-pipelines-operator"]="red-hat-data-services:data-science-pipelines-operator:rhods-2.4:config:data-science-pipelines-operator" -# ["odh-dashboard"]="opendatahub-io:odh-dashboard:main:manifests:dashboard" -# ["kf-notebook-controller"]="red-hat-data-services:kubeflow:rhods-2.4:components/notebook-controller/config:odh-notebook-controller/kf-notebook-controller" -# ["odh-notebook-controller"]="red-hat-data-services:kubeflow:rhods-2.4:components/odh-notebook-controller/config:odh-notebook-controller/odh-notebook-controller" - ["notebooks"]="red-hat-data-services:notebooks:rhods-2.4:manifests:/jupyterhub/notebooks" - ["trustyai"]="red-hat-data-services:trustyai-service-operator:rhods-2.4:config:trustyai-service-operator" -# ["model-mesh"]="red-hat-data-services:modelmesh-serving:release-0.11.0:config:model-mesh" -# ["odh-model-controller"]="red-hat-data-services:odh-model-controller:release-0.11.0:config:odh-model-controller" - ["kserve"]="red-hat-data-services:kserve:release-v0.11.0:config:kserve" - + ["codeflare"]="red-hat-data-services:codeflare-operator:rhods-2.5:config:codeflare" + ["ray"]="red-hat-data-services:kuberay:rhods-2.5:ray-operator/config:ray" + ["data-science-pipelines-operator"]="red-hat-data-services:data-science-pipelines-operator:rhods-2.5:config:data-science-pipelines-operator" + ["kf-notebook-controller"]="red-hat-data-services:kubeflow:rhods-2.5:components/notebook-controller/config:odh-notebook-controller/kf-notebook-controller" + ["odh-notebook-controller"]="red-hat-data-services:kubeflow:rhods-2.5:components/odh-notebook-controller/config:odh-notebook-controller/odh-notebook-controller" + ["notebooks"]="red-hat-data-services:notebooks:rhods-2.5:manifests:/jupyterhub/notebooks" + ["trustyai"]="red-hat-data-services:trustyai-service-operator:rhods-2.5:config:trustyai-service-operator" + ["model-mesh"]="red-hat-data-services:modelmesh-serving:rhods-2.5:config:model-mesh" + ["odh-model-controller"]="red-hat-data-services:odh-model-controller:rhods-2.5:config:odh-model-controller" + ["kserve"]="red-hat-data-services:kserve:rhods-2.5:config:kserve" ) # Allow overwriting repo using flags component=repo @@ -54,16 +52,10 @@ rm -fr ./odh-manifests/* ./.odh-manifests-tmp/ mkdir -p ./.odh-manifests-tmp/ ./odh-manifests/ wget -q -c ${MANIFESTS_TARBALL_URL} -O - | tar -zxv -C ./.odh-manifests-tmp/ --strip-components 1 > /dev/null + # mm-monitroing cp -r ./.odh-manifests-tmp/modelmesh-monitoring/ ./odh-manifests -# modelmesh -cp -r ./.odh-manifests-tmp/model-mesh/ ./odh-manifests -cp -r ./.odh-manifests-tmp/odh-model-controller/ ./odh-manifests -cp -r ./.odh-manifests-tmp/modelmesh-monitoring/ ./odh-manifests -# Kserve -cp -r ./.odh-manifests-tmp/kserve/ ./odh-manifests -# workbench nbc -cp -r ./.odh-manifests-tmp/odh-notebook-controller/ ./odh-manifests + # Dashboard cp -r ./.odh-manifests-tmp/odh-dashboard/ ./odh-manifests/dashboard diff --git a/go.mod b/go.mod index 5687f67e33b..d1be95930f0 100644 --- a/go.mod +++ b/go.mod @@ -12,8 +12,8 @@ require ( github.com/openshift/addon-operator/apis v0.0.0-20230919043633-820afed15881 github.com/openshift/api v0.0.0-20230823114715-5fdd7511b790 github.com/openshift/custom-resource-status v1.1.2 - github.com/operator-framework/api v0.17.6 - github.com/operator-framework/operator-lifecycle-manager v0.25.0 + github.com/operator-framework/api v0.18.0 + github.com/operator-framework/operator-lifecycle-manager v0.26.0 github.com/pkg/errors v0.9.1 github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.69.1 github.com/stretchr/testify v1.8.3 @@ -34,7 +34,7 @@ require ( github.com/blang/semver/v4 v4.0.0 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect - github.com/emicklei/go-restful/v3 v3.10.1 // indirect + github.com/emicklei/go-restful/v3 v3.10.2 // indirect github.com/evanphx/json-patch v5.6.0+incompatible // indirect github.com/evanphx/json-patch/v5 v5.6.0 // indirect github.com/fsnotify/fsnotify v1.6.0 // indirect @@ -53,7 +53,7 @@ require ( github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 // indirect github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 // indirect github.com/google/uuid v1.3.1 // indirect - github.com/hashicorp/errwrap v1.0.0 // indirect + github.com/hashicorp/errwrap v1.1.0 // indirect github.com/imdario/mergo v0.3.13 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect @@ -71,7 +71,7 @@ require ( github.com/rhobs/obo-prometheus-operator/pkg/apis/monitoring v0.61.1-rhobs1 // indirect github.com/rogpeppe/go-internal v1.11.0 // indirect github.com/sergi/go-diff v1.2.0 // indirect - github.com/sirupsen/logrus v1.9.0 // indirect + github.com/sirupsen/logrus v1.9.2 // indirect github.com/spf13/pflag v1.0.5 // indirect github.com/xlab/treeprint v1.2.0 // indirect go.starlark.net v0.0.0-20200306205701-8dd3e2ee1dd5 // indirect diff --git a/go.sum b/go.sum index e29714b9540..860f79606a7 100644 --- a/go.sum +++ b/go.sum @@ -773,8 +773,8 @@ github.com/elazarl/goproxy v0.0.0-20180725130230-947c36da3153/go.mod h1:/Zj4wYkg github.com/emicklei/go-restful v2.9.5+incompatible/go.mod h1:otzb+WCGbkyDHkqmQmT5YD2WR4BBwUdeQoFo8l/7tVs= github.com/emicklei/go-restful v2.15.0+incompatible/go.mod h1:otzb+WCGbkyDHkqmQmT5YD2WR4BBwUdeQoFo8l/7tVs= github.com/emicklei/go-restful/v3 v3.8.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= -github.com/emicklei/go-restful/v3 v3.10.1 h1:rc42Y5YTp7Am7CS630D7JmhRjq4UlEUuEKfrDac4bSQ= -github.com/emicklei/go-restful/v3 v3.10.1/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/emicklei/go-restful/v3 v3.10.2 h1:hIovbnmBTLjHXkqEBUz3HGpXZdM7ZrE9fJIZIqlJLqE= +github.com/emicklei/go-restful/v3 v3.10.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= github.com/envoyproxy/go-control-plane v0.10.2-0.20220325020618-49ff273808a1/go.mod h1:KJwIaB5Mv44NWtYuAOFCVOjcI94vtpEz2JU/D2v6IjE= github.com/envoyproxy/go-control-plane v0.10.3/go.mod h1:fJJn/j26vwOu972OllsvAgJJM//w9BV6Fxbg2LuVd34= github.com/envoyproxy/go-control-plane v0.11.0/go.mod h1:VnHyVMpzcLvCFt9yUz1UnCwHLhwx1WguiVDV7pTG/tI= @@ -964,8 +964,9 @@ github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/ad github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw= github.com/grpc-ecosystem/grpc-gateway/v2 v2.7.0/go.mod h1:hgWBS7lorOAVIJEQMi4ZsPv9hVvWI6+ch50m39Pf2Ks= github.com/grpc-ecosystem/grpc-gateway/v2 v2.11.3/go.mod h1:o//XUCC/F+yRGJoPO/VU0GSB0f8Nhgmxx0VIRUvaC0w= -github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA= github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= +github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I= +github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo= github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM= github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= @@ -1068,10 +1069,10 @@ github.com/openshift/api v0.0.0-20230823114715-5fdd7511b790 h1:e3zIxk67/kiABxGFf github.com/openshift/api v0.0.0-20230823114715-5fdd7511b790/go.mod h1:yimSGmjsI+XF1mr+AKBs2//fSXIOhhetHGbMlBEfXbs= github.com/openshift/custom-resource-status v1.1.2 h1:C3DL44LEbvlbItfd8mT5jWrqPfHnSOQoQf/sypqA6A4= github.com/openshift/custom-resource-status v1.1.2/go.mod h1:DB/Mf2oTeiAmVVX1gN+NEqweonAPY0TKUwADizj8+ZA= -github.com/operator-framework/api v0.17.6 h1:E6+vlvYUKafvoXYtCuHlDZrXX4vl8AT+r93OxNlzjpU= -github.com/operator-framework/api v0.17.6/go.mod h1:l/cuwtPxkVUY7fzYgdust2m9tlmb8I4pOvbsUufRb24= -github.com/operator-framework/operator-lifecycle-manager v0.25.0 h1:Y/ocKKQXxmxxNMH3xIbB0kRjicYIN9cN8ka/DUgjTGQ= -github.com/operator-framework/operator-lifecycle-manager v0.25.0/go.mod h1:0DeNITwrneRQ7b5Qd6Dnp9+CpIBbv3F21RyncsK5ivU= +github.com/operator-framework/api v0.18.0 h1:6EdSNeAjin4LRu2YQnQWMJMc6HXS0AQDG+CfaEvFrAo= +github.com/operator-framework/api v0.18.0/go.mod h1:SCCslqke6AVOJ5JM+NqNE1CHuAgJLScsL66pnPaSMXs= +github.com/operator-framework/operator-lifecycle-manager v0.26.0 h1:16vEJZ5gzMXlcvNlJBnLsEfa+8h5+mvp7/xHwj+u230= +github.com/operator-framework/operator-lifecycle-manager v0.26.0/go.mod h1:uDY8iANE0neSPEYNsT/AE3fpuKS9OhgcKqKEBMhGO6A= github.com/phpdave11/gofpdf v1.4.2/go.mod h1:zpO6xFn9yxo3YLyMvW8HcKWVdbNqgIfOOp2dXMnm1mY= github.com/phpdave11/gofpdi v1.0.12/go.mod h1:vBmVV0Do6hSBHC8uKUQ71JGW+ZGQq74llk/7bXwjDoI= github.com/phpdave11/gofpdi v1.0.13/go.mod h1:vBmVV0Do6hSBHC8uKUQ71JGW+ZGQq74llk/7bXwjDoI= @@ -1111,8 +1112,8 @@ github.com/ruudk/golang-pdf417 v0.0.0-20181029194003-1af4ab5afa58/go.mod h1:6lfF github.com/ruudk/golang-pdf417 v0.0.0-20201230142125-a7e3863a1245/go.mod h1:pQAZKsJ8yyVxGRWYNEm9oFB8ieLgKFnamEyDmSA0BRk= github.com/sergi/go-diff v1.2.0 h1:XU+rvMAioB0UC3q1MFrIQy4Vo5/4VsRDQQXHsEya6xQ= github.com/sergi/go-diff v1.2.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= -github.com/sirupsen/logrus v1.9.0 h1:trlNQbNUG3OdDrDil03MCb1H2o9nJ1x4/5LYw7byDE0= -github.com/sirupsen/logrus v1.9.0/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/sirupsen/logrus v1.9.2 h1:oxx1eChJGI6Uks2ZC4W1zpLlVgqB8ner4EuQwV4Ik1Y= +github.com/sirupsen/logrus v1.9.2/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/spf13/afero v1.2.2/go.mod h1:9ZxEEn6pIJ8Rxe320qSDBk6AsU0r9pR7Q4OcevTdifk= github.com/spf13/afero v1.3.3/go.mod h1:5KUK8ByomD5Ti5Artl0RtHeI5pTF7MIDuXL3yY520V4= github.com/spf13/afero v1.6.0/go.mod h1:Ai8FlHk4v/PARR026UzYexafAt9roJ7LcLMAmO6Z93I= diff --git a/main.go b/main.go index e97dbf1f35d..3a2ff135336 100644 --- a/main.go +++ b/main.go @@ -144,9 +144,10 @@ func main() { } if err = (&datascienceclustercontrollers.DataScienceClusterReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - Log: ctrl.Log.WithName("controllers").WithName("DataScienceCluster"), + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + RestConfig: mgr.GetConfig(), + Log: ctrl.Log.WithName("controllers").WithName("DataScienceCluster"), DataScienceCluster: &datascienceclustercontrollers.DataScienceClusterConfig{ DSCISpec: &dsci.DSCInitializationSpec{ ApplicationsNamespace: dscApplicationsNamespace, @@ -181,7 +182,7 @@ func main() { // Get operator platform platform, err := deploy.GetPlatform(setupClient) if err != nil { - setupLog.Error(err, "error getting client for setup") + setupLog.Error(err, "error getting platform") os.Exit(1) } @@ -194,7 +195,7 @@ func main() { } // Apply update from legacy operator - if err = upgrade.UpdateFromLegacyVersion(setupClient, platform); err != nil { + if err = upgrade.UpdateFromLegacyVersion(setupClient, platform, dscApplicationsNamespace, dscMonitoringNamespace); err != nil { setupLog.Error(err, "unable to update from legacy operator version") } diff --git a/pkg/plugins/addLabelsplugin.go b/pkg/plugins/addLabelsplugin.go index e4cf31883eb..a440210a227 100644 --- a/pkg/plugins/addLabelsplugin.go +++ b/pkg/plugins/addLabelsplugin.go @@ -11,6 +11,7 @@ func ApplyAddLabelsPlugin(componentName string, resMap resmap.ResMap) error { nsplug := builtins.LabelTransformerPlugin{ Labels: map[string]string{ "app.opendatahub.io/" + componentName: "true", + "app.kubernetes.io/part-of": componentName, }, FieldSpecs: []types.FieldSpec{ { @@ -18,6 +19,11 @@ func ApplyAddLabelsPlugin(componentName string, resMap resmap.ResMap) error { Path: "spec/template/metadata/labels", CreateIfNotPresent: true, }, + { + Gvk: resid.Gvk{Kind: "Deployment"}, + Path: "spec/selector/matchLabels", + CreateIfNotPresent: true, + }, { Gvk: resid.Gvk{}, Path: "metadata/labels", diff --git a/pkg/upgrade/upgrade.go b/pkg/upgrade/upgrade.go index a7896c93383..d930fcccaea 100644 --- a/pkg/upgrade/upgrade.go +++ b/pkg/upgrade/upgrade.go @@ -5,17 +5,21 @@ import ( "encoding/json" "fmt" "os" + // "reflect" "strings" "time" + "github.com/hashicorp/go-multierror" operatorv1 "github.com/openshift/api/operator/v1" ofapi "github.com/operator-framework/api/pkg/operators/v1alpha1" olmclientset "github.com/operator-framework/operator-lifecycle-manager/pkg/api/client/clientset/versioned/typed/operators/v1alpha1" + appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" apiextv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" apierrs "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/rest" "sigs.k8s.io/controller-runtime/pkg/client" @@ -257,30 +261,69 @@ func CreateDefaultDSCI(cli client.Client, platform deploy.Platform, appNamespace return nil } -func UpdateFromLegacyVersion(cli client.Client, platform deploy.Platform) error { +func UpdateFromLegacyVersion(cli client.Client, platform deploy.Platform, appNS string, montNamespace string) error { // If platform is Managed, remove Kfdefs and create default dsc if platform == deploy.ManagedRhods { - err := CreateDefaultDSC(cli, platform) - if err != nil { + fmt.Println("starting deletion of Deloyments in managed cluster") + if err := deleteResource(cli, appNS, "deployment"); err != nil { + return err + } + // this is for the modelmesh monitoring part from v1 to v2 + if err := deleteResource(cli, montNamespace, "deployment"); err != nil { + return err + } + if err := deleteResource(cli, montNamespace, "statefulset"); err != nil { + return err + } + if err := CreateDefaultDSC(cli, platform); err != nil { return err } - err = RemoveKfDefInstances(cli, platform) - if err != nil { + if err := RemoveKfDefInstances(cli, platform); err != nil { return err } + return nil } if platform == deploy.SelfManagedRhods { - kfDefList, err := getKfDefInstances(cli) - if err != nil { - return fmt.Errorf("error getting kfdef instances: %v", err) + fmt.Println("starting deletion of Deloyments in selfmanaged cluster") + // If KfDef CRD is not found, we see it as a cluster not pre-installed v1 operator // Check if kfdef are deployed + kfdefCrd := &apiextv1.CustomResourceDefinition{} + if err := cli.Get(context.TODO(), client.ObjectKey{Name: "kfdefs.kfdef.apps.kubeflow.org"}, kfdefCrd); err != nil { + if apierrs.IsNotFound(err) { + // If no Crd found, return, since its a new Installation + // return empty list + return nil + } else { + return fmt.Errorf("error retrieving kfdef CRD : %v", err) + } } + // If KfDef Instances found, and no DSC instances are found in Self-managed, that means this is an upgrade path from + // legacy version. Create a default DSC instance + kfDefList := &kfdefv1.KfDefList{} + err := cli.List(context.TODO(), kfDefList) + if err != nil { + if apierrs.IsNotFound(err) { + // If no KfDefs, do nothing and return + return nil + } else { + return fmt.Errorf("error getting kfdef instances: : %w", err) + } + } if len(kfDefList.Items) > 0 { - err := CreateDefaultDSC(cli, platform) - if err != nil { + if err = deleteResource(cli, appNS, "deployment"); err != nil { + return fmt.Errorf("error deleting deployment: %w", err) + } + // this is for the modelmesh monitoring part from v1 to v2 + if err := deleteResource(cli, montNamespace, "deployment"); err != nil { + return err + } + if err := deleteResource(cli, montNamespace, "statefulset"); err != nil { + return err + } + if err = CreateDefaultDSC(cli, platform); err != nil { return err } } @@ -301,12 +344,28 @@ func GetOperatorNamespace() (string, error) { func RemoveKfDefInstances(cli client.Client, platform deploy.Platform) error { // Check if kfdef are deployed - expectedKfDefList, err := getKfDefInstances(cli) + kfdefCrd := &apiextv1.CustomResourceDefinition{} + + err := cli.Get(context.TODO(), client.ObjectKey{Name: "kfdefs.kfdef.apps.kubeflow.org"}, kfdefCrd) if err != nil { - return err - } - // Delete kfdefs - if len(expectedKfDefList.Items) > 0 { + if apierrs.IsNotFound(err) { + // If no Crd found, return, since its a new Installation + return nil + } else { + return fmt.Errorf("error retrieving kfdef CRD : %v", err) + } + } else { + expectedKfDefList := &kfdefv1.KfDefList{} + err := cli.List(context.TODO(), expectedKfDefList) + if err != nil { + if apierrs.IsNotFound(err) { + // If no KfDefs, do nothing and return + return nil + } else { + return fmt.Errorf("error getting list of kfdefs: %v", err) + } + } + // Delete kfdefs for _, kfdef := range expectedKfDefList.Items { // Remove finalizer updatedKfDef := &kfdef @@ -321,7 +380,6 @@ func RemoveKfDefInstances(cli client.Client, platform deploy.Platform) error { } } } - return nil } @@ -376,30 +434,140 @@ func getClusterServiceVersion(cfg *rest.Config, watchNameSpace string) (*ofapi.C return nil, nil } -func getKfDefInstances(c client.Client) (*kfdefv1.KfDefList, error) { - // If KfDef CRD is not found, we see it as a cluster not pre-installed v1 operator // Check if kfdef are deployed - kfdefCrd := &apiextv1.CustomResourceDefinition{} - if err := c.Get(context.TODO(), client.ObjectKey{Name: "kfdefs.kfdef.apps.kubeflow.org"}, kfdefCrd); err != nil { - if apierrs.IsNotFound(err) { - // If no Crd found, return, since its a new Installation - // return empty list - return &kfdefv1.KfDefList{}, nil +func deleteResource(cli client.Client, namespace string, resourceType string) error { + // In v2, Deployment selectors use a label "app.opendatahub.io/" which is + // not present in v1. Since label selectors are immutable, we need to delete the existing + // deployments and recreated them. + // because we can't proceed if a deployment is not deleted, we use exponential backoff + // to retry the deletion until it succeeds + var err error + switch resourceType { + case "deployment": + err = wait.ExponentialBackoffWithContext(context.TODO(), wait.Backoff{ + // 5, 10, ,20, 40 then timeout + Duration: 5 * time.Second, + Factor: 2.0, + Jitter: 0.1, + Steps: 4, + Cap: 1 * time.Minute, + }, func(ctx context.Context) (bool, error) { + done, err := deleteDeploymentsAndCheck(ctx, cli, namespace) + return done, err + }) + case "statefulset": + err = wait.ExponentialBackoffWithContext(context.TODO(), wait.Backoff{ + // 10, 20 then timeout + Duration: 10 * time.Second, + Factor: 2.0, + Jitter: 0.1, + Steps: 2, + Cap: 1 * time.Minute, + }, func(ctx context.Context) (bool, error) { + done, err := deleteStatefulsetsAndCheck(ctx, cli, namespace) + return done, err + }) + } + return err +} + +func deleteDeploymentsAndCheck(ctx context.Context, cli client.Client, namespace string) (bool, error) { //nolint + // Delete Deployment objects + var multiErr *multierror.Error + deployments := &appsv1.DeploymentList{} + listOpts := &client.ListOptions{ + Namespace: namespace, + } + + if err := cli.List(ctx, deployments, listOpts); err != nil { + return false, nil + } + // filter deployment which has the new label to limit that we do not over kill other deployment + // this logic can be used even when upgrade from v2.4 to v2.5 without remove it + markedForDeletion := []appsv1.Deployment{} + for _, deployment := range deployments.Items { + v2 := false + selectorLabels := deployment.Spec.Selector.MatchLabels + for label := range selectorLabels { + if strings.Contains(label, "app.opendatahub.io/") { + // this deployment has the new label, this is a v2 to v2 upgrade + // there is no need to recreate it, as labels are matching + v2 = true + continue + } + } + if !v2 { + markedForDeletion = append(markedForDeletion, deployment) + multiErr = multierror.Append(multiErr, cli.Delete(ctx, &deployment)) + } + } + + for _, deployment := range markedForDeletion { + if e := cli.Get(ctx, client.ObjectKey{ + Namespace: namespace, + Name: deployment.Name, + }, &deployment); e != nil { + if apierrs.IsNotFound(e) { + // resource has been successfully deleted + continue + } else { + // unexpected error, report it + multiErr = multierror.Append(multiErr, e) + } } else { - return nil, fmt.Errorf("error retrieving kfdef CRD : %v", err) + // resource still exists, wait for it to be deleted + return false, nil } } - // If KfDef Instances found, and no DSC instances are found in Self-managed, that means this is an upgrade path from - // legacy version. Create a default DSC instance - kfDefList := &kfdefv1.KfDefList{} - err := c.List(context.TODO(), kfDefList) - if err != nil { - if apierrs.IsNotFound(err) { - // If no KfDefs, do nothing and return - return nil, nil + return true, multiErr.ErrorOrNil() +} + +func deleteStatefulsetsAndCheck(ctx context.Context, cli client.Client, namespace string) (bool, error) { //nolint + // Delete statefulset objects + var multiErr *multierror.Error + statefulsets := &appsv1.StatefulSetList{} + listOpts := &client.ListOptions{ + Namespace: namespace, + } + + if err := cli.List(ctx, statefulsets, listOpts); err != nil { + return false, nil + } + + // even only we have one item to delete to avoid nil point still use range + markedForDeletion := []appsv1.StatefulSet{} + for _, statefulset := range statefulsets.Items { + v2 := false + selectorLabels := statefulset.Spec.Selector.MatchLabels + for label := range selectorLabels { + if strings.Contains(label, "app.opendatahub.io/") { + v2 = true + continue + } + } + if !v2 { + markedForDeletion = append(markedForDeletion, statefulset) + multiErr = multierror.Append(multiErr, cli.Delete(ctx, &statefulset)) + } + } + + for _, statefulset := range markedForDeletion { + if e := cli.Get(ctx, client.ObjectKey{ + Namespace: namespace, + Name: statefulset.Name, + }, &statefulset); e != nil { + if apierrs.IsNotFound(e) { + // resource has been successfully deleted + continue + } else { + // unexpected error, report it + multiErr = multierror.Append(multiErr, e) + } } else { - return nil, fmt.Errorf("error getting list of kfdefs: %v", err) + // resource still exists, wait for it to be deleted + return false, nil } } - return kfDefList, nil + + return true, multiErr.ErrorOrNil() }