Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: add labels to metrics #1433

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions controllers/secretproviderclasspodstatus_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ type SecretProviderClassPodStatusReconciler struct {
writer client.Writer
eventRecorder record.EventRecorder
driverName string
reporter StatsReporter
}

// New creates a new SecretProviderClassPodStatusReconciler
Expand All @@ -73,6 +74,10 @@ func New(driverName string, mgr manager.Manager, nodeID string) (*SecretProvider
kubeClient := kubernetes.NewForConfigOrDie(mgr.GetConfig())
eventBroadcaster.StartRecordingToSink(&clientcorev1.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")})
recorder := eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: "csi-secrets-store-controller"})
sr, err := newStatsReporter()
if err != nil {
return nil, err
}

return &SecretProviderClassPodStatusReconciler{
Client: mgr.GetClient(),
Expand All @@ -83,6 +88,7 @@ func New(driverName string, mgr manager.Manager, nodeID string) (*SecretProvider
writer: mgr.GetClient(),
eventRecorder: recorder,
driverName: driverName,
reporter: sr,
}, nil
}

Expand Down Expand Up @@ -266,6 +272,9 @@ func (r *SecretProviderClassPodStatusReconciler) Reconcile(ctx context.Context,
return ctrl.Result{}, nil
}

// if SecretObjects defined in the SPC, record the time to report sync_k8s_secret_duration_sec metric
begin := time.Now()

// determine which pod volume this is associated with
podVol := k8sutil.SPCVolume(pod, r.driverName, spc.Name)
if podVol == nil {
Expand Down Expand Up @@ -365,6 +374,9 @@ func (r *SecretProviderClassPodStatusReconciler) Reconcile(ctx context.Context,
return ctrl.Result{Requeue: true}, nil
}

r.reporter.ReportSyncSecretCtMetric(ctx, string(spc.Spec.Provider), spcPodStatus.Namespace, spc.Name)
r.reporter.ReportSyncSecretDuration(ctx, time.Since(begin).Seconds())

klog.InfoS("reconcile complete", "spc", klog.KObj(spc), "pod", klog.KObj(pod), "spcps", klog.KObj(spcPodStatus))
// requeue the spc pod status again after 5mins to check if secret and ownerRef exists
// and haven't been modified. If secret doesn't exist, then this requeue will ensure it's
Expand Down
80 changes: 80 additions & 0 deletions controllers/stats_reporter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
Copyright 2024 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package controllers

import (
"context"
"runtime"

"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/metric"
"go.opentelemetry.io/otel/metric/global"
)

const (
scope = "sigs.k8s.io/secrets-store-csi-driver"
)

var (
providerKey = "provider"
osTypeKey = "os_type"
runtimeOS = runtime.GOOS
namespaceKey = "namespace"
spcKey = "secret_provider_class"
)

type reporter struct {
syncK8sSecretTotal metric.Int64Counter
syncK8sSecretDuration metric.Float64Histogram
}

type StatsReporter interface {
ReportSyncSecretCtMetric(ctx context.Context, provider, namespace, spc string)
ReportSyncSecretDuration(ctx context.Context, duration float64)
}

func newStatsReporter() (StatsReporter, error) {
var err error

r := &reporter{}
meter := global.Meter(scope)

if r.syncK8sSecretTotal, err = meter.Int64Counter("sync_k8s_secret", metric.WithDescription("Total number of k8s secrets synced")); err != nil {
return nil, err
}
if r.syncK8sSecretDuration, err = meter.Float64Histogram("sync_k8s_secret_duration_sec", metric.WithDescription("Distribution of how long it took to sync k8s secret")); err != nil {
return nil, err
}
return r, nil
}

func (r reporter) ReportSyncSecretCtMetric(ctx context.Context, provider, namespace, spc string) {
opt := metric.WithAttributes(
attribute.Key(providerKey).String(provider),
attribute.Key(osTypeKey).String(runtimeOS),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
attribute.Key(osTypeKey).String(runtimeOS),

The os key attribute is not relevant for the sync controller because it's a Kubernetes API object. The only reason it's used in driver metrics is to show the os runtime for the mount operation.

attribute.Key(namespaceKey).String(namespace),
attribute.Key(spcKey).String(spc),
)
r.syncK8sSecretTotal.Add(ctx, 1, opt)
}

func (r reporter) ReportSyncSecretDuration(ctx context.Context, duration float64) {
opt := metric.WithAttributes(
attribute.Key(osTypeKey).String(runtimeOS),
Comment on lines +76 to +77
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same comment as above

)
r.syncK8sSecretDuration.Record(ctx, duration, opt)
}
76 changes: 51 additions & 25 deletions docs/book/src/topics/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,17 @@ Prometheus is the only exporter that's currently supported with the driver.

## List of metrics provided by the driver

| Metric | Description | Tags |
| ------------------------------- | ------------------------------------------------------------------------- | --------------------------------------------------------------------------------- |
| total_node_publish | Total number of successful volume mount requests | `os_type=<runtime os>`<br>`provider=<provider name>` |
| total_node_unpublish | Total number of successful volume unmount requests | `os_type=<runtime os>` |
| total_node_publish_error | Total number of errors with volume mount requests | `os_type=<runtime os>`<br>`provider=<provider name>`<br>`error_type=<error code>` |
| total_node_unpublish_error | Total number of errors with volume unmount requests | `os_type=<runtime os>` |
| total_sync_k8s_secret | Total number of k8s secrets synced | `os_type=<runtime os>`<br>`provider=<provider name>` |
| sync_k8s_secret_duration_sec | Distribution of how long it took to sync k8s secret | `os_type=<runtime os>` |
| total_rotation_reconcile | Total number of rotation reconciles | `os_type=<runtime os>`<br>`rotated=<true or false>` |
| total_rotation_reconcile_error | Total number of rotation reconciles with error | `os_type=<runtime os>`<br>`rotated=<true or false>`<br>`error_type=<error code>` |
| rotation_reconcile_duration_sec | Distribution of how long it took to rotate secrets-store content for pods | `os_type=<runtime os>` |
| Metric | Description | Tags |
|---------------------------------|---------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| node_publish_total | Total number of successful volume mount requests | `os_type=<runtime os>`<br>`provider=<provider name>`<br>`pod_name=<pod_name>`<br>`pod_namespace=<pod_namespace>`<br>`secret_provider_class=<secret_provider_class>` |
| node_unpublish_total | Total number of successful volume unmount requests | `os_type=<runtime os>` |
| node_publish_error_total | Total number of errors with volume mount requests | `os_type=<runtime os>`<br>`provider=<provider name>`<br>`error_type=<error code>`<br>`pod_name=<pod_name>`<br>`pod_namespace=<pod_namespace>`<br>`secret_provider_class=<secret_provider_class>` |
| node_unpublish_error_total | Total number of errors with volume unmount requests | `os_type=<runtime os>` |
| sync_k8s_secret_total | Total number of k8s secrets synced | `os_type=<runtime os>`<br>`provider=<provider name>`<br>`namespace=<namespace>`<br>`secret_provider_class=<secret_provider_class>` |
| sync_k8s_secret_duration_sec | Distribution of how long it took to sync k8s secret | `os_type=<runtime os>` |
| rotation_reconcile_total | Total number of rotation reconciles | `os_type=<runtime os>`<br>`rotated=<true or false>`<br>`pod_name=<pod_name>`<br>`pod_namespace=<pod_namespace>`<br>`secret_provider_class=<secret_provider_class>` |
| rotation_reconcile_error_total | Total number of rotation reconciles with error | `os_type=<runtime os>`<br>`rotated=<true or false>`<br>`error_type=<error code>`<br>`pod_name=<pod_name>`<br>`pod_namespace=<pod_namespace>`<br>`secret_provider_class=<secret_provider_class>` |
| rotation_reconcile_duration_sec | Distribution of how long it took to rotate secrets-store content for pods | `os_type=<runtime os>`<br>`pod_name=<pod_name>`<br>`pod_namespace=<pod_namespace>`<br>`secret_provider_class=<secret_provider_class>` |

Metrics are served from port 8095, but this port is not exposed outside the pod by default. Use kubectl port-forward to access the metrics over localhost:

Expand Down Expand Up @@ -47,17 +47,43 @@ sync_k8s_secret_duration_sec_bucket{os_type="linux",le="30"} 1
sync_k8s_secret_duration_sec_bucket{os_type="linux",le="+Inf"} 1
sync_k8s_secret_duration_sec_sum{os_type="linux"} 0.3115892
sync_k8s_secret_duration_sec_count{os_type="linux"} 1
# HELP total_node_publish Total number of node publish calls
# TYPE total_node_publish counter
total_node_publish{os_type="linux",provider="azure"} 1
# HELP total_node_publish_error Total number of node publish calls with error
# TYPE total_node_publish_error counter
total_node_publish_error{error_type="ProviderBinaryNotFound",os_type="linux",provider="azure"} 2
total_node_publish_error{error_type="SecretProviderClassNotFound",os_type="linux",provider=""} 4
# HELP total_node_unpublish Total number of node unpublish calls
# TYPE total_node_unpublish counter
total_node_unpublish{os_type="linux"} 1
# HELP total_sync_k8s_secret Total number of k8s secrets synced
# TYPE total_sync_k8s_secret counter
total_sync_k8s_secret{os_type="linux",provider="azure"} 1
```

# HELP sync_k8s_secret_total Total number of k8s secrets synced
# TYPE sync_k8s_secret_total counter
sync_k8s_secret_total{namespace="csi-test-secret-ns",os_type="linux",provider="azure",secret_provider_class="csi-test-spc"} 1

# HELP rotation_reconcile_duration_sec Distribution of how long it took to rotate secrets-store content for pods
# TYPE rotation_reconcile_duration_sec histogram
rotation_reconcile_duration_sec_bucket{os_type="linux",le="0.1"} 0
rotation_reconcile_duration_sec_bucket{os_type="linux",le="0.2"} 0
rotation_reconcile_duration_sec_bucket{os_type="linux",le="0.3"} 0
rotation_reconcile_duration_sec_bucket{os_type="linux",le="0.4"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="0.5"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="1"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="1.5"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="2"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="2.5"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="3"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="5"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="10"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="15"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="30"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="+Inf"} 1
rotation_reconcile_duration_sec_sum{os_type="linux",} 0.3115892
rotation_reconcile_duration_sec_count{os_type="linux"} 1
# HELP rotation_reconcile_total Total number of rotation reconciles
# TYPE rotation_reconcile_total counter
rotation_reconcile_total{os_type="linux",pod_name="csi-test-app-wcsxk",pod_namespace="csi-test-secret-ns",provider="azure",rotated="false",secret_provider_class="csi-test-spc"} 1
# HELP rotation_reconcile_error_total Total number of rotation reconciles with error
# TYPE rotation_reconcile_error_total counter
rotation_reconcile_error_total{error_type="GRPCProviderError",os_type="linux",pod_name="csi-test-app-wcsxk",pod_namespace="csi-test-secret-ns",provider="azure",rotated="false",secret_provider_class="csi-test-spc"} 12
# HELP node_publish_total Total number of node publish calls
# TYPE node_publish_total counter
node_publish_total{os_type="linux",pod_name="csi-test-app-wcsxk",pod_namespace="csi-test-secret-ns",provider="azure",secret_provider_class="csi-test-spc"} 1
# HELP node_publish_error_total Total number of node publish calls with error
# TYPE node_publish_error_total counter
node_publish_error_total{error_type="ProviderBinaryNotFound",os_type="linux",pod_name="csi-test-app-wcsxk",pod_namespace="csi-test-secret-ns",provider="azure",secret_provider_class="csi-test-spc"} 7
# HELP node_unpublish_total Total number of node unpublish calls
# TYPE node_unpublish_total counter
node_unpublish_total{os_type="linux"} 1
```
33 changes: 18 additions & 15 deletions pkg/rotation/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -251,13 +251,16 @@ func (r *Reconciler) reconcile(ctx context.Context, spcps *secretsstorev1.Secret
// after the provider mount request is complete
var requiresUpdate bool
var providerName string
podName := spcps.Status.PodName
Suraiya-Hameed marked this conversation as resolved.
Show resolved Hide resolved
podNamespace := spcps.Namespace
secretProviderClass := spcps.Status.SecretProviderClassName
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
secretProviderClass := spcps.Status.SecretProviderClassName
secretProviderClassName := spcps.Status.SecretProviderClassName


defer func() {
if err != nil {
r.reporter.reportRotationErrorCtMetric(ctx, providerName, errorReason, requiresUpdate)
r.reporter.reportRotationErrorCtMetric(ctx, providerName, podName, podNamespace, secretProviderClass, errorReason, requiresUpdate)
return
}
r.reporter.reportRotationCtMetric(ctx, providerName, requiresUpdate)
r.reporter.reportRotationCtMetric(ctx, providerName, podName, podNamespace, secretProviderClass, requiresUpdate)
r.reporter.reportRotationDuration(ctx, time.Since(begin).Seconds())
}()

Expand All @@ -266,14 +269,14 @@ func (r *Reconciler) reconcile(ctx context.Context, spcps *secretsstorev1.Secret
err = r.cache.Get(
ctx,
client.ObjectKey{
Namespace: spcps.Namespace,
Name: spcps.Status.PodName,
Namespace: podNamespace,
Name: podName,
},
pod,
)
if err != nil {
errorReason = internalerrors.PodNotFound
return fmt.Errorf("failed to get pod %s/%s, err: %w", spcps.Namespace, spcps.Status.PodName, err)
return fmt.Errorf("failed to get pod %s/%s, err: %w", podNamespace, podName, err)
}
// skip rotation if the pod is being terminated
// or the pod is in succeeded state (for jobs that complete aren't gc yet)
Expand All @@ -289,14 +292,14 @@ func (r *Reconciler) reconcile(ctx context.Context, spcps *secretsstorev1.Secret
err = r.cache.Get(
ctx,
client.ObjectKey{
Namespace: spcps.Namespace,
Name: spcps.Status.SecretProviderClassName,
Namespace: podNamespace,
Name: secretProviderClass,
},
spc,
)
if err != nil {
errorReason = internalerrors.SecretProviderClassNotFound
return fmt.Errorf("failed to get secret provider class %s/%s, err: %w", spcps.Namespace, spcps.Status.SecretProviderClassName, err)
return fmt.Errorf("failed to get secret provider class %s/%s, err: %w", podNamespace, secretProviderClass, err)
}

// determine which pod volume this is associated with
Expand Down Expand Up @@ -359,16 +362,16 @@ func (r *Reconciler) reconcile(ctx context.Context, spcps *secretsstorev1.Secret
// This comprises the secret parameter in the MountRequest to the provider
if nodePublishSecretRef != nil {
// read secret from the informer cache
secret, err := r.secretStore.GetNodePublishSecretRefSecret(nodePublishSecretRef.Name, spcps.Namespace)
secret, err := r.secretStore.GetNodePublishSecretRefSecret(nodePublishSecretRef.Name, podNamespace)
if err != nil {
if apierrors.IsNotFound(err) {
klog.ErrorS(err,
fmt.Sprintf("nodePublishSecretRef not found. If the secret with name exists in namespace, label the secret by running 'kubectl label secret %s %s=true -n %s", nodePublishSecretRef.Name, controllers.SecretUsedLabel, spcps.Namespace),
"name", nodePublishSecretRef.Name, "namespace", spcps.Namespace)
fmt.Sprintf("nodePublishSecretRef not found. If the secret with name exists in namespace, label the secret by running 'kubectl label secret %s %s=true -n %s", nodePublishSecretRef.Name, controllers.SecretUsedLabel, podNamespace),
"name", nodePublishSecretRef.Name, "namespace", podNamespace)
}
errorReason = internalerrors.NodePublishSecretRefNotFound
r.generateEvent(pod, corev1.EventTypeWarning, mountRotationFailedReason, fmt.Sprintf("failed to get node publish secret %s/%s, err: %+v", spcps.Namespace, nodePublishSecretRef.Name, err))
return fmt.Errorf("failed to get node publish secret %s/%s, err: %w", spcps.Namespace, nodePublishSecretRef.Name, err)
r.generateEvent(pod, corev1.EventTypeWarning, mountRotationFailedReason, fmt.Sprintf("failed to get node publish secret %s/%s, err: %+v", podNamespace, nodePublishSecretRef.Name, err))
return fmt.Errorf("failed to get node publish secret %s/%s, err: %w", podNamespace, nodePublishSecretRef.Name, err)
}

for k, v := range secret.Data {
Expand Down Expand Up @@ -401,7 +404,7 @@ func (r *Reconciler) reconcile(ctx context.Context, spcps *secretsstorev1.Secret
newObjectVersions, errorReason, err := secretsstore.MountContent(ctx, providerClient, string(paramsJSON), string(secretsJSON), spcps.Status.TargetPath, string(permissionJSON), oldObjectVersions)
if err != nil {
r.generateEvent(pod, corev1.EventTypeWarning, mountRotationFailedReason, fmt.Sprintf("provider mount err: %+v", err))
return fmt.Errorf("failed to rotate objects for pod %s/%s, err: %w", spcps.Namespace, spcps.Status.PodName, err)
return fmt.Errorf("failed to rotate objects for pod %s/%s, err: %w", podNamespace, podName, err)
}

// compare the old object versions and new object versions to check if any of the objects
Expand Down Expand Up @@ -488,7 +491,7 @@ func (r *Reconciler) reconcile(ctx context.Context, spcps *secretsstorev1.Secret

patchFn := func() (bool, error) {
// patch secret data with the new contents
if err := r.patchSecret(ctx, secretObj.SecretName, spcps.Namespace, datamap); err != nil {
if err := r.patchSecret(ctx, secretObj.SecretName, podNamespace, datamap); err != nil {
// syncSecret.enabled is set to false by default in the helm chart for installing the driver in v0.0.23+
// that would result in a forbidden error, so generate a warning that can be helpful for debugging
if apierrors.IsForbidden(err) {
Expand Down
Loading
Loading