Skip to content

Commit

Permalink
Add maintenance job
Browse files Browse the repository at this point in the history
Signed-off-by: Ming Qiu <mqiu@vmware.com>
  • Loading branch information
qiuming-best committed Feb 21, 2024
1 parent 2a1ae0e commit d6fd309
Show file tree
Hide file tree
Showing 9 changed files with 581 additions and 41 deletions.
145 changes: 145 additions & 0 deletions pkg/cmd/server/repomaintenance/repo_maintenance.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
package repomaintenance

import (
"context"
"fmt"
"log"
"os"
"strings"

"github.com/sirupsen/logrus"
"github.com/spf13/cobra"
"github.com/spf13/pflag"
"github.com/vmware-tanzu/velero/internal/credentials"
velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
velerocli "github.com/vmware-tanzu/velero/pkg/client"
"github.com/vmware-tanzu/velero/pkg/repository"
"github.com/vmware-tanzu/velero/pkg/repository/provider"
"github.com/vmware-tanzu/velero/pkg/util/filesystem"
"github.com/vmware-tanzu/velero/pkg/util/logging"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
)

type Options struct {
VolumeNamespace string
BackupStorageLocation string
RepoType string
KeepLatestMaitenanceJobs int
LogLevelFlag *logging.LevelFlag
FormatFlag *logging.FormatFlag
}

func (o *Options) BindFlags(flags *pflag.FlagSet) {
flags.StringVar(&o.VolumeNamespace, "repo-name", "", "namespace of the pod/volume that the snapshot is for")
flags.StringVar(&o.BackupStorageLocation, "backup-storage-location", "", "backup's storage location name")
flags.StringVar(&o.RepoType, "repo-type", velerov1api.BackupRepositoryTypeKopia, "type of the repository where the snapshot is stored")
flags.Var(o.LogLevelFlag, "log-level", fmt.Sprintf("The level at which to log. Valid values are %s.", strings.Join(o.LogLevelFlag.AllowedValues(), ", ")))
flags.Var(o.FormatFlag, "log-format", fmt.Sprintf("The format for log output. Valid values are %s.", strings.Join(o.FormatFlag.AllowedValues(), ", ")))
}

func NewCommand(f velerocli.Factory) *cobra.Command {
o := &Options{
LogLevelFlag: logging.LogLevelFlag(logrus.InfoLevel),
FormatFlag: logging.NewFormatFlag(),
}
cmd := &cobra.Command{
Use: "repo-maintenance",
Hidden: true,
Short: "VELERO SERVER COMMAND ONLY - not intended to be run directly by users",
Run: func(c *cobra.Command, args []string) {
o.Run(f)
},
}

o.BindFlags(cmd.Flags())
return cmd
}

func checkError(err error, log logrus.FieldLogger) {
if err != nil {
if err != context.Canceled {
log.Errorf("An error occurred: %v", err)
}
os.Exit(1)
}
}

func (o *Options) Run(f velerocli.Factory) {
log.SetOutput(os.Stdout)
logrus.SetOutput(os.Stdout)
logger := logging.DefaultLogger(o.LogLevelFlag.Parse(), o.FormatFlag.Parse())

errorFile, err := os.Create("/dev/termination-log")
if err != nil {
fmt.Fprintf(os.Stderr, "Failed to create termination log file: %v\n", err)
return
}
defer errorFile.Close()

logger.AddHook(&logging.FileHook{
File: errorFile})

scheme := runtime.NewScheme()
err = velerov1api.AddToScheme(scheme)
checkError(err, logger)

err = v1.AddToScheme(scheme)
checkError(err, logger)

config, err := f.ClientConfig()
checkError(err, logger)

cli, err := client.New(config, client.Options{
Scheme: scheme,
})
checkError(err, logger)

credentialFileStore, err := credentials.NewNamespacedFileStore(
cli,
f.Namespace(),
"/tmp/credentials",
filesystem.NewFileSystem(),
)
checkError(err, logger)

credentialSecretStore, err := credentials.NewNamespacedSecretStore(cli, f.Namespace())
checkError(err, logger)

var repoProvider provider.Provider
if o.RepoType == velerov1api.BackupRepositoryTypeRestic {
repoProvider = provider.NewResticRepositoryProvider(credentialFileStore, filesystem.NewFileSystem(), logger)
} else {
repoProvider = provider.NewUnifiedRepoProvider(
credentials.CredentialGetter{
FromFile: credentialFileStore,
FromSecret: credentialSecretStore,
}, o.RepoType, cli, logger)
}

// backupRepository
repo, err := repository.GetBackupRepository(context.Background(), cli, f.Namespace(),
repository.BackupRepositoryKey{
VolumeNamespace: o.VolumeNamespace,
BackupLocation: o.BackupStorageLocation,
RepositoryType: o.RepoType,
}, true)
checkError(err, logger)

// bsl
bsl := &velerov1api.BackupStorageLocation{}
cli.Get(context.Background(), client.ObjectKey{Namespace: f.Namespace(), Name: repo.Spec.BackupStorageLocation}, bsl)
checkError(err, logger)

para := provider.RepoParam{
BackupRepo: repo,
BackupLocation: bsl,
}

err = repoProvider.BoostRepoConnect(context.Background(), para)
checkError(err, logger)

err = repoProvider.PruneRepo(context.Background(), para)
checkError(err, logger)
}
62 changes: 43 additions & 19 deletions pkg/cmd/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,24 +32,6 @@ import (
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/sirupsen/logrus"
"github.com/spf13/cobra"
corev1api "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
kubeerrs "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/discovery"
"k8s.io/client-go/dynamic"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/utils/clock"
ctrl "sigs.k8s.io/controller-runtime"
ctrlclient "sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/manager"

"github.com/vmware-tanzu/velero/internal/credentials"
"github.com/vmware-tanzu/velero/internal/storage"
velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
Expand All @@ -58,6 +40,7 @@ import (
"github.com/vmware-tanzu/velero/pkg/buildinfo"
"github.com/vmware-tanzu/velero/pkg/client"
"github.com/vmware-tanzu/velero/pkg/cmd"
"github.com/vmware-tanzu/velero/pkg/cmd/server/repomaintenance"
"github.com/vmware-tanzu/velero/pkg/cmd/util/flag"
"github.com/vmware-tanzu/velero/pkg/cmd/util/signals"
"github.com/vmware-tanzu/velero/pkg/controller"
Expand All @@ -77,6 +60,25 @@ import (
"github.com/vmware-tanzu/velero/pkg/uploader"
"github.com/vmware-tanzu/velero/pkg/util/filesystem"
"github.com/vmware-tanzu/velero/pkg/util/logging"
appsv1 "k8s.io/api/apps/v1"
batchv1api "k8s.io/api/batch/v1"
corev1api "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
kubeerrs "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/discovery"
"k8s.io/client-go/dynamic"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/utils/clock"
ctrl "sigs.k8s.io/controller-runtime"
ctrlclient "sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/manager"
)

const (
Expand Down Expand Up @@ -135,6 +137,7 @@ type serverConfig struct {
defaultSnapshotMoveData bool
disableInformerCache bool
scheduleSkipImmediately bool
maintenanceCfg repository.MaintenanceConfig
}

func NewCommand(f client.Factory) *cobra.Command {
Expand Down Expand Up @@ -166,6 +169,9 @@ func NewCommand(f client.Factory) *cobra.Command {
defaultSnapshotMoveData: false,
disableInformerCache: defaultDisableInformerCache,
scheduleSkipImmediately: false,
maintenanceCfg: repository.MaintenanceConfig{
KeepLatestMaitenanceJobs: 3,
},
}
)

Expand Down Expand Up @@ -239,7 +245,17 @@ func NewCommand(f client.Factory) *cobra.Command {
command.Flags().BoolVar(&config.defaultSnapshotMoveData, "default-snapshot-move-data", config.defaultSnapshotMoveData, "Move data by default for all snapshots supporting data movement.")
command.Flags().BoolVar(&config.disableInformerCache, "disable-informer-cache", config.disableInformerCache, "Disable informer cache for Get calls on restore. With this enabled, it will speed up restore in cases where there are backup resources which already exist in the cluster, but for very large clusters this will increase velero memory usage. Default is false (don't disable).")
command.Flags().BoolVar(&config.scheduleSkipImmediately, "schedule-skip-immediately", config.scheduleSkipImmediately, "Skip the first scheduled backup immediately after creating a schedule. Default is false (don't skip).")
command.Flags().IntVar(&config.maintenanceCfg.KeepLatestMaitenanceJobs, "keep-latest-maintenance-jobs", config.maintenanceCfg.KeepLatestMaitenanceJobs, "Number of latest maintenance jobs to keep each repository. Default is 3.")
command.Flags().StringVar(&config.maintenanceCfg.CPURequest, "maintenance-job-cpu-request", config.maintenanceCfg.CPURequest, "CPU request for maintenance job. Default is empty.")
command.Flags().StringVar(&config.maintenanceCfg.MemoryRequest, "maintenance-job-memory-request", config.maintenanceCfg.MemoryRequest, "Memory request for maintenance job. Default is empty.")
command.Flags().StringVar(&config.maintenanceCfg.CPULimit, "maintenance-job-cpu-limit", config.maintenanceCfg.CPULimit, "CPU limit for maintenance job. Default is empty.")
command.Flags().StringVar(&config.maintenanceCfg.MemoryLimit, "maintenance-job-memory-limit", config.maintenanceCfg.MemoryLimit, "Memory limit for maintenance job. Default is empty.")

// inherited from server command
config.maintenanceCfg.FormatFlag = config.formatFlag
config.maintenanceCfg.LogLevelFlag = logLevelFlag

command.AddCommand(repomaintenance.NewCommand(f))
return command
}

Expand Down Expand Up @@ -346,6 +362,14 @@ func newServer(f client.Factory, config serverConfig, logger *logrus.Logger) (*s
cancelFunc()
return nil, err
}
if err := batchv1api.AddToScheme(scheme); err != nil {
cancelFunc()
return nil, err
}
if err := appsv1.AddToScheme(scheme); err != nil {
cancelFunc()
return nil, err
}

ctrl.SetLogger(logrusr.New(logger))

Expand Down Expand Up @@ -642,7 +666,7 @@ func (s *server) initRepoManager() error {
s.repoLocker = repository.NewRepoLocker()
s.repoEnsurer = repository.NewEnsurer(s.mgr.GetClient(), s.logger, s.config.resourceTimeout)

s.repoManager = repository.NewManager(s.namespace, s.mgr.GetClient(), s.repoLocker, s.repoEnsurer, s.credentialFileStore, s.credentialSecretStore, s.logger)
s.repoManager = repository.NewManager(s.namespace, s.mgr.GetClient(), s.repoLocker, s.repoEnsurer, s.credentialFileStore, s.credentialSecretStore, s.config.maintenanceCfg, s.logger)

return nil
}
Expand Down
36 changes: 32 additions & 4 deletions pkg/controller/backup_repository_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,9 @@ import (
)

const (
repoSyncPeriod = 5 * time.Minute
defaultMaintainFrequency = 7 * 24 * time.Hour
repoSyncPeriod = 5 * time.Minute
defaultMaintainFrequency = 7 * 24 * time.Hour
defaultMaintainRetryFrequency = 1 * time.Hour
)

type BackupRepoReconciler struct {
Expand Down Expand Up @@ -277,16 +278,42 @@ func ensureRepo(repo *velerov1api.BackupRepository, repoManager repository.Manag
return repoManager.PrepareRepo(repo)
}

func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *velerov1api.BackupRepository, log logrus.FieldLogger) error {
log.Debug("backupRepositoryController.runMaintenanceIfDue")
func (r *BackupRepoReconciler) shouldRetryMaintenance(ctx context.Context, req *velerov1api.BackupRepository, log logrus.FieldLogger) bool {
log.Debug("Checking if maintenance should be retried")

if req.Status.Message == "" {
return true
}

job, err := repository.GetLatestMaintenanceJob(r.Client, req.Name)
if err != nil {
log.WithError(err).Error("error getting latest maintenance job")
return false
}

if job != nil && job.Status.Failed > 0 && job.CreationTimestamp.Add(defaultMaintainRetryFrequency).Before(r.clock.Now()) {
log.Debug("Latest maintenance job failed and is older than retry frequency, retrying maintenance")
return true
} else {
log.Debugf("Latest maintenance job %s is not older than retry frequency, not retrying maintenance", job.Name)
}

return false
}

func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *velerov1api.BackupRepository, log logrus.FieldLogger) error {
now := r.clock.Now()

if !dueForMaintenance(req, now) {
log.Debug("not due for maintenance")
return nil
}

if !r.shouldRetryMaintenance(ctx, req, log) {
log.Debug("not retrying maintenance")
return nil
}

log.Info("Running maintenance on backup repository")

// prune failures should be displayed in the `.status.message` field but
Expand All @@ -300,6 +327,7 @@ func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *vel
}

return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) {
rr.Status.Message = ""
rr.Status.LastMaintenanceTime = &metav1.Time{Time: now}
})
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/datapath/file_system.go
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ func (fs *fileSystemBR) Cancel() {

func (fs *fileSystemBR) boostRepoConnect(ctx context.Context, repositoryType string, credentialGetter *credentials.CredentialGetter) error {
if repositoryType == velerov1api.BackupRepositoryTypeKopia {
if err := repoProvider.NewUnifiedRepoProvider(*credentialGetter, repositoryType, fs.log).BoostRepoConnect(ctx, repoProvider.RepoParam{BackupLocation: fs.backupLocation, BackupRepo: fs.backupRepo}); err != nil {
if err := repoProvider.NewUnifiedRepoProvider(*credentialGetter, repositoryType, fs.client, fs.log).BoostRepoConnect(ctx, repoProvider.RepoParam{BackupLocation: fs.backupLocation, BackupRepo: fs.backupRepo}); err != nil {
return err
}
} else {
Expand Down
Loading

0 comments on commit d6fd309

Please sign in to comment.