Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add repository maintenance job #7451

Merged
merged 1 commit into from
Mar 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelogs/unreleased/7451-qiuming-best
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add repository maintenance job
11 changes: 11 additions & 0 deletions pkg/cmd/cli/install/install.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"strings"
"time"

"github.com/vmware-tanzu/velero/pkg/repository"
"github.com/vmware-tanzu/velero/pkg/uploader"

"github.com/pkg/errors"
Expand Down Expand Up @@ -84,6 +85,7 @@ type Options struct {
DefaultSnapshotMoveData bool
DisableInformerCache bool
ScheduleSkipImmediately bool
MaintenanceCfg repository.MaintenanceConfig
}

// BindFlags adds command line values to the options struct.
Expand Down Expand Up @@ -128,6 +130,11 @@ func (o *Options) BindFlags(flags *pflag.FlagSet) {
flags.BoolVar(&o.DefaultSnapshotMoveData, "default-snapshot-move-data", o.DefaultSnapshotMoveData, "Bool flag to configure Velero server to move data by default for all snapshots supporting data movement. Optional.")
flags.BoolVar(&o.DisableInformerCache, "disable-informer-cache", o.DisableInformerCache, "Disable informer cache for Get calls on restore. With this enabled, it will speed up restore in cases where there are backup resources which already exist in the cluster, but for very large clusters this will increase velero memory usage. Default is false (don't disable). Optional.")
flags.BoolVar(&o.ScheduleSkipImmediately, "schedule-skip-immediately", o.ScheduleSkipImmediately, "Skip the first scheduled backup immediately after creating a schedule. Default is false (don't skip).")
flags.IntVar(&o.MaintenanceCfg.KeepLatestMaitenanceJobs, "keep-latest-maintenance-jobs", o.MaintenanceCfg.KeepLatestMaitenanceJobs, "Number of latest maintenance jobs to keep each repository. Optional.")
ywk253100 marked this conversation as resolved.
Show resolved Hide resolved
flags.StringVar(&o.MaintenanceCfg.CPURequest, "maintenance-job-cpu-request", o.MaintenanceCfg.CPURequest, "CPU request for maintenance jobs. Default is no limit.")
flags.StringVar(&o.MaintenanceCfg.MemRequest, "maintenance-job-mem-request", o.MaintenanceCfg.MemRequest, "Memory request for maintenance jobs. Default is no limit.")
flags.StringVar(&o.MaintenanceCfg.CPULimit, "maintenance-job-cpu-limit", o.MaintenanceCfg.CPULimit, "CPU limit for maintenance jobs. Default is no limit.")
flags.StringVar(&o.MaintenanceCfg.MemLimit, "maintenance-job-mem-limit", o.MaintenanceCfg.MemLimit, "Memory limit for maintenance jobs. Default is no limit.")
}

// NewInstallOptions instantiates a new, default InstallOptions struct.
Expand Down Expand Up @@ -157,6 +164,9 @@ func NewInstallOptions() *Options {
DefaultSnapshotMoveData: false,
DisableInformerCache: false,
ScheduleSkipImmediately: false,
MaintenanceCfg: repository.MaintenanceConfig{
KeepLatestMaitenanceJobs: repository.DefaultKeepLatestMaitenanceJobs,
},
}
}

Expand Down Expand Up @@ -224,6 +234,7 @@ func (o *Options) AsVeleroOptions() (*install.VeleroOptions, error) {
DefaultSnapshotMoveData: o.DefaultSnapshotMoveData,
DisableInformerCache: o.DisableInformerCache,
ScheduleSkipImmediately: o.ScheduleSkipImmediately,
MaintenanceCfg: o.MaintenanceCfg,
}, nil
}

Expand Down
179 changes: 179 additions & 0 deletions pkg/cmd/cli/repomantenance/maintenance.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
package repomantenance

import (
"context"
"fmt"
"os"
"strings"

"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"github.com/spf13/cobra"
"github.com/spf13/pflag"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/vmware-tanzu/velero/internal/credentials"
velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
velerocli "github.com/vmware-tanzu/velero/pkg/client"
"github.com/vmware-tanzu/velero/pkg/repository"
"github.com/vmware-tanzu/velero/pkg/repository/provider"
"github.com/vmware-tanzu/velero/pkg/util/filesystem"
"github.com/vmware-tanzu/velero/pkg/util/logging"
)

type Options struct {
RepoName string
BackupStorageLocation string
RepoType string
LogLevelFlag *logging.LevelFlag
FormatFlag *logging.FormatFlag
}

func (o *Options) BindFlags(flags *pflag.FlagSet) {
flags.StringVar(&o.RepoName, "repo-name", "", "namespace of the pod/volume that the snapshot is for")
flags.StringVar(&o.BackupStorageLocation, "backup-storage-location", "", "backup's storage location name")
flags.StringVar(&o.RepoType, "repo-type", velerov1api.BackupRepositoryTypeKopia, "type of the repository where the snapshot is stored")
flags.Var(o.LogLevelFlag, "log-level", fmt.Sprintf("The level at which to log. Valid values are %s.", strings.Join(o.LogLevelFlag.AllowedValues(), ", ")))
flags.Var(o.FormatFlag, "log-format", fmt.Sprintf("The format for log output. Valid values are %s.", strings.Join(o.FormatFlag.AllowedValues(), ", ")))
}

func NewCommand(f velerocli.Factory) *cobra.Command {
o := &Options{
LogLevelFlag: logging.LogLevelFlag(logrus.InfoLevel),
FormatFlag: logging.NewFormatFlag(),
}
cmd := &cobra.Command{
Use: "repo-maintenance",
Hidden: true,
Short: "VELERO INTERNAL COMMAND ONLY - not intended to be run directly by users",
Run: func(c *cobra.Command, args []string) {
o.Run(f)
},
}

o.BindFlags(cmd.Flags())
return cmd
}

func (o *Options) Run(f velerocli.Factory) {
logger := logging.DefaultLogger(o.LogLevelFlag.Parse(), o.FormatFlag.Parse())
logger.SetOutput(os.Stdout)

pruneError := o.runRepoPrune(f, f.Namespace(), logger)
defer func() {
if pruneError != nil {
os.Exit(1)
}
}()

if pruneError != nil {
logger.WithError(pruneError).Error("An error occurred when running repo prune")
terminationLogFile, err := os.Create("/dev/termination-log")
if err != nil {
logger.WithError(err).Error("Failed to create termination log file")
return
}
defer terminationLogFile.Close()

if _, errWrite := terminationLogFile.WriteString(fmt.Sprintf("An error occurred: %v", err)); errWrite != nil {
logger.WithError(errWrite).Error("Failed to write error to termination log file")
}
}
}

func (o *Options) initClient(f velerocli.Factory) (client.Client, error) {
scheme := runtime.NewScheme()
err := velerov1api.AddToScheme(scheme)
if err != nil {
return nil, errors.Wrap(err, "failed to add velero scheme")
}

err = v1.AddToScheme(scheme)
if err != nil {
return nil, errors.Wrap(err, "failed to add api core scheme")
}

config, err := f.ClientConfig()
if err != nil {
return nil, errors.Wrap(err, "failed to get client config")
}

cli, err := client.New(config, client.Options{
Scheme: scheme,
})
if err != nil {
return nil, errors.Wrap(err, "failed to create client")
}

return cli, nil
}

func (o *Options) runRepoPrune(f velerocli.Factory, namespace string, logger logrus.FieldLogger) error {
cli, err := o.initClient(f)
if err != nil {
return err
}

credentialFileStore, err := credentials.NewNamespacedFileStore(
cli,
namespace,
"/tmp/credentials",
filesystem.NewFileSystem(),
)
if err != nil {
return errors.Wrap(err, "failed to create namespaced file store")
}

credentialSecretStore, err := credentials.NewNamespacedSecretStore(cli, namespace)
if err != nil {
return errors.Wrap(err, "failed to create namespaced secret store")
}

var repoProvider provider.Provider
if o.RepoType == velerov1api.BackupRepositoryTypeRestic {
repoProvider = provider.NewResticRepositoryProvider(credentialFileStore, filesystem.NewFileSystem(), logger)
} else {
repoProvider = provider.NewUnifiedRepoProvider(
credentials.CredentialGetter{
FromFile: credentialFileStore,
FromSecret: credentialSecretStore,
}, o.RepoType, cli, logger)
}

// backupRepository
repo, err := repository.GetBackupRepository(context.Background(), cli, namespace,
repository.BackupRepositoryKey{
VolumeNamespace: o.RepoName,
BackupLocation: o.BackupStorageLocation,
RepositoryType: o.RepoType,
}, true)

if err != nil {
return errors.Wrap(err, "failed to get backup repository")
}

// bsl
bsl := &velerov1api.BackupStorageLocation{}
err = cli.Get(context.Background(), client.ObjectKey{Namespace: namespace, Name: repo.Spec.BackupStorageLocation}, bsl)
if err != nil {
return errors.Wrap(err, "failed to get backup storage location")
}

para := provider.RepoParam{
BackupRepo: repo,
BackupLocation: bsl,
}

err = repoProvider.BoostRepoConnect(context.Background(), para)
if err != nil {
return errors.Wrap(err, "failed to boost repo connect")
}

err = repoProvider.PruneRepo(context.Background(), para)
if err != nil {
return errors.Wrap(err, "failed to prune repo")
}
return nil
}
26 changes: 24 additions & 2 deletions pkg/cmd/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/sirupsen/logrus"
"github.com/spf13/cobra"
appsv1 "k8s.io/api/apps/v1"
batchv1api "k8s.io/api/batch/v1"
corev1api "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -136,6 +138,7 @@
defaultSnapshotMoveData bool
disableInformerCache bool
scheduleSkipImmediately bool
maintenanceCfg repository.MaintenanceConfig
}

func NewCommand(f client.Factory) *cobra.Command {
Expand Down Expand Up @@ -167,6 +170,9 @@
defaultSnapshotMoveData: false,
disableInformerCache: defaultDisableInformerCache,
scheduleSkipImmediately: false,
maintenanceCfg: repository.MaintenanceConfig{
KeepLatestMaitenanceJobs: repository.DefaultKeepLatestMaitenanceJobs,
},
}
)

Expand Down Expand Up @@ -240,7 +246,15 @@
command.Flags().BoolVar(&config.defaultSnapshotMoveData, "default-snapshot-move-data", config.defaultSnapshotMoveData, "Move data by default for all snapshots supporting data movement.")
command.Flags().BoolVar(&config.disableInformerCache, "disable-informer-cache", config.disableInformerCache, "Disable informer cache for Get calls on restore. With this enabled, it will speed up restore in cases where there are backup resources which already exist in the cluster, but for very large clusters this will increase velero memory usage. Default is false (don't disable).")
command.Flags().BoolVar(&config.scheduleSkipImmediately, "schedule-skip-immediately", config.scheduleSkipImmediately, "Skip the first scheduled backup immediately after creating a schedule. Default is false (don't skip).")

command.Flags().IntVar(&config.maintenanceCfg.KeepLatestMaitenanceJobs, "keep-latest-maintenance-jobs", config.maintenanceCfg.KeepLatestMaitenanceJobs, "Number of latest maintenance jobs to keep each repository. Optional.")
command.Flags().StringVar(&config.maintenanceCfg.CPURequest, "maintenance-job-cpu-request", config.maintenanceCfg.CPURequest, "CPU request for maintenance job. Default is no limit.")
command.Flags().StringVar(&config.maintenanceCfg.MemRequest, "maintenance-job-mem-request", config.maintenanceCfg.MemRequest, "Memory request for maintenance job. Default is no limit.")
command.Flags().StringVar(&config.maintenanceCfg.CPULimit, "maintenance-job-cpu-limit", config.maintenanceCfg.CPULimit, "CPU limit for maintenance job. Default is no limit.")
command.Flags().StringVar(&config.maintenanceCfg.MemLimit, "maintenance-job-mem-limit", config.maintenanceCfg.MemLimit, "Memory limit for maintenance job. Default is no limit.")

// maintenance job log setting inherited from velero server
config.maintenanceCfg.FormatFlag = config.formatFlag
config.maintenanceCfg.LogLevelFlag = logLevelFlag
return command
}

Expand Down Expand Up @@ -347,6 +361,14 @@
cancelFunc()
return nil, err
}
if err := batchv1api.AddToScheme(scheme); err != nil {
cancelFunc()
return nil, err
}
if err := appsv1.AddToScheme(scheme); err != nil {
cancelFunc()
return nil, err
}

Check warning on line 371 in pkg/cmd/server/server.go

View check run for this annotation

Codecov / codecov/patch

pkg/cmd/server/server.go#L364-L371

Added lines #L364 - L371 were not covered by tests

ctrl.SetLogger(logrusr.New(logger))

Expand Down Expand Up @@ -647,7 +669,7 @@
s.repoLocker = repository.NewRepoLocker()
s.repoEnsurer = repository.NewEnsurer(s.mgr.GetClient(), s.logger, s.config.resourceTimeout)

s.repoManager = repository.NewManager(s.namespace, s.mgr.GetClient(), s.repoLocker, s.repoEnsurer, s.credentialFileStore, s.credentialSecretStore, s.logger)
s.repoManager = repository.NewManager(s.namespace, s.mgr.GetClient(), s.repoLocker, s.repoEnsurer, s.credentialFileStore, s.credentialSecretStore, s.config.maintenanceCfg, s.logger)

Check warning on line 672 in pkg/cmd/server/server.go

View check run for this annotation

Codecov / codecov/patch

pkg/cmd/server/server.go#L672

Added line #L672 was not covered by tests

return nil
}
Expand Down
2 changes: 2 additions & 0 deletions pkg/cmd/velero/velero.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"k8s.io/klog/v2"

"github.com/vmware-tanzu/velero/pkg/cmd/cli/debug"
"github.com/vmware-tanzu/velero/pkg/cmd/cli/repomantenance"

"github.com/vmware-tanzu/velero/pkg/client"
"github.com/vmware-tanzu/velero/pkg/cmd/cli/backup"
Expand Down Expand Up @@ -122,6 +123,7 @@ operations can also be performed as 'velero backup get' and 'velero schedule cre
backuplocation.NewCommand(f),
snapshotlocation.NewCommand(f),
debug.NewCommand(f),
repomantenance.NewCommand(f),
)

// init and add the klog flags
Expand Down
28 changes: 19 additions & 9 deletions pkg/controller/backup_repository_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,10 +189,16 @@
}

switch backupRepo.Status.Phase {
case velerov1api.BackupRepositoryPhaseNotReady:
ready, err := r.checkNotReadyRepo(ctx, backupRepo, log)
if err != nil {
return ctrl.Result{}, err
} else if !ready {
return ctrl.Result{}, nil
}
fallthrough

Check warning on line 199 in pkg/controller/backup_repository_controller.go

View check run for this annotation

Codecov / codecov/patch

pkg/controller/backup_repository_controller.go#L192-L199

Added lines #L192 - L199 were not covered by tests
case velerov1api.BackupRepositoryPhaseReady:
return ctrl.Result{}, r.runMaintenanceIfDue(ctx, backupRepo, log)
case velerov1api.BackupRepositoryPhaseNotReady:
return ctrl.Result{}, r.checkNotReadyRepo(ctx, backupRepo, log)
}

return ctrl.Result{}, nil
Expand Down Expand Up @@ -277,8 +283,6 @@
}

func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *velerov1api.BackupRepository, log logrus.FieldLogger) error {
log.Debug("backupRepositoryController.runMaintenanceIfDue")

now := r.clock.Now()

if !dueForMaintenance(req, now) {
Expand All @@ -291,6 +295,7 @@
// prune failures should be displayed in the `.status.message` field but
// should not cause the repo to move to `NotReady`.
log.Debug("Pruning repo")

if err := r.repositoryManager.PruneRepo(req); err != nil {
log.WithError(err).Warn("error pruning repository")
return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) {
Expand All @@ -299,6 +304,7 @@
}

return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) {
rr.Status.Message = ""
rr.Status.LastMaintenanceTime = &metav1.Time{Time: now}
})
}
Expand All @@ -307,28 +313,32 @@
return req.Status.LastMaintenanceTime == nil || req.Status.LastMaintenanceTime.Add(req.Spec.MaintenanceFrequency.Duration).Before(now)
}

func (r *BackupRepoReconciler) checkNotReadyRepo(ctx context.Context, req *velerov1api.BackupRepository, log logrus.FieldLogger) error {
func (r *BackupRepoReconciler) checkNotReadyRepo(ctx context.Context, req *velerov1api.BackupRepository, log logrus.FieldLogger) (bool, error) {
log.Info("Checking backup repository for readiness")

repoIdentifier, err := r.getIdentiferByBSL(ctx, req)
if err != nil {
return r.patchBackupRepository(ctx, req, repoNotReady(err.Error()))
return false, r.patchBackupRepository(ctx, req, repoNotReady(err.Error()))

Check warning on line 321 in pkg/controller/backup_repository_controller.go

View check run for this annotation

Codecov / codecov/patch

pkg/controller/backup_repository_controller.go#L321

Added line #L321 was not covered by tests
}

if repoIdentifier != req.Spec.ResticIdentifier {
if err := r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) {
rr.Spec.ResticIdentifier = repoIdentifier
}); err != nil {
return err
return false, err

Check warning on line 328 in pkg/controller/backup_repository_controller.go

View check run for this annotation

Codecov / codecov/patch

pkg/controller/backup_repository_controller.go#L328

Added line #L328 was not covered by tests
}
}

// we need to ensure it (first check, if check fails, attempt to init)
// because we don't know if it's been successfully initialized yet.
if err := ensureRepo(req, r.repositoryManager); err != nil {
return r.patchBackupRepository(ctx, req, repoNotReady(err.Error()))
return false, r.patchBackupRepository(ctx, req, repoNotReady(err.Error()))
}

Check warning on line 336 in pkg/controller/backup_repository_controller.go

View check run for this annotation

Codecov / codecov/patch

pkg/controller/backup_repository_controller.go#L335-L336

Added lines #L335 - L336 were not covered by tests
err = r.patchBackupRepository(ctx, req, repoReady())
if err != nil {
return false, err

Check warning on line 339 in pkg/controller/backup_repository_controller.go

View check run for this annotation

Codecov / codecov/patch

pkg/controller/backup_repository_controller.go#L339

Added line #L339 was not covered by tests
}
return r.patchBackupRepository(ctx, req, repoReady())
return true, nil
}

func repoNotReady(msg string) func(*velerov1api.BackupRepository) {
Expand Down
Loading
Loading