diff --git a/changelogs/unreleased/1708-skriss b/changelogs/unreleased/1708-skriss new file mode 100644 index 0000000000..a7bd811cea --- /dev/null +++ b/changelogs/unreleased/1708-skriss @@ -0,0 +1 @@ +remove any stale locks from restic repositories every 5m diff --git a/pkg/controller/restic_repository_controller.go b/pkg/controller/restic_repository_controller.go index b74814dc98..9d6f369603 100644 --- a/pkg/controller/restic_repository_controller.go +++ b/pkg/controller/restic_repository_controller.go @@ -18,6 +18,7 @@ package controller import ( "encoding/json" + "strings" "time" jsonpatch "github.com/evanphx/json-patch" @@ -120,9 +121,19 @@ func (c *resticRepositoryController) processQueueItem(key string) error { // Don't mutate the shared cache reqCopy := req.DeepCopy() - switch req.Status.Phase { - case "", v1.ResticRepositoryPhaseNew: + if req.Status.Phase == "" || req.Status.Phase == v1.ResticRepositoryPhaseNew { return c.initializeRepo(reqCopy, log) + } + + // If the repository is ready or not-ready, check it for stale locks, but if + // this fails for any reason, it's non-critical so we still continue on to the + // rest of the "process" logic. + log.Debug("Checking repository for stale locks") + if err := c.repositoryManager.UnlockRepo(reqCopy); err != nil { + log.WithError(err).Error("Error checking repository for stale locks") + } + + switch req.Status.Phase { case v1.ResticRepositoryPhaseReady: return c.runMaintenanceIfDue(reqCopy, log) case v1.ResticRepositoryPhaseNotReady: @@ -162,14 +173,23 @@ func (c *resticRepositoryController) initializeRepo(req *v1.ResticRepository, lo }) } -// ensureRepo first tries to connect to the repo, and returns if it succeeds. If it fails, -// it attempts to init the repo, and returns the result. +// ensureRepo checks to see if a repository exists, and attempts to initialize it if +// it does not exist. An error is returned if the repository can't be connected to +// or initialized. func ensureRepo(repo *v1.ResticRepository, repoManager restic.RepositoryManager) error { - if repoManager.ConnectToRepo(repo) == nil { - return nil + if err := repoManager.ConnectToRepo(repo); err != nil { + // If the repository has not yet been initialized, the error message will always include + // the following string. This is the only scenario where we should try to initialize it. + // Other errors (e.g. "already locked") should be returned as-is since the repository + // does already exist, but it can't be connected to. + if strings.Contains(err.Error(), "Is there a repository at the following location?") { + return repoManager.InitRepo(repo) + } + + return err } - return repoManager.InitRepo(repo) + return nil } func (c *resticRepositoryController) runMaintenanceIfDue(req *v1.ResticRepository, log logrus.FieldLogger) error { diff --git a/pkg/restic/command_factory.go b/pkg/restic/command_factory.go index da75f313d6..fd913125f3 100644 --- a/pkg/restic/command_factory.go +++ b/pkg/restic/command_factory.go @@ -1,5 +1,5 @@ /* -Copyright 2018 the Velero contributors. +Copyright 2018, 2019 the Velero contributors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -112,3 +112,10 @@ func ForgetCommand(repoIdentifier, snapshotID string) *Command { Args: []string{snapshotID}, } } + +func UnlockCommand(repoIdentifier string) *Command { + return &Command{ + Command: "unlock", + RepoIdentifier: repoIdentifier, + } +} diff --git a/pkg/restic/repository_manager.go b/pkg/restic/repository_manager.go index fe7b32a970..31925b7369 100644 --- a/pkg/restic/repository_manager.go +++ b/pkg/restic/repository_manager.go @@ -1,5 +1,5 @@ /* -Copyright 2018 the Velero contributors. +Copyright 2018, 2019 the Velero contributors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -55,6 +55,9 @@ type RepositoryManager interface { // PruneRepo deletes unused data from a repo. PruneRepo(repo *velerov1api.ResticRepository) error + // UnlockRepo removes stale locks from a repo. + UnlockRepo(repo *velerov1api.ResticRepository) error + // Forget removes a snapshot from the list of // available snapshots in a repo. Forget(context.Context, SnapshotIdentifier) error @@ -213,6 +216,14 @@ func (rm *repositoryManager) PruneRepo(repo *velerov1api.ResticRepository) error return rm.exec(PruneCommand(repo.Spec.ResticIdentifier), repo.Spec.BackupStorageLocation) } +func (rm *repositoryManager) UnlockRepo(repo *velerov1api.ResticRepository) error { + // restic unlock requires a non-exclusive lock + rm.repoLocker.Lock(repo.Name) + defer rm.repoLocker.Unlock(repo.Name) + + return rm.exec(UnlockCommand(repo.Spec.ResticIdentifier), repo.Spec.BackupStorageLocation) +} + func (rm *repositoryManager) Forget(ctx context.Context, snapshot SnapshotIdentifier) error { // We can't wait for this in the constructor, because this informer is coming // from the shared informer factory, which isn't started until *after* the repo