Skip to content

Commit

Permalink
NewSystemd handles UnitExists when starting units
Browse files Browse the repository at this point in the history
As done in this runc PR
opencontainers/runc#3782,
UnitExists errors are no longer ignored when starting units. If it
already exists, we attempt to reset the failed unit and retry once.
Otherwise, we fail. See containerd#279
for more context.

Signed-off-by: Matt Merkes <matt.merkes@gmail.com>
  • Loading branch information
mmerkes committed Apr 23, 2023
1 parent fb1932a commit 7a75a95
Showing 1 changed file with 53 additions and 8 deletions.
61 changes: 53 additions & 8 deletions cgroup2/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -865,14 +865,7 @@ func NewSystemd(slice, group string, pid int, resources *Resources) (*Manager, e
newSystemdProperty("TasksMax", uint64(resources.Pids.Max)))
}

statusChan := make(chan string, 1)
if _, err := conn.StartTransientUnitContext(ctx, group, "replace", properties, statusChan); err == nil {
select {
case <-statusChan:
case <-time.After(time.Second):
logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", group)
}
} else if !isUnitExists(err) {
if err := startUnit(conn, group, properties); err != nil {
return &Manager{}, err
}

Expand All @@ -881,6 +874,58 @@ func NewSystemd(slice, group string, pid int, resources *Resources) (*Manager, e
}, nil
}

func startUnit(conn *systemdDbus.Conn, group string, properties []systemdDbus.Property) error {
ctx := context.TODO()
statusChan := make(chan string, 1)
retry := true

attemptStart:
if _, err := conn.StartTransientUnitContext(ctx, group, "replace", properties, statusChan); err != nil {
if !isUnitExists(err) {
return err
}

// TODO: In runc, it has some scenarios where it doesn't retry and just returns happily.
// https://github.com/kolyshkin/runc/blob/82bc89cd10eb5fa5fdb5d77e41b7ea3c07f9d834/libcontainer/cgroups/systemd/common.go#L140-L145
// Need to figure out if/when that would be appropriate.
if retry {
retry = false
// See https://github.com/opencontainers/runc/pull/3782 for more context and the original change
// made in runc. When a unit of the same name already exists, it may be a leftover failed unit.
// If we reset it once, systemd can try to remove it.
attemptFailedUnitReset(conn, group)
goto attemptStart
}

return err
}

select {
case s := <-statusChan:
close(statusChan)

if s != "done" {
attemptFailedUnitReset(conn, group)
return fmt.Errorf("error creating systemd unit `%s`: got `%s`", group, s)
}
case <-time.After(time.Second):
logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", group)
// TODO: Question: runc errors out in this case. However, it waits for 30s before timing out rather than 1s. Should
// we do similar here?
// https://github.com/kolyshkin/runc/blob/82bc89cd10eb5fa5fdb5d77e41b7ea3c07f9d834/libcontainer/cgroups/systemd/common.go#L170
}

return nil
}

func attemptFailedUnitReset(conn *systemdDbus.Conn, group string) {
err := conn.ResetFailedUnitContext(context.TODO(), group)

if err != nil {
logrus.Warnf("Unable to reset failed unit: %v", err)
}
}

func LoadSystemd(slice, group string) (*Manager, error) {
if slice == "" {
slice = defaultSlice
Expand Down

0 comments on commit 7a75a95

Please sign in to comment.